Merge tag 'perf-tools-for-v7.0-1-2026-02-21' of git://git.kernel.org/pub/scm/linux/kernel/git/perf/perf-tools

Pull perf tools updates from Arnaldo Carvalho de Melo: - Introduce 'perf sched stats' tool with record/report/diff workflows using schedstat counters - Add a faster libdw based addr2line implementation and allow selecting it or its alternatives via 'perf config addr2line.style=' - Data-type profiling fixes and improvements including the ability to select fields using 'perf report''s -F/-fields, e.g.: 'perf report --fields overhead,type' - Add 'perf test' regression tests for Data-type profiling with C and Rust workloads - Fix srcline printing with inlines in callchains, make sure this has coverage in 'perf test' - Fix printing of leaf IP in LBR callchains - Fix display of metrics without sufficient permission in 'perf stat' - Print all machines in 'perf kvm report -vvv', not just the host - Switch from SHA-1 to BLAKE2s for build ID generation, remove SHA-1 code - Fix 'perf report's histogram entry collapsing with '-F' option - Use system's cacheline size instead of a hardcoded value in 'perf report' - Allow filtering conversion by time range in 'perf data' - Cover conversion to CTF using 'perf data' in 'perf test' - Address newer glibc const-correctness (-Werror=discarded-qualifiers) issues - Fixes and improvements for ARM's CoreSight support, simplify ARM SPE event config in 'perf mem', update docs for 'perf c2c' including the ARM events it can be used with - Build support for generating metrics from arch specific python script, add extra AMD, Intel, ARM64 metrics using it - Add AMD Zen 6 events and metrics - Add JSON file with OpenHW Risc-V CVA6 hardware counters - Add 'perf kvm' stats live testing - Add more 'perf stat' tests to 'perf test' - Fix segfault in `perf lock contention -b/--use-bpf` - Fix various 'perf test' cases for s390 - Build system cleanups, bump minimum shellcheck version to 0.7.2 - Support building the capstone based annotation routines as a plugin - Allow passing extra Clang flags via EXTRA_BPF_FLAGS * tag 'perf-tools-for-v7.0-1-2026-02-21' of git://git.kernel.org/pub/scm/linux/kernel/git/perf/perf-tools: (255 commits) perf test script: Add python script testing support perf test script: Add perl script testing support perf script: Allow the generated script to be a path perf test: perf data --to-ctf testing perf test: Test pipe mode with data conversion --to-json perf json: Pipe mode --to-ctf support perf json: Pipe mode --to-json support perf check: Add libbabeltrace to the listed features perf build: Allow passing extra Clang flags via EXTRA_BPF_FLAGS perf test data_type_profiling.sh: Skip just the Rust tests if code_with_type workload is missing tools build: Fix feature test for rust compiler perf libunwind: Fix calls to thread__e_machine() perf stat: Add no-affinity flag perf evlist: Reduce affinity use and move into iterator, fix no affinity perf evlist: Missing TPEBS close in evlist__close() perf evlist: Special map propagation for tool events that read on 1 CPU perf stat-shadow: In prepare_metric fix guard on reading NULL perf_stat_evsel Revert "perf tool_pmu: More accurately set the cpus for tool events" tools build: Emit dependencies file for test-rust.bin tools build: Make test-rust.bin be removed by the 'clean' target ...
author: Linus Torvalds <torvalds@linux-foundation.org> 2026-02-21 10:51:08 -0800
committer: Linus Torvalds <torvalds@linux-foundation.org> 2026-02-21 10:51:08 -0800
commit: c7decec2f2d2ab0366567f9e30c0e1418cece43f (patch)
tree: 50312739ad43d0655ea71c942d848db2ff123e8e /tools
parent: 3544d5ce36f403db6e5c994f526101c870ffe9fe (diff)
parent: dbf0108347bdb5d4ccef8910555b16c1f1a505f8 (diff)
324 files changed, 15622 insertions, 4400 deletions
diff --git a/tools/arch/arm64/include/uapi/asm/unistd.h b/tools/arch/arm64/include/uapi/asm/unistd.h
index df36f23876e8..9306726337fe 100644
--- a/tools/arch/arm64/include/uapi/asm/unistd.h
+++ b/tools/arch/arm64/include/uapi/asm/unistd.h
@@ -1,2 +1,24 @@
 /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-#include <asm/unistd_64.h>
+/*
+ * Copyright (C) 2012 ARM Ltd.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#define __ARCH_WANT_RENAMEAT
+#define __ARCH_WANT_NEW_STAT
+#define __ARCH_WANT_SET_GET_RLIMIT
+#define __ARCH_WANT_TIME32_SYSCALLS
+#define __ARCH_WANT_MEMFD_SECRET
+
+#include <asm-generic/unistd.h>
diff --git a/tools/build/Makefile.build b/tools/build/Makefile.build
index 3584ff308607..60e65870eae1 100644
--- a/tools/build/Makefile.build
+++ b/tools/build/Makefile.build
@@ -76,6 +76,14 @@ quiet_cmd_host_ld_multi = HOSTLD  $@
       cmd_host_ld_multi = $(if $(strip $(obj-y)),\
                           $(HOSTLD) -r -o $@  $(filter $(obj-y),$^),rm -f $@; $(HOSTAR) rcs $@)
 
+rust_common_cmd = \
+	$(RUSTC) $(rust_flags) \
+	--crate-type staticlib -L $(objtree)/rust/ \
+	--emit=dep-info=$(depfile),link
+
+quiet_cmd_rustc_a_rs = $(RUSTC) $(quiet_modtag) $@
+      cmd_rustc_a_rs = $(rust_common_cmd) -o $@ -g $< $(cmd_objtool)
+
 ifneq ($(filter $(obj),$(hostprogs)),)
   host = host_
 endif
@@ -105,6 +113,12 @@ $(OUTPUT)%.s: %.c FORCE
 	$(call rule_mkdir)
 	$(call if_changed_dep,cc_s_c)
 
+# it's recommended to build a static rust library, when a foreight (to rust)
+# linker is used.
+$(OUTPUT)%.a: %.rs FORCE
+	$(call rule_mkdir)
+	$(call if_changed_dep,rustc_a_rs)
+
 # bison and flex files are generated in the OUTPUT directory
 # so it needs a separate rule to depend on them properly
 $(OUTPUT)%-bison.o: $(OUTPUT)%-bison.c FORCE
diff --git a/tools/build/Makefile.feature b/tools/build/Makefile.feature
index 362cf8f4a0a0..0b7a7c38cb88 100644
--- a/tools/build/Makefile.feature
+++ b/tools/build/Makefile.feature
@@ -71,7 +71,7 @@ FEATURE_TESTS_BASIC :=                  \
         gettid				\
         glibc                           \
         libbfd                          \
-        libbfd-buildid			\
+	libbfd-threadsafe		\
         libelf                          \
         libelf-getphdrnum               \
         libelf-gelf_getnote             \
@@ -149,7 +149,8 @@ FEATURE_DISPLAY ?=              \
          bpf			\
          libaio			\
          libzstd		\
-         libopenssl
+         libopenssl		\
+         rust
 
 #
 # Declare group members of a feature to display the logical OR of the detection
diff --git a/tools/build/feature/Makefile b/tools/build/feature/Makefile
index 0d5a15654b17..1fbcb3ce74d2 100644
--- a/tools/build/feature/Makefile
+++ b/tools/build/feature/Makefile
@@ -13,7 +13,7 @@ FILES=                                          \
          test-gtk2-infobar.bin                  \
          test-hello.bin                         \
          test-libbfd.bin                        \
-         test-libbfd-buildid.bin		\
+	 test-libbfd-threadsafe.bin      	\
          test-disassembler-four-args.bin        \
          test-disassembler-init-styled.bin	\
          test-reallocarray.bin			\
@@ -73,6 +73,7 @@ FILES=                                          \
          test-clang-bpf-co-re.bin		\
          test-file-handle.bin			\
          test-libpfm4.bin			\
+         test-rust.bin				\
          test-libopenssl.bin
 
 FILES := $(addprefix $(OUTPUT),$(FILES))
@@ -268,7 +269,7 @@ $(OUTPUT)test-libpython.bin:
 $(OUTPUT)test-libbfd.bin:
 	$(BUILD_BFD)
 
-$(OUTPUT)test-libbfd-buildid.bin:
+$(OUTPUT)test-libbfd-threadsafe.bin:
 	$(BUILD_BFD) || $(BUILD_BFD) -liberty || $(BUILD_BFD) -liberty -lz
 
 $(OUTPUT)test-disassembler-four-args.bin:
@@ -388,6 +389,15 @@ $(OUTPUT)test-libopenssl.bin:
 $(OUTPUT)test-bpftool-skeletons.bin:
 	$(SYSTEM_BPFTOOL) version | grep '^features:.*skeletons' \
 		> $(@:.bin=.make.output) 2>&1
+
+# Testing Rust is special: we don't compile anything, it's enough to check the
+# compiler presence. Compiling a test code for this purposes is problematic,
+# because Rust will emit a dependency file without any external references,
+# meaning that if rustc will be removed the build process will still think it's
+# there.
+$(OUTPUT)test-rust.bin:
+	$(RUSTC) --version > /dev/null 2>&1
+
 ###############################
 
 clean:
diff --git a/tools/build/feature/test-libbfd-buildid.c b/tools/build/feature/test-libbfd-buildid.c
deleted file mode 100644
index 157644b04c05..000000000000
--- a/tools/build/feature/test-libbfd-buildid.c
+++ /dev/null
@@ -1,8 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include <bfd.h>
-
-int main(void)
-{
-	bfd *abfd = bfd_openr("Pedro", 0);
-	return abfd && (!abfd->build_id || abfd->build_id->size > 0x506564726f);
-}
diff --git a/tools/build/feature/test-libbfd-threadsafe.c b/tools/build/feature/test-libbfd-threadsafe.c
new file mode 100644
index 000000000000..fe97f95f6f06
--- /dev/null
+++ b/tools/build/feature/test-libbfd-threadsafe.c
@@ -0,0 +1,18 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <bfd.h>
+
+static bool lock(void *unused)
+{
+	return true;
+}
+
+static bool unlock(void *unused)
+{
+	return true;
+}
+
+int main(void)
+{
+       /* Check for presence of new thread safety API (version 2.42) */
+       return !bfd_thread_init(lock, unlock, NULL);
+}
diff --git a/tools/include/linux/bitfield.h b/tools/include/linux/bitfield.h
index 6093fa6db260..ddf81f24956b 100644
--- a/tools/include/linux/bitfield.h
+++ b/tools/include/linux/bitfield.h
@@ -8,6 +8,7 @@
 #define _LINUX_BITFIELD_H
 
 #include <linux/build_bug.h>
+#include <linux/kernel.h>
 #include <asm/byteorder.h>
 
 /*
diff --git a/tools/include/linux/list.h b/tools/include/linux/list.h
index a4dfb6a7cc6a..a692ff7aed5c 100644
--- a/tools/include/linux/list.h
+++ b/tools/include/linux/list.h
@@ -170,6 +170,16 @@ static inline void list_move_tail(struct list_head *list,
 }
 
 /**
+ * list_is_first -- tests whether @list is the first entry in list @head
+ * @list: the entry to test
+ * @head: the head of the list
+ */
+static inline int list_is_first(const struct list_head *list, const struct list_head *head)
+{
+	return list->prev == head;
+}
+
+/**
  * list_is_last - tests whether @list is the last entry in list @head
  * @list: the entry to test
  * @head: the head of the list
diff --git a/tools/lib/perf/Documentation/libperf.txt b/tools/lib/perf/Documentation/libperf.txt
index 4072bc9b7670..576ecc5fc312 100644
--- a/tools/lib/perf/Documentation/libperf.txt
+++ b/tools/lib/perf/Documentation/libperf.txt
@@ -211,6 +211,8 @@ SYNOPSIS
   struct perf_record_header_feature;
   struct perf_record_compressed;
   struct perf_record_compressed2;
+  struct perf_record_schedstat_cpu;
+  struct perf_record_schedstat_domain;
 --
 
 DESCRIPTION
diff --git a/tools/lib/perf/Makefile b/tools/lib/perf/Makefile
index 7fbb50b74c00..32301a1d8f0c 100644
--- a/tools/lib/perf/Makefile
+++ b/tools/lib/perf/Makefile
@@ -42,7 +42,6 @@ libdir_relative_SQ = $(subst ','\'',$(libdir_relative))
 TEST_ARGS := $(if $(V),-v)
 
 INCLUDES = \
--I$(OUTPUT)arch/$(SRCARCH)/include/generated/uapi \
 -I$(srctree)/tools/lib/perf/include \
 -I$(srctree)/tools/lib/ \
 -I$(srctree)/tools/include \
@@ -51,9 +50,9 @@ INCLUDES = \
 -I$(srctree)/tools/include/uapi
 
 # Append required CFLAGS
+override CFLAGS := $(INCLUDES) $(CFLAGS)
 override CFLAGS += -g -Werror -Wall
 override CFLAGS += -fPIC
-override CFLAGS += $(INCLUDES)
 override CFLAGS += -fvisibility=hidden
 override CFLAGS += $(EXTRA_WARNINGS)
 override CFLAGS += $(EXTRA_CFLAGS)
@@ -100,16 +99,7 @@ $(LIBAPI)-clean:
 	$(call QUIET_CLEAN, libapi)
 	$(Q)$(MAKE) -C $(LIB_DIR) O=$(OUTPUT) clean >/dev/null
 
-uapi-asm := $(OUTPUT)arch/$(SRCARCH)/include/generated/uapi/asm
-ifeq ($(SRCARCH),arm64)
-	syscall-y := $(uapi-asm)/unistd_64.h
-endif
-uapi-asm-generic:
-	$(if $(syscall-y),\
-		$(Q)$(MAKE) -f $(srctree)/scripts/Makefile.asm-headers obj=$(uapi-asm) \
-		generic=include/uapi/asm-generic $(syscall-y),)
-
-$(LIBPERF_IN): uapi-asm-generic FORCE
+$(LIBPERF_IN): FORCE
 	$(Q)$(MAKE) $(build)=libperf
 
 $(LIBPERF_A): $(LIBPERF_IN)
@@ -130,7 +120,7 @@ all: fixdep
 clean: $(LIBAPI)-clean
 	$(call QUIET_CLEAN, libperf) $(RM) $(LIBPERF_A) \
                 *.o *~ *.a *.so *.so.$(VERSION) *.so.$(LIBPERF_VERSION) .*.d .*.cmd tests/*.o LIBPERF-CFLAGS $(LIBPERF_PC) \
-                $(TESTS_STATIC) $(TESTS_SHARED) $(syscall-y)
+                $(TESTS_STATIC) $(TESTS_SHARED)
 
 TESTS_IN = tests-in.o
 
@@ -179,6 +169,7 @@ install_lib: libs
 		cp -fpR $(LIBPERF_ALL) $(DESTDIR)$(libdir_SQ)
 
 HDRS := bpf_perf.h core.h cpumap.h threadmap.h evlist.h evsel.h event.h mmap.h
+HDRS += schedstat-v15.h schedstat-v16.h schedstat-v17.h
 INTERNAL_HDRS := cpumap.h evlist.h evsel.h lib.h mmap.h rc_check.h threadmap.h xyarray.h
 
 INSTALL_HDRS_PFX := $(DESTDIR)$(prefix)/include/perf
diff --git a/tools/lib/perf/evlist.c b/tools/lib/perf/evlist.c
index 3ed023f4b190..1f210dadd666 100644
--- a/tools/lib/perf/evlist.c
+++ b/tools/lib/perf/evlist.c
@@ -101,6 +101,28 @@ static void __perf_evlist__propagate_maps(struct perf_evlist *evlist,
 		evsel->cpus = perf_cpu_map__get(evlist->user_requested_cpus);
 	}
 
+	/*
+	 * Tool events may only read on the first CPU index to avoid double
+	 * counting things like duration_time. Make the evsel->cpus contain just
+	 * that single entry otherwise we may spend time changing affinity to
+	 * CPUs that just have tool events, etc.
+	 */
+	if (evsel->reads_only_on_cpu_idx0 && perf_cpu_map__nr(evsel->cpus) > 0) {
+		struct perf_cpu_map *srcs[3] = {
+			evlist->all_cpus,
+			evlist->user_requested_cpus,
+			evsel->pmu_cpus,
+		};
+		for (size_t i = 0; i < ARRAY_SIZE(srcs); i++) {
+			if (!srcs[i])
+				continue;
+
+			perf_cpu_map__put(evsel->cpus);
+			evsel->cpus = perf_cpu_map__new_int(perf_cpu_map__cpu(srcs[i], 0).cpu);
+			break;
+		}
+	}
+
 	/* Sanity check assert before the evsel is potentially removed. */
 	assert(!evsel->requires_cpu || !perf_cpu_map__has_any_cpu(evsel->cpus));
 
@@ -133,16 +155,22 @@ static void __perf_evlist__propagate_maps(struct perf_evlist *evlist,
 
 static void perf_evlist__propagate_maps(struct perf_evlist *evlist)
 {
-	struct perf_evsel *evsel, *n;
-
 	evlist->needs_map_propagation = true;
 
 	/* Clear the all_cpus set which will be merged into during propagation. */
 	perf_cpu_map__put(evlist->all_cpus);
 	evlist->all_cpus = NULL;
 
-	list_for_each_entry_safe(evsel, n, &evlist->entries, node)
-		__perf_evlist__propagate_maps(evlist, evsel);
+	/* 2 rounds so that reads_only_on_cpu_idx0 benefit from knowing the other CPU maps. */
+	for (int round = 0; round < 2; round++) {
+		struct perf_evsel *evsel, *n;
+
+		list_for_each_entry_safe(evsel, n, &evlist->entries, node) {
+			if ((!evsel->reads_only_on_cpu_idx0 && round == 0) ||
+			    (evsel->reads_only_on_cpu_idx0 && round == 1))
+				__perf_evlist__propagate_maps(evlist, evsel);
+		}
+	}
 }
 
 void perf_evlist__add(struct perf_evlist *evlist,
diff --git a/tools/lib/perf/include/internal/evsel.h b/tools/lib/perf/include/internal/evsel.h
index fefe64ba5e26..b988034f1371 100644
--- a/tools/lib/perf/include/internal/evsel.h
+++ b/tools/lib/perf/include/internal/evsel.h
@@ -128,6 +128,8 @@ struct perf_evsel {
 	bool			 requires_cpu;
 	/** Is the PMU for the event a core one? Effects the handling of own_cpus. */
 	bool			 is_pmu_core;
+	/** Does the evsel on read on the first CPU index such as tool time events? */
+	bool			 reads_only_on_cpu_idx0;
 	int			 idx;
 };
 
diff --git a/tools/lib/perf/include/perf/event.h b/tools/lib/perf/include/perf/event.h
index 43a8cb04994f..9043dc72b5d6 100644
--- a/tools/lib/perf/include/perf/event.h
+++ b/tools/lib/perf/include/perf/event.h
@@ -496,6 +496,71 @@ struct perf_record_bpf_metadata {
 	struct perf_record_bpf_metadata_entry entries[];
 };
 
+struct perf_record_schedstat_cpu_v15 {
+#define CPU_FIELD(_type, _name, _desc, _format, _is_pct, _pct_of, _ver)		_type _name
+#include "schedstat-v15.h"
+#undef CPU_FIELD
+};
+
+struct perf_record_schedstat_cpu_v16 {
+#define CPU_FIELD(_type, _name, _desc, _format, _is_pct, _pct_of, _ver)		_type _name
+#include "schedstat-v16.h"
+#undef CPU_FIELD
+};
+
+struct perf_record_schedstat_cpu_v17 {
+#define CPU_FIELD(_type, _name, _desc, _format, _is_pct, _pct_of, _ver)		_type _name
+#include "schedstat-v17.h"
+#undef CPU_FIELD
+};
+
+struct perf_record_schedstat_cpu {
+	struct perf_event_header header;
+	__u64			 timestamp;
+	__u32			 cpu;
+	__u16			 version;
+	/* Padding */
+	char			 __pad[2];
+	union {
+		struct perf_record_schedstat_cpu_v15 v15;
+		struct perf_record_schedstat_cpu_v16 v16;
+		struct perf_record_schedstat_cpu_v17 v17;
+	};
+};
+
+struct perf_record_schedstat_domain_v15 {
+#define DOMAIN_FIELD(_type, _name, _desc, _format, _is_jiffies, _ver)		_type _name
+#include "schedstat-v15.h"
+#undef DOMAIN_FIELD
+};
+
+struct perf_record_schedstat_domain_v16 {
+#define DOMAIN_FIELD(_type, _name, _desc, _format, _is_jiffies, _ver)		_type _name
+#include "schedstat-v16.h"
+#undef DOMAIN_FIELD
+};
+
+struct perf_record_schedstat_domain_v17 {
+#define DOMAIN_FIELD(_type, _name, _desc, _format, _is_jiffies, _ver)		_type _name
+#include "schedstat-v17.h"
+#undef DOMAIN_FIELD
+};
+
+#define DOMAIN_NAME_LEN		16
+
+struct perf_record_schedstat_domain {
+	struct perf_event_header header;
+	__u64			 timestamp;
+	__u32			 cpu;
+	__u16			 version;
+	__u16			 domain;
+	union {
+		struct perf_record_schedstat_domain_v15 v15;
+		struct perf_record_schedstat_domain_v16 v16;
+		struct perf_record_schedstat_domain_v17 v17;
+	};
+};
+
 enum perf_user_event_type { /* above any possible kernel type */
 	PERF_RECORD_USER_TYPE_START		= 64,
 	PERF_RECORD_HEADER_ATTR			= 64,
@@ -519,6 +584,8 @@ enum perf_user_event_type { /* above any possible kernel type */
 	PERF_RECORD_FINISHED_INIT		= 82,
 	PERF_RECORD_COMPRESSED2			= 83,
 	PERF_RECORD_BPF_METADATA		= 84,
+	PERF_RECORD_SCHEDSTAT_CPU		= 85,
+	PERF_RECORD_SCHEDSTAT_DOMAIN		= 86,
 	PERF_RECORD_HEADER_MAX
 };
 
@@ -562,6 +629,8 @@ union perf_event {
 	struct perf_record_compressed		pack;
 	struct perf_record_compressed2		pack2;
 	struct perf_record_bpf_metadata		bpf_metadata;
+	struct perf_record_schedstat_cpu	schedstat_cpu;
+	struct perf_record_schedstat_domain	schedstat_domain;
 };
 
 #endif /* __LIBPERF_EVENT_H */
diff --git a/tools/lib/perf/include/perf/schedstat-v15.h b/tools/lib/perf/include/perf/schedstat-v15.h
new file mode 100644
index 000000000000..639458df05f8
--- /dev/null
+++ b/tools/lib/perf/include/perf/schedstat-v15.h
@@ -0,0 +1,146 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifdef CPU_FIELD
+CPU_FIELD(__u32, yld_count, "sched_yield() count",
+	  "%11u", false, yld_count, v15);
+CPU_FIELD(__u32, array_exp, "Legacy counter can be ignored",
+	  "%11u", false, array_exp, v15);
+CPU_FIELD(__u32, sched_count, "schedule() called",
+	  "%11u", false, sched_count, v15);
+CPU_FIELD(__u32, sched_goidle, "schedule() left the processor idle",
+	  "%11u", true, sched_count, v15);
+CPU_FIELD(__u32, ttwu_count, "try_to_wake_up() was called",
+	  "%11u", false, ttwu_count, v15);
+CPU_FIELD(__u32, ttwu_local, "try_to_wake_up() was called to wake up the local cpu",
+	  "%11u", true, ttwu_count, v15);
+CPU_FIELD(__u64, rq_cpu_time, "total runtime by tasks on this processor (in jiffies)",
+	  "%11llu", false, rq_cpu_time, v15);
+CPU_FIELD(__u64, run_delay, "total waittime by tasks on this processor (in jiffies)",
+	  "%11llu", true, rq_cpu_time, v15);
+CPU_FIELD(__u64, pcount, "total timeslices run on this cpu",
+	  "%11llu", false, pcount, v15);
+#endif
+
+#ifdef DOMAIN_FIELD
+#ifdef DOMAIN_CATEGORY
+DOMAIN_CATEGORY(" <Category idle> ");
+#endif
+DOMAIN_FIELD(__u32, idle_lb_count,
+	     "load_balance() count on cpu idle", "%11u", true, v15);
+DOMAIN_FIELD(__u32, idle_lb_balanced,
+	     "load_balance() found balanced on cpu idle", "%11u", true, v15);
+DOMAIN_FIELD(__u32, idle_lb_failed,
+	     "load_balance() move task failed on cpu idle", "%11u", true, v15);
+DOMAIN_FIELD(__u32, idle_lb_imbalance,
+	     "imbalance sum on cpu idle", "%11u", false, v15);
+DOMAIN_FIELD(__u32, idle_lb_gained,
+	     "pull_task() count on cpu idle", "%11u", false, v15);
+DOMAIN_FIELD(__u32, idle_lb_hot_gained,
+	     "pull_task() when target task was cache-hot on cpu idle", "%11u", false, v15);
+DOMAIN_FIELD(__u32, idle_lb_nobusyq,
+	     "load_balance() failed to find busier queue on cpu idle", "%11u", true, v15);
+DOMAIN_FIELD(__u32, idle_lb_nobusyg,
+	     "load_balance() failed to find busier group on cpu idle", "%11u", true, v15);
+#ifdef DERIVED_CNT_FIELD
+DERIVED_CNT_FIELD(idle_lb_success_count, "load_balance() success count on cpu idle", "%11u",
+		  idle_lb_count, idle_lb_balanced, idle_lb_failed, v15);
+#endif
+#ifdef DERIVED_AVG_FIELD
+DERIVED_AVG_FIELD(idle_lb_avg_pulled,
+		  "avg task pulled per successful lb attempt (cpu idle)", "%11.2Lf",
+		  idle_lb_count, idle_lb_balanced, idle_lb_failed, idle_lb_gained, v15);
+#endif
+#ifdef DOMAIN_CATEGORY
+DOMAIN_CATEGORY(" <Category busy> ");
+#endif
+DOMAIN_FIELD(__u32, busy_lb_count,
+	     "load_balance() count on cpu busy", "%11u", true, v15);
+DOMAIN_FIELD(__u32, busy_lb_balanced,
+	     "load_balance() found balanced on cpu busy", "%11u", true, v15);
+DOMAIN_FIELD(__u32, busy_lb_failed,
+	     "load_balance() move task failed on cpu busy", "%11u", true, v15);
+DOMAIN_FIELD(__u32, busy_lb_imbalance,
+	     "imbalance sum on cpu busy", "%11u", false, v15);
+DOMAIN_FIELD(__u32, busy_lb_gained,
+	     "pull_task() count on cpu busy", "%11u", false, v15);
+DOMAIN_FIELD(__u32, busy_lb_hot_gained,
+	     "pull_task() when target task was cache-hot on cpu busy", "%11u", false, v15);
+DOMAIN_FIELD(__u32, busy_lb_nobusyq,
+	     "load_balance() failed to find busier queue on cpu busy", "%11u", true, v15);
+DOMAIN_FIELD(__u32, busy_lb_nobusyg,
+	     "load_balance() failed to find busier group on cpu busy", "%11u", true, v15);
+#ifdef DERIVED_CNT_FIELD
+DERIVED_CNT_FIELD(busy_lb_success_count, "load_balance() success count on cpu busy", "%11u",
+		  busy_lb_count, busy_lb_balanced, busy_lb_failed, v15);
+#endif
+#ifdef DERIVED_AVG_FIELD
+DERIVED_AVG_FIELD(busy_lb_avg_pulled,
+		  "avg task pulled per successful lb attempt (cpu busy)", "%11.2Lf",
+		  busy_lb_count, busy_lb_balanced, busy_lb_failed, busy_lb_gained, v15);
+#endif
+#ifdef DOMAIN_CATEGORY
+DOMAIN_CATEGORY(" <Category newidle> ");
+#endif
+DOMAIN_FIELD(__u32, newidle_lb_count,
+	     "load_balance() count on cpu newly idle", "%11u", true, v15);
+DOMAIN_FIELD(__u32, newidle_lb_balanced,
+	     "load_balance() found balanced on cpu newly idle", "%11u", true, v15);
+DOMAIN_FIELD(__u32, newidle_lb_failed,
+	     "load_balance() move task failed on cpu newly idle", "%11u", true, v15);
+DOMAIN_FIELD(__u32, newidle_lb_imbalance,
+	     "imbalance sum on cpu newly idle", "%11u", false, v15);
+DOMAIN_FIELD(__u32, newidle_lb_gained,
+	     "pull_task() count on cpu newly idle", "%11u", false, v15);
+DOMAIN_FIELD(__u32, newidle_lb_hot_gained,
+	     "pull_task() when target task was cache-hot on cpu newly idle", "%11u", false, v15);
+DOMAIN_FIELD(__u32, newidle_lb_nobusyq,
+	     "load_balance() failed to find busier queue on cpu newly idle", "%11u", true, v15);
+DOMAIN_FIELD(__u32, newidle_lb_nobusyg,
+	     "load_balance() failed to find busier group on cpu newly idle", "%11u", true, v15);
+#ifdef DERIVED_CNT_FIELD
+DERIVED_CNT_FIELD(newidle_lb_success_count,
+		  "load_balance() success count on cpu newly idle", "%11u",
+		  newidle_lb_count, newidle_lb_balanced, newidle_lb_failed, v15);
+#endif
+#ifdef DERIVED_AVG_FIELD
+DERIVED_AVG_FIELD(newidle_lb_avg_pulled,
+		  "avg task pulled per successful lb attempt (cpu newly idle)", "%11.2Lf",
+		  newidle_lb_count, newidle_lb_balanced, newidle_lb_failed, newidle_lb_gained, v15);
+#endif
+#ifdef DOMAIN_CATEGORY
+DOMAIN_CATEGORY(" <Category active_load_balance()> ");
+#endif
+DOMAIN_FIELD(__u32, alb_count,
+	     "active_load_balance() count", "%11u", false, v15);
+DOMAIN_FIELD(__u32, alb_failed,
+	     "active_load_balance() move task failed", "%11u", false, v15);
+DOMAIN_FIELD(__u32, alb_pushed,
+	     "active_load_balance() successfully moved a task", "%11u", false, v15);
+#ifdef DOMAIN_CATEGORY
+DOMAIN_CATEGORY(" <Category sched_balance_exec()> ");
+#endif
+DOMAIN_FIELD(__u32, sbe_count,
+	     "sbe_count is not used", "%11u", false, v15);
+DOMAIN_FIELD(__u32, sbe_balanced,
+	     "sbe_balanced is not used", "%11u", false, v15);
+DOMAIN_FIELD(__u32, sbe_pushed,
+	     "sbe_pushed is not used", "%11u", false, v15);
+#ifdef DOMAIN_CATEGORY
+DOMAIN_CATEGORY(" <Category sched_balance_fork()> ");
+#endif
+DOMAIN_FIELD(__u32, sbf_count,
+	     "sbf_count is not used", "%11u", false, v15);
+DOMAIN_FIELD(__u32, sbf_balanced,
+	     "sbf_balanced is not used", "%11u", false, v15);
+DOMAIN_FIELD(__u32, sbf_pushed,
+	     "sbf_pushed is not used", "%11u", false, v15);
+#ifdef DOMAIN_CATEGORY
+DOMAIN_CATEGORY(" <Wakeup Info> ");
+#endif
+DOMAIN_FIELD(__u32, ttwu_wake_remote,
+	     "try_to_wake_up() awoke a task that last ran on a diff cpu", "%11u", false, v15);
+DOMAIN_FIELD(__u32, ttwu_move_affine,
+	     "try_to_wake_up() moved task because cache-cold on own cpu", "%11u", false, v15);
+DOMAIN_FIELD(__u32, ttwu_move_balance,
+	     "try_to_wake_up() started passive balancing", "%11u", false, v15);
+#endif /* DOMAIN_FIELD */
diff --git a/tools/lib/perf/include/perf/schedstat-v16.h b/tools/lib/perf/include/perf/schedstat-v16.h
new file mode 100644
index 000000000000..3462b79c29af
--- /dev/null
+++ b/tools/lib/perf/include/perf/schedstat-v16.h
@@ -0,0 +1,146 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifdef CPU_FIELD
+CPU_FIELD(__u32, yld_count, "sched_yield() count",
+	  "%11u", false, yld_count, v16);
+CPU_FIELD(__u32, array_exp, "Legacy counter can be ignored",
+	  "%11u", false, array_exp, v16);
+CPU_FIELD(__u32, sched_count, "schedule() called",
+	  "%11u", false, sched_count, v16);
+CPU_FIELD(__u32, sched_goidle, "schedule() left the processor idle",
+	  "%11u", true, sched_count, v16);
+CPU_FIELD(__u32, ttwu_count, "try_to_wake_up() was called",
+	  "%11u", false, ttwu_count, v16);
+CPU_FIELD(__u32, ttwu_local, "try_to_wake_up() was called to wake up the local cpu",
+	  "%11u", true, ttwu_count, v16);
+CPU_FIELD(__u64, rq_cpu_time, "total runtime by tasks on this processor (in jiffies)",
+	  "%11llu", false, rq_cpu_time, v16);
+CPU_FIELD(__u64, run_delay, "total waittime by tasks on this processor (in jiffies)",
+	  "%11llu", true, rq_cpu_time, v16);
+CPU_FIELD(__u64, pcount, "total timeslices run on this cpu",
+	  "%11llu", false, pcount, v16);
+#endif /* CPU_FIELD */
+
+#ifdef DOMAIN_FIELD
+#ifdef DOMAIN_CATEGORY
+DOMAIN_CATEGORY(" <Category busy> ");
+#endif
+DOMAIN_FIELD(__u32, busy_lb_count,
+	     "load_balance() count on cpu busy", "%11u", true, v16);
+DOMAIN_FIELD(__u32, busy_lb_balanced,
+	     "load_balance() found balanced on cpu busy", "%11u", true, v16);
+DOMAIN_FIELD(__u32, busy_lb_failed,
+	     "load_balance() move task failed on cpu busy", "%11u", true, v16);
+DOMAIN_FIELD(__u32, busy_lb_imbalance,
+	     "imbalance sum on cpu busy", "%11u", false, v16);
+DOMAIN_FIELD(__u32, busy_lb_gained,
+	     "pull_task() count on cpu busy", "%11u", false, v16);
+DOMAIN_FIELD(__u32, busy_lb_hot_gained,
+	     "pull_task() when target task was cache-hot on cpu busy", "%11u", false, v16);
+DOMAIN_FIELD(__u32, busy_lb_nobusyq,
+	     "load_balance() failed to find busier queue on cpu busy", "%11u", true, v16);
+DOMAIN_FIELD(__u32, busy_lb_nobusyg,
+	     "load_balance() failed to find busier group on cpu busy", "%11u", true, v16);
+#ifdef DERIVED_CNT_FIELD
+DERIVED_CNT_FIELD(busy_lb_success_count, "load_balance() success count on cpu busy", "%11u",
+		  busy_lb_count, busy_lb_balanced, busy_lb_failed, v16);
+#endif
+#ifdef DERIVED_AVG_FIELD
+DERIVED_AVG_FIELD(busy_lb_avg_pulled,
+		  "avg task pulled per successful lb attempt (cpu busy)", "%11.2Lf",
+		  busy_lb_count, busy_lb_balanced, busy_lb_failed, busy_lb_gained, v16);
+#endif
+#ifdef DOMAIN_CATEGORY
+DOMAIN_CATEGORY(" <Category idle> ");
+#endif
+DOMAIN_FIELD(__u32, idle_lb_count,
+	     "load_balance() count on cpu idle", "%11u", true, v16);
+DOMAIN_FIELD(__u32, idle_lb_balanced,
+	     "load_balance() found balanced on cpu idle", "%11u", true, v16);
+DOMAIN_FIELD(__u32, idle_lb_failed,
+	     "load_balance() move task failed on cpu idle", "%11u", true, v16);
+DOMAIN_FIELD(__u32, idle_lb_imbalance,
+	     "imbalance sum on cpu idle", "%11u", false, v16);
+DOMAIN_FIELD(__u32, idle_lb_gained,
+	     "pull_task() count on cpu idle", "%11u", false, v16);
+DOMAIN_FIELD(__u32, idle_lb_hot_gained,
+	     "pull_task() when target task was cache-hot on cpu idle", "%11u", false, v16);
+DOMAIN_FIELD(__u32, idle_lb_nobusyq,
+	     "load_balance() failed to find busier queue on cpu idle", "%11u", true, v16);
+DOMAIN_FIELD(__u32, idle_lb_nobusyg,
+	     "load_balance() failed to find busier group on cpu idle", "%11u", true, v16);
+#ifdef DERIVED_CNT_FIELD
+DERIVED_CNT_FIELD(idle_lb_success_count, "load_balance() success count on cpu idle", "%11u",
+		  idle_lb_count, idle_lb_balanced, idle_lb_failed, v16);
+#endif
+#ifdef DERIVED_AVG_FIELD
+DERIVED_AVG_FIELD(idle_lb_avg_pulled,
+		  "avg task pulled per successful lb attempt (cpu idle)", "%11.2Lf",
+		  idle_lb_count, idle_lb_balanced, idle_lb_failed, idle_lb_gained, v16);
+#endif
+#ifdef DOMAIN_CATEGORY
+DOMAIN_CATEGORY(" <Category newidle> ");
+#endif
+DOMAIN_FIELD(__u32, newidle_lb_count,
+	     "load_balance() count on cpu newly idle", "%11u", true, v16);
+DOMAIN_FIELD(__u32, newidle_lb_balanced,
+	     "load_balance() found balanced on cpu newly idle", "%11u", true, v16);
+DOMAIN_FIELD(__u32, newidle_lb_failed,
+	     "load_balance() move task failed on cpu newly idle", "%11u", true, v16);
+DOMAIN_FIELD(__u32, newidle_lb_imbalance,
+	     "imbalance sum on cpu newly idle", "%11u", false, v16);
+DOMAIN_FIELD(__u32, newidle_lb_gained,
+	     "pull_task() count on cpu newly idle", "%11u", false, v16);
+DOMAIN_FIELD(__u32, newidle_lb_hot_gained,
+	     "pull_task() when target task was cache-hot on cpu newly idle", "%11u", false, v16);
+DOMAIN_FIELD(__u32, newidle_lb_nobusyq,
+	     "load_balance() failed to find busier queue on cpu newly idle", "%11u", true, v16);
+DOMAIN_FIELD(__u32, newidle_lb_nobusyg,
+	     "load_balance() failed to find busier group on cpu newly idle", "%11u", true, v16);
+#ifdef DERIVED_CNT_FIELD
+DERIVED_CNT_FIELD(newidle_lb_success_count,
+		  "load_balance() success count on cpu newly idle", "%11u",
+		  newidle_lb_count, newidle_lb_balanced, newidle_lb_failed, v16);
+#endif
+#ifdef DERIVED_AVG_FIELD
+DERIVED_AVG_FIELD(newidle_lb_avg_count,
+		  "avg task pulled per successful lb attempt (cpu newly idle)", "%11.2Lf",
+		  newidle_lb_count, newidle_lb_balanced, newidle_lb_failed, newidle_lb_gained, v16);
+#endif
+#ifdef DOMAIN_CATEGORY
+DOMAIN_CATEGORY(" <Category active_load_balance()> ");
+#endif
+DOMAIN_FIELD(__u32, alb_count,
+	     "active_load_balance() count", "%11u", false, v16);
+DOMAIN_FIELD(__u32, alb_failed,
+	     "active_load_balance() move task failed", "%11u", false, v16);
+DOMAIN_FIELD(__u32, alb_pushed,
+	     "active_load_balance() successfully moved a task", "%11u", false, v16);
+#ifdef DOMAIN_CATEGORY
+DOMAIN_CATEGORY(" <Category sched_balance_exec()> ");
+#endif
+DOMAIN_FIELD(__u32, sbe_count,
+	     "sbe_count is not used", "%11u", false, v16);
+DOMAIN_FIELD(__u32, sbe_balanced,
+	     "sbe_balanced is not used", "%11u", false, v16);
+DOMAIN_FIELD(__u32, sbe_pushed,
+	     "sbe_pushed is not used", "%11u", false, v16);
+#ifdef DOMAIN_CATEGORY
+DOMAIN_CATEGORY(" <Category sched_balance_fork()> ");
+#endif
+DOMAIN_FIELD(__u32, sbf_count,
+	     "sbf_count is not used", "%11u", false, v16);
+DOMAIN_FIELD(__u32, sbf_balanced,
+	     "sbf_balanced is not used", "%11u", false, v16);
+DOMAIN_FIELD(__u32, sbf_pushed,
+	     "sbf_pushed is not used", "%11u", false, v16);
+#ifdef DOMAIN_CATEGORY
+DOMAIN_CATEGORY(" <Wakeup Info> ");
+#endif
+DOMAIN_FIELD(__u32, ttwu_wake_remote,
+	     "try_to_wake_up() awoke a task that last ran on a diff cpu", "%11u", false, v16);
+DOMAIN_FIELD(__u32, ttwu_move_affine,
+	     "try_to_wake_up() moved task because cache-cold on own cpu", "%11u", false, v16);
+DOMAIN_FIELD(__u32, ttwu_move_balance,
+	     "try_to_wake_up() started passive balancing", "%11u", false, v16);
+#endif /* DOMAIN_FIELD */
diff --git a/tools/lib/perf/include/perf/schedstat-v17.h b/tools/lib/perf/include/perf/schedstat-v17.h
new file mode 100644
index 000000000000..865dc7c1039c
--- /dev/null
+++ b/tools/lib/perf/include/perf/schedstat-v17.h
@@ -0,0 +1,164 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifdef CPU_FIELD
+CPU_FIELD(__u32, yld_count, "sched_yield() count",
+	  "%11u", false, yld_count, v17);
+CPU_FIELD(__u32, array_exp, "Legacy counter can be ignored",
+	  "%11u", false, array_exp, v17);
+CPU_FIELD(__u32, sched_count, "schedule() called",
+	  "%11u", false, sched_count, v17);
+CPU_FIELD(__u32, sched_goidle, "schedule() left the processor idle",
+	  "%11u", true, sched_count, v17);
+CPU_FIELD(__u32, ttwu_count, "try_to_wake_up() was called",
+	  "%11u", false, ttwu_count, v17);
+CPU_FIELD(__u32, ttwu_local, "try_to_wake_up() was called to wake up the local cpu",
+	  "%11u", true, ttwu_count, v17);
+CPU_FIELD(__u64, rq_cpu_time, "total runtime by tasks on this processor (in jiffies)",
+	  "%11llu", false, rq_cpu_time, v17);
+CPU_FIELD(__u64, run_delay, "total waittime by tasks on this processor (in jiffies)",
+	  "%11llu", true, rq_cpu_time, v17);
+CPU_FIELD(__u64, pcount, "total timeslices run on this cpu",
+	  "%11llu", false, pcount, v17);
+#endif /* CPU_FIELD */
+
+#ifdef DOMAIN_FIELD
+#ifdef DOMAIN_CATEGORY
+DOMAIN_CATEGORY(" <Category busy> ");
+#endif
+DOMAIN_FIELD(__u32, busy_lb_count,
+	     "load_balance() count on cpu busy", "%11u", true, v17);
+DOMAIN_FIELD(__u32, busy_lb_balanced,
+	     "load_balance() found balanced on cpu busy", "%11u", true, v17);
+DOMAIN_FIELD(__u32, busy_lb_failed,
+	     "load_balance() move task failed on cpu busy", "%11u", true, v17);
+DOMAIN_FIELD(__u32, busy_lb_imbalance_load,
+	     "imbalance in load on cpu busy", "%11u", false, v17);
+DOMAIN_FIELD(__u32, busy_lb_imbalance_util,
+	     "imbalance in utilization on cpu busy", "%11u", false, v17);
+DOMAIN_FIELD(__u32, busy_lb_imbalance_task,
+	     "imbalance in number of tasks on cpu busy", "%11u", false, v17);
+DOMAIN_FIELD(__u32, busy_lb_imbalance_misfit,
+	     "imbalance in misfit tasks on cpu busy", "%11u", false, v17);
+DOMAIN_FIELD(__u32, busy_lb_gained,
+	     "pull_task() count on cpu busy", "%11u", false, v17);
+DOMAIN_FIELD(__u32, busy_lb_hot_gained,
+	     "pull_task() when target task was cache-hot on cpu busy", "%11u", false, v17);
+DOMAIN_FIELD(__u32, busy_lb_nobusyq,
+	     "load_balance() failed to find busier queue on cpu busy", "%11u", true, v17);
+DOMAIN_FIELD(__u32, busy_lb_nobusyg,
+	     "load_balance() failed to find busier group on cpu busy", "%11u", true, v17);
+#ifdef DERIVED_CNT_FIELD
+DERIVED_CNT_FIELD(busy_lb_success_count, "load_balance() success count on cpu busy", "%11u",
+		  busy_lb_count, busy_lb_balanced, busy_lb_failed, v17);
+#endif
+#ifdef DERIVED_AVG_FIELD
+DERIVED_AVG_FIELD(busy_lb_avg_pulled,
+		  "avg task pulled per successful lb attempt (cpu busy)", "%11.2Lf",
+		  busy_lb_count, busy_lb_balanced, busy_lb_failed, busy_lb_gained, v17);
+#endif
+#ifdef DOMAIN_CATEGORY
+DOMAIN_CATEGORY(" <Category idle> ");
+#endif
+DOMAIN_FIELD(__u32, idle_lb_count,
+	     "load_balance() count on cpu idle", "%11u", true, v17);
+DOMAIN_FIELD(__u32, idle_lb_balanced,
+	     "load_balance() found balanced on cpu idle", "%11u", true, v17);
+DOMAIN_FIELD(__u32, idle_lb_failed,
+	     "load_balance() move task failed on cpu idle", "%11u", true, v17);
+DOMAIN_FIELD(__u32, idle_lb_imbalance_load,
+	     "imbalance in load on cpu idle", "%11u", false, v17);
+DOMAIN_FIELD(__u32, idle_lb_imbalance_util,
+	     "imbalance in utilization on cpu idle", "%11u", false, v17);
+DOMAIN_FIELD(__u32, idle_lb_imbalance_task,
+	     "imbalance in number of tasks on cpu idle", "%11u", false, v17);
+DOMAIN_FIELD(__u32, idle_lb_imbalance_misfit,
+	     "imbalance in misfit tasks on cpu idle", "%11u", false, v17);
+DOMAIN_FIELD(__u32, idle_lb_gained,
+	     "pull_task() count on cpu idle", "%11u", false, v17);
+DOMAIN_FIELD(__u32, idle_lb_hot_gained,
+	     "pull_task() when target task was cache-hot on cpu idle", "%11u", false, v17);
+DOMAIN_FIELD(__u32, idle_lb_nobusyq,
+	     "load_balance() failed to find busier queue on cpu idle", "%11u", true, v17);
+DOMAIN_FIELD(__u32, idle_lb_nobusyg,
+	     "load_balance() failed to find busier group on cpu idle", "%11u", true, v17);
+#ifdef DERIVED_CNT_FIELD
+DERIVED_CNT_FIELD(idle_lb_success_count, "load_balance() success count on cpu idle", "%11u",
+		  idle_lb_count, idle_lb_balanced, idle_lb_failed, v17);
+#endif
+#ifdef DERIVED_AVG_FIELD
+DERIVED_AVG_FIELD(idle_lb_avg_pulled,
+		  "avg task pulled per successful lb attempt (cpu idle)", "%11.2Lf",
+		  idle_lb_count, idle_lb_balanced, idle_lb_failed, idle_lb_gained, v17);
+#endif
+#ifdef DOMAIN_CATEGORY
+DOMAIN_CATEGORY(" <Category newidle> ");
+#endif
+DOMAIN_FIELD(__u32, newidle_lb_count,
+	     "load_balance() count on cpu newly idle", "%11u", true, v17);
+DOMAIN_FIELD(__u32, newidle_lb_balanced,
+	     "load_balance() found balanced on cpu newly idle", "%11u", true, v17);
+DOMAIN_FIELD(__u32, newidle_lb_failed,
+	     "load_balance() move task failed on cpu newly idle", "%11u", true, v17);
+DOMAIN_FIELD(__u32, newidle_lb_imbalance_load,
+	     "imbalance in load on cpu newly idle", "%11u", false, v17);
+DOMAIN_FIELD(__u32, newidle_lb_imbalance_util,
+	     "imbalance in utilization on cpu newly idle", "%11u", false, v17);
+DOMAIN_FIELD(__u32, newidle_lb_imbalance_task,
+	     "imbalance in number of tasks on cpu newly idle", "%11u", false, v17);
+DOMAIN_FIELD(__u32, newidle_lb_imbalance_misfit,
+	     "imbalance in misfit tasks on cpu newly idle", "%11u", false, v17);
+DOMAIN_FIELD(__u32, newidle_lb_gained,
+	     "pull_task() count on cpu newly idle", "%11u", false, v17);
+DOMAIN_FIELD(__u32, newidle_lb_hot_gained,
+	     "pull_task() when target task was cache-hot on cpu newly idle", "%11u", false, v17);
+DOMAIN_FIELD(__u32, newidle_lb_nobusyq,
+	     "load_balance() failed to find busier queue on cpu newly idle", "%11u", true, v17);
+DOMAIN_FIELD(__u32, newidle_lb_nobusyg,
+	     "load_balance() failed to find busier group on cpu newly idle", "%11u", true, v17);
+#ifdef DERIVED_CNT_FIELD
+DERIVED_CNT_FIELD(newidle_lb_success_count,
+		  "load_balance() success count on cpu newly idle", "%11u",
+		  newidle_lb_count, newidle_lb_balanced, newidle_lb_failed, v17);
+#endif
+#ifdef DERIVED_AVG_FIELD
+DERIVED_AVG_FIELD(newidle_lb_avg_pulled,
+		  "avg task pulled per successful lb attempt (cpu newly idle)", "%11.2Lf",
+		  newidle_lb_count, newidle_lb_balanced, newidle_lb_failed, newidle_lb_gained, v17);
+#endif
+#ifdef DOMAIN_CATEGORY
+DOMAIN_CATEGORY(" <Category active_load_balance()> ");
+#endif
+DOMAIN_FIELD(__u32, alb_count,
+	     "active_load_balance() count", "%11u", false, v17);
+DOMAIN_FIELD(__u32, alb_failed,
+	     "active_load_balance() move task failed", "%11u", false, v17);
+DOMAIN_FIELD(__u32, alb_pushed,
+	     "active_load_balance() successfully moved a task", "%11u", false, v17);
+#ifdef DOMAIN_CATEGORY
+DOMAIN_CATEGORY(" <Category sched_balance_exec()> ");
+#endif
+DOMAIN_FIELD(__u32, sbe_count,
+	     "sbe_count is not used", "%11u", false, v17);
+DOMAIN_FIELD(__u32, sbe_balanced,
+	     "sbe_balanced is not used", "%11u", false, v17);
+DOMAIN_FIELD(__u32, sbe_pushed,
+	     "sbe_pushed is not used", "%11u", false, v17);
+#ifdef DOMAIN_CATEGORY
+DOMAIN_CATEGORY(" <Category sched_balance_fork()> ");
+#endif
+DOMAIN_FIELD(__u32, sbf_count,
+	     "sbf_count is not used", "%11u", false, v17);
+DOMAIN_FIELD(__u32, sbf_balanced,
+	     "sbf_balanced is not used", "%11u", false, v17);
+DOMAIN_FIELD(__u32, sbf_pushed,
+	     "sbf_pushed is not used", "%11u", false, v17);
+#ifdef DOMAIN_CATEGORY
+DOMAIN_CATEGORY(" <Wakeup Info> ");
+#endif
+DOMAIN_FIELD(__u32, ttwu_wake_remote,
+	     "try_to_wake_up() awoke a task that last ran on a diff cpu", "%11u", false, v17);
+DOMAIN_FIELD(__u32, ttwu_move_affine,
+	     "try_to_wake_up() moved task because cache-cold on own cpu", "%11u", false, v17);
+DOMAIN_FIELD(__u32, ttwu_move_balance,
+	     "try_to_wake_up() started passive balancing", "%11u", false, v17);
+#endif /* DOMAIN_FIELD */
diff --git a/tools/lib/subcmd/help.c b/tools/lib/subcmd/help.c
index ddaeb4eb3e24..db94aa685b73 100644
--- a/tools/lib/subcmd/help.c
+++ b/tools/lib/subcmd/help.c
@@ -97,11 +97,13 @@ void exclude_cmds(struct cmdnames *cmds, struct cmdnames *excludes)
 			ei++;
 		}
 	}
-	if (ci != cj) {
-		while (ci < cmds->cnt) {
-			cmds->names[cj++] = cmds->names[ci];
-			cmds->names[ci++] = NULL;
+	while (ci < cmds->cnt) {
+		if (ci != cj) {
+			cmds->names[cj] = cmds->names[ci];
+			cmds->names[ci] = NULL;
 		}
+		ci++;
+		cj++;
 	}
 	for (ci = cj; ci < cmds->cnt; ci++)
 		assert(cmds->names[ci] == NULL);
diff --git a/tools/perf/.gitignore b/tools/perf/.gitignore
index b64302a76144..0f9451a6e39c 100644
--- a/tools/perf/.gitignore
+++ b/tools/perf/.gitignore
@@ -36,12 +36,18 @@ config.mak.autogen
 util/intel-pt-decoder/inat-tables.c
 arch/*/include/generated/
 trace/beauty/generated/
+pmu-events/arch/common/common/legacy-cache.json
 pmu-events/pmu-events.c
 pmu-events/jevents
 pmu-events/metric_test.log
 pmu-events/empty-pmu-events.log
 pmu-events/test-empty-pmu-events.c
 *.shellcheck_log
+pmu-events/arch/**/extra-metrics.json
+pmu-events/arch/**/extra-metricgroups.json
+tests/shell/*.shellcheck_log
+tests/shell/coresight/*.shellcheck_log
+tests/shell/lib/*.shellcheck_log
 feature/
 libapi/
 libbpf/
diff --git a/tools/perf/Documentation/perf-c2c.txt b/tools/perf/Documentation/perf-c2c.txt
index 40b0f71a2c44..e57a122b8719 100644
--- a/tools/perf/Documentation/perf-c2c.txt
+++ b/tools/perf/Documentation/perf-c2c.txt
@@ -160,20 +160,43 @@ Following perf record options are configured by default:
 
   -W,-d,--phys-data,--sample-cpu
 
-Unless specified otherwise with '-e' option, following events are monitored by
-default on Intel:
-
-  cpu/mem-loads,ldlat=30/P
-  cpu/mem-stores/P
-
-following on AMD:
-
-  ibs_op//
-
-and following on PowerPC:
-
-  cpu/mem-loads/
-  cpu/mem-stores/
+The following table lists the events monitored on different architectures.
+Unless specified otherwise with the -e option, the tool will select the
+default events.
+
+  +--------+---------------+-----------------+--------------------------------------------------------------------------------+
+  | Arch   | Configuration | Options         | Events                                                                         |
+  +--------+---------------+-----------------+--------------------------------------------------------------------------------+
+  | Intel  | Default       | -e ldlat-loads  | cpu/mem-loads,ldlat=30/P                                                       |
+  |        |               | -e ldlat-stores | cpu/mem-stores/P                                                               |
+  |        |---------------+-----------------+--------------------------------------------------------------------------------+
+  |        | Load only     | -e ldlat-loads  | cpu/mem-loads,ldlat=30/P                                                       |
+  |        |---------------+-----------------+--------------------------------------------------------------------------------+
+  |        | Store only    | -e ldlat-stores | cpu/mem-stores/P                                                               |
+  +--------+---------------+-----------------+--------------------------------------------------------------------------------+
+  | Intel  | Default       | -e ldlat-loads  | {cpu/mem-loads-aux/,cpu/mem-loads,ldlat=30/}:P                                 |
+  | with   |               | -e ldlat-stores | cpu/mem-stores/P                                                               |
+  | AUX    |--------------+------------------+--------------------------------------------------------------------------------+
+  |        | Load only     | -e ldlat-loads  | {cpu/mem-loads-aux/,cpu/mem-loads,ldlat=30/}:P                                 |
+  |        |---------------+-----------------+--------------------------------------------------------------------------------+
+  |        | Store only    | -e ldlat-stores | cpu/mem-stores/P                                                               |
+  +--------+---------------+-----------------+--------------------------------------------------------------------------------+
+  | AMD    | Default       | -e mem-ldst     | ibs_op// (without latency support)                                             |
+  |        |               |                 | ibs_op/ldlat=30/ (with latency support)                                        |
+  +--------+---------------+-----------------+--------------------------------------------------------------------------------+
+  | PowerPC| Default       | -e ldlat-loads  | cpu/mem-loads/                                                                 |
+  |        |               | -e ldlat-stores | cpu/mem-stores/                                                                |
+  |        |---------------+-----------------+--------------------------------------------------------------------------------+
+  |        | Load only     | -e ldlat-loads  | cpu/mem-loads/                                                                 |
+  |        |---------------+-----------------+--------------------------------------------------------------------------------+
+  |        | Store only    | -e ldlat-stores | cpu/mem-stores/                                                                |
+  +--------+---------------+-----------------+--------------------------------------------------------------------------------+
+  | Arm    | Default       | -e spe-ldst     | arm_spe_0/ts_enable=1,pa_enable=1,load_filter=1,store_filter=1,min_latency=30/ |
+  | SPE    |---------------+-----------------+--------------------------------------------------------------------------------+
+  |        | Load only     | -e spe-load     | arm_spe_0/ts_enable=1,pa_enable=1,load_filter=1,min_latency=30/                |
+  |        |---------------+-----------------+--------------------------------------------------------------------------------+
+  |        | Store only    | -e spe-store    | arm_spe_0/ts_enable=1,pa_enable=1,store_filter=1/                              |
+  +--------+---------------+-----------------+--------------------------------------------------------------------------------+
 
 User can pass any 'perf record' option behind '--' mark, like (to enable
 callchains and system wide monitoring):
diff --git a/tools/perf/Documentation/perf-data.txt b/tools/perf/Documentation/perf-data.txt
index 417bf17e265c..20f178d61ed7 100644
--- a/tools/perf/Documentation/perf-data.txt
+++ b/tools/perf/Documentation/perf-data.txt
@@ -40,6 +40,34 @@ OPTIONS for 'convert'
 --force::
 	Don't complain, do it.
 
+--time::
+	Only convert samples within given time window: <start>,<stop>. Times
+	have the format seconds.nanoseconds. If start is not given (i.e. time
+	string is ',x.y') then analysis starts at the beginning of the file. If
+	stop time is not given (i.e. time string is 'x.y,') then analysis goes
+	to end of file. Multiple ranges can be separated by spaces, which
+	requires the argument to be quoted e.g. --time "1234.567,1234.789 1235,"
+
+	Also support time percent with multiple time ranges. Time string is
+	'a%/n,b%/m,...' or 'a%-b%,c%-%d,...'.
+
+	For example:
+	Select the second 10% time slice:
+
+	  perf data convert --to-json out.json --time 10%/2
+
+	Select from 0% to 10% time slice:
+
+	  perf data convert --to-json out.json --time 0%-10%
+
+	Select the first and second 10% time slices:
+
+	  perf data convert --to-json out.json --time 10%/1,10%/2
+
+	Select from 0% to 10% and 30% to 40% slices:
+
+	  perf data convert --to-json out.json --time 0%-10%,30%-40%
+
 -v::
 --verbose::
         Be more verbose (show counter open errors, etc).
diff --git a/tools/perf/Documentation/perf-inject.txt b/tools/perf/Documentation/perf-inject.txt
index c972032f4ca0..95dfdf39666e 100644
--- a/tools/perf/Documentation/perf-inject.txt
+++ b/tools/perf/Documentation/perf-inject.txt
@@ -109,6 +109,11 @@ include::itrace.txt[]
 	should be used, and also --buildid-all and --switch-events may be
 	useful.
 
+--convert-callchain::
+	Parse DWARF callchains and convert them to usual callchains.  This also
+	discards stack and register data from the samples.  This will lose
+	inlined callchain entries.
+
 :GMEXAMPLECMD: inject
 :GMEXAMPLESUBCMD:
 include::guestmount.txt[]
diff --git a/tools/perf/Documentation/perf-record.txt b/tools/perf/Documentation/perf-record.txt
index e8b9aadbbfa5..178f483140ed 100644
--- a/tools/perf/Documentation/perf-record.txt
+++ b/tools/perf/Documentation/perf-record.txt
@@ -344,7 +344,8 @@ OPTIONS
 
 -d::
 --data::
-	Record the sample virtual addresses.  Implies --sample-mem-info.
+	Record the sample virtual addresses.  Implies --sample-mem-info and
+	--data-mmap.
 
 --phys-data::
 	Record the sample physical addresses.
@@ -454,7 +455,7 @@ following filters are defined:
 	- no_tx: only when the target is not in a hardware transaction
 	- abort_tx: only when the target is a hardware transaction abort
 	- cond: conditional branches
-	- call_stack: save call stack
+	- stack: save call stack
 	- no_flags: don't save branch flags e.g prediction, misprediction etc
 	- no_cycles: don't save branch cycles
 	- hw_index: save branch hardware index
@@ -861,6 +862,11 @@ filtered through the mask provided by -C option.
 	Prepare BPF filter to be used by regular users.  The action should be
 	either "pin" or "unpin".  The filter can be used after it's pinned.
 
+--data-mmap::
+	Enable recording MMAP events for non-executable mappings.  Basically
+	perf only records executable mappings but data mmaping can be useful
+	when you analyze data access with sample addresses.  So using -d option
+	would enable this unless you specify --no-data-mmap manually.
 
 include::intel-hybrid.txt[]
 
diff --git a/tools/perf/Documentation/perf-sched.txt b/tools/perf/Documentation/perf-sched.txt
index 6dbbddb6464d..4d9981609c04 100644
--- a/tools/perf/Documentation/perf-sched.txt
+++ b/tools/perf/Documentation/perf-sched.txt
@@ -8,7 +8,7 @@ perf-sched - Tool to trace/measure scheduler properties (latencies)
 SYNOPSIS
 --------
 [verse]
-'perf sched' {record|latency|map|replay|script|timehist}
+'perf sched' {record|latency|map|replay|script|timehist|stats}
 
 DESCRIPTION
 -----------
@@ -80,8 +80,267 @@ There are several variants of 'perf sched':
     
    Times are in msec.usec.
 
+   'perf sched stats {record | report | diff} <command>' to capture, report the diff
+   in schedstat counters and show the difference between perf sched stats report
+   respectively. schedstat counters which are present in the linux kernel and are
+   exposed through the file ``/proc/schedstat``. These counters are enabled or disabled
+   via the sysctl governed by the file ``/proc/sys/kernel/sched_schedstats``. These
+   counters accounts for many scheduler events such as ``schedule()`` calls, load-balancing
+   events, ``try_to_wakeup()`` call among others. This is useful in understanding the
+   scheduler behavior for the workload.
+
+   Note: The tool will not give correct results if there is topological reordering or
+         online/offline of cpus in between capturing snapshots of `/proc/schedstat`.
+
+    Example usage:
+        perf sched stats record -- sleep 1
+        perf sched stats report
+        perf sched stats diff
+
+   A detailed description of the schedstats can be found in the Kernel Documentation:
+   https://www.kernel.org/doc/html/latest/scheduler/sched-stats.html
+
+   The result can be interpreted as follows:
+
+   The `perf sched stats report` starts with description of the columns present in
+   the report. These column names are given before cpu and domain stats to improve
+   the readability of the report.
+
+   ----------------------------------------------------------------------------------------------------
+   DESC                    -> Description of the field
+   COUNT                   -> Value of the field
+   PCT_CHANGE              -> Percent change with corresponding base value
+   AVG_JIFFIES             -> Avg time in jiffies between two consecutive occurrence of event
+   ----------------------------------------------------------------------------------------------------
+
+   Next is the total profiling time in terms of jiffies:
+
+   ----------------------------------------------------------------------------------------------------
+   Time elapsed (in jiffies)                                   :        2323
+   ----------------------------------------------------------------------------------------------------
+
+   Next is CPU scheduling statistics. These are simple diffs of /proc/schedstat CPU lines
+   along with description. The report also prints % relative to base stat.
+
+   In the example below, schedule() left the CPU0 idle 36.58% of the time. 0.45% of total
+   try_to_wake_up() was to wakeup local CPU. And, the total waittime by tasks on CPU0 is
+   48.70% of the total runtime by tasks on the same CPU.
+
+   ----------------------------------------------------------------------------------------------------
+   CPU 0
+   ----------------------------------------------------------------------------------------------------
+   DESC                                                                     COUNT   PCT_CHANGE
+   ----------------------------------------------------------------------------------------------------
+   yld_count                                                        :           0
+   array_exp                                                        :           0
+   sched_count                                                      :      402267
+   sched_goidle                                                     :      147161  (    36.58% )
+   ttwu_count                                                       :      236309
+   ttwu_local                                                       :        1062  (     0.45% )
+   rq_cpu_time                                                      :  7083791148
+   run_delay                                                        :  3449973971  (    48.70% )
+   pcount                                                           :      255035
+   ----------------------------------------------------------------------------------------------------
+
+   Next is load balancing statistics. For each of the sched domains
+   (eg: `SMT`, `MC`, `DIE`...), the scheduler computes statistics under
+   the following three categories:
+
+   1) Idle Load Balance: Load balancing performed on behalf of a long
+                         idling CPU by some other CPU.
+   2) Busy Load Balance: Load balancing performed when the CPU was busy.
+   3) New Idle Balance : Load balancing performed when a CPU just became
+                        idle.
+
+   Under each of these three categories, sched stats report provides
+   different load balancing statistics. Along with direct stats, the
+   report also contains derived metrics prefixed with *. Example:
+
+   ----------------------------------------------------------------------------------------------------
+   CPU 0, DOMAIN SMT CPUS 0,64
+   ----------------------------------------------------------------------------------------------------
+   DESC                                                                     COUNT    AVG_JIFFIES
+   ----------------------------------------- <Category busy> ------------------------------------------
+   busy_lb_count                                                    :         136  $       17.08 $
+   busy_lb_balanced                                                 :         131  $       17.73 $
+   busy_lb_failed                                                   :           0  $        0.00 $
+   busy_lb_imbalance_load                                           :          58
+   busy_lb_imbalance_util                                           :           0
+   busy_lb_imbalance_task                                           :           0
+   busy_lb_imbalance_misfit                                         :           0
+   busy_lb_gained                                                   :           7
+   busy_lb_hot_gained                                               :           0
+   busy_lb_nobusyq                                                  :           2  $     1161.50 $
+   busy_lb_nobusyg                                                  :         129  $       18.01 $
+   *busy_lb_success_count                                           :           5
+   *busy_lb_avg_pulled                                              :        1.40
+   ----------------------------------------- <Category idle> ------------------------------------------
+   idle_lb_count                                                    :         449  $        5.17 $
+   idle_lb_balanced                                                 :         382  $        6.08 $
+   idle_lb_failed                                                   :           3  $      774.33 $
+   idle_lb_imbalance_load                                           :           0
+   idle_lb_imbalance_util                                           :           0
+   idle_lb_imbalance_task                                           :          71
+   idle_lb_imbalance_misfit                                         :           0
+   idle_lb_gained                                                   :          67
+   idle_lb_hot_gained                                               :           0
+   idle_lb_nobusyq                                                  :           0  $        0.00 $
+   idle_lb_nobusyg                                                  :         382  $        6.08 $
+   *idle_lb_success_count                                           :          64
+   *idle_lb_avg_pulled                                              :        1.05
+   ---------------------------------------- <Category newidle> ----------------------------------------
+   newidle_lb_count                                                 :       30471  $        0.08 $
+   newidle_lb_balanced                                              :       28490  $        0.08 $
+   newidle_lb_failed                                                :         633  $        3.67 $
+   newidle_lb_imbalance_load                                        :           0
+   newidle_lb_imbalance_util                                        :           0
+   newidle_lb_imbalance_task                                        :        2040
+   newidle_lb_imbalance_misfit                                      :           0
+   newidle_lb_gained                                                :        1348
+   newidle_lb_hot_gained                                            :           0
+   newidle_lb_nobusyq                                               :           6  $      387.17 $
+   newidle_lb_nobusyg                                               :       26634  $        0.09 $
+   *newidle_lb_success_count                                        :        1348
+   *newidle_lb_avg_pulled                                           :        1.00
+   ----------------------------------------------------------------------------------------------------
+
+   Consider following line:
+
+   newidle_lb_balanced                                              :       28490  $        0.08 $
+
+   While profiling was active, the load-balancer found 28490 times the load
+   needs to be balanced on a newly idle CPU 0. Following value encapsulated
+   inside $ is average jiffies between two events (2323 / 28490 = 0.08).
+
+   Next are active_load_balance() stats. alb did not trigger while the
+   profiling was active, hence it's all 0s.
+
+   --------------------------------- <Category active_load_balance()> ---------------------------------
+   alb_count                                                        :           0
+   alb_failed                                                       :           0
+   alb_pushed                                                       :           0
+   ----------------------------------------------------------------------------------------------------
+
+   Next are sched_balance_exec() and sched_balance_fork() stats. They are
+   not used but we kept it in RFC just for legacy purpose. Unless opposed,
+   we plan to remove them in next revision.
+
+   Next are wakeup statistics. For every domain, the report also shows
+   task-wakeup statistics. Example:
+
+   ------------------------------------------ <Wakeup Info> -------------------------------------------
+   ttwu_wake_remote                                                 :        1590
+   ttwu_move_affine                                                 :          84
+   ttwu_move_balance                                                :           0
+   ----------------------------------------------------------------------------------------------------
+
+   Same set of stats are reported for each CPU and each domain level.
+
+   How to interpret the diff
+   ~~~~~~~~~~~~~~~~~~~~~~~~~
+
+   The `perf sched stats diff` will also start with explaining the columns
+   present in the diff. Then it will show the diff in time in terms of
+   jiffies. The order of the values depends on the order of input data
+   files. It will take `perf.data.old` and `perf.data` respectively as the
+   defaults for comparison. Example:
+
+   ----------------------------------------------------------------------------------------------------
+   Time elapsed (in jiffies)                                        :        2009,       2001
+   ----------------------------------------------------------------------------------------------------
+
+   Below is the sample representing the difference in cpu and domain stats of
+   two runs. Here third column or the values enclosed in `|...|` shows the
+   percent change between the two. Second and fourth columns shows the
+   side-by-side representions of the corresponding fields from `perf sched
+   stats report`.
+
+   ----------------------------------------------------------------------------------------------------
+   CPU <ALL CPUS SUMMARY>
+   ----------------------------------------------------------------------------------------------------
+   DESC                                                                    COUNT1      COUNT2   PCT_CHANG>
+   ----------------------------------------------------------------------------------------------------
+   yld_count                                                        :           0,          0  |     0.00>
+   array_exp                                                        :           0,          0  |     0.00>
+   sched_count                                                      :      528533,     412573  |   -21.94>
+   sched_goidle                                                     :      193426,     146082  |   -24.48>
+   ttwu_count                                                       :      313134,     385975  |    23.26>
+   ttwu_local                                                       :        1126,       1282  |    13.85>
+   rq_cpu_time                                                      :  8257200244, 8301250047  |     0.53>
+   run_delay                                                        :  4728347053, 3997100703  |   -15.47>
+   pcount                                                           :      335031,     266396  |   -20.49>
+   ----------------------------------------------------------------------------------------------------
+
+   Below is the sample of domain stats diff:
+
+   ----------------------------------------------------------------------------------------------------
+   CPU <ALL CPUS SUMMARY>, DOMAIN SMT
+   ----------------------------------------------------------------------------------------------------
+   DESC                                                                    COUNT1      COUNT2   PCT_CHANG>
+   ----------------------------------------- <Category busy> ------------------------------------------
+   busy_lb_count                                                    :         122,         80  |   -34.43>
+   busy_lb_balanced                                                 :         115,         76  |   -33.91>
+   busy_lb_failed                                                   :           1,          3  |   200.00>
+   busy_lb_imbalance_load                                           :          35,         49  |    40.00>
+   busy_lb_imbalance_util                                           :           0,          0  |     0.00>
+   busy_lb_imbalance_task                                           :           0,          0  |     0.00>
+   busy_lb_imbalance_misfit                                         :           0,          0  |     0.00>
+   busy_lb_gained                                                   :           7,          2  |   -71.43>
+   busy_lb_hot_gained                                               :           0,          0  |     0.00>
+   busy_lb_nobusyq                                                  :           0,          0  |     0.00>
+   busy_lb_nobusyg                                                  :         115,         76  |   -33.91>
+   *busy_lb_success_count                                           :           6,          1  |   -83.33>
+   *busy_lb_avg_pulled                                              :        1.17,       2.00  |    71.43>
+   ----------------------------------------- <Category idle> ------------------------------------------
+   idle_lb_count                                                    :         568,        620  |     9.15>
+   idle_lb_balanced                                                 :         462,        449  |    -2.81>
+   idle_lb_failed                                                   :          11,         21  |    90.91>
+   idle_lb_imbalance_load                                           :           0,          0  |     0.00>
+   idle_lb_imbalance_util                                           :           0,          0  |     0.00>
+   idle_lb_imbalance_task                                           :         115,        189  |    64.35>
+   idle_lb_imbalance_misfit                                         :           0,          0  |     0.00>
+   idle_lb_gained                                                   :         103,        169  |    64.08>
+   idle_lb_hot_gained                                               :           0,          0  |     0.00>
+   idle_lb_nobusyq                                                  :           0,          0  |     0.00>
+   idle_lb_nobusyg                                                  :         462,        449  |    -2.81>
+   *idle_lb_success_count                                           :          95,        150  |    57.89>
+   *idle_lb_avg_pulled                                              :        1.08,       1.13  |     3.92>
+   ---------------------------------------- <Category newidle> ----------------------------------------
+   newidle_lb_count                                                 :       16961,       3155  |   -81.40>
+   newidle_lb_balanced                                              :       15646,       2556  |   -83.66>
+   newidle_lb_failed                                                :         397,        142  |   -64.23>
+   newidle_lb_imbalance_load                                        :           0,          0  |     0.00>
+   newidle_lb_imbalance_util                                        :           0,          0  |     0.00>
+   newidle_lb_imbalance_task                                        :        1376,        655  |   -52.40>
+   newidle_lb_imbalance_misfit                                      :           0,          0  |     0.00>
+   newidle_lb_gained                                                :         917,        457  |   -50.16>
+   newidle_lb_hot_gained                                            :           0,          0  |     0.00>
+   newidle_lb_nobusyq                                               :           3,          1  |   -66.67>
+   newidle_lb_nobusyg                                               :       14480,       2103  |   -85.48>
+   *newidle_lb_success_count                                        :         918,        457  |   -50.22>
+   *newidle_lb_avg_pulled                                           :        1.00,       1.00  |     0.11>
+   --------------------------------- <Category active_load_balance()> ---------------------------------
+   alb_count                                                        :           0,          1  |     0.00>
+   alb_failed                                                       :           0,          0  |     0.00>
+   alb_pushed                                                       :           0,          1  |     0.00>
+   --------------------------------- <Category sched_balance_exec()> ----------------------------------
+   sbe_count                                                        :           0,          0  |     0.00>
+   sbe_balanced                                                     :           0,          0  |     0.00>
+   sbe_pushed                                                       :           0,          0  |     0.00>
+   --------------------------------- <Category sched_balance_fork()> ----------------------------------
+   sbf_count                                                        :           0,          0  |     0.00>
+   sbf_balanced                                                     :           0,          0  |     0.00>
+   sbf_pushed                                                       :           0,          0  |     0.00>
+   ------------------------------------------ <Wakeup Info> -------------------------------------------
+   ttwu_wake_remote                                                 :        2031,       2914  |    43.48>
+   ttwu_move_affine                                                 :          73,        124  |    69.86>
+   ttwu_move_balance                                                :           0,          0  |     0.00>
+   ----------------------------------------------------------------------------------------------------
+
 OPTIONS
 -------
+Applicable to {record|latency|map|replay|script}
+
 -i::
 --input=<file>::
         Input file name. (default: perf.data unless stdin is a fifo)
diff --git a/tools/perf/Documentation/perf-script.txt b/tools/perf/Documentation/perf-script.txt
index 03d112960632..ddf92f9c7821 100644
--- a/tools/perf/Documentation/perf-script.txt
+++ b/tools/perf/Documentation/perf-script.txt
@@ -98,8 +98,10 @@ OPTIONS
 
 -g::
 --gen-script=::
-        Generate perf-script.[ext] starter script for given language,
-        using current perf.data.
+	Generate a starter script. If a language is given then the
+        script is named perf-script.[ext] according to the
+        language. If a file path is given then python is used for
+        files ending '.py' and perl used for files ending '.pl'.
 
 --dlfilter=<file>::
 	Filter sample events using the given shared object file.
diff --git a/tools/perf/Documentation/perf-stat.txt b/tools/perf/Documentation/perf-stat.txt
index 1a766d4a2233..7cccc3a847d1 100644
--- a/tools/perf/Documentation/perf-stat.txt
+++ b/tools/perf/Documentation/perf-stat.txt
@@ -382,6 +382,11 @@ color the metric's computed value.
 Don't print output, warnings or messages. This is useful with perf stat
 record below to only write data to the perf.data file.
 
+--no-affinity::
+Don't change scheduler CPU affinities when iterating over
+CPUs. Disables an optimization aimed at minimizing interprocessor
+interrupts.
+
 STAT RECORD
 -----------
 Stores stat data into perf data file.
diff --git a/tools/perf/Documentation/perf.data-file-format.txt b/tools/perf/Documentation/perf.data-file-format.txt
index c9d4dec65344..0e4d0ecc9e12 100644
--- a/tools/perf/Documentation/perf.data-file-format.txt
+++ b/tools/perf/Documentation/perf.data-file-format.txt
@@ -447,6 +447,23 @@ struct {
 	} [nr_pmu];
 };
 
+	HEADER_CPU_DOMAIN_INFO = 32,
+
+List of cpu-domain relation info. The format of the data is as below.
+
+struct domain_info {
+	int domain;
+	char dname[];
+	char cpumask[];
+	char cpulist[];
+};
+
+struct cpu_domain_info {
+	int cpu;
+	int nr_domains;
+	struct domain_info domains[];
+};
+
 	other bits are reserved and should ignored for now
 	HEADER_FEAT_BITS	= 256,
 
diff --git a/tools/perf/Makefile.config b/tools/perf/Makefile.config
index d8d25f62aaad..a8dc72cfe48e 100644
--- a/tools/perf/Makefile.config
+++ b/tools/perf/Makefile.config
@@ -64,7 +64,6 @@ include $(srctree)/tools/scripts/Makefile.arch
 $(call detected_var,SRCARCH)
 
 CFLAGS += -I$(OUTPUT)arch/$(SRCARCH)/include/generated
-CFLAGS += -I$(OUTPUT)libperf/arch/$(SRCARCH)/include/generated/uapi
 
 # Additional ARCH settings for ppc
 ifeq ($(SRCARCH),powerpc)
@@ -118,14 +117,6 @@ ifeq ($(ARCH),mips)
   endif
 endif
 
-# So far there's only x86 and arm libdw unwind support merged in perf.
-# Disable it on all other architectures in case libdw unwind
-# support is detected in system. Add supported architectures
-# to the check.
-ifneq ($(SRCARCH),$(filter $(SRCARCH),x86 arm arm64 powerpc s390 csky riscv loongarch))
-  NO_LIBDW_DWARF_UNWIND := 1
-endif
-
 ifneq ($(LIBUNWIND),1)
   NO_LIBUNWIND := 1
 endif
@@ -379,8 +370,8 @@ ifneq ($(TCMALLOC),)
 endif
 
 ifeq ($(FEATURES_DUMP),)
-# We will display at the end of this Makefile.config, using $(call feature_display_entries)
-# As we may retry some feature detection here, see the disassembler-four-args case, for instance
+# We will display at the end of this Makefile.config, using $(call feature_display_entries),
+# as we may retry some feature detection here.
   FEATURE_DISPLAY_DEFERRED := 1
 include $(srctree)/tools/build/Makefile.feature
 else
@@ -456,7 +447,6 @@ endif
 ifdef NO_LIBELF
   NO_LIBDW := 1
   NO_LIBUNWIND := 1
-  NO_LIBDW_DWARF_UNWIND := 1
   NO_LIBBPF := 1
   NO_JVMTI := 1
 else
@@ -504,10 +494,6 @@ ifeq ($(feature-libaio), 1)
   endif
 endif
 
-ifdef NO_LIBDW
-  NO_LIBDW_DWARF_UNWIND := 1
-endif
-
 ifeq ($(feature-scandirat), 1)
   # Ignore having scandirat with memory sanitizer that lacks an interceptor.
   ifeq ($(filter s% -fsanitize=memory%,$(EXTRA_CFLAGS),),)
@@ -757,7 +743,7 @@ dwarf-post-unwind-text := BUG
 
 # setup DWARF post unwinder
 ifdef NO_LIBUNWIND
-  ifdef NO_LIBDW_DWARF_UNWIND
+  ifdef NO_LIBDW
     $(warning Disabling post unwind, no support found.)
     dwarf-post-unwind := 0
   else
@@ -767,10 +753,6 @@ ifdef NO_LIBUNWIND
 else
   dwarf-post-unwind-text := libunwind
   $(call detected,CONFIG_LIBUNWIND)
-  # Enable libunwind support by default.
-  ifndef NO_LIBDW_DWARF_UNWIND
-    NO_LIBDW_DWARF_UNWIND := 1
-  endif
 endif
 
 ifeq ($(dwarf-post-unwind),1)
@@ -931,48 +913,32 @@ ifneq ($(NO_JEVENTS),1)
 endif
 
 ifdef BUILD_NONDISTRO
+  # call all detections now so we get correct status in VF output
   $(call feature_check,libbfd)
+  $(call feature_check,disassembler-four-args)
+  $(call feature_check,disassembler-init-styled)
+  $(call feature_check,libbfd-threadsafe)
+  $(call feature_check,libbfd-liberty)
+  $(call feature_check,libbfd-liberty-z)
+
+  ifneq ($(feature-libbfd-threadsafe), 1)
+    $(error binutils 2.42 or later is required for non-distro builds)
+  endif
 
+  # we may be on a system that requires -liberty and (maybe) -lz
+  # to link against -lbfd; test each case individually here
   ifeq ($(feature-libbfd), 1)
     EXTLIBS += -lbfd -lopcodes
-    FEATURE_CHECK_LDFLAGS-disassembler-four-args = -lbfd -lopcodes -ldl
-    FEATURE_CHECK_LDFLAGS-disassembler-init-styled = -lbfd -lopcodes -ldl
-  else
-    # we are on a system that requires -liberty and (maybe) -lz
-    # to link against -lbfd; test each case individually here
-
-    # call all detections now so we get correct
-    # status in VF output
-    $(call feature_check,libbfd-liberty)
-    $(call feature_check,libbfd-liberty-z)
-
-    ifeq ($(feature-libbfd-liberty), 1)
-      EXTLIBS += -lbfd -lopcodes -liberty
-      FEATURE_CHECK_LDFLAGS-disassembler-four-args += -liberty -ldl
-      FEATURE_CHECK_LDFLAGS-disassembler-init-styled += -liberty -ldl
-    else
-      ifeq ($(feature-libbfd-liberty-z), 1)
-        EXTLIBS += -lbfd -lopcodes -liberty -lz
-        FEATURE_CHECK_LDFLAGS-disassembler-four-args += -liberty -lz -ldl
-        FEATURE_CHECK_LDFLAGS-disassembler-init-styled += -liberty -lz -ldl
-      endif
-    endif
-    $(call feature_check,disassembler-four-args)
-    $(call feature_check,disassembler-init-styled)
+  else ifeq ($(feature-libbfd-liberty), 1)
+    EXTLIBS += -lbfd -lopcodes -liberty
+  else ifeq ($(feature-libbfd-liberty-z), 1)
+    EXTLIBS += -lbfd -lopcodes -liberty -lz
   endif
 
   CFLAGS += -DHAVE_LIBBFD_SUPPORT
   CXXFLAGS += -DHAVE_LIBBFD_SUPPORT
   $(call detected,CONFIG_LIBBFD)
 
-  $(call feature_check,libbfd-buildid)
-
-  ifeq ($(feature-libbfd-buildid), 1)
-    CFLAGS += -DHAVE_LIBBFD_BUILDID_SUPPORT
-  else
-    $(warning Old version of libbfd/binutils things like PE executable profiling will not be available)
-  endif
-
   ifeq ($(feature-disassembler-four-args), 1)
     CFLAGS += -DDISASM_FOUR_ARGS_SIGNATURE
   endif
@@ -1067,10 +1033,6 @@ ifndef NO_LIBNUMA
   endif
 endif
 
-ifdef HAVE_KVM_STAT_SUPPORT
-    CFLAGS += -DHAVE_KVM_STAT_SUPPORT
-endif
-
 ifeq (${IS_64_BIT}, 1)
   ifndef NO_PERF_READ_VDSO32
     $(call feature_check,compile-32)
@@ -1112,8 +1074,12 @@ ifndef NO_CAPSTONE
   $(call feature_check,libcapstone)
   ifeq ($(feature-libcapstone), 1)
     CFLAGS += -DHAVE_LIBCAPSTONE_SUPPORT $(LIBCAPSTONE_CFLAGS)
-    LDFLAGS += $(LICAPSTONE_LDFLAGS)
-    EXTLIBS += -lcapstone
+    ifdef LIBCAPSTONE_DLOPEN
+      CFLAGS += -DLIBCAPSTONE_DLOPEN
+    else
+      LDFLAGS += $(LIBCAPSTONE_LDFLAGS)
+      EXTLIBS += -lcapstone
+    endif
     $(call detected,CONFIG_LIBCAPSTONE)
   else
     msg := $(warning No libcapstone found, disables disasm engine support for 'perf script', please install libcapstone-dev/capstone-devel);
@@ -1187,6 +1153,18 @@ ifneq ($(NO_LIBTRACEEVENT),1)
   endif
 endif
 
+ifndef NO_RUST
+  $(call feature_check,rust)
+  ifneq ($(feature-rust), 1)
+    $(warning Rust is not found. Test workloads with rust are disabled.)
+    NO_RUST := 1
+  else
+    NO_RUST := 0
+    CFLAGS += -DHAVE_RUST_SUPPORT
+    $(call detected,CONFIG_RUST_SUPPORT)
+  endif
+endif
+
 # Among the variables below, these:
 #   perfexecdir
 #   libbpf_include_dir
@@ -1332,6 +1310,6 @@ endif
 
 # re-generate FEATURE-DUMP as we may have called feature_check, found out
 # extra libraries to add to LDFLAGS of some other test and then redo those
-# tests, see the block about libbfd, disassembler-four-args, for instance.
+# tests.
 $(shell rm -f $(FEATURE_DUMP_FILENAME))
 $(foreach feat,$(FEATURE_TESTS),$(shell echo "$(call feature_assign,$(feat))" >> $(FEATURE_DUMP_FILENAME)))
diff --git a/tools/perf/Makefile.perf b/tools/perf/Makefile.perf
index b3f481a626af..11b63bafdb23 100644
--- a/tools/perf/Makefile.perf
+++ b/tools/perf/Makefile.perf
@@ -35,6 +35,9 @@ include ../scripts/utilities.mak
 #
 # Define EXTRA_CFLAGS=-m64 or EXTRA_CFLAGS=-m32 as appropriate for cross-builds.
 #
+# Define EXTRA_BPF_FLAGS="--sysroot=<path>" or other custom include paths for
+# cross-compiling BPF skeletons
+#
 # Define EXCLUDE_EXTLIBS=-lmylib to exclude libmylib from the auto-generated
 # EXTLIBS.
 #
@@ -86,8 +89,6 @@ include ../scripts/utilities.mak
 #
 # Define NO_LIBBPF if you do not want BPF support
 #
-# Define NO_LIBCAP if you do not want process capabilities considered by perf
-#
 # Define NO_SDT if you do not want to define SDT event in perf tools,
 # note that it doesn't disable SDT scanning support.
 #
@@ -251,11 +252,12 @@ else
 endif
 
 # shellcheck is using in tools/perf/tests/Build with option -a/--check-sourced (
-# introduced in v0.4.7) and -S/--severity (introduced in v0.6.0). So make the
-# minimal shellcheck version as v0.6.0.
+# introduced in v0.4.7) and -S/--severity (introduced in v0.6.0) as well as
+# dynamic source inclusions (properly handled since v0.7.2).
+# So make the minimal shellcheck version as v0.7.2.
 ifneq ($(SHELLCHECK),)
   ifeq ($(shell expr $(shell $(SHELLCHECK) --version | grep version: | \
-        sed -e 's/.\+ \([0-9]\+\).\([0-9]\+\).\([0-9]\+\)/\1\2\3/g') \< 060), 1)
+        sed -e 's/.\+ \([0-9]\+\).\([0-9]\+\).\([0-9]\+\)/\1\2\3/g') \< 072), 1)
     SHELLCHECK :=
   else
     SHELLCHECK := $(SHELLCHECK) -s bash -a -S warning
@@ -272,7 +274,7 @@ ifeq ($(PYLINT),1)
   PYLINT := $(shell which pylint 2> /dev/null)
 endif
 
-export srctree OUTPUT RM CC CXX LD AR CFLAGS CXXFLAGS V BISON FLEX AWK
+export srctree OUTPUT RM CC CXX RUSTC LD AR CFLAGS CXXFLAGS V BISON FLEX AWK
 export HOSTCC HOSTLD HOSTAR HOSTCFLAGS SHELLCHECK MYPY PYLINT
 
 include $(srctree)/tools/build/Makefile.include
@@ -807,11 +809,6 @@ $(GTK_IN): FORCE prepare
 $(OUTPUT)libperf-gtk.so: $(GTK_IN) $(PERFLIBS)
 	$(QUIET_LINK)$(CC) -o $@ -shared $(LDFLAGS) $(filter %.o,$^) $(GTK_LIBS)
 
-$(OUTPUT)common-cmds.h: util/generate-cmdlist.sh command-list.txt
-
-$(OUTPUT)common-cmds.h: $(wildcard Documentation/perf-*.txt)
-	$(QUIET_GEN). util/generate-cmdlist.sh > $@+ && mv $@+ $@
-
 $(SCRIPTS) : % : %.sh
 	$(QUIET_GEN)$(INSTALL) '$@.sh' '$(OUTPUT)$@'
 
@@ -849,7 +846,7 @@ endif
 __build-dir = $(subst $(OUTPUT),,$(dir $@))
 build-dir   = $(or $(__build-dir),.)
 
-prepare: $(OUTPUT)PERF-VERSION-FILE $(OUTPUT)common-cmds.h archheaders \
+prepare: $(OUTPUT)PERF-VERSION-FILE archheaders \
 	arm64-sysreg-defs \
 	$(syscall_array) \
 	$(fs_at_flags_array) \
@@ -1053,7 +1050,7 @@ cscope:
 # However, the environment gets quite big, and some programs have problems
 # with that.
 
-check: $(OUTPUT)common-cmds.h
+check: prepare
 	if sparse; \
 	then \
 		for i in *.c */*.c; \
@@ -1250,7 +1247,7 @@ endif
 $(SKEL_TMP_OUT)/%.bpf.o: $(OUTPUT)PERF-VERSION-FILE util/bpf_skel/perf_version.h | $(SKEL_TMP_OUT)
 $(SKEL_TMP_OUT)/%.bpf.o: util/bpf_skel/%.bpf.c $(LIBBPF) $(SKEL_OUT)/vmlinux.h
 	$(QUIET_CLANG)$(CLANG) -g -O2 -fno-stack-protector --target=bpf \
-	  $(CLANG_OPTIONS) $(BPF_INCLUDE) $(TOOLS_UAPI_INCLUDE) \
+	  $(CLANG_OPTIONS) $(EXTRA_BPF_FLAGS) $(BPF_INCLUDE) $(TOOLS_UAPI_INCLUDE) \
 	  -include $(OUTPUT)PERF-VERSION-FILE -include util/bpf_skel/perf_version.h \
 	  -c $(filter util/bpf_skel/%.bpf.c,$^) -o $@
 
@@ -1277,6 +1274,8 @@ ifeq ($(OUTPUT),)
 		pmu-events/metric_test.log \
 		pmu-events/test-empty-pmu-events.c \
 		pmu-events/empty-pmu-events.log
+	$(Q)find pmu-events/arch -name 'extra-metrics.json' -delete -o \
+		-name 'extra-metricgroups.json' -delete
 else # When an OUTPUT directory is present, clean up the copied pmu-events/arch directory.
 	$(call QUIET_CLEAN, pmu-events) $(RM) -r $(OUTPUT)pmu-events/arch \
 		$(OUTPUT)pmu-events/pmu-events.c \
@@ -1296,7 +1295,7 @@ clean:: $(LIBAPI)-clean $(LIBBPF)-clean $(LIBSUBCMD)-clean $(LIBSYMBOL)-clean $(
 	$(call QUIET_CLEAN, core-progs) $(RM) $(ALL_PROGRAMS) perf perf-read-vdso32 \
 		perf-read-vdsox32 $(OUTPUT)$(LIBJVMTI).so
 	$(call QUIET_CLEAN, core-gen)   $(RM)  *.spec *.pyc *.pyo */*.pyc */*.pyo \
-		$(OUTPUT)common-cmds.h TAGS tags cscope* $(OUTPUT)PERF-VERSION-FILE \
+		TAGS tags cscope* $(OUTPUT)PERF-VERSION-FILE \
 		$(OUTPUT)FEATURE-DUMP $(OUTPUT)util/*-bison* $(OUTPUT)util/*-flex* \
 		$(OUTPUT)util/intel-pt-decoder/inat-tables.c \
 		$(OUTPUT)tests/llvm-src-{base,kbuild,prologue,relocation}.c \
diff --git a/tools/perf/arch/arc/annotate/instructions.c b/tools/perf/arch/arc/annotate/instructions.c
deleted file mode 100644
index e5619770a1af..000000000000
--- a/tools/perf/arch/arc/annotate/instructions.c
+++ /dev/null
@@ -1,11 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include <linux/compiler.h>
-
-static int arc__annotate_init(struct arch *arch, char *cpuid __maybe_unused)
-{
-	arch->initialized = true;
-	arch->objdump.comment_char = ';';
-	arch->e_machine = EM_ARC;
-	arch->e_flags = 0;
-	return 0;
-}
diff --git a/tools/perf/arch/arm/include/perf_regs.h b/tools/perf/arch/arm/include/perf_regs.h
index 75ce1c370114..20c54766e3a0 100644
--- a/tools/perf/arch/arm/include/perf_regs.h
+++ b/tools/perf/arch/arm/include/perf_regs.h
@@ -4,7 +4,7 @@
 
 #include <stdlib.h>
 #include <linux/types.h>
-#include <asm/perf_regs.h>
+#include "../../../../arch/arm/include/uapi/asm/perf_regs.h"
 
 void perf_regs_load(u64 *regs);
 
diff --git a/tools/perf/arch/arm/util/Build b/tools/perf/arch/arm/util/Build
index fd695e1fdaee..b94bf3c5279a 100644
--- a/tools/perf/arch/arm/util/Build
+++ b/tools/perf/arch/arm/util/Build
@@ -1,6 +1,3 @@
-perf-util-y += perf_regs.o
-
 perf-util-$(CONFIG_LOCAL_LIBUNWIND)    += unwind-libunwind.o
-perf-util-$(CONFIG_LIBDW_DWARF_UNWIND) += unwind-libdw.o
 
 perf-util-y += pmu.o auxtrace.o cs-etm.o
diff --git a/tools/perf/arch/arm/util/cs-etm.c b/tools/perf/arch/arm/util/cs-etm.c
index ea891d12f8f4..dc3f4e86b075 100644
--- a/tools/perf/arch/arm/util/cs-etm.c
+++ b/tools/perf/arch/arm/util/cs-etm.c
@@ -68,6 +68,20 @@ static const char * const metadata_ete_ro[] = {
 
 enum cs_etm_version { CS_NOT_PRESENT, CS_ETMV3, CS_ETMV4, CS_ETE };
 
+/* ETMv4 CONFIGR register bits */
+#define TRCCONFIGR_BB		BIT(3)
+#define TRCCONFIGR_CCI		BIT(4)
+#define TRCCONFIGR_CID		BIT(6)
+#define TRCCONFIGR_VMID		BIT(7)
+#define TRCCONFIGR_TS		BIT(11)
+#define TRCCONFIGR_RS		BIT(12)
+#define TRCCONFIGR_VMIDOPT	BIT(15)
+
+/* ETMv3 ETMCR register bits */
+#define ETMCR_CYC_ACC		BIT(12)
+#define ETMCR_TIMESTAMP_EN	BIT(28)
+#define ETMCR_RETURN_STACK	BIT(29)
+
 static bool cs_etm_is_ete(struct perf_pmu *cs_etm_pmu, struct perf_cpu cpu);
 static int cs_etm_get_ro(struct perf_pmu *pmu, struct perf_cpu cpu, const char *path, __u64 *val);
 static bool cs_etm_pmu_path_exists(struct perf_pmu *pmu, struct perf_cpu cpu, const char *path);
@@ -89,13 +103,14 @@ static int cs_etm_validate_context_id(struct perf_pmu *cs_etm_pmu, struct evsel
 				      struct perf_cpu cpu)
 {
 	int err;
-	__u64 val;
-	u64 contextid = evsel->core.attr.config &
-		(perf_pmu__format_bits(cs_etm_pmu, "contextid") |
-		 perf_pmu__format_bits(cs_etm_pmu, "contextid1") |
-		 perf_pmu__format_bits(cs_etm_pmu, "contextid2"));
+	u64 ctxt, ctxt1, ctxt2;
+	__u64 trcidr2;
+
+	evsel__get_config_val(evsel, "contextid", &ctxt);
+	evsel__get_config_val(evsel, "contextid1", &ctxt1);
+	evsel__get_config_val(evsel, "contextid2", &ctxt2);
 
-	if (!contextid)
+	if (!ctxt && !ctxt1 && !ctxt2)
 		return 0;
 
 	/* Not supported in etmv3 */
@@ -106,12 +121,11 @@ static int cs_etm_validate_context_id(struct perf_pmu *cs_etm_pmu, struct evsel
 	}
 
 	/* Get a handle on TRCIDR2 */
-	err = cs_etm_get_ro(cs_etm_pmu, cpu, metadata_etmv4_ro[CS_ETMV4_TRCIDR2], &val);
+	err = cs_etm_get_ro(cs_etm_pmu, cpu, metadata_etmv4_ro[CS_ETMV4_TRCIDR2], &trcidr2);
 	if (err)
 		return err;
 
-	if (contextid &
-	    perf_pmu__format_bits(cs_etm_pmu, "contextid1")) {
+	if (ctxt1) {
 		/*
 		 * TRCIDR2.CIDSIZE, bit [9-5], indicates whether contextID
 		 * tracing is supported:
@@ -119,15 +133,14 @@ static int cs_etm_validate_context_id(struct perf_pmu *cs_etm_pmu, struct evsel
 		 *  0b00100 Maximum of 32-bit Context ID size.
 		 *  All other values are reserved.
 		 */
-		if (BMVAL(val, 5, 9) != 0x4) {
+		if (BMVAL(trcidr2, 5, 9) != 0x4) {
 			pr_err("%s: CONTEXTIDR_EL1 isn't supported, disable with %s/contextid1=0/\n",
 			       CORESIGHT_ETM_PMU_NAME, CORESIGHT_ETM_PMU_NAME);
 			return -EINVAL;
 		}
 	}
 
-	if (contextid &
-	    perf_pmu__format_bits(cs_etm_pmu, "contextid2")) {
+	if (ctxt2) {
 		/*
 		 * TRCIDR2.VMIDOPT[30:29] != 0 and
 		 * TRCIDR2.VMIDSIZE[14:10] == 0b00100 (32bit virtual contextid)
@@ -135,7 +148,7 @@ static int cs_etm_validate_context_id(struct perf_pmu *cs_etm_pmu, struct evsel
 		 * virtual context id is < 32bit.
 		 * Any value of VMIDSIZE >= 4 (i.e, > 32bit) is fine for us.
 		 */
-		if (!BMVAL(val, 29, 30) || BMVAL(val, 10, 14) < 4) {
+		if (!BMVAL(trcidr2, 29, 30) || BMVAL(trcidr2, 10, 14) < 4) {
 			pr_err("%s: CONTEXTIDR_EL2 isn't supported, disable with %s/contextid2=0/\n",
 			       CORESIGHT_ETM_PMU_NAME, CORESIGHT_ETM_PMU_NAME);
 			return -EINVAL;
@@ -149,10 +162,11 @@ static int cs_etm_validate_timestamp(struct perf_pmu *cs_etm_pmu, struct evsel *
 				     struct perf_cpu cpu)
 {
 	int err;
-	__u64 val;
+	u64 val;
+	__u64 trcidr0;
 
-	if (!(evsel->core.attr.config &
-	      perf_pmu__format_bits(cs_etm_pmu, "timestamp")))
+	evsel__get_config_val(evsel, "timestamp", &val);
+	if (!val)
 		return 0;
 
 	if (cs_etm_get_version(cs_etm_pmu, cpu) == CS_ETMV3) {
@@ -162,7 +176,7 @@ static int cs_etm_validate_timestamp(struct perf_pmu *cs_etm_pmu, struct evsel *
 	}
 
 	/* Get a handle on TRCIRD0 */
-	err = cs_etm_get_ro(cs_etm_pmu, cpu, metadata_etmv4_ro[CS_ETMV4_TRCIDR0], &val);
+	err = cs_etm_get_ro(cs_etm_pmu, cpu, metadata_etmv4_ro[CS_ETMV4_TRCIDR0], &trcidr0);
 	if (err)
 		return err;
 
@@ -173,10 +187,9 @@ static int cs_etm_validate_timestamp(struct perf_pmu *cs_etm_pmu, struct evsel *
 	 *  0b00110 Implementation supports a maximum timestamp of 48bits.
 	 *  0b01000 Implementation supports a maximum timestamp of 64bits.
 	 */
-	val &= GENMASK(28, 24);
-	if (!val) {
+	trcidr0 &= GENMASK(28, 24);
+	if (!trcidr0)
 		return -EINVAL;
-	}
 
 	return 0;
 }
@@ -259,16 +272,19 @@ static int cs_etm_parse_snapshot_options(struct auxtrace_record *itr,
 	return 0;
 }
 
+/*
+ * If the sink name format "@sink_name" is used, lookup the sink by name to convert to
+ * "sinkid=sink_hash" format. If the user has already manually provided a hash then
+ * "sinkid" isn't overwritten. If neither are provided then the driver will pick the best
+ * sink.
+ */
 static int cs_etm_set_sink_attr(struct perf_pmu *pmu,
 				struct evsel *evsel)
 {
 	char msg[BUFSIZ], path[PATH_MAX], *sink;
 	struct evsel_config_term *term;
-	int ret = -EINVAL;
 	u32 hash;
-
-	if (evsel->core.attr.config2 & GENMASK(31, 0))
-		return 0;
+	int ret;
 
 	list_for_each_entry(term, &evsel->config_terms, list) {
 		if (term->type != EVSEL__CONFIG_TERM_DRV_CFG)
@@ -291,17 +307,26 @@ static int cs_etm_set_sink_attr(struct perf_pmu *pmu,
 			return ret;
 		}
 
-		evsel->core.attr.config2 |= hash;
+		evsel__set_config_if_unset(evsel, "sinkid", hash);
 		return 0;
 	}
 
-	/*
-	 * No sink was provided on the command line - allow the CoreSight
-	 * system to look for a default
-	 */
 	return 0;
 }
 
+static struct evsel *cs_etm_get_evsel(struct evlist *evlist,
+				      struct perf_pmu *cs_etm_pmu)
+{
+	struct evsel *evsel;
+
+	evlist__for_each_entry(evlist, evsel) {
+		if (evsel->core.attr.type == cs_etm_pmu->type)
+			return evsel;
+	}
+
+	return NULL;
+}
+
 static int cs_etm_recording_options(struct auxtrace_record *itr,
 				    struct evlist *evlist,
 				    struct record_opts *opts)
@@ -441,10 +466,8 @@ static int cs_etm_recording_options(struct auxtrace_record *itr,
 	 * when a context switch happened.
 	 */
 	if (!perf_cpu_map__is_any_cpu_or_is_empty(cpus)) {
-		evsel__set_config_if_unset(cs_etm_pmu, cs_etm_evsel,
-					   "timestamp", 1);
-		evsel__set_config_if_unset(cs_etm_pmu, cs_etm_evsel,
-					   "contextid", 1);
+		evsel__set_config_if_unset(cs_etm_evsel, "timestamp", 1);
+		evsel__set_config_if_unset(cs_etm_evsel, "contextid", 1);
 	}
 
 	/*
@@ -453,8 +476,7 @@ static int cs_etm_recording_options(struct auxtrace_record *itr,
 	 * timestamp tracing.
 	 */
 	if (opts->sample_time_set)
-		evsel__set_config_if_unset(cs_etm_pmu, cs_etm_evsel,
-					   "timestamp", 1);
+		evsel__set_config_if_unset(cs_etm_evsel, "timestamp", 1);
 
 	/* Add dummy event to keep tracking */
 	err = parse_event(evlist, "dummy:u");
@@ -474,64 +496,64 @@ out:
 	return err;
 }
 
-static u64 cs_etm_get_config(struct auxtrace_record *itr)
+static u64 cs_etm_synth_etmcr(struct auxtrace_record *itr)
 {
-	u64 config = 0;
 	struct cs_etm_recording *ptr =
-			container_of(itr, struct cs_etm_recording, itr);
+		container_of(itr, struct cs_etm_recording, itr);
 	struct perf_pmu *cs_etm_pmu = ptr->cs_etm_pmu;
-	struct evlist *evlist = ptr->evlist;
-	struct evsel *evsel;
+	struct evsel *evsel = cs_etm_get_evsel(ptr->evlist, cs_etm_pmu);
+	u64 etmcr = 0;
+	u64 val;
 
-	evlist__for_each_entry(evlist, evsel) {
-		if (evsel->core.attr.type == cs_etm_pmu->type) {
-			/*
-			 * Variable perf_event_attr::config is assigned to
-			 * ETMv3/PTM.  The bit fields have been made to match
-			 * the ETMv3.5 ETRMCR register specification.  See the
-			 * PMU_FORMAT_ATTR() declarations in
-			 * drivers/hwtracing/coresight/coresight-perf.c for
-			 * details.
-			 */
-			config = evsel->core.attr.config;
-			break;
-		}
-	}
+	if (!evsel)
+		return 0;
 
-	return config;
+	/*
+	 * Synthesize what the kernel programmed into ETMCR based on
+	 * what options the event was opened with. This doesn't have to be
+	 * complete or 100% accurate, not all bits used by OpenCSD anyway.
+	 */
+	if (!evsel__get_config_val(evsel, "cycacc", &val) && val)
+		etmcr |= ETMCR_CYC_ACC;
+	if (!evsel__get_config_val(evsel, "timestamp", &val) && val)
+		etmcr |= ETMCR_TIMESTAMP_EN;
+	if (!evsel__get_config_val(evsel, "retstack", &val) && val)
+		etmcr |= ETMCR_RETURN_STACK;
+
+	return etmcr;
 }
 
-#ifndef BIT
-#define BIT(N) (1UL << (N))
-#endif
-
-static u64 cs_etmv4_get_config(struct auxtrace_record *itr)
+static u64 cs_etmv4_synth_trcconfigr(struct auxtrace_record *itr)
 {
-	u64 config = 0;
-	u64 config_opts = 0;
+	u64 trcconfigr = 0;
+	struct cs_etm_recording *ptr =
+		container_of(itr, struct cs_etm_recording, itr);
+	struct perf_pmu *cs_etm_pmu = ptr->cs_etm_pmu;
+	struct evsel *evsel = cs_etm_get_evsel(ptr->evlist, cs_etm_pmu);
+	u64 val;
+
+	if (!evsel)
+		return 0;
 
 	/*
-	 * The perf event variable config bits represent both
-	 * the command line options and register programming
-	 * bits in ETMv3/PTM. For ETMv4 we must remap options
-	 * to real bits
+	 * Synthesize what the kernel programmed into TRCCONFIGR based on
+	 * what options the event was opened with. This doesn't have to be
+	 * complete or 100% accurate, not all bits used by OpenCSD anyway.
 	 */
-	config_opts = cs_etm_get_config(itr);
-	if (config_opts & BIT(ETM_OPT_CYCACC))
-		config |= BIT(ETM4_CFG_BIT_CYCACC);
-	if (config_opts & BIT(ETM_OPT_CTXTID))
-		config |= BIT(ETM4_CFG_BIT_CTXTID);
-	if (config_opts & BIT(ETM_OPT_TS))
-		config |= BIT(ETM4_CFG_BIT_TS);
-	if (config_opts & BIT(ETM_OPT_RETSTK))
-		config |= BIT(ETM4_CFG_BIT_RETSTK);
-	if (config_opts & BIT(ETM_OPT_CTXTID2))
-		config |= BIT(ETM4_CFG_BIT_VMID) |
-			  BIT(ETM4_CFG_BIT_VMID_OPT);
-	if (config_opts & BIT(ETM_OPT_BRANCH_BROADCAST))
-		config |= BIT(ETM4_CFG_BIT_BB);
-
-	return config;
+	if (!evsel__get_config_val(evsel, "cycacc", &val) && val)
+		trcconfigr |= TRCCONFIGR_CCI;
+	if (!evsel__get_config_val(evsel, "contextid1", &val) && val)
+		trcconfigr |= TRCCONFIGR_CID;
+	if (!evsel__get_config_val(evsel, "timestamp", &val) && val)
+		trcconfigr |= TRCCONFIGR_TS;
+	if (!evsel__get_config_val(evsel, "retstack", &val) && val)
+		trcconfigr |= TRCCONFIGR_RS;
+	if (!evsel__get_config_val(evsel, "contextid2", &val) && val)
+		trcconfigr |= TRCCONFIGR_VMID | TRCCONFIGR_VMIDOPT;
+	if (!evsel__get_config_val(evsel, "branch_broadcast", &val) && val)
+		trcconfigr |= TRCCONFIGR_BB;
+
+	return trcconfigr;
 }
 
 static size_t
@@ -653,7 +675,7 @@ static void cs_etm_save_etmv4_header(__u64 data[], struct auxtrace_record *itr,
 	struct perf_pmu *cs_etm_pmu = ptr->cs_etm_pmu;
 
 	/* Get trace configuration register */
-	data[CS_ETMV4_TRCCONFIGR] = cs_etmv4_get_config(itr);
+	data[CS_ETMV4_TRCCONFIGR] = cs_etmv4_synth_trcconfigr(itr);
 	/* traceID set to legacy version, in case new perf running on older system */
 	data[CS_ETMV4_TRCTRACEIDR] = cs_etm_get_legacy_trace_id(cpu);
 
@@ -685,7 +707,7 @@ static void cs_etm_save_ete_header(__u64 data[], struct auxtrace_record *itr, st
 	struct perf_pmu *cs_etm_pmu = ptr->cs_etm_pmu;
 
 	/* Get trace configuration register */
-	data[CS_ETE_TRCCONFIGR] = cs_etmv4_get_config(itr);
+	data[CS_ETE_TRCCONFIGR] = cs_etmv4_synth_trcconfigr(itr);
 	/* traceID set to legacy version, in case new perf running on older system */
 	data[CS_ETE_TRCTRACEIDR] = cs_etm_get_legacy_trace_id(cpu);
 
@@ -741,7 +763,7 @@ static void cs_etm_get_metadata(struct perf_cpu cpu, u32 *offset,
 	case CS_ETMV3:
 		magic = __perf_cs_etmv3_magic;
 		/* Get configuration register */
-		info->priv[*offset + CS_ETM_ETMCR] = cs_etm_get_config(itr);
+		info->priv[*offset + CS_ETM_ETMCR] = cs_etm_synth_etmcr(itr);
 		/* traceID set to legacy value in case new perf running on old system */
 		info->priv[*offset + CS_ETM_ETMTRACEIDR] = cs_etm_get_legacy_trace_id(cpu);
 		/* Get read-only information from sysFS */
@@ -832,12 +854,11 @@ static int cs_etm_snapshot_start(struct auxtrace_record *itr)
 {
 	struct cs_etm_recording *ptr =
 			container_of(itr, struct cs_etm_recording, itr);
-	struct evsel *evsel;
+	struct evsel *evsel = cs_etm_get_evsel(ptr->evlist, ptr->cs_etm_pmu);
+
+	if (evsel)
+		return evsel__disable(evsel);
 
-	evlist__for_each_entry(ptr->evlist, evsel) {
-		if (evsel->core.attr.type == ptr->cs_etm_pmu->type)
-			return evsel__disable(evsel);
-	}
 	return -EINVAL;
 }
 
diff --git a/tools/perf/arch/arm/util/perf_regs.c b/tools/perf/arch/arm/util/perf_regs.c
deleted file mode 100644
index f94a0210c7b7..000000000000
--- a/tools/perf/arch/arm/util/perf_regs.c
+++ /dev/null
@@ -1,22 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include "perf_regs.h"
-#include "../../../util/perf_regs.h"
-
-static const struct sample_reg sample_reg_masks[] = {
-	SMPL_REG_END
-};
-
-uint64_t arch__intr_reg_mask(void)
-{
-	return PERF_REGS_MASK;
-}
-
-uint64_t arch__user_reg_mask(void)
-{
-	return PERF_REGS_MASK;
-}
-
-const struct sample_reg *arch__sample_reg_masks(void)
-{
-	return sample_reg_masks;
-}
diff --git a/tools/perf/arch/arm/util/unwind-libdw.c b/tools/perf/arch/arm/util/unwind-libdw.c
deleted file mode 100644
index fbb643f224ec..000000000000
--- a/tools/perf/arch/arm/util/unwind-libdw.c
+++ /dev/null
@@ -1,39 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include <elfutils/libdwfl.h>
-#include "perf_regs.h"
-#include "../../../util/unwind-libdw.h"
-#include "../../../util/perf_regs.h"
-#include "../../../util/sample.h"
-
-bool libdw__arch_set_initial_registers(Dwfl_Thread *thread, void *arg)
-{
-	struct unwind_info *ui = arg;
-	struct regs_dump *user_regs = perf_sample__user_regs(ui->sample);
-	Dwarf_Word dwarf_regs[PERF_REG_ARM_MAX];
-
-#define REG(r) ({						\
-	Dwarf_Word val = 0;					\
-	perf_reg_value(&val, user_regs, PERF_REG_ARM_##r);	\
-	val;							\
-})
-
-	dwarf_regs[0]  = REG(R0);
-	dwarf_regs[1]  = REG(R1);
-	dwarf_regs[2]  = REG(R2);
-	dwarf_regs[3]  = REG(R3);
-	dwarf_regs[4]  = REG(R4);
-	dwarf_regs[5]  = REG(R5);
-	dwarf_regs[6]  = REG(R6);
-	dwarf_regs[7]  = REG(R7);
-	dwarf_regs[8]  = REG(R8);
-	dwarf_regs[9]  = REG(R9);
-	dwarf_regs[10] = REG(R10);
-	dwarf_regs[11] = REG(FP);
-	dwarf_regs[12] = REG(IP);
-	dwarf_regs[13] = REG(SP);
-	dwarf_regs[14] = REG(LR);
-	dwarf_regs[15] = REG(PC);
-
-	return dwfl_thread_state_registers(thread, 0, PERF_REG_ARM_MAX,
-					   dwarf_regs);
-}
diff --git a/tools/perf/arch/arm64/Makefile b/tools/perf/arch/arm64/Makefile
index 087e099fb453..44cc3f023318 100644
--- a/tools/perf/arch/arm64/Makefile
+++ b/tools/perf/arch/arm64/Makefile
@@ -1,3 +1,2 @@
 # SPDX-License-Identifier: GPL-2.0
 PERF_HAVE_JITDUMP := 1
-HAVE_KVM_STAT_SUPPORT := 1
diff --git a/tools/perf/arch/arm64/include/perf_regs.h b/tools/perf/arch/arm64/include/perf_regs.h
index 58639ee9f7ea..372f2565a9dd 100644
--- a/tools/perf/arch/arm64/include/perf_regs.h
+++ b/tools/perf/arch/arm64/include/perf_regs.h
@@ -5,7 +5,7 @@
 #include <stdlib.h>
 #include <linux/types.h>
 #define perf_event_arm_regs perf_event_arm64_regs
-#include <asm/perf_regs.h>
+#include "../../../../arch/arm64/include/uapi/asm/perf_regs.h"
 #undef perf_event_arm_regs
 
 void perf_regs_load(u64 *regs);
diff --git a/tools/perf/arch/arm64/util/Build b/tools/perf/arch/arm64/util/Build
index d63881081d2e..4e06a08d281a 100644
--- a/tools/perf/arch/arm64/util/Build
+++ b/tools/perf/arch/arm64/util/Build
@@ -1,5 +1,3 @@
-perf-util-$(CONFIG_LIBDW_DWARF_UNWIND) += unwind-libdw.o
-perf-util-$(CONFIG_LIBTRACEEVENT) += kvm-stat.o
 perf-util-$(CONFIG_LOCAL_LIBUNWIND) += unwind-libunwind.o
 perf-util-y += ../../arm/util/auxtrace.o
 perf-util-y += ../../arm/util/cs-etm.o
@@ -9,6 +7,5 @@ perf-util-y += header.o
 perf-util-y += hisi-ptt.o
 perf-util-y += machine.o
 perf-util-y += mem-events.o
-perf-util-y += perf_regs.o
 perf-util-y += pmu.o
 perf-util-y += tsc.o
diff --git a/tools/perf/arch/arm64/util/arm-spe.c b/tools/perf/arch/arm64/util/arm-spe.c
index d5ec1408d0ae..17ced7bbbdda 100644
--- a/tools/perf/arch/arm64/util/arm-spe.c
+++ b/tools/perf/arch/arm64/util/arm-spe.c
@@ -256,7 +256,7 @@ static __u64 arm_spe_pmu__sample_period(const struct perf_pmu *arm_spe_pmu)
 
 static void arm_spe_setup_evsel(struct evsel *evsel, struct perf_cpu_map *cpus)
 {
-	u64 bit;
+	u64 pa_enable_bit;
 
 	evsel->core.attr.freq = 0;
 	evsel->core.attr.sample_period = arm_spe_pmu__sample_period(evsel->pmu);
@@ -274,7 +274,7 @@ static void arm_spe_setup_evsel(struct evsel *evsel, struct perf_cpu_map *cpus)
 	 */
 	if (!perf_cpu_map__is_any_cpu_or_is_empty(cpus)) {
 		evsel__set_sample_bit(evsel, CPU);
-		evsel__set_config_if_unset(evsel->pmu, evsel, "ts_enable", 1);
+		evsel__set_config_if_unset(evsel, "ts_enable", 1);
 	}
 
 	/*
@@ -288,9 +288,10 @@ static void arm_spe_setup_evsel(struct evsel *evsel, struct perf_cpu_map *cpus)
 	 * inform that the resulting output's SPE samples contain physical addresses
 	 * where applicable.
 	 */
-	bit = perf_pmu__format_bits(evsel->pmu, "pa_enable");
-	if (evsel->core.attr.config & bit)
-		evsel__set_sample_bit(evsel, PHYS_ADDR);
+
+	if (!evsel__get_config_val(evsel, "pa_enable", &pa_enable_bit))
+		if (pa_enable_bit)
+			evsel__set_sample_bit(evsel, PHYS_ADDR);
 }
 
 static int arm_spe_setup_aux_buffer(struct record_opts *opts)
@@ -397,6 +398,7 @@ static int arm_spe_recording_options(struct auxtrace_record *itr,
 	struct perf_cpu_map *cpus = evlist->core.user_requested_cpus;
 	bool discard = false;
 	int err;
+	u64 discard_bit;
 
 	sper->evlist = evlist;
 
@@ -425,9 +427,8 @@ static int arm_spe_recording_options(struct auxtrace_record *itr,
 	evlist__for_each_entry_safe(evlist, tmp, evsel) {
 		if (evsel__is_aux_event(evsel)) {
 			arm_spe_setup_evsel(evsel, cpus);
-			if (evsel->core.attr.config &
-			    perf_pmu__format_bits(evsel->pmu, "discard"))
-				discard = true;
+			if (!evsel__get_config_val(evsel, "discard", &discard_bit))
+				discard = !!discard_bit;
 		}
 	}
 
diff --git a/tools/perf/arch/arm64/util/header.c b/tools/perf/arch/arm64/util/header.c
index f445a2dd6293..cbc0ba101636 100644
--- a/tools/perf/arch/arm64/util/header.c
+++ b/tools/perf/arch/arm64/util/header.c
@@ -1,4 +1,3 @@
-#include <linux/kernel.h>
 #include <linux/bits.h>
 #include <linux/bitfield.h>
 #include <stdio.h>
diff --git a/tools/perf/arch/arm64/util/machine.c b/tools/perf/arch/arm64/util/machine.c
index aab1cc2bc283..80fb13c958d9 100644
--- a/tools/perf/arch/arm64/util/machine.c
+++ b/tools/perf/arch/arm64/util/machine.c
@@ -1,18 +1,12 @@
 // SPDX-License-Identifier: GPL-2.0
 
-#include <inttypes.h>
-#include <stdio.h>
-#include <string.h>
-#include "debug.h"
-#include "symbol.h"
-#include "callchain.h"
+#include "callchain.h" // prototype of arch__add_leaf_frame_record_opts
 #include "perf_regs.h"
 #include "record.h"
-#include "util/perf_regs.h"
+
+#define SMPL_REG_MASK(b) (1ULL << (b))
 
 void arch__add_leaf_frame_record_opts(struct record_opts *opts)
 {
-	const struct sample_reg *sample_reg_masks = arch__sample_reg_masks();
-
-	opts->sample_user_regs |= sample_reg_masks[PERF_REG_ARM64_LR].mask;
+	opts->sample_user_regs |= SMPL_REG_MASK(PERF_REG_ARM64_LR);
 }
diff --git a/tools/perf/arch/arm64/util/mem-events.c b/tools/perf/arch/arm64/util/mem-events.c
index 9f8da7937255..eaf00e0609c6 100644
--- a/tools/perf/arch/arm64/util/mem-events.c
+++ b/tools/perf/arch/arm64/util/mem-events.c
@@ -6,7 +6,7 @@
 #define E(t, n, s, l, a) { .tag = t, .name = n, .event_name = s, .ldlat = l, .aux_event = a }
 
 struct perf_mem_event perf_mem_events_arm[PERF_MEM_EVENTS__MAX] = {
-	E("spe-load",	"%s/ts_enable=1,pa_enable=1,load_filter=1,store_filter=0,min_latency=%u/",	NULL,	true,	0),
-	E("spe-store",	"%s/ts_enable=1,pa_enable=1,load_filter=0,store_filter=1/",			NULL,	false,	0),
+	E("spe-load",	"%s/ts_enable=1,pa_enable=1,load_filter=1,min_latency=%u/",	NULL,	true,	0),
+	E("spe-store",	"%s/ts_enable=1,pa_enable=1,store_filter=1/",			NULL,	false,	0),
 	E("spe-ldst",	"%s/ts_enable=1,pa_enable=1,load_filter=1,store_filter=1,min_latency=%u/",	NULL,	true,	0),
 };
diff --git a/tools/perf/arch/arm64/util/perf_regs.c b/tools/perf/arch/arm64/util/perf_regs.c
deleted file mode 100644
index 09308665e28a..000000000000
--- a/tools/perf/arch/arm64/util/perf_regs.c
+++ /dev/null
@@ -1,182 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include <errno.h>
-#include <regex.h>
-#include <string.h>
-#include <sys/auxv.h>
-#include <linux/kernel.h>
-#include <linux/zalloc.h>
-
-#include "perf_regs.h"
-#include "../../../perf-sys.h"
-#include "../../../util/debug.h"
-#include "../../../util/event.h"
-#include "../../../util/perf_regs.h"
-
-#ifndef HWCAP_SVE
-#define HWCAP_SVE	(1 << 22)
-#endif
-
-static const struct sample_reg sample_reg_masks[] = {
-	SMPL_REG(x0, PERF_REG_ARM64_X0),
-	SMPL_REG(x1, PERF_REG_ARM64_X1),
-	SMPL_REG(x2, PERF_REG_ARM64_X2),
-	SMPL_REG(x3, PERF_REG_ARM64_X3),
-	SMPL_REG(x4, PERF_REG_ARM64_X4),
-	SMPL_REG(x5, PERF_REG_ARM64_X5),
-	SMPL_REG(x6, PERF_REG_ARM64_X6),
-	SMPL_REG(x7, PERF_REG_ARM64_X7),
-	SMPL_REG(x8, PERF_REG_ARM64_X8),
-	SMPL_REG(x9, PERF_REG_ARM64_X9),
-	SMPL_REG(x10, PERF_REG_ARM64_X10),
-	SMPL_REG(x11, PERF_REG_ARM64_X11),
-	SMPL_REG(x12, PERF_REG_ARM64_X12),
-	SMPL_REG(x13, PERF_REG_ARM64_X13),
-	SMPL_REG(x14, PERF_REG_ARM64_X14),
-	SMPL_REG(x15, PERF_REG_ARM64_X15),
-	SMPL_REG(x16, PERF_REG_ARM64_X16),
-	SMPL_REG(x17, PERF_REG_ARM64_X17),
-	SMPL_REG(x18, PERF_REG_ARM64_X18),
-	SMPL_REG(x19, PERF_REG_ARM64_X19),
-	SMPL_REG(x20, PERF_REG_ARM64_X20),
-	SMPL_REG(x21, PERF_REG_ARM64_X21),
-	SMPL_REG(x22, PERF_REG_ARM64_X22),
-	SMPL_REG(x23, PERF_REG_ARM64_X23),
-	SMPL_REG(x24, PERF_REG_ARM64_X24),
-	SMPL_REG(x25, PERF_REG_ARM64_X25),
-	SMPL_REG(x26, PERF_REG_ARM64_X26),
-	SMPL_REG(x27, PERF_REG_ARM64_X27),
-	SMPL_REG(x28, PERF_REG_ARM64_X28),
-	SMPL_REG(x29, PERF_REG_ARM64_X29),
-	SMPL_REG(lr, PERF_REG_ARM64_LR),
-	SMPL_REG(sp, PERF_REG_ARM64_SP),
-	SMPL_REG(pc, PERF_REG_ARM64_PC),
-	SMPL_REG(vg, PERF_REG_ARM64_VG),
-	SMPL_REG_END
-};
-
-/* %xNUM */
-#define SDT_OP_REGEX1  "^(x[1-2]?[0-9]|3[0-1])$"
-
-/* [sp], [sp, NUM] */
-#define SDT_OP_REGEX2  "^\\[sp(, )?([0-9]+)?\\]$"
-
-static regex_t sdt_op_regex1, sdt_op_regex2;
-
-static int sdt_init_op_regex(void)
-{
-	static int initialized;
-	int ret = 0;
-
-	if (initialized)
-		return 0;
-
-	ret = regcomp(&sdt_op_regex1, SDT_OP_REGEX1, REG_EXTENDED);
-	if (ret)
-		goto error;
-
-	ret = regcomp(&sdt_op_regex2, SDT_OP_REGEX2, REG_EXTENDED);
-	if (ret)
-		goto free_regex1;
-
-	initialized = 1;
-	return 0;
-
-free_regex1:
-	regfree(&sdt_op_regex1);
-error:
-	pr_debug4("Regex compilation error.\n");
-	return ret;
-}
-
-/*
- * SDT marker arguments on Arm64 uses %xREG or [sp, NUM], currently
- * support these two formats.
- */
-int arch_sdt_arg_parse_op(char *old_op, char **new_op)
-{
-	int ret, new_len;
-	regmatch_t rm[5];
-
-	ret = sdt_init_op_regex();
-	if (ret < 0)
-		return ret;
-
-	if (!regexec(&sdt_op_regex1, old_op, 3, rm, 0)) {
-		/* Extract xNUM */
-		new_len = 2;	/* % NULL */
-		new_len += (int)(rm[1].rm_eo - rm[1].rm_so);
-
-		*new_op = zalloc(new_len);
-		if (!*new_op)
-			return -ENOMEM;
-
-		scnprintf(*new_op, new_len, "%%%.*s",
-			(int)(rm[1].rm_eo - rm[1].rm_so), old_op + rm[1].rm_so);
-	} else if (!regexec(&sdt_op_regex2, old_op, 5, rm, 0)) {
-		/* [sp], [sp, NUM] or [sp,NUM] */
-		new_len = 7;	/* + ( % s p ) NULL */
-
-		/* If the argument is [sp], need to fill offset '0' */
-		if (rm[2].rm_so == -1)
-			new_len += 1;
-		else
-			new_len += (int)(rm[2].rm_eo - rm[2].rm_so);
-
-		*new_op = zalloc(new_len);
-		if (!*new_op)
-			return -ENOMEM;
-
-		if (rm[2].rm_so == -1)
-			scnprintf(*new_op, new_len, "+0(%%sp)");
-		else
-			scnprintf(*new_op, new_len, "+%.*s(%%sp)",
-				  (int)(rm[2].rm_eo - rm[2].rm_so),
-				  old_op + rm[2].rm_so);
-	} else {
-		pr_debug4("Skipping unsupported SDT argument: %s\n", old_op);
-		return SDT_ARG_SKIP;
-	}
-
-	return SDT_ARG_VALID;
-}
-
-uint64_t arch__intr_reg_mask(void)
-{
-	return PERF_REGS_MASK;
-}
-
-uint64_t arch__user_reg_mask(void)
-{
-	struct perf_event_attr attr = {
-		.type                   = PERF_TYPE_HARDWARE,
-		.config                 = PERF_COUNT_HW_CPU_CYCLES,
-		.sample_type            = PERF_SAMPLE_REGS_USER,
-		.disabled               = 1,
-		.exclude_kernel         = 1,
-		.sample_period		= 1,
-		.sample_regs_user	= PERF_REGS_MASK
-	};
-	int fd;
-
-	if (getauxval(AT_HWCAP) & HWCAP_SVE)
-		attr.sample_regs_user |= SMPL_REG_MASK(PERF_REG_ARM64_VG);
-
-	/*
-	 * Check if the pmu supports perf extended regs, before
-	 * returning the register mask to sample.
-	 */
-	if (attr.sample_regs_user != PERF_REGS_MASK) {
-		event_attr_init(&attr);
-		fd = sys_perf_event_open(&attr, 0, -1, -1, 0);
-		if (fd != -1) {
-			close(fd);
-			return attr.sample_regs_user;
-		}
-	}
-	return PERF_REGS_MASK;
-}
-
-const struct sample_reg *arch__sample_reg_masks(void)
-{
-	return sample_reg_masks;
-}
diff --git a/tools/perf/arch/arm64/util/unwind-libdw.c b/tools/perf/arch/arm64/util/unwind-libdw.c
deleted file mode 100644
index b89b0a7e5ad9..000000000000
--- a/tools/perf/arch/arm64/util/unwind-libdw.c
+++ /dev/null
@@ -1,61 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include <elfutils/libdwfl.h>
-#include "perf_regs.h"
-#include "../../../util/unwind-libdw.h"
-#include "../../../util/perf_regs.h"
-#include "../../../util/sample.h"
-
-bool libdw__arch_set_initial_registers(Dwfl_Thread *thread, void *arg)
-{
-	struct unwind_info *ui = arg;
-	struct regs_dump *user_regs = perf_sample__user_regs(ui->sample);
-	Dwarf_Word dwarf_regs[PERF_REG_ARM64_MAX], dwarf_pc;
-
-#define REG(r) ({						\
-	Dwarf_Word val = 0;					\
-	perf_reg_value(&val, user_regs, PERF_REG_ARM64_##r);	\
-	val;							\
-})
-
-	dwarf_regs[0]  = REG(X0);
-	dwarf_regs[1]  = REG(X1);
-	dwarf_regs[2]  = REG(X2);
-	dwarf_regs[3]  = REG(X3);
-	dwarf_regs[4]  = REG(X4);
-	dwarf_regs[5]  = REG(X5);
-	dwarf_regs[6]  = REG(X6);
-	dwarf_regs[7]  = REG(X7);
-	dwarf_regs[8]  = REG(X8);
-	dwarf_regs[9]  = REG(X9);
-	dwarf_regs[10] = REG(X10);
-	dwarf_regs[11] = REG(X11);
-	dwarf_regs[12] = REG(X12);
-	dwarf_regs[13] = REG(X13);
-	dwarf_regs[14] = REG(X14);
-	dwarf_regs[15] = REG(X15);
-	dwarf_regs[16] = REG(X16);
-	dwarf_regs[17] = REG(X17);
-	dwarf_regs[18] = REG(X18);
-	dwarf_regs[19] = REG(X19);
-	dwarf_regs[20] = REG(X20);
-	dwarf_regs[21] = REG(X21);
-	dwarf_regs[22] = REG(X22);
-	dwarf_regs[23] = REG(X23);
-	dwarf_regs[24] = REG(X24);
-	dwarf_regs[25] = REG(X25);
-	dwarf_regs[26] = REG(X26);
-	dwarf_regs[27] = REG(X27);
-	dwarf_regs[28] = REG(X28);
-	dwarf_regs[29] = REG(X29);
-	dwarf_regs[30] = REG(LR);
-	dwarf_regs[31] = REG(SP);
-
-	if (!dwfl_thread_state_registers(thread, 0, PERF_REG_ARM64_MAX,
-					 dwarf_regs))
-		return false;
-
-	dwarf_pc = REG(PC);
-	dwfl_thread_state_register_pc(thread, dwarf_pc);
-
-	return true;
-}
diff --git a/tools/perf/arch/csky/Build b/tools/perf/arch/csky/Build
deleted file mode 100644
index e63eabc2c8f4..000000000000
--- a/tools/perf/arch/csky/Build
+++ /dev/null
@@ -1 +0,0 @@
-perf-util-y += util/
diff --git a/tools/perf/arch/csky/include/perf_regs.h b/tools/perf/arch/csky/include/perf_regs.h
index 076c7746c8a2..0bf7b963909c 100644
--- a/tools/perf/arch/csky/include/perf_regs.h
+++ b/tools/perf/arch/csky/include/perf_regs.h
@@ -6,7 +6,7 @@
 
 #include <stdlib.h>
 #include <linux/types.h>
-#include <asm/perf_regs.h>
+#include "../../../../arch/csky/include/uapi/asm/perf_regs.h"
 
 #define PERF_REGS_MASK	((1ULL << PERF_REG_CSKY_MAX) - 1)
 #define PERF_REGS_MAX	PERF_REG_CSKY_MAX
diff --git a/tools/perf/arch/csky/util/Build b/tools/perf/arch/csky/util/Build
deleted file mode 100644
index 5e6ea82c4202..000000000000
--- a/tools/perf/arch/csky/util/Build
+++ /dev/null
@@ -1,3 +0,0 @@
-perf-util-y += perf_regs.o
-
-perf-util-$(CONFIG_LIBDW_DWARF_UNWIND) += unwind-libdw.o
diff --git a/tools/perf/arch/csky/util/perf_regs.c b/tools/perf/arch/csky/util/perf_regs.c
deleted file mode 100644
index 6b1665f41180..000000000000
--- a/tools/perf/arch/csky/util/perf_regs.c
+++ /dev/null
@@ -1,22 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include "perf_regs.h"
-#include "../../util/perf_regs.h"
-
-static const struct sample_reg sample_reg_masks[] = {
-	SMPL_REG_END
-};
-
-uint64_t arch__intr_reg_mask(void)
-{
-	return PERF_REGS_MASK;
-}
-
-uint64_t arch__user_reg_mask(void)
-{
-	return PERF_REGS_MASK;
-}
-
-const struct sample_reg *arch__sample_reg_masks(void)
-{
-	return sample_reg_masks;
-}
diff --git a/tools/perf/arch/csky/util/unwind-libdw.c b/tools/perf/arch/csky/util/unwind-libdw.c
deleted file mode 100644
index b20b1569783d..000000000000
--- a/tools/perf/arch/csky/util/unwind-libdw.c
+++ /dev/null
@@ -1,78 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-// Copyright (C) 2019 Hangzhou C-SKY Microsystems co.,ltd.
-
-#include <elfutils/libdwfl.h>
-#include "perf_regs.h"
-#include "../../util/unwind-libdw.h"
-#include "../../util/perf_regs.h"
-#include "../../util/event.h"
-
-bool libdw__arch_set_initial_registers(Dwfl_Thread *thread, void *arg)
-{
-	struct unwind_info *ui = arg;
-	struct regs_dump *user_regs = perf_sample__user_regs(ui->sample);
-	Dwarf_Word dwarf_regs[PERF_REG_CSKY_MAX];
-
-#define REG(r) ({						\
-	Dwarf_Word val = 0;					\
-	perf_reg_value(&val, user_regs, PERF_REG_CSKY_##r);	\
-	val;							\
-})
-
-#if defined(__CSKYABIV2__)
-	dwarf_regs[0]  = REG(A0);
-	dwarf_regs[1]  = REG(A1);
-	dwarf_regs[2]  = REG(A2);
-	dwarf_regs[3]  = REG(A3);
-	dwarf_regs[4]  = REG(REGS0);
-	dwarf_regs[5]  = REG(REGS1);
-	dwarf_regs[6]  = REG(REGS2);
-	dwarf_regs[7]  = REG(REGS3);
-	dwarf_regs[8]  = REG(REGS4);
-	dwarf_regs[9]  = REG(REGS5);
-	dwarf_regs[10] = REG(REGS6);
-	dwarf_regs[11] = REG(REGS7);
-	dwarf_regs[12] = REG(REGS8);
-	dwarf_regs[13] = REG(REGS9);
-	dwarf_regs[14] = REG(SP);
-	dwarf_regs[15] = REG(LR);
-	dwarf_regs[16] = REG(EXREGS0);
-	dwarf_regs[17] = REG(EXREGS1);
-	dwarf_regs[18] = REG(EXREGS2);
-	dwarf_regs[19] = REG(EXREGS3);
-	dwarf_regs[20] = REG(EXREGS4);
-	dwarf_regs[21] = REG(EXREGS5);
-	dwarf_regs[22] = REG(EXREGS6);
-	dwarf_regs[23] = REG(EXREGS7);
-	dwarf_regs[24] = REG(EXREGS8);
-	dwarf_regs[25] = REG(EXREGS9);
-	dwarf_regs[26] = REG(EXREGS10);
-	dwarf_regs[27] = REG(EXREGS11);
-	dwarf_regs[28] = REG(EXREGS12);
-	dwarf_regs[29] = REG(EXREGS13);
-	dwarf_regs[30] = REG(EXREGS14);
-	dwarf_regs[31] = REG(TLS);
-	dwarf_regs[32] = REG(PC);
-#else
-	dwarf_regs[0]  = REG(SP);
-	dwarf_regs[1]  = REG(REGS9);
-	dwarf_regs[2]  = REG(A0);
-	dwarf_regs[3]  = REG(A1);
-	dwarf_regs[4]  = REG(A2);
-	dwarf_regs[5]  = REG(A3);
-	dwarf_regs[6]  = REG(REGS0);
-	dwarf_regs[7]  = REG(REGS1);
-	dwarf_regs[8]  = REG(REGS2);
-	dwarf_regs[9]  = REG(REGS3);
-	dwarf_regs[10] = REG(REGS4);
-	dwarf_regs[11] = REG(REGS5);
-	dwarf_regs[12] = REG(REGS6);
-	dwarf_regs[13] = REG(REGS7);
-	dwarf_regs[14] = REG(REGS8);
-	dwarf_regs[15] = REG(LR);
-#endif
-	dwfl_thread_state_register_pc(thread, REG(PC));
-
-	return dwfl_thread_state_registers(thread, 0, PERF_REG_CSKY_MAX,
-					   dwarf_regs);
-}
diff --git a/tools/perf/arch/loongarch/Makefile b/tools/perf/arch/loongarch/Makefile
index 087e099fb453..44cc3f023318 100644
--- a/tools/perf/arch/loongarch/Makefile
+++ b/tools/perf/arch/loongarch/Makefile
@@ -1,3 +1,2 @@
 # SPDX-License-Identifier: GPL-2.0
 PERF_HAVE_JITDUMP := 1
-HAVE_KVM_STAT_SUPPORT := 1
diff --git a/tools/perf/arch/loongarch/include/perf_regs.h b/tools/perf/arch/loongarch/include/perf_regs.h
index 45c799fa5330..b86078a55e90 100644
--- a/tools/perf/arch/loongarch/include/perf_regs.h
+++ b/tools/perf/arch/loongarch/include/perf_regs.h
@@ -4,7 +4,7 @@
 
 #include <stdlib.h>
 #include <linux/types.h>
-#include <asm/perf_regs.h>
+#include "../../../../arch/loongarch/include/uapi/asm/perf_regs.h"
 
 #define PERF_REGS_MAX PERF_REG_LOONGARCH_MAX
 
diff --git a/tools/perf/arch/loongarch/util/Build b/tools/perf/arch/loongarch/util/Build
index 0aa31986ecb5..3ad73d0289f3 100644
--- a/tools/perf/arch/loongarch/util/Build
+++ b/tools/perf/arch/loongarch/util/Build
@@ -1,6 +1,4 @@
 perf-util-y += header.o
-perf-util-y += perf_regs.o
 
 perf-util-$(CONFIG_LOCAL_LIBUNWIND) += unwind-libunwind.o
 perf-util-$(CONFIG_LIBDW_DWARF_UNWIND) += unwind-libdw.o
-perf-util-$(CONFIG_LIBTRACEEVENT) += kvm-stat.o
diff --git a/tools/perf/arch/loongarch/util/perf_regs.c b/tools/perf/arch/loongarch/util/perf_regs.c
deleted file mode 100644
index f94a0210c7b7..000000000000
--- a/tools/perf/arch/loongarch/util/perf_regs.c
+++ /dev/null
@@ -1,22 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include "perf_regs.h"
-#include "../../../util/perf_regs.h"
-
-static const struct sample_reg sample_reg_masks[] = {
-	SMPL_REG_END
-};
-
-uint64_t arch__intr_reg_mask(void)
-{
-	return PERF_REGS_MASK;
-}
-
-uint64_t arch__user_reg_mask(void)
-{
-	return PERF_REGS_MASK;
-}
-
-const struct sample_reg *arch__sample_reg_masks(void)
-{
-	return sample_reg_masks;
-}
diff --git a/tools/perf/arch/loongarch/util/unwind-libdw.c b/tools/perf/arch/loongarch/util/unwind-libdw.c
deleted file mode 100644
index 60b1144bedd5..000000000000
--- a/tools/perf/arch/loongarch/util/unwind-libdw.c
+++ /dev/null
@@ -1,57 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/* Copyright (C) 2020-2023 Loongson Technology Corporation Limited */
-
-#include <elfutils/libdwfl.h>
-#include "perf_regs.h"
-#include "../../util/unwind-libdw.h"
-#include "../../util/perf_regs.h"
-#include "../../util/sample.h"
-
-bool libdw__arch_set_initial_registers(Dwfl_Thread *thread, void *arg)
-{
-	struct unwind_info *ui = arg;
-	struct regs_dump *user_regs = perf_sample__user_regs(ui->sample);
-	Dwarf_Word dwarf_regs[PERF_REG_LOONGARCH_MAX];
-
-#define REG(r) ({							\
-	Dwarf_Word val = 0;						\
-	perf_reg_value(&val, user_regs, PERF_REG_LOONGARCH_##r);	\
-	val;								\
-})
-
-	dwarf_regs[0]  = 0;
-	dwarf_regs[1]  = REG(R1);
-	dwarf_regs[2]  = REG(R2);
-	dwarf_regs[3]  = REG(R3);
-	dwarf_regs[4]  = REG(R4);
-	dwarf_regs[5]  = REG(R5);
-	dwarf_regs[6]  = REG(R6);
-	dwarf_regs[7]  = REG(R7);
-	dwarf_regs[8]  = REG(R8);
-	dwarf_regs[9]  = REG(R9);
-	dwarf_regs[10] = REG(R10);
-	dwarf_regs[11] = REG(R11);
-	dwarf_regs[12] = REG(R12);
-	dwarf_regs[13] = REG(R13);
-	dwarf_regs[14] = REG(R14);
-	dwarf_regs[15] = REG(R15);
-	dwarf_regs[16] = REG(R16);
-	dwarf_regs[17] = REG(R17);
-	dwarf_regs[18] = REG(R18);
-	dwarf_regs[19] = REG(R19);
-	dwarf_regs[20] = REG(R20);
-	dwarf_regs[21] = REG(R21);
-	dwarf_regs[22] = REG(R22);
-	dwarf_regs[23] = REG(R23);
-	dwarf_regs[24] = REG(R24);
-	dwarf_regs[25] = REG(R25);
-	dwarf_regs[26] = REG(R26);
-	dwarf_regs[27] = REG(R27);
-	dwarf_regs[28] = REG(R28);
-	dwarf_regs[29] = REG(R29);
-	dwarf_regs[30] = REG(R30);
-	dwarf_regs[31] = REG(R31);
-	dwfl_thread_state_register_pc(thread, REG(PC));
-
-	return dwfl_thread_state_registers(thread, 0, PERF_REG_LOONGARCH_MAX, dwarf_regs);
-}
diff --git a/tools/perf/arch/mips/include/perf_regs.h b/tools/perf/arch/mips/include/perf_regs.h
index 7082e91e0ed1..66655f0c4fea 100644
--- a/tools/perf/arch/mips/include/perf_regs.h
+++ b/tools/perf/arch/mips/include/perf_regs.h
@@ -4,7 +4,7 @@
 
 #include <stdlib.h>
 #include <linux/types.h>
-#include <asm/perf_regs.h>
+#include "../../../../arch/mips/include/uapi/asm/perf_regs.h"
 
 #define PERF_REGS_MAX PERF_REG_MIPS_MAX
 
diff --git a/tools/perf/arch/mips/util/Build b/tools/perf/arch/mips/util/Build
index 691fa2051958..818b808a8247 100644
--- a/tools/perf/arch/mips/util/Build
+++ b/tools/perf/arch/mips/util/Build
@@ -1,2 +1 @@
-perf-util-y += perf_regs.o
 perf-util-$(CONFIG_LOCAL_LIBUNWIND) += unwind-libunwind.o
diff --git a/tools/perf/arch/mips/util/perf_regs.c b/tools/perf/arch/mips/util/perf_regs.c
deleted file mode 100644
index 6b1665f41180..000000000000
--- a/tools/perf/arch/mips/util/perf_regs.c
+++ /dev/null
@@ -1,22 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include "perf_regs.h"
-#include "../../util/perf_regs.h"
-
-static const struct sample_reg sample_reg_masks[] = {
-	SMPL_REG_END
-};
-
-uint64_t arch__intr_reg_mask(void)
-{
-	return PERF_REGS_MASK;
-}
-
-uint64_t arch__user_reg_mask(void)
-{
-	return PERF_REGS_MASK;
-}
-
-const struct sample_reg *arch__sample_reg_masks(void)
-{
-	return sample_reg_masks;
-}
diff --git a/tools/perf/arch/powerpc/Makefile b/tools/perf/arch/powerpc/Makefile
index a295a80ea078..44cc3f023318 100644
--- a/tools/perf/arch/powerpc/Makefile
+++ b/tools/perf/arch/powerpc/Makefile
@@ -1,3 +1,2 @@
 # SPDX-License-Identifier: GPL-2.0
-HAVE_KVM_STAT_SUPPORT := 1
 PERF_HAVE_JITDUMP := 1
diff --git a/tools/perf/arch/powerpc/include/perf_regs.h b/tools/perf/arch/powerpc/include/perf_regs.h
index 1c66f6ba6773..22b492a3dd58 100644
--- a/tools/perf/arch/powerpc/include/perf_regs.h
+++ b/tools/perf/arch/powerpc/include/perf_regs.h
@@ -4,7 +4,7 @@
 
 #include <stdlib.h>
 #include <linux/types.h>
-#include <asm/perf_regs.h>
+#include "../../../../arch/powerpc/include/uapi/asm/perf_regs.h"
 
 void perf_regs_load(u64 *regs);
 
diff --git a/tools/perf/arch/powerpc/util/Build b/tools/perf/arch/powerpc/util/Build
index 3d0d5427aef7..d66574cbb9a9 100644
--- a/tools/perf/arch/powerpc/util/Build
+++ b/tools/perf/arch/powerpc/util/Build
@@ -1,6 +1,4 @@
 perf-util-y += header.o
-perf-util-$(CONFIG_LIBTRACEEVENT) += kvm-stat.o
-perf-util-y += perf_regs.o
 perf-util-y += mem-events.o
 perf-util-y += pmu.o
 perf-util-y += sym-handling.o
@@ -9,5 +7,4 @@ perf-util-y += evsel.o
 perf-util-$(CONFIG_LIBDW) += skip-callchain-idx.o
 
 perf-util-$(CONFIG_LIBUNWIND) += unwind-libunwind.o
-perf-util-$(CONFIG_LIBDW_DWARF_UNWIND) += unwind-libdw.o
 perf-util-y += auxtrace.o
diff --git a/tools/perf/arch/powerpc/util/perf_regs.c b/tools/perf/arch/powerpc/util/perf_regs.c
deleted file mode 100644
index bd36cfd420a2..000000000000
--- a/tools/perf/arch/powerpc/util/perf_regs.c
+++ /dev/null
@@ -1,240 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include <errno.h>
-#include <string.h>
-#include <regex.h>
-#include <linux/zalloc.h>
-
-#include "perf_regs.h"
-#include "../../../util/perf_regs.h"
-#include "../../../util/debug.h"
-#include "../../../util/event.h"
-#include "../../../util/header.h"
-#include "../../../perf-sys.h"
-#include "utils_header.h"
-
-#include <linux/kernel.h>
-
-#define PVR_POWER9		0x004E
-#define PVR_POWER10		0x0080
-#define PVR_POWER11		0x0082
-
-static const struct sample_reg sample_reg_masks[] = {
-	SMPL_REG(r0, PERF_REG_POWERPC_R0),
-	SMPL_REG(r1, PERF_REG_POWERPC_R1),
-	SMPL_REG(r2, PERF_REG_POWERPC_R2),
-	SMPL_REG(r3, PERF_REG_POWERPC_R3),
-	SMPL_REG(r4, PERF_REG_POWERPC_R4),
-	SMPL_REG(r5, PERF_REG_POWERPC_R5),
-	SMPL_REG(r6, PERF_REG_POWERPC_R6),
-	SMPL_REG(r7, PERF_REG_POWERPC_R7),
-	SMPL_REG(r8, PERF_REG_POWERPC_R8),
-	SMPL_REG(r9, PERF_REG_POWERPC_R9),
-	SMPL_REG(r10, PERF_REG_POWERPC_R10),
-	SMPL_REG(r11, PERF_REG_POWERPC_R11),
-	SMPL_REG(r12, PERF_REG_POWERPC_R12),
-	SMPL_REG(r13, PERF_REG_POWERPC_R13),
-	SMPL_REG(r14, PERF_REG_POWERPC_R14),
-	SMPL_REG(r15, PERF_REG_POWERPC_R15),
-	SMPL_REG(r16, PERF_REG_POWERPC_R16),
-	SMPL_REG(r17, PERF_REG_POWERPC_R17),
-	SMPL_REG(r18, PERF_REG_POWERPC_R18),
-	SMPL_REG(r19, PERF_REG_POWERPC_R19),
-	SMPL_REG(r20, PERF_REG_POWERPC_R20),
-	SMPL_REG(r21, PERF_REG_POWERPC_R21),
-	SMPL_REG(r22, PERF_REG_POWERPC_R22),
-	SMPL_REG(r23, PERF_REG_POWERPC_R23),
-	SMPL_REG(r24, PERF_REG_POWERPC_R24),
-	SMPL_REG(r25, PERF_REG_POWERPC_R25),
-	SMPL_REG(r26, PERF_REG_POWERPC_R26),
-	SMPL_REG(r27, PERF_REG_POWERPC_R27),
-	SMPL_REG(r28, PERF_REG_POWERPC_R28),
-	SMPL_REG(r29, PERF_REG_POWERPC_R29),
-	SMPL_REG(r30, PERF_REG_POWERPC_R30),
-	SMPL_REG(r31, PERF_REG_POWERPC_R31),
-	SMPL_REG(nip, PERF_REG_POWERPC_NIP),
-	SMPL_REG(msr, PERF_REG_POWERPC_MSR),
-	SMPL_REG(orig_r3, PERF_REG_POWERPC_ORIG_R3),
-	SMPL_REG(ctr, PERF_REG_POWERPC_CTR),
-	SMPL_REG(link, PERF_REG_POWERPC_LINK),
-	SMPL_REG(xer, PERF_REG_POWERPC_XER),
-	SMPL_REG(ccr, PERF_REG_POWERPC_CCR),
-	SMPL_REG(softe, PERF_REG_POWERPC_SOFTE),
-	SMPL_REG(trap, PERF_REG_POWERPC_TRAP),
-	SMPL_REG(dar, PERF_REG_POWERPC_DAR),
-	SMPL_REG(dsisr, PERF_REG_POWERPC_DSISR),
-	SMPL_REG(sier, PERF_REG_POWERPC_SIER),
-	SMPL_REG(mmcra, PERF_REG_POWERPC_MMCRA),
-	SMPL_REG(mmcr0, PERF_REG_POWERPC_MMCR0),
-	SMPL_REG(mmcr1, PERF_REG_POWERPC_MMCR1),
-	SMPL_REG(mmcr2, PERF_REG_POWERPC_MMCR2),
-	SMPL_REG(mmcr3, PERF_REG_POWERPC_MMCR3),
-	SMPL_REG(sier2, PERF_REG_POWERPC_SIER2),
-	SMPL_REG(sier3, PERF_REG_POWERPC_SIER3),
-	SMPL_REG(pmc1, PERF_REG_POWERPC_PMC1),
-	SMPL_REG(pmc2, PERF_REG_POWERPC_PMC2),
-	SMPL_REG(pmc3, PERF_REG_POWERPC_PMC3),
-	SMPL_REG(pmc4, PERF_REG_POWERPC_PMC4),
-	SMPL_REG(pmc5, PERF_REG_POWERPC_PMC5),
-	SMPL_REG(pmc6, PERF_REG_POWERPC_PMC6),
-	SMPL_REG(sdar, PERF_REG_POWERPC_SDAR),
-	SMPL_REG(siar, PERF_REG_POWERPC_SIAR),
-	SMPL_REG_END
-};
-
-/* REG or %rREG */
-#define SDT_OP_REGEX1  "^(%r)?([1-2]?[0-9]|3[0-1])$"
-
-/* -NUM(REG) or NUM(REG) or -NUM(%rREG) or NUM(%rREG) */
-#define SDT_OP_REGEX2  "^(\\-)?([0-9]+)\\((%r)?([1-2]?[0-9]|3[0-1])\\)$"
-
-static regex_t sdt_op_regex1, sdt_op_regex2;
-
-static int sdt_init_op_regex(void)
-{
-	static int initialized;
-	int ret = 0;
-
-	if (initialized)
-		return 0;
-
-	ret = regcomp(&sdt_op_regex1, SDT_OP_REGEX1, REG_EXTENDED);
-	if (ret)
-		goto error;
-
-	ret = regcomp(&sdt_op_regex2, SDT_OP_REGEX2, REG_EXTENDED);
-	if (ret)
-		goto free_regex1;
-
-	initialized = 1;
-	return 0;
-
-free_regex1:
-	regfree(&sdt_op_regex1);
-error:
-	pr_debug4("Regex compilation error.\n");
-	return ret;
-}
-
-/*
- * Parse OP and convert it into uprobe format, which is, +/-NUM(%gprREG).
- * Possible variants of OP are:
- *	Format		Example
- *	-------------------------
- *	NUM(REG)	48(18)
- *	-NUM(REG)	-48(18)
- *	NUM(%rREG)	48(%r18)
- *	-NUM(%rREG)	-48(%r18)
- *	REG		18
- *	%rREG		%r18
- *	iNUM		i0
- *	i-NUM		i-1
- *
- * SDT marker arguments on Powerpc uses %rREG form with -mregnames flag
- * and REG form with -mno-regnames. Here REG is general purpose register,
- * which is in 0 to 31 range.
- */
-int arch_sdt_arg_parse_op(char *old_op, char **new_op)
-{
-	int ret, new_len;
-	regmatch_t rm[5];
-	char prefix;
-
-	/* Constant argument. Uprobe does not support it */
-	if (old_op[0] == 'i') {
-		pr_debug4("Skipping unsupported SDT argument: %s\n", old_op);
-		return SDT_ARG_SKIP;
-	}
-
-	ret = sdt_init_op_regex();
-	if (ret < 0)
-		return ret;
-
-	if (!regexec(&sdt_op_regex1, old_op, 3, rm, 0)) {
-		/* REG or %rREG --> %gprREG */
-
-		new_len = 5;	/* % g p r NULL */
-		new_len += (int)(rm[2].rm_eo - rm[2].rm_so);
-
-		*new_op = zalloc(new_len);
-		if (!*new_op)
-			return -ENOMEM;
-
-		scnprintf(*new_op, new_len, "%%gpr%.*s",
-			(int)(rm[2].rm_eo - rm[2].rm_so), old_op + rm[2].rm_so);
-	} else if (!regexec(&sdt_op_regex2, old_op, 5, rm, 0)) {
-		/*
-		 * -NUM(REG) or NUM(REG) or -NUM(%rREG) or NUM(%rREG) -->
-		 *	+/-NUM(%gprREG)
-		 */
-		prefix = (rm[1].rm_so == -1) ? '+' : '-';
-
-		new_len = 8;	/* +/- ( % g p r ) NULL */
-		new_len += (int)(rm[2].rm_eo - rm[2].rm_so);
-		new_len += (int)(rm[4].rm_eo - rm[4].rm_so);
-
-		*new_op = zalloc(new_len);
-		if (!*new_op)
-			return -ENOMEM;
-
-		scnprintf(*new_op, new_len, "%c%.*s(%%gpr%.*s)", prefix,
-			(int)(rm[2].rm_eo - rm[2].rm_so), old_op + rm[2].rm_so,
-			(int)(rm[4].rm_eo - rm[4].rm_so), old_op + rm[4].rm_so);
-	} else {
-		pr_debug4("Skipping unsupported SDT argument: %s\n", old_op);
-		return SDT_ARG_SKIP;
-	}
-
-	return SDT_ARG_VALID;
-}
-
-uint64_t arch__intr_reg_mask(void)
-{
-	struct perf_event_attr attr = {
-		.type                   = PERF_TYPE_HARDWARE,
-		.config                 = PERF_COUNT_HW_CPU_CYCLES,
-		.sample_type            = PERF_SAMPLE_REGS_INTR,
-		.precise_ip             = 1,
-		.disabled               = 1,
-		.exclude_kernel         = 1,
-	};
-	int fd;
-	u32 version;
-	u64 extended_mask = 0, mask = PERF_REGS_MASK;
-
-	/*
-	 * Get the PVR value to set the extended
-	 * mask specific to platform.
-	 */
-	version = (((mfspr(SPRN_PVR)) >>  16) & 0xFFFF);
-	if (version == PVR_POWER9)
-		extended_mask = PERF_REG_PMU_MASK_300;
-	else if ((version == PVR_POWER10) || (version == PVR_POWER11))
-		extended_mask = PERF_REG_PMU_MASK_31;
-	else
-		return mask;
-
-	attr.sample_regs_intr = extended_mask;
-	attr.sample_period = 1;
-	event_attr_init(&attr);
-
-	/*
-	 * check if the pmu supports perf extended regs, before
-	 * returning the register mask to sample.
-	 */
-	fd = sys_perf_event_open(&attr, 0, -1, -1, 0);
-	if (fd != -1) {
-		close(fd);
-		mask |= extended_mask;
-	}
-	return mask;
-}
-
-uint64_t arch__user_reg_mask(void)
-{
-	return PERF_REGS_MASK;
-}
-
-const struct sample_reg *arch__sample_reg_masks(void)
-{
-	return sample_reg_masks;
-}
diff --git a/tools/perf/arch/powerpc/util/skip-callchain-idx.c b/tools/perf/arch/powerpc/util/skip-callchain-idx.c
index 356786432fd3..e57f10798fa6 100644
--- a/tools/perf/arch/powerpc/util/skip-callchain-idx.c
+++ b/tools/perf/arch/powerpc/util/skip-callchain-idx.c
@@ -30,14 +30,6 @@
  * The libdwfl code in this file is based on code from elfutils
  * (libdwfl/argp-std.c, libdwfl/tests/addrcfi.c, etc).
  */
-static char *debuginfo_path;
-
-static const Dwfl_Callbacks offline_callbacks = {
-	.debuginfo_path = &debuginfo_path,
-	.find_debuginfo = dwfl_standard_find_debuginfo,
-	.section_address = dwfl_offline_section_address,
-};
-
 
 /*
  * Use the DWARF expression for the Call-frame-address and determine
@@ -149,44 +141,22 @@ static Dwarf_Frame *get_dwarf_frame(Dwfl_Module *mod, Dwarf_Addr pc)
  *		yet used)
  *	-1 in case of errors
  */
-static int check_return_addr(struct dso *dso, u64 map_start, Dwarf_Addr pc)
+static int check_return_addr(struct dso *dso, Dwarf_Addr mapped_pc)
 {
 	int		rc = -1;
 	Dwfl		*dwfl;
 	Dwfl_Module	*mod;
 	Dwarf_Frame	*frame;
 	int		ra_regno;
-	Dwarf_Addr	start = pc;
-	Dwarf_Addr	end = pc;
+	Dwarf_Addr	start = mapped_pc;
+	Dwarf_Addr	end = mapped_pc;
 	bool		signalp;
-	const char	*exec_file = dso__long_name(dso);
-
-	dwfl = RC_CHK_ACCESS(dso)->dwfl;
-
-	if (!dwfl) {
-		dwfl = dwfl_begin(&offline_callbacks);
-		if (!dwfl) {
-			pr_debug("dwfl_begin() failed: %s\n", dwarf_errmsg(-1));
-			return -1;
-		}
-
-		mod = dwfl_report_elf(dwfl, exec_file, exec_file, -1,
-						map_start, false);
-		if (!mod) {
-			pr_debug("dwfl_report_elf() failed %s\n",
-						dwarf_errmsg(-1));
-			/*
-			 * We normally cache the DWARF debug info and never
-			 * call dwfl_end(). But to prevent fd leak, free in
-			 * case of error.
-			 */
-			dwfl_end(dwfl);
-			goto out;
-		}
-		RC_CHK_ACCESS(dso)->dwfl = dwfl;
-	}
 
-	mod = dwfl_addrmodule(dwfl, pc);
+	dwfl = dso__libdw_dwfl(dso);
+	if (!dwfl)
+		return -1;
+
+	mod = dwfl_addrmodule(dwfl, mapped_pc);
 	if (!mod) {
 		pr_debug("dwfl_addrmodule() failed, %s\n", dwarf_errmsg(-1));
 		goto out;
@@ -196,9 +166,9 @@ static int check_return_addr(struct dso *dso, u64 map_start, Dwarf_Addr pc)
 	 * To work with split debug info files (eg: glibc), check both
 	 * .eh_frame and .debug_frame sections of the ELF header.
 	 */
-	frame = get_eh_frame(mod, pc);
+	frame = get_eh_frame(mod, mapped_pc);
 	if (!frame) {
-		frame = get_dwarf_frame(mod, pc);
+		frame = get_dwarf_frame(mod, mapped_pc);
 		if (!frame)
 			goto out;
 	}
@@ -264,7 +234,7 @@ int arch_skip_callchain_idx(struct thread *thread, struct ip_callchain *chain)
 		return skip_slot;
 	}
 
-	rc = check_return_addr(dso, map__start(al.map), ip);
+	rc = check_return_addr(dso, map__map_ip(al.map, ip));
 
 	pr_debug("[DSO %s, sym %s, ip 0x%" PRIx64 "] rc %d\n",
 		dso__long_name(dso), al.sym->name, ip, rc);
diff --git a/tools/perf/arch/powerpc/util/unwind-libdw.c b/tools/perf/arch/powerpc/util/unwind-libdw.c
deleted file mode 100644
index 82d0c28ae345..000000000000
--- a/tools/perf/arch/powerpc/util/unwind-libdw.c
+++ /dev/null
@@ -1,76 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include <elfutils/libdwfl.h>
-#include <linux/kernel.h>
-#include "perf_regs.h"
-#include "../../../util/unwind-libdw.h"
-#include "../../../util/perf_regs.h"
-#include "../../../util/sample.h"
-
-/* See backends/ppc_initreg.c and backends/ppc_regs.c in elfutils.  */
-static const int special_regs[3][2] = {
-	{ 65, PERF_REG_POWERPC_LINK },
-	{ 101, PERF_REG_POWERPC_XER },
-	{ 109, PERF_REG_POWERPC_CTR },
-};
-
-bool libdw__arch_set_initial_registers(Dwfl_Thread *thread, void *arg)
-{
-	struct unwind_info *ui = arg;
-	struct regs_dump *user_regs = perf_sample__user_regs(ui->sample);
-	Dwarf_Word dwarf_regs[32], dwarf_nip;
-	size_t i;
-
-#define REG(r) ({						\
-	Dwarf_Word val = 0;					\
-	perf_reg_value(&val, user_regs, PERF_REG_POWERPC_##r);	\
-	val;							\
-})
-
-	dwarf_regs[0]  = REG(R0);
-	dwarf_regs[1]  = REG(R1);
-	dwarf_regs[2]  = REG(R2);
-	dwarf_regs[3]  = REG(R3);
-	dwarf_regs[4]  = REG(R4);
-	dwarf_regs[5]  = REG(R5);
-	dwarf_regs[6]  = REG(R6);
-	dwarf_regs[7]  = REG(R7);
-	dwarf_regs[8]  = REG(R8);
-	dwarf_regs[9]  = REG(R9);
-	dwarf_regs[10] = REG(R10);
-	dwarf_regs[11] = REG(R11);
-	dwarf_regs[12] = REG(R12);
-	dwarf_regs[13] = REG(R13);
-	dwarf_regs[14] = REG(R14);
-	dwarf_regs[15] = REG(R15);
-	dwarf_regs[16] = REG(R16);
-	dwarf_regs[17] = REG(R17);
-	dwarf_regs[18] = REG(R18);
-	dwarf_regs[19] = REG(R19);
-	dwarf_regs[20] = REG(R20);
-	dwarf_regs[21] = REG(R21);
-	dwarf_regs[22] = REG(R22);
-	dwarf_regs[23] = REG(R23);
-	dwarf_regs[24] = REG(R24);
-	dwarf_regs[25] = REG(R25);
-	dwarf_regs[26] = REG(R26);
-	dwarf_regs[27] = REG(R27);
-	dwarf_regs[28] = REG(R28);
-	dwarf_regs[29] = REG(R29);
-	dwarf_regs[30] = REG(R30);
-	dwarf_regs[31] = REG(R31);
-	if (!dwfl_thread_state_registers(thread, 0, 32, dwarf_regs))
-		return false;
-
-	dwarf_nip = REG(NIP);
-	dwfl_thread_state_register_pc(thread, dwarf_nip);
-	for (i = 0; i < ARRAY_SIZE(special_regs); i++) {
-		Dwarf_Word val = 0;
-		perf_reg_value(&val, user_regs, special_regs[i][1]);
-		if (!dwfl_thread_state_registers(thread,
-						 special_regs[i][0], 1,
-						 &val))
-			return false;
-	}
-
-	return true;
-}
diff --git a/tools/perf/arch/riscv/Makefile b/tools/perf/arch/riscv/Makefile
index 087e099fb453..44cc3f023318 100644
--- a/tools/perf/arch/riscv/Makefile
+++ b/tools/perf/arch/riscv/Makefile
@@ -1,3 +1,2 @@
 # SPDX-License-Identifier: GPL-2.0
 PERF_HAVE_JITDUMP := 1
-HAVE_KVM_STAT_SUPPORT := 1
diff --git a/tools/perf/arch/riscv/include/perf_regs.h b/tools/perf/arch/riscv/include/perf_regs.h
index d482edb413e5..af7a1b47bf66 100644
--- a/tools/perf/arch/riscv/include/perf_regs.h
+++ b/tools/perf/arch/riscv/include/perf_regs.h
@@ -6,14 +6,19 @@
 
 #include <stdlib.h>
 #include <linux/types.h>
-#include <asm/perf_regs.h>
+#include "../../../../arch/riscv/include/uapi/asm/perf_regs.h"
 
 #define PERF_REGS_MASK	((1ULL << PERF_REG_RISCV_MAX) - 1)
 #define PERF_REGS_MAX	PERF_REG_RISCV_MAX
+
+#if defined(__riscv_xlen)
 #if __riscv_xlen == 64
-#define PERF_SAMPLE_REGS_ABI    PERF_SAMPLE_REGS_ABI_64
+#define PERF_SAMPLE_REGS_ABI	PERF_SAMPLE_REGS_ABI_64
 #else
 #define PERF_SAMPLE_REGS_ABI	PERF_SAMPLE_REGS_ABI_32
 #endif
+#else
+#define PERF_SAMPLE_REGS_ABI	PERF_SAMPLE_REGS_NONE
+#endif
 
 #endif /* ARCH_PERF_REGS_H */
diff --git a/tools/perf/arch/riscv/util/Build b/tools/perf/arch/riscv/util/Build
index 58a672246024..2328fb9a30a3 100644
--- a/tools/perf/arch/riscv/util/Build
+++ b/tools/perf/arch/riscv/util/Build
@@ -1,5 +1 @@
-perf-util-y += perf_regs.o
 perf-util-y += header.o
-
-perf-util-$(CONFIG_LIBTRACEEVENT) += kvm-stat.o
-perf-util-$(CONFIG_LIBDW_DWARF_UNWIND) += unwind-libdw.o
diff --git a/tools/perf/arch/riscv/util/perf_regs.c b/tools/perf/arch/riscv/util/perf_regs.c
deleted file mode 100644
index 6b1665f41180..000000000000
--- a/tools/perf/arch/riscv/util/perf_regs.c
+++ /dev/null
@@ -1,22 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include "perf_regs.h"
-#include "../../util/perf_regs.h"
-
-static const struct sample_reg sample_reg_masks[] = {
-	SMPL_REG_END
-};
-
-uint64_t arch__intr_reg_mask(void)
-{
-	return PERF_REGS_MASK;
-}
-
-uint64_t arch__user_reg_mask(void)
-{
-	return PERF_REGS_MASK;
-}
-
-const struct sample_reg *arch__sample_reg_masks(void)
-{
-	return sample_reg_masks;
-}
diff --git a/tools/perf/arch/riscv/util/unwind-libdw.c b/tools/perf/arch/riscv/util/unwind-libdw.c
deleted file mode 100644
index dc1476e16321..000000000000
--- a/tools/perf/arch/riscv/util/unwind-libdw.c
+++ /dev/null
@@ -1,58 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/* Copyright (C) 2019 Hangzhou C-SKY Microsystems co.,ltd. */
-
-#include <elfutils/libdwfl.h>
-#include "perf_regs.h"
-#include "../../util/unwind-libdw.h"
-#include "../../util/perf_regs.h"
-#include "../../util/sample.h"
-
-bool libdw__arch_set_initial_registers(Dwfl_Thread *thread, void *arg)
-{
-	struct unwind_info *ui = arg;
-	struct regs_dump *user_regs = perf_sample__user_regs(ui->sample);
-	Dwarf_Word dwarf_regs[32];
-
-#define REG(r) ({						\
-	Dwarf_Word val = 0;					\
-	perf_reg_value(&val, user_regs, PERF_REG_RISCV_##r);	\
-	val;							\
-})
-
-	dwarf_regs[0]  = 0;
-	dwarf_regs[1]  = REG(RA);
-	dwarf_regs[2]  = REG(SP);
-	dwarf_regs[3]  = REG(GP);
-	dwarf_regs[4]  = REG(TP);
-	dwarf_regs[5]  = REG(T0);
-	dwarf_regs[6]  = REG(T1);
-	dwarf_regs[7]  = REG(T2);
-	dwarf_regs[8]  = REG(S0);
-	dwarf_regs[9]  = REG(S1);
-	dwarf_regs[10] = REG(A0);
-	dwarf_regs[11] = REG(A1);
-	dwarf_regs[12] = REG(A2);
-	dwarf_regs[13] = REG(A3);
-	dwarf_regs[14] = REG(A4);
-	dwarf_regs[15] = REG(A5);
-	dwarf_regs[16] = REG(A6);
-	dwarf_regs[17] = REG(A7);
-	dwarf_regs[18] = REG(S2);
-	dwarf_regs[19] = REG(S3);
-	dwarf_regs[20] = REG(S4);
-	dwarf_regs[21] = REG(S5);
-	dwarf_regs[22] = REG(S6);
-	dwarf_regs[23] = REG(S7);
-	dwarf_regs[24] = REG(S8);
-	dwarf_regs[25] = REG(S9);
-	dwarf_regs[26] = REG(S10);
-	dwarf_regs[27] = REG(S11);
-	dwarf_regs[28] = REG(T3);
-	dwarf_regs[29] = REG(T4);
-	dwarf_regs[30] = REG(T5);
-	dwarf_regs[31] = REG(T6);
-	dwfl_thread_state_register_pc(thread, REG(PC));
-
-	return dwfl_thread_state_registers(thread, 0, PERF_REG_RISCV_MAX,
-					   dwarf_regs);
-}
diff --git a/tools/perf/arch/riscv64/annotate/instructions.c b/tools/perf/arch/riscv64/annotate/instructions.c
deleted file mode 100644
index 55cf911633f8..000000000000
--- a/tools/perf/arch/riscv64/annotate/instructions.c
+++ /dev/null
@@ -1,36 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-static
-struct ins_ops *riscv64__associate_ins_ops(struct arch *arch, const char *name)
-{
-	struct ins_ops *ops = NULL;
-
-	if (!strncmp(name, "jal", 3) ||
-	    !strncmp(name, "jr", 2) ||
-	    !strncmp(name, "call", 4))
-		ops = &call_ops;
-	else if (!strncmp(name, "ret", 3))
-		ops = &ret_ops;
-	else if (name[0] == 'j' || name[0] == 'b')
-		ops = &jump_ops;
-	else
-		return NULL;
-
-	arch__associate_ins_ops(arch, name, ops);
-
-	return ops;
-}
-
-static
-int riscv64__annotate_init(struct arch *arch, char *cpuid __maybe_unused)
-{
-	if (!arch->initialized) {
-		arch->associate_instruction_ops = riscv64__associate_ins_ops;
-		arch->initialized = true;
-		arch->objdump.comment_char = '#';
-		arch->e_machine = EM_RISCV;
-		arch->e_flags = 0;
-	}
-
-	return 0;
-}
diff --git a/tools/perf/arch/s390/Makefile b/tools/perf/arch/s390/Makefile
index 0033698a65ce..8b59ce8efb89 100644
--- a/tools/perf/arch/s390/Makefile
+++ b/tools/perf/arch/s390/Makefile
@@ -1,3 +1,2 @@
 # SPDX-License-Identifier: GPL-2.0-only
-HAVE_KVM_STAT_SUPPORT := 1
 PERF_HAVE_JITDUMP := 1
diff --git a/tools/perf/arch/s390/include/perf_regs.h b/tools/perf/arch/s390/include/perf_regs.h
index 130dfad2b96a..9c95589965fe 100644
--- a/tools/perf/arch/s390/include/perf_regs.h
+++ b/tools/perf/arch/s390/include/perf_regs.h
@@ -3,7 +3,7 @@
 
 #include <stdlib.h>
 #include <linux/types.h>
-#include <asm/perf_regs.h>
+#include "../../../../arch/s390/include/uapi/asm/perf_regs.h"
 
 void perf_regs_load(u64 *regs);
 
diff --git a/tools/perf/arch/s390/util/Build b/tools/perf/arch/s390/util/Build
index c64eb18dbdae..65d75cd5b138 100644
--- a/tools/perf/arch/s390/util/Build
+++ b/tools/perf/arch/s390/util/Build
@@ -1,8 +1,4 @@
 perf-util-y += header.o
-perf-util-$(CONFIG_LIBTRACEEVENT) += kvm-stat.o
-perf-util-y += perf_regs.o
-
-perf-util-$(CONFIG_LIBDW_DWARF_UNWIND) += unwind-libdw.o
 
 perf-util-y += machine.o
 perf-util-y += pmu.o
diff --git a/tools/perf/arch/s390/util/perf_regs.c b/tools/perf/arch/s390/util/perf_regs.c
deleted file mode 100644
index 6b1665f41180..000000000000
--- a/tools/perf/arch/s390/util/perf_regs.c
+++ /dev/null
@@ -1,22 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include "perf_regs.h"
-#include "../../util/perf_regs.h"
-
-static const struct sample_reg sample_reg_masks[] = {
-	SMPL_REG_END
-};
-
-uint64_t arch__intr_reg_mask(void)
-{
-	return PERF_REGS_MASK;
-}
-
-uint64_t arch__user_reg_mask(void)
-{
-	return PERF_REGS_MASK;
-}
-
-const struct sample_reg *arch__sample_reg_masks(void)
-{
-	return sample_reg_masks;
-}
diff --git a/tools/perf/arch/s390/util/unwind-libdw.c b/tools/perf/arch/s390/util/unwind-libdw.c
deleted file mode 100644
index c27c7a0d1076..000000000000
--- a/tools/perf/arch/s390/util/unwind-libdw.c
+++ /dev/null
@@ -1,65 +0,0 @@
-#include <linux/kernel.h>
-#include <elfutils/libdwfl.h>
-#include "../../util/unwind-libdw.h"
-#include "../../util/perf_regs.h"
-#include "../../util/event.h"
-#include "../../util/sample.h"
-#include "dwarf-regs-table.h"
-#include "perf_regs.h"
-
-
-bool libdw__arch_set_initial_registers(Dwfl_Thread *thread, void *arg)
-{
-	struct unwind_info *ui = arg;
-	struct regs_dump *user_regs = perf_sample__user_regs(ui->sample);
-	Dwarf_Word dwarf_regs[ARRAY_SIZE(s390_dwarf_regs)];
-
-#define REG(r) ({						\
-	Dwarf_Word val = 0;					\
-	perf_reg_value(&val, user_regs, PERF_REG_S390_##r);	\
-	val;							\
-})
-	/*
-	 * For DWARF register mapping details,
-	 * see also perf/arch/s390/include/dwarf-regs-table.h
-	 */
-	dwarf_regs[0]  = REG(R0);
-	dwarf_regs[1]  = REG(R1);
-	dwarf_regs[2]  = REG(R2);
-	dwarf_regs[3]  = REG(R3);
-	dwarf_regs[4]  = REG(R4);
-	dwarf_regs[5]  = REG(R5);
-	dwarf_regs[6]  = REG(R6);
-	dwarf_regs[7]  = REG(R7);
-	dwarf_regs[8]  = REG(R8);
-	dwarf_regs[9]  = REG(R9);
-	dwarf_regs[10] = REG(R10);
-	dwarf_regs[11] = REG(R11);
-	dwarf_regs[12] = REG(R12);
-	dwarf_regs[13] = REG(R13);
-	dwarf_regs[14] = REG(R14);
-	dwarf_regs[15] = REG(R15);
-
-	dwarf_regs[16] = REG(FP0);
-	dwarf_regs[17] = REG(FP2);
-	dwarf_regs[18] = REG(FP4);
-	dwarf_regs[19] = REG(FP6);
-	dwarf_regs[20] = REG(FP1);
-	dwarf_regs[21] = REG(FP3);
-	dwarf_regs[22] = REG(FP5);
-	dwarf_regs[23] = REG(FP7);
-	dwarf_regs[24] = REG(FP8);
-	dwarf_regs[25] = REG(FP10);
-	dwarf_regs[26] = REG(FP12);
-	dwarf_regs[27] = REG(FP14);
-	dwarf_regs[28] = REG(FP9);
-	dwarf_regs[29] = REG(FP11);
-	dwarf_regs[30] = REG(FP13);
-	dwarf_regs[31] = REG(FP15);
-
-	dwarf_regs[64] = REG(MASK);
-	dwarf_regs[65] = REG(PC);
-
-	dwfl_thread_state_register_pc(thread, dwarf_regs[65]);
-	return dwfl_thread_state_registers(thread, 0, 32, dwarf_regs);
-}
diff --git a/tools/perf/arch/x86/Makefile b/tools/perf/arch/x86/Makefile
index a295a80ea078..44cc3f023318 100644
--- a/tools/perf/arch/x86/Makefile
+++ b/tools/perf/arch/x86/Makefile
@@ -1,3 +1,2 @@
 # SPDX-License-Identifier: GPL-2.0
-HAVE_KVM_STAT_SUPPORT := 1
 PERF_HAVE_JITDUMP := 1
diff --git a/tools/perf/arch/x86/include/perf_regs.h b/tools/perf/arch/x86/include/perf_regs.h
index f209ce2c1dd9..5495e5ca7cdc 100644
--- a/tools/perf/arch/x86/include/perf_regs.h
+++ b/tools/perf/arch/x86/include/perf_regs.h
@@ -4,7 +4,7 @@
 
 #include <stdlib.h>
 #include <linux/types.h>
-#include <asm/perf_regs.h>
+#include "../../../../arch/x86/include/uapi/asm/perf_regs.h"
 
 void perf_regs_load(u64 *regs);
 
diff --git a/tools/perf/arch/x86/tests/bp-modify.c b/tools/perf/arch/x86/tests/bp-modify.c
index 0924ccd9e36d..589b43273948 100644
--- a/tools/perf/arch/x86/tests/bp-modify.c
+++ b/tools/perf/arch/x86/tests/bp-modify.c
@@ -80,26 +80,24 @@ static int bp_modify1(void)
 	 */
 	if (ptrace(PTRACE_POKEUSER, child,
 		   offsetof(struct user, u_debugreg[0]), bp_2)) {
-		pr_debug("failed to set breakpoint, 1st time: %s\n",
-			 strerror(errno));
+		pr_debug("failed to set breakpoint, 1st time: %m\n");
 		goto out;
 	}
 
 	if (ptrace(PTRACE_POKEUSER, child,
 		   offsetof(struct user, u_debugreg[0]), bp_1)) {
-		pr_debug("failed to set breakpoint, 2nd time: %s\n",
-			 strerror(errno));
+		pr_debug("failed to set breakpoint, 2nd time: %m\n");
 		goto out;
 	}
 
 	if (ptrace(PTRACE_POKEUSER, child,
 		   offsetof(struct user, u_debugreg[7]), dr7)) {
-		pr_debug("failed to set dr7: %s\n", strerror(errno));
+		pr_debug("failed to set dr7: %m\n");
 		goto out;
 	}
 
 	if (ptrace(PTRACE_CONT, child, NULL, NULL)) {
-		pr_debug("failed to PTRACE_CONT: %s\n", strerror(errno));
+		pr_debug("failed to PTRACE_CONT: %m\n");
 		goto out;
 	}
 
@@ -112,19 +110,17 @@ static int bp_modify1(void)
 	rip = ptrace(PTRACE_PEEKUSER, child,
 		     offsetof(struct user_regs_struct, rip), NULL);
 	if (rip == (unsigned long) -1) {
-		pr_debug("failed to PTRACE_PEEKUSER: %s\n",
-			 strerror(errno));
+		pr_debug("failed to PTRACE_PEEKUSER: %m\n");
 		goto out;
 	}
 
 	pr_debug("rip %lx, bp_1 %p\n", rip, bp_1);
-
 out:
 	if (ptrace(PTRACE_DETACH, child, NULL, NULL)) {
-		pr_debug("failed to PTRACE_DETACH: %s", strerror(errno));
+		pr_debug("failed to PTRACE_DETACH: %m\n");
 		return TEST_FAIL;
-	}
 
+	}
 	return rip == (unsigned long) bp_1 ? TEST_OK : TEST_FAIL;
 }
 
@@ -157,14 +153,13 @@ static int bp_modify2(void)
 	 */
 	if (ptrace(PTRACE_POKEUSER, child,
 		   offsetof(struct user, u_debugreg[0]), bp_1)) {
-		pr_debug("failed to set breakpoint: %s\n",
-			 strerror(errno));
+		pr_debug("failed to set breakpoint: %m\n");
 		goto out;
 	}
 
 	if (ptrace(PTRACE_POKEUSER, child,
 		   offsetof(struct user, u_debugreg[7]), dr7)) {
-		pr_debug("failed to set dr7: %s\n", strerror(errno));
+		pr_debug("failed to set dr7: %m\n");
 		goto out;
 	}
 
@@ -175,7 +170,7 @@ static int bp_modify2(void)
 	}
 
 	if (ptrace(PTRACE_CONT, child, NULL, NULL)) {
-		pr_debug("failed to PTRACE_CONT: %s\n", strerror(errno));
+		pr_debug("failed to PTRACE_CONT: %m\n");
 		goto out;
 	}
 
@@ -188,8 +183,7 @@ static int bp_modify2(void)
 	rip = ptrace(PTRACE_PEEKUSER, child,
 		     offsetof(struct user_regs_struct, rip), NULL);
 	if (rip == (unsigned long) -1) {
-		pr_debug("failed to PTRACE_PEEKUSER: %s\n",
-			 strerror(errno));
+		pr_debug("failed to PTRACE_PEEKUSER: %m\n");
 		goto out;
 	}
 
@@ -197,7 +191,7 @@ static int bp_modify2(void)
 
 out:
 	if (ptrace(PTRACE_DETACH, child, NULL, NULL)) {
-		pr_debug("failed to PTRACE_DETACH: %s", strerror(errno));
+		pr_debug("failed to PTRACE_DETACH: %m\n");
 		return TEST_FAIL;
 	}
 
diff --git a/tools/perf/arch/x86/util/Build b/tools/perf/arch/x86/util/Build
index c0dc5965f362..b94c91984c66 100644
--- a/tools/perf/arch/x86/util/Build
+++ b/tools/perf/arch/x86/util/Build
@@ -1,8 +1,6 @@
 perf-util-y += header.o
 perf-util-y += tsc.o
 perf-util-y += pmu.o
-perf-util-$(CONFIG_LIBTRACEEVENT) += kvm-stat.o
-perf-util-y += perf_regs.o
 perf-util-y += topdown.o
 perf-util-y += machine.o
 perf-util-y += event.o
@@ -12,9 +10,7 @@ perf-util-y += evsel.o
 perf-util-y += iostat.o
 
 perf-util-$(CONFIG_LOCAL_LIBUNWIND)    += unwind-libunwind.o
-perf-util-$(CONFIG_LIBDW_DWARF_UNWIND) += unwind-libdw.o
 
 perf-util-y += auxtrace.o
-perf-util-y += archinsn.o
 perf-util-y += intel-pt.o
 perf-util-y += intel-bts.o
diff --git a/tools/perf/arch/x86/util/archinsn.c b/tools/perf/arch/x86/util/archinsn.c
deleted file mode 100644
index 546feda08428..000000000000
--- a/tools/perf/arch/x86/util/archinsn.c
+++ /dev/null
@@ -1,27 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include "archinsn.h"
-#include "event.h"
-#include "machine.h"
-#include "thread.h"
-#include "symbol.h"
-#include "../../../../arch/x86/include/asm/insn.h"
-
-void arch_fetch_insn(struct perf_sample *sample,
-		     struct thread *thread,
-		     struct machine *machine)
-{
-	struct insn insn;
-	int len, ret;
-	bool is64bit = false;
-
-	if (!sample->ip)
-		return;
-	len = thread__memcpy(thread, machine, sample->insn, sample->ip, sizeof(sample->insn), &is64bit);
-	if (len <= 0)
-		return;
-
-	ret = insn_decode(&insn, sample->insn, len,
-			  is64bit ? INSN_MODE_64 : INSN_MODE_32);
-	if (ret >= 0 && insn.length <= len)
-		sample->insn_len = insn.length;
-}
diff --git a/tools/perf/arch/x86/util/intel-pt.c b/tools/perf/arch/x86/util/intel-pt.c
index b394ad9cc635..c131a727774f 100644
--- a/tools/perf/arch/x86/util/intel-pt.c
+++ b/tools/perf/arch/x86/util/intel-pt.c
@@ -664,8 +664,7 @@ static int intel_pt_recording_options(struct auxtrace_record *itr,
 		return 0;
 
 	if (opts->auxtrace_sample_mode)
-		evsel__set_config_if_unset(intel_pt_pmu, intel_pt_evsel,
-					   "psb_period", 0);
+		evsel__set_config_if_unset(intel_pt_evsel, "psb_period", 0);
 
 	err = intel_pt_validate_config(intel_pt_pmu, intel_pt_evsel);
 	if (err)
diff --git a/tools/perf/arch/x86/util/perf_regs.c b/tools/perf/arch/x86/util/perf_regs.c
deleted file mode 100644
index 12fd93f04802..000000000000
--- a/tools/perf/arch/x86/util/perf_regs.c
+++ /dev/null
@@ -1,330 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include <errno.h>
-#include <string.h>
-#include <regex.h>
-#include <linux/kernel.h>
-#include <linux/zalloc.h>
-
-#include "perf_regs.h"
-#include "../../../perf-sys.h"
-#include "../../../util/perf_regs.h"
-#include "../../../util/debug.h"
-#include "../../../util/event.h"
-#include "../../../util/pmu.h"
-#include "../../../util/pmus.h"
-
-static const struct sample_reg sample_reg_masks[] = {
-	SMPL_REG(AX, PERF_REG_X86_AX),
-	SMPL_REG(BX, PERF_REG_X86_BX),
-	SMPL_REG(CX, PERF_REG_X86_CX),
-	SMPL_REG(DX, PERF_REG_X86_DX),
-	SMPL_REG(SI, PERF_REG_X86_SI),
-	SMPL_REG(DI, PERF_REG_X86_DI),
-	SMPL_REG(BP, PERF_REG_X86_BP),
-	SMPL_REG(SP, PERF_REG_X86_SP),
-	SMPL_REG(IP, PERF_REG_X86_IP),
-	SMPL_REG(FLAGS, PERF_REG_X86_FLAGS),
-	SMPL_REG(CS, PERF_REG_X86_CS),
-	SMPL_REG(SS, PERF_REG_X86_SS),
-#ifdef HAVE_ARCH_X86_64_SUPPORT
-	SMPL_REG(R8, PERF_REG_X86_R8),
-	SMPL_REG(R9, PERF_REG_X86_R9),
-	SMPL_REG(R10, PERF_REG_X86_R10),
-	SMPL_REG(R11, PERF_REG_X86_R11),
-	SMPL_REG(R12, PERF_REG_X86_R12),
-	SMPL_REG(R13, PERF_REG_X86_R13),
-	SMPL_REG(R14, PERF_REG_X86_R14),
-	SMPL_REG(R15, PERF_REG_X86_R15),
-#endif
-	SMPL_REG2(XMM0, PERF_REG_X86_XMM0),
-	SMPL_REG2(XMM1, PERF_REG_X86_XMM1),
-	SMPL_REG2(XMM2, PERF_REG_X86_XMM2),
-	SMPL_REG2(XMM3, PERF_REG_X86_XMM3),
-	SMPL_REG2(XMM4, PERF_REG_X86_XMM4),
-	SMPL_REG2(XMM5, PERF_REG_X86_XMM5),
-	SMPL_REG2(XMM6, PERF_REG_X86_XMM6),
-	SMPL_REG2(XMM7, PERF_REG_X86_XMM7),
-	SMPL_REG2(XMM8, PERF_REG_X86_XMM8),
-	SMPL_REG2(XMM9, PERF_REG_X86_XMM9),
-	SMPL_REG2(XMM10, PERF_REG_X86_XMM10),
-	SMPL_REG2(XMM11, PERF_REG_X86_XMM11),
-	SMPL_REG2(XMM12, PERF_REG_X86_XMM12),
-	SMPL_REG2(XMM13, PERF_REG_X86_XMM13),
-	SMPL_REG2(XMM14, PERF_REG_X86_XMM14),
-	SMPL_REG2(XMM15, PERF_REG_X86_XMM15),
-	SMPL_REG_END
-};
-
-struct sdt_name_reg {
-	const char *sdt_name;
-	const char *uprobe_name;
-};
-#define SDT_NAME_REG(n, m) {.sdt_name = "%" #n, .uprobe_name = "%" #m}
-#define SDT_NAME_REG_END {.sdt_name = NULL, .uprobe_name = NULL}
-
-static const struct sdt_name_reg sdt_reg_tbl[] = {
-	SDT_NAME_REG(eax, ax),
-	SDT_NAME_REG(rax, ax),
-	SDT_NAME_REG(al,  ax),
-	SDT_NAME_REG(ah,  ax),
-	SDT_NAME_REG(ebx, bx),
-	SDT_NAME_REG(rbx, bx),
-	SDT_NAME_REG(bl,  bx),
-	SDT_NAME_REG(bh,  bx),
-	SDT_NAME_REG(ecx, cx),
-	SDT_NAME_REG(rcx, cx),
-	SDT_NAME_REG(cl,  cx),
-	SDT_NAME_REG(ch,  cx),
-	SDT_NAME_REG(edx, dx),
-	SDT_NAME_REG(rdx, dx),
-	SDT_NAME_REG(dl,  dx),
-	SDT_NAME_REG(dh,  dx),
-	SDT_NAME_REG(esi, si),
-	SDT_NAME_REG(rsi, si),
-	SDT_NAME_REG(sil, si),
-	SDT_NAME_REG(edi, di),
-	SDT_NAME_REG(rdi, di),
-	SDT_NAME_REG(dil, di),
-	SDT_NAME_REG(ebp, bp),
-	SDT_NAME_REG(rbp, bp),
-	SDT_NAME_REG(bpl, bp),
-	SDT_NAME_REG(rsp, sp),
-	SDT_NAME_REG(esp, sp),
-	SDT_NAME_REG(spl, sp),
-
-	/* rNN registers */
-	SDT_NAME_REG(r8b,  r8),
-	SDT_NAME_REG(r8w,  r8),
-	SDT_NAME_REG(r8d,  r8),
-	SDT_NAME_REG(r9b,  r9),
-	SDT_NAME_REG(r9w,  r9),
-	SDT_NAME_REG(r9d,  r9),
-	SDT_NAME_REG(r10b, r10),
-	SDT_NAME_REG(r10w, r10),
-	SDT_NAME_REG(r10d, r10),
-	SDT_NAME_REG(r11b, r11),
-	SDT_NAME_REG(r11w, r11),
-	SDT_NAME_REG(r11d, r11),
-	SDT_NAME_REG(r12b, r12),
-	SDT_NAME_REG(r12w, r12),
-	SDT_NAME_REG(r12d, r12),
-	SDT_NAME_REG(r13b, r13),
-	SDT_NAME_REG(r13w, r13),
-	SDT_NAME_REG(r13d, r13),
-	SDT_NAME_REG(r14b, r14),
-	SDT_NAME_REG(r14w, r14),
-	SDT_NAME_REG(r14d, r14),
-	SDT_NAME_REG(r15b, r15),
-	SDT_NAME_REG(r15w, r15),
-	SDT_NAME_REG(r15d, r15),
-	SDT_NAME_REG_END,
-};
-
-/*
- * Perf only supports OP which is in  +/-NUM(REG)  form.
- * Here plus-minus sign, NUM and parenthesis are optional,
- * only REG is mandatory.
- *
- * SDT events also supports indirect addressing mode with a
- * symbol as offset, scaled mode and constants in OP. But
- * perf does not support them yet. Below are few examples.
- *
- * OP with scaled mode:
- *     (%rax,%rsi,8)
- *     10(%ras,%rsi,8)
- *
- * OP with indirect addressing mode:
- *     check_action(%rip)
- *     mp_+52(%rip)
- *     44+mp_(%rip)
- *
- * OP with constant values:
- *     $0
- *     $123
- *     $-1
- */
-#define SDT_OP_REGEX  "^([+\\-]?)([0-9]*)(\\(?)(%[a-z][a-z0-9]+)(\\)?)$"
-
-static regex_t sdt_op_regex;
-
-static int sdt_init_op_regex(void)
-{
-	static int initialized;
-	int ret = 0;
-
-	if (initialized)
-		return 0;
-
-	ret = regcomp(&sdt_op_regex, SDT_OP_REGEX, REG_EXTENDED);
-	if (ret < 0) {
-		pr_debug4("Regex compilation error.\n");
-		return ret;
-	}
-
-	initialized = 1;
-	return 0;
-}
-
-/*
- * Max x86 register name length is 5(ex: %r15d). So, 6th char
- * should always contain NULL. This helps to find register name
- * length using strlen, instead of maintaining one more variable.
- */
-#define SDT_REG_NAME_SIZE  6
-
-/*
- * The uprobe parser does not support all gas register names;
- * so, we have to replace them (ex. for x86_64: %rax -> %ax).
- * Note: If register does not require renaming, just copy
- * paste as it is, but don't leave it empty.
- */
-static void sdt_rename_register(char *sdt_reg, int sdt_len, char *uprobe_reg)
-{
-	int i = 0;
-
-	for (i = 0; sdt_reg_tbl[i].sdt_name != NULL; i++) {
-		if (!strncmp(sdt_reg_tbl[i].sdt_name, sdt_reg, sdt_len)) {
-			strcpy(uprobe_reg, sdt_reg_tbl[i].uprobe_name);
-			return;
-		}
-	}
-
-	strncpy(uprobe_reg, sdt_reg, sdt_len);
-}
-
-int arch_sdt_arg_parse_op(char *old_op, char **new_op)
-{
-	char new_reg[SDT_REG_NAME_SIZE] = {0};
-	int new_len = 0, ret;
-	/*
-	 * rm[0]:  +/-NUM(REG)
-	 * rm[1]:  +/-
-	 * rm[2]:  NUM
-	 * rm[3]:  (
-	 * rm[4]:  REG
-	 * rm[5]:  )
-	 */
-	regmatch_t rm[6];
-	/*
-	 * Max prefix length is 2 as it may contains sign(+/-)
-	 * and displacement 0 (Both sign and displacement 0 are
-	 * optional so it may be empty). Use one more character
-	 * to hold last NULL so that strlen can be used to find
-	 * prefix length, instead of maintaining one more variable.
-	 */
-	char prefix[3] = {0};
-
-	ret = sdt_init_op_regex();
-	if (ret < 0)
-		return ret;
-
-	/*
-	 * If unsupported OR does not match with regex OR
-	 * register name too long, skip it.
-	 */
-	if (strchr(old_op, ',') || strchr(old_op, '$') ||
-	    regexec(&sdt_op_regex, old_op, 6, rm, 0)   ||
-	    rm[4].rm_eo - rm[4].rm_so > SDT_REG_NAME_SIZE) {
-		pr_debug4("Skipping unsupported SDT argument: %s\n", old_op);
-		return SDT_ARG_SKIP;
-	}
-
-	/*
-	 * Prepare prefix.
-	 * If SDT OP has parenthesis but does not provide
-	 * displacement, add 0 for displacement.
-	 *     SDT         Uprobe     Prefix
-	 *     -----------------------------
-	 *     +24(%rdi)   +24(%di)   +
-	 *     24(%rdi)    +24(%di)   +
-	 *     %rdi        %di
-	 *     (%rdi)      +0(%di)    +0
-	 *     -80(%rbx)   -80(%bx)   -
-	 */
-	if (rm[3].rm_so != rm[3].rm_eo) {
-		if (rm[1].rm_so != rm[1].rm_eo)
-			prefix[0] = *(old_op + rm[1].rm_so);
-		else if (rm[2].rm_so != rm[2].rm_eo)
-			prefix[0] = '+';
-		else
-			scnprintf(prefix, sizeof(prefix), "+0");
-	}
-
-	/* Rename register */
-	sdt_rename_register(old_op + rm[4].rm_so, rm[4].rm_eo - rm[4].rm_so,
-			    new_reg);
-
-	/* Prepare final OP which should be valid for uprobe_events */
-	new_len = strlen(prefix)              +
-		  (rm[2].rm_eo - rm[2].rm_so) +
-		  (rm[3].rm_eo - rm[3].rm_so) +
-		  strlen(new_reg)             +
-		  (rm[5].rm_eo - rm[5].rm_so) +
-		  1;					/* NULL */
-
-	*new_op = zalloc(new_len);
-	if (!*new_op)
-		return -ENOMEM;
-
-	scnprintf(*new_op, new_len, "%.*s%.*s%.*s%.*s%.*s",
-		  strlen(prefix), prefix,
-		  (int)(rm[2].rm_eo - rm[2].rm_so), old_op + rm[2].rm_so,
-		  (int)(rm[3].rm_eo - rm[3].rm_so), old_op + rm[3].rm_so,
-		  strlen(new_reg), new_reg,
-		  (int)(rm[5].rm_eo - rm[5].rm_so), old_op + rm[5].rm_so);
-
-	return SDT_ARG_VALID;
-}
-
-const struct sample_reg *arch__sample_reg_masks(void)
-{
-	return sample_reg_masks;
-}
-
-uint64_t arch__intr_reg_mask(void)
-{
-	struct perf_event_attr attr = {
-		.type			= PERF_TYPE_HARDWARE,
-		.config			= PERF_COUNT_HW_CPU_CYCLES,
-		.sample_type		= PERF_SAMPLE_REGS_INTR,
-		.sample_regs_intr	= PERF_REG_EXTENDED_MASK,
-		.precise_ip		= 1,
-		.disabled 		= 1,
-		.exclude_kernel		= 1,
-	};
-	int fd;
-	/*
-	 * In an unnamed union, init it here to build on older gcc versions
-	 */
-	attr.sample_period = 1;
-
-	if (perf_pmus__num_core_pmus() > 1) {
-		struct perf_pmu *pmu = NULL;
-		__u64 type = PERF_TYPE_RAW;
-
-		/*
-		 * The same register set is supported among different hybrid PMUs.
-		 * Only check the first available one.
-		 */
-		while ((pmu = perf_pmus__scan_core(pmu)) != NULL) {
-			type = pmu->type;
-			break;
-		}
-		attr.config |= type << PERF_PMU_TYPE_SHIFT;
-	}
-
-	event_attr_init(&attr);
-
-	fd = sys_perf_event_open(&attr, 0, -1, -1, 0);
-	if (fd != -1) {
-		close(fd);
-		return (PERF_REG_EXTENDED_MASK | PERF_REGS_MASK);
-	}
-
-	return PERF_REGS_MASK;
-}
-
-uint64_t arch__user_reg_mask(void)
-{
-	return PERF_REGS_MASK;
-}
diff --git a/tools/perf/arch/x86/util/unwind-libdw.c b/tools/perf/arch/x86/util/unwind-libdw.c
deleted file mode 100644
index 798493e887d7..000000000000
--- a/tools/perf/arch/x86/util/unwind-libdw.c
+++ /dev/null
@@ -1,54 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include <elfutils/libdwfl.h>
-#include "perf_regs.h"
-#include "../../../util/unwind-libdw.h"
-#include "../../../util/perf_regs.h"
-#include "util/sample.h"
-
-bool libdw__arch_set_initial_registers(Dwfl_Thread *thread, void *arg)
-{
-	struct unwind_info *ui = arg;
-	struct regs_dump *user_regs = perf_sample__user_regs(ui->sample);
-	Dwarf_Word dwarf_regs[17];
-	unsigned nregs;
-
-#define REG(r) ({						\
-	Dwarf_Word val = 0;					\
-	perf_reg_value(&val, user_regs, PERF_REG_X86_##r);	\
-	val;							\
-})
-
-	if (user_regs->abi == PERF_SAMPLE_REGS_ABI_32) {
-		dwarf_regs[0] = REG(AX);
-		dwarf_regs[1] = REG(CX);
-		dwarf_regs[2] = REG(DX);
-		dwarf_regs[3] = REG(BX);
-		dwarf_regs[4] = REG(SP);
-		dwarf_regs[5] = REG(BP);
-		dwarf_regs[6] = REG(SI);
-		dwarf_regs[7] = REG(DI);
-		dwarf_regs[8] = REG(IP);
-		nregs = 9;
-	} else {
-		dwarf_regs[0]  = REG(AX);
-		dwarf_regs[1]  = REG(DX);
-		dwarf_regs[2]  = REG(CX);
-		dwarf_regs[3]  = REG(BX);
-		dwarf_regs[4]  = REG(SI);
-		dwarf_regs[5]  = REG(DI);
-		dwarf_regs[6]  = REG(BP);
-		dwarf_regs[7]  = REG(SP);
-		dwarf_regs[8]  = REG(R8);
-		dwarf_regs[9]  = REG(R9);
-		dwarf_regs[10] = REG(R10);
-		dwarf_regs[11] = REG(R11);
-		dwarf_regs[12] = REG(R12);
-		dwarf_regs[13] = REG(R13);
-		dwarf_regs[14] = REG(R14);
-		dwarf_regs[15] = REG(R15);
-		dwarf_regs[16] = REG(IP);
-		nregs = 17;
-	}
-
-	return dwfl_thread_state_registers(thread, 0, nregs, dwarf_regs);
-}
diff --git a/tools/perf/bench/uprobe.c b/tools/perf/bench/uprobe.c
index 0b90275862e1..c4dac868f1ee 100644
--- a/tools/perf/bench/uprobe.c
+++ b/tools/perf/bench/uprobe.c
@@ -54,7 +54,7 @@ static const char * const bench_uprobe_usage[] = {
 							   /*opts=*/&uprobe_opts); \
 	if (!skel->links.prog) { \
 		err = -errno; \
-		fprintf(stderr, "Failed to attach bench uprobe \"%s\": %s\n", #prog, strerror(errno)); \
+		fprintf(stderr, "Failed to attach bench uprobe \"%s\": %m\n", #prog); \
 		goto cleanup; \
 	}
 
diff --git a/tools/perf/builtin-check.c b/tools/perf/builtin-check.c
index d19769a8f689..3641d263b345 100644
--- a/tools/perf/builtin-check.c
+++ b/tools/perf/builtin-check.c
@@ -43,6 +43,7 @@ struct feature_status supported_features[] = {
 	FEATURE_STATUS("dwarf_getlocations", HAVE_LIBDW_SUPPORT),
 	FEATURE_STATUS("dwarf-unwind", HAVE_DWARF_UNWIND_SUPPORT),
 	FEATURE_STATUS_TIP("libbfd", HAVE_LIBBFD_SUPPORT, "Deprecated, license incompatibility, use BUILD_NONDISTRO=1 and install binutils-dev[el]"),
+	FEATURE_STATUS("libbabeltrace", HAVE_LIBBABELTRACE_SUPPORT),
 	FEATURE_STATUS("libbpf-strings", HAVE_LIBBPF_STRINGS_SUPPORT),
 	FEATURE_STATUS("libcapstone", HAVE_LIBCAPSTONE_SUPPORT),
 	FEATURE_STATUS("libdw-dwarf-unwind", HAVE_LIBDW_SUPPORT),
@@ -60,6 +61,7 @@ struct feature_status supported_features[] = {
 	FEATURE_STATUS("numa_num_possible_cpus", HAVE_LIBNUMA_SUPPORT),
 	FEATURE_STATUS("zlib", HAVE_ZLIB_SUPPORT),
 	FEATURE_STATUS("zstd", HAVE_ZSTD_SUPPORT),
+	FEATURE_STATUS("rust", HAVE_RUST_SUPPORT),
 
 	/* this should remain at end, to know the array end */
 	FEATURE_STATUS(NULL, _)
diff --git a/tools/perf/builtin-daemon.c b/tools/perf/builtin-daemon.c
index f0568431fbd5..33473e071392 100644
--- a/tools/perf/builtin-daemon.c
+++ b/tools/perf/builtin-daemon.c
@@ -265,8 +265,7 @@ static int check_base(struct daemon *daemon)
 			       daemon->base);
 			return -EACCES;
 		default:
-			pr_err("failed: can't access base '%s': %s\n",
-			       daemon->base, strerror(errno));
+			pr_err("failed: can't access base '%s': %m\n", daemon->base);
 			return -errno;
 		}
 	}
@@ -544,8 +543,7 @@ static int daemon_session__control(struct daemon_session *session,
 
 	err = writen(control, msg, len);
 	if (err != len) {
-		pr_err("failed: write to control pipe: %d (%s)\n",
-		       errno, control_path);
+		pr_err("failed: write to control pipe: %m (%s)\n", control_path);
 		goto out;
 	}
 
@@ -586,7 +584,7 @@ static int setup_server_socket(struct daemon *daemon)
 	int fd = socket(AF_UNIX, SOCK_STREAM, 0);
 
 	if (fd < 0) {
-		fprintf(stderr, "socket: %s\n", strerror(errno));
+		fprintf(stderr, "socket: %m\n");
 		return -1;
 	}
 
diff --git a/tools/perf/builtin-data.c b/tools/perf/builtin-data.c
index ce51cbf6dc97..85f59886b5cf 100644
--- a/tools/perf/builtin-data.c
+++ b/tools/perf/builtin-data.c
@@ -33,6 +33,7 @@ const char *to_ctf;
 struct perf_data_convert_opts opts = {
 	.force = false,
 	.all = false,
+	.time_str = NULL,
 };
 
 const struct option data_options[] = {
@@ -45,6 +46,8 @@ const struct option data_options[] = {
 #endif
 		OPT_BOOLEAN('f', "force", &opts.force, "don't complain, do it"),
 		OPT_BOOLEAN(0, "all", &opts.all, "Convert all events"),
+		OPT_STRING(0, "time", &opts.time_str, "str",
+			   "Time span of interest (start,stop)"),
 		OPT_END()
 	};
 
diff --git a/tools/perf/builtin-diff.c b/tools/perf/builtin-diff.c
index 53d5ea4a6a4f..59bf1f72d12e 100644
--- a/tools/perf/builtin-diff.c
+++ b/tools/perf/builtin-diff.c
@@ -178,10 +178,9 @@ static struct header_column {
 	}
 };
 
-static int setup_compute_opt_wdiff(char *opt)
+static int setup_compute_opt_wdiff(const char *opt)
 {
-	char *w1_str = opt;
-	char *w2_str;
+	const char *w1_str = opt, *w2_str;
 
 	int ret = -EINVAL;
 
@@ -192,8 +191,7 @@ static int setup_compute_opt_wdiff(char *opt)
 	if (!w2_str)
 		goto out;
 
-	*w2_str++ = 0x0;
-	if (!*w2_str)
+	if (!*++w2_str)
 		goto out;
 
 	compute_wdiff_w1 = strtol(w1_str, NULL, 10);
@@ -214,7 +212,7 @@ static int setup_compute_opt_wdiff(char *opt)
 	return ret;
 }
 
-static int setup_compute_opt(char *opt)
+static int setup_compute_opt(const char *opt)
 {
 	if (compute == COMPUTE_WEIGHTED_DIFF)
 		return setup_compute_opt_wdiff(opt);
@@ -234,7 +232,7 @@ static int setup_compute(const struct option *opt, const char *str,
 	char *cstr = (char *) str;
 	char buf[50];
 	unsigned i;
-	char *option;
+	const char *option;
 
 	if (!str) {
 		*cp = COMPUTE_DELTA;
diff --git a/tools/perf/builtin-help.c b/tools/perf/builtin-help.c
index 7be6fb6df595..2692b2e40a23 100644
--- a/tools/perf/builtin-help.c
+++ b/tools/perf/builtin-help.c
@@ -9,7 +9,6 @@
 #include "util/strbuf.h"
 #include "builtin.h"
 #include <subcmd/exec-cmd.h>
-#include "common-cmds.h"
 #include <subcmd/parse-options.h>
 #include <subcmd/run-command.h>
 #include <subcmd/help.h>
@@ -301,16 +300,58 @@ static struct cmdnames main_cmds, other_cmds;
 
 void list_common_cmds_help(void)
 {
-	unsigned int i, longest = 0;
+	const struct cmdname_help {
+		const char *name;
+		const char *help;
+	} common_cmds[] = {
+		{"annotate", "Read perf.data (created by perf record) and display annotated code"},
+		{"archive",
+		 "Create archive with object files with build-ids found in perf.data file"},
+		{"bench", "General framework for benchmark suites"},
+		{"buildid-cache", "Manage build-id cache."},
+		{"buildid-list", "List the buildids in a perf.data file"},
+		{"c2c", "Shared Data C2C/HITM Analyzer."},
+		{"config", "Get and set variables in a configuration file."},
+		{"daemon", "Run record sessions on background"},
+		{"data", "Data file related processing"},
+		{"diff", "Read perf.data files and display the differential profile"},
+		{"evlist", "List the event names in a perf.data file"},
+		{"ftrace", "simple wrapper for kernel's ftrace functionality"},
+		{"inject", "Filter to augment the events stream with additional information"},
+		{"iostat", "Show I/O performance metrics"},
+		{"kallsyms", "Searches running kernel for symbols"},
+		{"kvm", "Tool to trace/measure kvm guest os"},
+		{"list", "List all symbolic event types"},
+		{"mem", "Profile memory accesses"},
+		{"record", "Run a command and record its profile into perf.data"},
+		{"report", "Read perf.data (created by perf record) and display the profile"},
+		{"script", "Read perf.data (created by perf record) and display trace output"},
+		{"stat", "Run a command and gather performance counter statistics"},
+		{"test", "Runs sanity tests."},
+		{"top", "System profiling tool."},
+		{"version", "display the version of perf binary"},
+	#ifdef HAVE_LIBELF_SUPPORT
+		{"probe", "Define new dynamic tracepoints"},
+	#endif /* HAVE_LIBELF_SUPPORT */
+	#ifdef HAVE_LIBTRACEEVENT
+		{"trace", "strace inspired tool"},
+		{"kmem", "Tool to trace/measure kernel memory properties"},
+		{"kwork", "Tool to trace/measure kernel work properties (latencies)"},
+		{"lock", "Analyze lock events"},
+		{"sched", "Tool to trace/measure scheduler properties (latencies)"},
+		{"timechart", "Tool to visualize total system behavior during a workload"},
+	#endif /* HAVE_LIBTRACEEVENT */
+	};
+	size_t longest = 0;
 
-	for (i = 0; i < ARRAY_SIZE(common_cmds); i++) {
+	for (size_t i = 0; i < ARRAY_SIZE(common_cmds); i++) {
 		if (longest < strlen(common_cmds[i].name))
 			longest = strlen(common_cmds[i].name);
 	}
 
 	puts(" The most commonly used perf commands are:");
-	for (i = 0; i < ARRAY_SIZE(common_cmds); i++) {
-		printf("   %-*s   ", longest, common_cmds[i].name);
+	for (size_t i = 0; i < ARRAY_SIZE(common_cmds); i++) {
+		printf("   %-*s   ", (int)longest, common_cmds[i].name);
 		puts(common_cmds[i].help);
 	}
 }
diff --git a/tools/perf/builtin-inject.c b/tools/perf/builtin-inject.c
index aa7be4fb5838..5b29f4296861 100644
--- a/tools/perf/builtin-inject.c
+++ b/tools/perf/builtin-inject.c
@@ -122,6 +122,7 @@ struct perf_inject {
 	bool			in_place_update;
 	bool			in_place_update_dry_run;
 	bool			copy_kcore_dir;
+	bool			convert_callchain;
 	const char		*input_name;
 	struct perf_data	output;
 	u64			bytes_written;
@@ -133,6 +134,7 @@ struct perf_inject {
 	struct guest_session	guest_session;
 	struct strlist		*known_build_ids;
 	const struct evsel	*mmap_evsel;
+	struct ip_callchain	*raw_callchain;
 };
 
 struct event_entry {
@@ -383,6 +385,90 @@ static int perf_event__repipe_sample(const struct perf_tool *tool,
 	return perf_event__repipe_synth(tool, event);
 }
 
+static int perf_event__convert_sample_callchain(const struct perf_tool *tool,
+						union perf_event *event,
+						struct perf_sample *sample,
+						struct evsel *evsel,
+						struct machine *machine)
+{
+	struct perf_inject *inject = container_of(tool, struct perf_inject, tool);
+	struct callchain_cursor *cursor = get_tls_callchain_cursor();
+	union perf_event *event_copy = (void *)inject->event_copy;
+	struct callchain_cursor_node *node;
+	struct thread *thread;
+	u64 sample_type = evsel->core.attr.sample_type;
+	u32 sample_size = event->header.size;
+	u64 i, k;
+	int ret;
+
+	if (event_copy == NULL) {
+		inject->event_copy = malloc(PERF_SAMPLE_MAX_SIZE);
+		if (!inject->event_copy)
+			return -ENOMEM;
+
+		event_copy = (void *)inject->event_copy;
+	}
+
+	if (cursor == NULL)
+		return -ENOMEM;
+
+	callchain_cursor_reset(cursor);
+
+	thread = machine__find_thread(machine, sample->tid, sample->pid);
+	if (thread == NULL)
+		goto out;
+
+	/* this will parse DWARF using stack and register data */
+	ret = thread__resolve_callchain(thread, cursor, evsel, sample,
+					/*parent=*/NULL, /*root_al=*/NULL,
+					PERF_MAX_STACK_DEPTH);
+	thread__put(thread);
+	if (ret != 0)
+		goto out;
+
+	/* copy kernel callchain and context entries */
+	for (i = 0; i < sample->callchain->nr; i++) {
+		inject->raw_callchain->ips[i] = sample->callchain->ips[i];
+		if (sample->callchain->ips[i] == PERF_CONTEXT_USER) {
+			i++;
+			break;
+		}
+	}
+	if (i == 0 || inject->raw_callchain->ips[i - 1] != PERF_CONTEXT_USER)
+		inject->raw_callchain->ips[i++] = PERF_CONTEXT_USER;
+
+	node = cursor->first;
+	for (k = 0; k < cursor->nr && i < PERF_MAX_STACK_DEPTH; k++) {
+		if (machine__kernel_ip(machine, node->ip))
+			/* kernel IPs were added already */;
+		else if (node->ms.sym && node->ms.sym->inlined)
+			/* we can't handle inlined callchains */;
+		else
+			inject->raw_callchain->ips[i++] = node->ip;
+
+		node = node->next;
+	}
+
+	inject->raw_callchain->nr = i;
+	sample->callchain = inject->raw_callchain;
+
+out:
+	memcpy(event_copy, event, sizeof(event->header));
+
+	/* adjust sample size for stack and regs */
+	sample_size -= sample->user_stack.size;
+	sample_size -= (hweight64(evsel->core.attr.sample_regs_user) + 1) * sizeof(u64);
+	sample_size += (sample->callchain->nr + 1) * sizeof(u64);
+	event_copy->header.size = sample_size;
+
+	/* remove sample_type {STACK,REGS}_USER for synthesize */
+	sample_type &= ~(PERF_SAMPLE_STACK_USER | PERF_SAMPLE_REGS_USER);
+
+	perf_event__synthesize_sample(event_copy, sample_type,
+				      evsel->core.attr.read_format, sample);
+	return perf_event__repipe_synth(tool, event_copy);
+}
+
 static struct dso *findnew_dso(int pid, int tid, const char *filename,
 			       const struct dso_id *id, struct machine *machine)
 {
@@ -2022,7 +2108,7 @@ static int save_section_info(struct perf_inject *inject)
 	return perf_header__process_sections(header, fd, inject, save_section_info_cb);
 }
 
-static bool keep_feat(int feat)
+static bool keep_feat(struct perf_inject *inject, int feat)
 {
 	switch (feat) {
 	/* Keep original information that describes the machine or software */
@@ -2047,9 +2133,11 @@ static bool keep_feat(int feat)
 	case HEADER_CLOCK_DATA:
 	case HEADER_HYBRID_TOPOLOGY:
 	case HEADER_PMU_CAPS:
+	case HEADER_CPU_DOMAIN_INFO:
 		return true;
 	/* Information that can be updated */
 	case HEADER_BUILD_ID:
+		return inject->build_id_style == BID_RWS__NONE;
 	case HEADER_CMDLINE:
 	case HEADER_EVENT_DESC:
 	case HEADER_BRANCH_STACK:
@@ -2108,7 +2196,7 @@ static int feat_copy_cb(struct feat_copier *fc, int feat, struct feat_writer *fw
 	int ret;
 
 	if (!inject->secs[feat].offset ||
-	    !keep_feat(feat))
+	    !keep_feat(inject, feat))
 		return 0;
 
 	ret = feat_copy(inject, feat, fw);
@@ -2269,6 +2357,15 @@ static int __cmd_inject(struct perf_inject *inject)
 		/* Allow space in the header for guest attributes */
 		output_data_offset += gs->session->header.data_offset;
 		output_data_offset = roundup(output_data_offset, 4096);
+	} else if (inject->convert_callchain) {
+		inject->tool.sample	= perf_event__convert_sample_callchain;
+		inject->tool.fork	= perf_event__repipe_fork;
+		inject->tool.comm	= perf_event__repipe_comm;
+		inject->tool.exit	= perf_event__repipe_exit;
+		inject->tool.mmap	= perf_event__repipe_mmap;
+		inject->tool.mmap2	= perf_event__repipe_mmap2;
+		inject->tool.ordered_events = true;
+		inject->tool.ordering_requires_timestamps = true;
 	}
 
 	if (!inject->itrace_synth_opts.set)
@@ -2321,6 +2418,23 @@ static int __cmd_inject(struct perf_inject *inject)
 				perf_header__set_feat(&session->header,
 						      HEADER_BRANCH_STACK);
 		}
+
+		/*
+		 * The converted data file won't have stack and registers.
+		 * Update the perf_event_attr to remove them before writing.
+		 */
+		if (inject->convert_callchain) {
+			struct evsel *evsel;
+
+			evlist__for_each_entry(session->evlist, evsel) {
+				evsel__reset_sample_bit(evsel, REGS_USER);
+				evsel__reset_sample_bit(evsel, STACK_USER);
+				evsel->core.attr.sample_regs_user = 0;
+				evsel->core.attr.sample_stack_user = 0;
+				evsel->core.attr.exclude_callchain_user = 0;
+			}
+		}
+
 		session->header.data_offset = output_data_offset;
 		session->header.data_size = inject->bytes_written;
 		perf_session__inject_header(session, session->evlist, fd, &inj_fc.fc,
@@ -2345,6 +2459,18 @@ static int __cmd_inject(struct perf_inject *inject)
 	return ret;
 }
 
+static bool evsel__has_dwarf_callchain(struct evsel *evsel)
+{
+	struct perf_event_attr *attr = &evsel->core.attr;
+	const u64 dwarf_callchain_flags =
+		PERF_SAMPLE_STACK_USER | PERF_SAMPLE_REGS_USER | PERF_SAMPLE_CALLCHAIN;
+
+	if (!attr->exclude_callchain_user)
+		return false;
+
+	return (attr->sample_type & dwarf_callchain_flags) == dwarf_callchain_flags;
+}
+
 int cmd_inject(int argc, const char **argv)
 {
 	struct perf_inject inject = {
@@ -2413,6 +2539,8 @@ int cmd_inject(int argc, const char **argv)
 		OPT_STRING(0, "guestmount", &symbol_conf.guestmount, "directory",
 			   "guest mount directory under which every guest os"
 			   " instance has a subdir"),
+		OPT_BOOLEAN(0, "convert-callchain", &inject.convert_callchain,
+			    "Generate callchains using DWARF and drop register/stack data"),
 		OPT_END()
 	};
 	const char * const inject_usage[] = {
@@ -2429,6 +2557,9 @@ int cmd_inject(int argc, const char **argv)
 #ifndef HAVE_JITDUMP
 	set_option_nobuild(options, 'j', "jit", "NO_LIBELF=1", true);
 #endif
+#ifndef HAVE_LIBDW_SUPPORT
+	set_option_nobuild(options, 0, "convert-callchain", "NO_LIBDW=1", true);
+#endif
 	argc = parse_options(argc, argv, options, inject_usage, 0);
 
 	/*
@@ -2526,6 +2657,8 @@ int cmd_inject(int argc, const char **argv)
 	inject.tool.compressed		= perf_event__repipe_op4_synth;
 	inject.tool.auxtrace		= perf_event__repipe_auxtrace;
 	inject.tool.bpf_metadata	= perf_event__repipe_op2_synth;
+	inject.tool.schedstat_cpu	= perf_event__repipe_op2_synth;
+	inject.tool.schedstat_domain	= perf_event__repipe_op2_synth;
 	inject.tool.dont_split_sample_group = true;
 	inject.tool.merge_deferred_callchains = false;
 	inject.session = __perf_session__new(&data, &inject.tool,
@@ -2587,6 +2720,28 @@ int cmd_inject(int argc, const char **argv)
 		}
 	}
 
+	if (inject.convert_callchain) {
+		struct evsel *evsel;
+
+		if (inject.output.is_pipe || inject.session->data->is_pipe) {
+			pr_err("--convert-callchain cannot work with pipe\n");
+			goto out_delete;
+		}
+
+		evlist__for_each_entry(inject.session->evlist, evsel) {
+			if (!evsel__has_dwarf_callchain(evsel) && !evsel__is_dummy_event(evsel)) {
+				pr_err("--convert-callchain requires DWARF call graph.\n");
+				goto out_delete;
+			}
+		}
+
+		inject.raw_callchain = calloc(PERF_MAX_STACK_DEPTH, sizeof(u64));
+		if (inject.raw_callchain == NULL) {
+			pr_err("callchain allocation failed\n");
+			goto out_delete;
+		}
+	}
+
 #ifdef HAVE_JITDUMP
 	if (inject.jit_mode) {
 		inject.tool.mmap2	   = perf_event__repipe_mmap2;
@@ -2617,5 +2772,6 @@ out_close_output:
 	free(inject.itrace_synth_opts.vm_tm_corr_args);
 	free(inject.event_copy);
 	free(inject.guest_session.ev.event_buf);
+	free(inject.raw_callchain);
 	return ret;
 }
diff --git a/tools/perf/builtin-kvm.c b/tools/perf/builtin-kvm.c
index c61369d54dd9..0c5e6b3aac74 100644
--- a/tools/perf/builtin-kvm.c
+++ b/tools/perf/builtin-kvm.c
@@ -2,6 +2,7 @@
 #include "builtin.h"
 #include "perf.h"
 
+#include <dwarf-regs.h>
 #include "util/build-id.h"
 #include "util/evsel.h"
 #include "util/evlist.h"
@@ -52,7 +53,7 @@
 #include <math.h>
 #include <perf/mmap.h>
 
-#if defined(HAVE_KVM_STAT_SUPPORT) && defined(HAVE_LIBTRACEEVENT)
+#if defined(HAVE_LIBTRACEEVENT)
 #define GET_EVENT_KEY(func, field)					\
 static u64 get_event_ ##func(struct kvm_event *event, int vcpu)		\
 {									\
@@ -597,7 +598,7 @@ static void kvm_display(struct perf_kvm_stat *kvm)
 
 #endif /* HAVE_SLANG_SUPPORT */
 
-#endif // defined(HAVE_KVM_STAT_SUPPORT) && defined(HAVE_LIBTRACEEVENT)
+#endif // defined(HAVE_LIBTRACEEVENT)
 
 static const char *get_filename_for_perf_kvm(void)
 {
@@ -613,13 +614,13 @@ static const char *get_filename_for_perf_kvm(void)
 	return filename;
 }
 
-#if defined(HAVE_KVM_STAT_SUPPORT) && defined(HAVE_LIBTRACEEVENT)
+#if defined(HAVE_LIBTRACEEVENT)
 
-static bool register_kvm_events_ops(struct perf_kvm_stat *kvm)
+static bool register_kvm_events_ops(struct perf_kvm_stat *kvm, uint16_t e_machine)
 {
-	struct kvm_reg_events_ops *events_ops = kvm_reg_events_ops;
+	const struct kvm_reg_events_ops *events_ops;
 
-	for (events_ops = kvm_reg_events_ops; events_ops->name; events_ops++) {
+	for (events_ops = kvm_reg_events_ops(e_machine); events_ops->name; events_ops++) {
 		if (!strcmp(events_ops->name, kvm->report_event)) {
 			kvm->events_ops = events_ops->ops;
 			return true;
@@ -809,7 +810,7 @@ static bool is_child_event(struct perf_kvm_stat *kvm,
 			   struct perf_sample *sample,
 			   struct event_key *key)
 {
-	struct child_event_ops *child_ops;
+	const struct child_event_ops *child_ops;
 
 	child_ops = kvm->events_ops->child_ops;
 
@@ -841,11 +842,11 @@ static bool handle_child_event(struct perf_kvm_stat *kvm,
 	return true;
 }
 
-static bool skip_event(const char *event)
+static bool skip_event(uint16_t e_machine, const char *event)
 {
 	const char * const *skip_events;
 
-	for (skip_events = kvm_skip_events; *skip_events; skip_events++)
+	for (skip_events = kvm_skip_events(e_machine); *skip_events; skip_events++)
 		if (!strcmp(event, *skip_events))
 			return true;
 
@@ -901,9 +902,10 @@ static bool handle_end_event(struct perf_kvm_stat *kvm,
 
 	if (kvm->duration && time_diff > kvm->duration) {
 		char decode[KVM_EVENT_NAME_LEN];
+		uint16_t e_machine = perf_session__e_machine(kvm->session, /*e_flags=*/NULL);
 
 		kvm->events_ops->decode_key(kvm, &event->key, decode);
-		if (!skip_event(decode)) {
+		if (!skip_event(e_machine, decode)) {
 			pr_info("%" PRIu64 " VM %d, vcpu %d: %s event took %" PRIu64 "usec\n",
 				 sample->time, sample->pid, vcpu_record->vcpu_id,
 				 decode, time_diff / NSEC_PER_USEC);
@@ -921,6 +923,8 @@ struct vcpu_event_record *per_vcpu_record(struct thread *thread,
 	/* Only kvm_entry records vcpu id. */
 	if (!thread__priv(thread) && kvm_entry_event(evsel)) {
 		struct vcpu_event_record *vcpu_record;
+		struct machine *machine = maps__machine(thread__maps(thread));
+		uint16_t e_machine = thread__e_machine(thread, machine, /*e_flags=*/NULL);
 
 		vcpu_record = zalloc(sizeof(*vcpu_record));
 		if (!vcpu_record) {
@@ -928,7 +932,7 @@ struct vcpu_event_record *per_vcpu_record(struct thread *thread,
 			return NULL;
 		}
 
-		vcpu_record->vcpu_id = evsel__intval(evsel, sample, vcpu_id_str);
+		vcpu_record->vcpu_id = evsel__intval(evsel, sample, vcpu_id_str(e_machine));
 		thread__set_priv(thread, vcpu_record);
 	}
 
@@ -1163,6 +1167,7 @@ static int cpu_isa_config(struct perf_kvm_stat *kvm)
 {
 	char buf[128], *cpuid;
 	int err;
+	uint16_t e_machine;
 
 	if (kvm->live) {
 		struct perf_cpu cpu = {-1};
@@ -1182,7 +1187,8 @@ static int cpu_isa_config(struct perf_kvm_stat *kvm)
 		return -EINVAL;
 	}
 
-	err = cpu_isa_init(kvm, cpuid);
+	e_machine = perf_session__e_machine(kvm->session, /*e_flags=*/NULL);
+	err = cpu_isa_init(kvm, e_machine, cpuid);
 	if (err == -ENOTSUP)
 		pr_err("CPU %s is not supported.\n", cpuid);
 
@@ -1413,7 +1419,7 @@ static int kvm_events_live_report(struct perf_kvm_stat *kvm)
 
 	if (!verify_vcpu(kvm->trace_vcpu) ||
 	    !is_valid_key(kvm) ||
-	    !register_kvm_events_ops(kvm)) {
+		!register_kvm_events_ops(kvm, EM_HOST)) {
 		goto out;
 	}
 
@@ -1543,7 +1549,7 @@ out:
 static int read_events(struct perf_kvm_stat *kvm)
 {
 	int ret;
-
+	uint16_t e_machine;
 	struct perf_data file = {
 		.path  = kvm->file_name,
 		.mode  = PERF_DATA_MODE_READ,
@@ -1568,6 +1574,12 @@ static int read_events(struct perf_kvm_stat *kvm)
 		goto out_delete;
 	}
 
+	e_machine = perf_session__e_machine(kvm->session, /*e_flags=*/NULL);
+	if (!register_kvm_events_ops(kvm, e_machine)) {
+		ret = -EINVAL;
+		goto out_delete;
+	}
+
 	/*
 	 * Do not use 'isa' recorded in kvm_exit tracepoint since it is not
 	 * traced in the old kernel.
@@ -1610,9 +1622,6 @@ static int kvm_events_report_vcpu(struct perf_kvm_stat *kvm)
 	if (!is_valid_key(kvm))
 		goto exit;
 
-	if (!register_kvm_events_ops(kvm))
-		goto exit;
-
 	if (kvm->use_stdio) {
 		use_browser = 0;
 		setup_pager();
@@ -1636,11 +1645,6 @@ exit:
 	return ret;
 }
 
-int __weak setup_kvm_events_tp(struct perf_kvm_stat *kvm __maybe_unused)
-{
-	return 0;
-}
-
 static int
 kvm_events_record(struct perf_kvm_stat *kvm, int argc, const char **argv)
 {
@@ -1658,15 +1662,16 @@ kvm_events_record(struct perf_kvm_stat *kvm, int argc, const char **argv)
 	};
 	const char * const *events_tp;
 	int ret;
+	uint16_t e_machine = EM_HOST;
 
 	events_tp_size = 0;
-	ret = setup_kvm_events_tp(kvm);
+	ret = setup_kvm_events_tp(kvm, e_machine);
 	if (ret < 0) {
 		pr_err("Unable to setup the kvm tracepoints\n");
 		return ret;
 	}
 
-	for (events_tp = kvm_events_tp; *events_tp; events_tp++)
+	for (events_tp = kvm_events_tp(e_machine); *events_tp; events_tp++)
 		events_tp_size++;
 
 	rec_argc = ARRAY_SIZE(record_args) + argc + 2 +
@@ -1681,7 +1686,7 @@ kvm_events_record(struct perf_kvm_stat *kvm, int argc, const char **argv)
 
 	for (j = 0; j < events_tp_size; j++) {
 		rec_argv[i++] = STRDUP_FAIL_EXIT("-e");
-		rec_argv[i++] = STRDUP_FAIL_EXIT(kvm_events_tp[j]);
+		rec_argv[i++] = STRDUP_FAIL_EXIT(kvm_events_tp(e_machine)[j]);
 	}
 
 	rec_argv[i++] = STRDUP_FAIL_EXIT("-o");
@@ -1775,7 +1780,7 @@ static struct evlist *kvm_live_event_list(void)
 	if (evlist == NULL)
 		return NULL;
 
-	for (events_tp = kvm_events_tp; *events_tp; events_tp++) {
+	for (events_tp = kvm_events_tp(EM_HOST); *events_tp; events_tp++) {
 
 		tp = strdup(*events_tp);
 		if (tp == NULL)
@@ -1900,7 +1905,7 @@ static int kvm_events_live(struct perf_kvm_stat *kvm,
 	/*
 	 * generate the event list
 	 */
-	err = setup_kvm_events_tp(kvm);
+	err = setup_kvm_events_tp(kvm, EM_HOST);
 	if (err < 0) {
 		pr_err("Unable to setup the kvm tracepoints\n");
 		return err;
@@ -1985,13 +1990,7 @@ static int kvm_cmd_stat(const char *file_name, int argc, const char **argv)
 perf_stat:
 	return cmd_stat(argc, argv);
 }
-#endif /* HAVE_KVM_STAT_SUPPORT */
-
-int __weak kvm_add_default_arch_event(int *argc __maybe_unused,
-					const char **argv __maybe_unused)
-{
-	return 0;
-}
+#endif /* HAVE_LIBTRACEEVENT */
 
 static int __cmd_record(const char *file_name, int argc, const char **argv)
 {
@@ -2016,7 +2015,7 @@ static int __cmd_record(const char *file_name, int argc, const char **argv)
 
 	BUG_ON(i + 2 != rec_argc);
 
-	ret = kvm_add_default_arch_event(&i, rec_argv);
+	ret = kvm_add_default_arch_event(EM_HOST, &i, rec_argv);
 	if (ret)
 		goto EXIT;
 
@@ -2103,7 +2102,7 @@ static int __cmd_top(int argc, const char **argv)
 
 	BUG_ON(i != argc);
 
-	ret = kvm_add_default_arch_event(&i, rec_argv);
+	ret = kvm_add_default_arch_event(EM_HOST, &i, rec_argv);
 	if (ret)
 		goto EXIT;
 
@@ -2179,7 +2178,7 @@ int cmd_kvm(int argc, const char **argv)
 		return __cmd_top(argc, argv);
 	else if (strlen(argv[0]) > 2 && strstarts("buildid-list", argv[0]))
 		return __cmd_buildid_list(file_name, argc, argv);
-#if defined(HAVE_KVM_STAT_SUPPORT) && defined(HAVE_LIBTRACEEVENT)
+#if defined(HAVE_LIBTRACEEVENT)
 	else if (strlen(argv[0]) > 2 && strstarts("stat", argv[0]))
 		return kvm_cmd_stat(file_name, argc, argv);
 #endif
diff --git a/tools/perf/builtin-list.c b/tools/perf/builtin-list.c
index 87a5491048ac..50f69c2c0d51 100644
--- a/tools/perf/builtin-list.c
+++ b/tools/perf/builtin-list.c
@@ -648,7 +648,7 @@ int cmd_list(int argc, const char **argv)
 	}
 
 	for (i = 0; i < argc; ++i) {
-		char *sep, *s;
+		char *s;
 
 		if (strcmp(argv[i], "tracepoint") == 0) {
 			char *old_pmu_glob = default_ps.pmu_glob;
@@ -720,7 +720,7 @@ int cmd_list(int argc, const char **argv)
 		else if (strcmp(argv[i], "pfm") == 0)
 			print_libpfm_events(&print_cb, ps);
 #endif
-		else if ((sep = strchr(argv[i], ':')) != NULL) {
+		else if (strchr(argv[i], ':') != NULL) {
 			char *old_pmu_glob = ps->pmu_glob;
 			char *old_event_glob = ps->event_glob;
 
diff --git a/tools/perf/builtin-probe.c b/tools/perf/builtin-probe.c
index 69800e4d9530..1b4ba85ee019 100644
--- a/tools/perf/builtin-probe.c
+++ b/tools/perf/builtin-probe.c
@@ -211,8 +211,7 @@ static int opt_set_target_ns(const struct option *opt __maybe_unused,
 		ns_pid = (pid_t)strtol(str, NULL, 10);
 		if (errno != 0) {
 			ret = -errno;
-			pr_warning("Failed to parse %s as a pid: %s\n", str,
-				   strerror(errno));
+			pr_warning("Failed to parse %s as a pid: %m\n", str);
 			return ret;
 		}
 		nsip = nsinfo__new(ns_pid);
diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index 2584d0d8bc82..60d764068302 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -1286,7 +1286,6 @@ static int record__mmap_evlist(struct record *rec,
 	struct record_opts *opts = &rec->opts;
 	bool auxtrace_overwrite = opts->auxtrace_snapshot_mode ||
 				  opts->auxtrace_sample_mode;
-	char msg[512];
 
 	if (opts->affinity != PERF_AFFINITY_SYS)
 		cpu__setup_cpunode_map();
@@ -1305,8 +1304,7 @@ static int record__mmap_evlist(struct record *rec,
 			       opts->mmap_pages, opts->auxtrace_mmap_pages);
 			return -errno;
 		} else {
-			pr_err("failed to mmap with %d (%s)\n", errno,
-				str_error_r(errno, msg, sizeof(msg)));
+			pr_err("failed to mmap: %m\n");
 			if (errno)
 				return -errno;
 			else
@@ -1324,7 +1322,8 @@ static int record__mmap_evlist(struct record *rec,
 	if (record__threads_enabled(rec)) {
 		ret = perf_data__create_dir(&rec->data, evlist->core.nr_mmaps);
 		if (ret) {
-			pr_err("Failed to create data directory: %s\n", strerror(-ret));
+			errno = -ret;
+			pr_err("Failed to create data directory: %m\n");
 			return ret;
 		}
 		for (i = 0; i < evlist->core.nr_mmaps; i++) {
@@ -1404,6 +1403,7 @@ try_again:
 			}
 #endif
 			if (report_error || verbose > 0) {
+				evsel__open_strerror(pos, &opts->target, errno, msg, sizeof(msg));
 				ui__error("Failure to open event '%s' on PMU '%s' which will be "
 					  "removed.\n%s\n",
 					  evsel__name(pos), evsel__pmu_name(pos), msg);
@@ -1461,9 +1461,8 @@ try_again:
 	}
 
 	if (evlist__apply_filters(evlist, &pos, &opts->target)) {
-		pr_err("failed to set filter \"%s\" on event %s with %d (%s)\n",
-			pos->filter ?: "BPF", evsel__name(pos), errno,
-			str_error_r(errno, msg, sizeof(msg)));
+		pr_err("failed to set filter \"%s\" on event %s: %m\n",
+			pos->filter ?: "BPF", evsel__name(pos));
 		rc = -1;
 		goto out;
 	}
@@ -1511,6 +1510,8 @@ static int process_buildids(struct record *rec)
 	if (perf_data__size(&rec->data) == 0)
 		return 0;
 
+	/* A single DSO is needed and not all inline frames. */
+	symbol_conf.inline_name = false;
 	/*
 	 * During this process, it'll load kernel map and replace the
 	 * dso->long_name to a real pathname it found.  In this case
@@ -1521,7 +1522,6 @@ static int process_buildids(struct record *rec)
 	 *   $HOME/.debug/.build-id/f0/6e17aa50adf4d00b88925e03775de107611551
 	 */
 	symbol_conf.ignore_vmlinux_buildid = true;
-
 	/*
 	 * If --buildid-all is given, it marks all DSO regardless of hits,
 	 * so no need to process samples. But if timestamp_boundary is enabled,
@@ -1748,8 +1748,7 @@ static void *record__thread(void *arg)
 
 	err = write(thread->pipes.ack[1], &msg, sizeof(msg));
 	if (err == -1)
-		pr_warning("threads[%d]: failed to notify on start: %s\n",
-			   thread->tid, strerror(errno));
+		pr_warning("threads[%d]: failed to notify on start: %m\n", thread->tid);
 
 	pr_debug("threads[%d]: started on cpu%d\n", thread->tid, sched_getcpu());
 
@@ -1792,8 +1791,7 @@ static void *record__thread(void *arg)
 
 	err = write(thread->pipes.ack[1], &msg, sizeof(msg));
 	if (err == -1)
-		pr_warning("threads[%d]: failed to notify on termination: %s\n",
-			   thread->tid, strerror(errno));
+		pr_warning("threads[%d]: failed to notify on termination: %m\n", thread->tid);
 
 	return NULL;
 }
@@ -1881,7 +1879,7 @@ static int record__synthesize_workload(struct record *rec, bool tail)
 						 process_synthesized_event,
 						 &rec->session->machines.host,
 						 needs_mmap,
-						 rec->opts.sample_address);
+						 rec->opts.record_data_mmap);
 	perf_thread_map__put(thread_map);
 	return err;
 }
@@ -2191,7 +2189,7 @@ static int record__synthesize(struct record *rec, bool tail)
 
 		err = __machine__synthesize_threads(machine, tool, &opts->target,
 						    rec->evlist->core.threads,
-						    f, needs_mmap, opts->sample_address,
+						    f, needs_mmap, opts->record_data_mmap,
 						    rec->opts.nr_threads_synthesize);
 	}
 
@@ -2338,7 +2336,7 @@ static int record__start_threads(struct record *rec)
 
 	sigfillset(&full);
 	if (sigprocmask(SIG_SETMASK, &full, &mask)) {
-		pr_err("Failed to block signals on threads start: %s\n", strerror(errno));
+		pr_err("Failed to block signals on threads start: %m\n");
 		return -1;
 	}
 
@@ -2356,7 +2354,7 @@ static int record__start_threads(struct record *rec)
 		if (pthread_create(&handle, &attrs, record__thread, &thread_data[t])) {
 			for (tt = 1; tt < t; tt++)
 				record__terminate_thread(&thread_data[t]);
-			pr_err("Failed to start threads: %s\n", strerror(errno));
+			pr_err("Failed to start threads: %m\n");
 			ret = -1;
 			goto out_err;
 		}
@@ -2379,7 +2377,7 @@ out_err:
 	pthread_attr_destroy(&attrs);
 
 	if (sigprocmask(SIG_SETMASK, &mask, NULL)) {
-		pr_err("Failed to unblock signals on threads start: %s\n", strerror(errno));
+		pr_err("Failed to unblock signals on threads start: %m\n");
 		ret = -1;
 	}
 
@@ -3006,8 +3004,9 @@ int record_opts__parse_callchain(struct record_opts *record,
 	ret = parse_callchain_record_opt(arg, callchain);
 	if (!ret) {
 		/* Enable data address sampling for DWARF unwind. */
-		if (callchain->record_mode == CALLCHAIN_DWARF)
-			record->sample_address = true;
+		if (callchain->record_mode == CALLCHAIN_DWARF &&
+		    !record->record_data_mmap_set)
+			record->record_data_mmap = true;
 		callchain_debug(callchain);
 	}
 
@@ -3686,6 +3685,9 @@ static struct option __record_options[] = {
 	OPT_CALLBACK(0, "off-cpu-thresh", &record.opts, "ms",
 		     "Dump off-cpu samples if off-cpu time exceeds this threshold (in milliseconds). (Default: 500ms)",
 		     record__parse_off_cpu_thresh),
+	OPT_BOOLEAN_SET(0, "data-mmap", &record.opts.record_data_mmap,
+			&record.opts.record_data_mmap_set,
+			"Record mmap events for non-executable mappings"),
 	OPT_END()
 };
 
@@ -4249,9 +4251,12 @@ int cmd_record(int argc, const char **argv)
 		goto out_opts;
 	}
 
-	/* For backward compatibility, -d implies --mem-info */
-	if (rec->opts.sample_address)
+	/* For backward compatibility, -d implies --mem-info and --data-mmap */
+	if (rec->opts.sample_address) {
 		rec->opts.sample_data_src = true;
+		if (!rec->opts.record_data_mmap_set)
+			rec->opts.record_data_mmap = true;
+	}
 
 	/*
 	 * Allow aliases to facilitate the lookup of symbols for address
diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index add6b1c2aaf0..3b81f4b3dc49 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -448,7 +448,7 @@ static int report__setup_sample_type(struct report *rep)
 		}
 	}
 
-	callchain_param_setup(sample_type, perf_env__arch(perf_session__env(rep->session)));
+	callchain_param_setup(sample_type, perf_session__e_machine(session, /*e_flags=*/NULL));
 
 	if (rep->stitch_lbr && (callchain_param.record_mode != CALLCHAIN_LBR)) {
 		ui__warning("Can't find LBR callchain. Switch off --stitch-lbr.\n"
@@ -1271,12 +1271,18 @@ parse_percent_limit(const struct option *opt, const char *str,
 	return 0;
 }
 
+static int
+report_parse_addr2line_config(const struct option *opt __maybe_unused,
+			      const char *arg, int unset __maybe_unused)
+{
+	return addr2line_configure("addr2line.style", arg, NULL);
+}
+
 static int process_attr(const struct perf_tool *tool __maybe_unused,
 			union perf_event *event,
 			struct evlist **pevlist)
 {
 	struct perf_session *session;
-	struct perf_env *env;
 	u64 sample_type;
 	int err;
 
@@ -1290,8 +1296,7 @@ static int process_attr(const struct perf_tool *tool __maybe_unused,
 	 */
 	sample_type = evlist__combined_sample_type(*pevlist);
 	session = (*pevlist)->session;
-	env = perf_session__env(session);
-	callchain_param_setup(sample_type, perf_env__arch(env));
+	callchain_param_setup(sample_type, perf_session__e_machine(session, /*e_flags=*/NULL));
 	return 0;
 }
 
@@ -1447,6 +1452,9 @@ int cmd_report(int argc, const char **argv)
 		   "objdump binary to use for disassembly and annotations"),
 	OPT_STRING(0, "addr2line", &addr2line_path, "path",
 		   "addr2line binary to use for line numbers"),
+	OPT_CALLBACK(0, "addr2line-style", NULL, "addr2line style",
+		     "addr2line styles (libdw,llvm,libbfd,addr2line)",
+		     report_parse_addr2line_config),
 	OPT_BOOLEAN(0, "demangle", &symbol_conf.demangle,
 		    "Symbol demangling. Enabled by default, use --no-demangle to disable."),
 	OPT_BOOLEAN(0, "demangle-kernel", &symbol_conf.demangle_kernel,
@@ -1727,7 +1735,8 @@ repeat:
 			sort_order = NULL;
 	}
 
-	if (sort_order && strstr(sort_order, "type")) {
+	if ((sort_order && strstr(sort_order, "type")) ||
+	    (field_order && strstr(field_order, "type"))) {
 		report.data_type = true;
 		annotate_opts.annotate_src = false;
 
diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c
index eca3b1c58c4b..3f509cfdd58c 100644
--- a/tools/perf/builtin-sched.c
+++ b/tools/perf/builtin-sched.c
@@ -28,6 +28,8 @@
 #include "util/debug.h"
 #include "util/event.h"
 #include "util/util.h"
+#include "util/synthetic-events.h"
+#include "util/target.h"
 
 #include <linux/kernel.h>
 #include <linux/log2.h>
@@ -53,8 +55,10 @@
 #define SYM_LEN			129
 #define MAX_PID			1024000
 #define MAX_PRIO		140
+#define SEP_LEN			100
 
 static const char *cpu_list;
+static struct perf_cpu_map *user_requested_cpus;
 static DECLARE_BITMAP(cpu_bitmap, MAX_NR_CPUS);
 
 struct sched_atom;
@@ -236,6 +240,9 @@ struct perf_sched {
 	volatile bool   thread_funcs_exit;
 	const char	*prio_str;
 	DECLARE_BITMAP(prio_bitmap, MAX_PRIO);
+
+	struct perf_session *session;
+	struct perf_data *data;
 };
 
 /* per thread run time data */
@@ -3734,6 +3741,993 @@ static void setup_sorting(struct perf_sched *sched, const struct option *options
 	sort_dimension__add("pid", &sched->cmp_pid);
 }
 
+static int process_synthesized_schedstat_event(const struct perf_tool *tool,
+					       union perf_event *event,
+					       struct perf_sample *sample __maybe_unused,
+					       struct machine *machine __maybe_unused)
+{
+	struct perf_sched *sched = container_of(tool, struct perf_sched, tool);
+
+	if (perf_data__write(sched->data, event, event->header.size) <= 0) {
+		pr_err("failed to write perf data, error: %m\n");
+		return -1;
+	}
+
+	sched->session->header.data_size += event->header.size;
+	return 0;
+}
+
+static void sighandler(int sig __maybe_unused)
+{
+}
+
+static int enable_sched_schedstats(int *reset)
+{
+	char path[PATH_MAX];
+	FILE *fp;
+	char ch;
+
+	snprintf(path, PATH_MAX, "%s/sys/kernel/sched_schedstats", procfs__mountpoint());
+	fp = fopen(path, "w+");
+	if (!fp) {
+		pr_err("Failed to open %s\n", path);
+		return -1;
+	}
+
+	ch = getc(fp);
+	if (ch == '0') {
+		*reset = 1;
+		rewind(fp);
+		putc('1', fp);
+		fclose(fp);
+	}
+	return 0;
+}
+
+static int disable_sched_schedstat(void)
+{
+	char path[PATH_MAX];
+	FILE *fp;
+
+	snprintf(path, PATH_MAX, "%s/sys/kernel/sched_schedstats", procfs__mountpoint());
+	fp = fopen(path, "w");
+	if (!fp) {
+		pr_err("Failed to open %s\n", path);
+		return -1;
+	}
+
+	putc('0', fp);
+	fclose(fp);
+	return 0;
+}
+
+/* perf.data or any other output file name used by stats subcommand (only). */
+const char *output_name;
+
+static int perf_sched__schedstat_record(struct perf_sched *sched,
+					int argc, const char **argv)
+{
+	struct perf_session *session;
+	struct target target = {};
+	struct evlist *evlist;
+	int reset = 0;
+	int err = 0;
+	int fd;
+	struct perf_data data = {
+		.path  = output_name,
+		.mode  = PERF_DATA_MODE_WRITE,
+	};
+
+	signal(SIGINT, sighandler);
+	signal(SIGCHLD, sighandler);
+	signal(SIGTERM, sighandler);
+
+	evlist = evlist__new();
+	if (!evlist)
+		return -ENOMEM;
+
+	session = perf_session__new(&data, &sched->tool);
+	if (IS_ERR(session)) {
+		pr_err("Perf session creation failed.\n");
+		evlist__delete(evlist);
+		return PTR_ERR(session);
+	}
+
+	session->evlist = evlist;
+
+	sched->session = session;
+	sched->data = &data;
+
+	fd = perf_data__fd(&data);
+
+	/*
+	 * Capture all important metadata about the system. Although they are
+	 * not used by `perf sched stats` tool directly, they provide useful
+	 * information about profiled environment.
+	 */
+	perf_header__set_feat(&session->header, HEADER_HOSTNAME);
+	perf_header__set_feat(&session->header, HEADER_OSRELEASE);
+	perf_header__set_feat(&session->header, HEADER_VERSION);
+	perf_header__set_feat(&session->header, HEADER_ARCH);
+	perf_header__set_feat(&session->header, HEADER_NRCPUS);
+	perf_header__set_feat(&session->header, HEADER_CPUDESC);
+	perf_header__set_feat(&session->header, HEADER_CPUID);
+	perf_header__set_feat(&session->header, HEADER_TOTAL_MEM);
+	perf_header__set_feat(&session->header, HEADER_CMDLINE);
+	perf_header__set_feat(&session->header, HEADER_CPU_TOPOLOGY);
+	perf_header__set_feat(&session->header, HEADER_NUMA_TOPOLOGY);
+	perf_header__set_feat(&session->header, HEADER_CACHE);
+	perf_header__set_feat(&session->header, HEADER_MEM_TOPOLOGY);
+	perf_header__set_feat(&session->header, HEADER_HYBRID_TOPOLOGY);
+	perf_header__set_feat(&session->header, HEADER_CPU_DOMAIN_INFO);
+
+	err = perf_session__write_header(session, evlist, fd, false);
+	if (err < 0)
+		goto out;
+
+	/*
+	 * `perf sched stats` does not support workload profiling (-p pid)
+	 * since /proc/schedstat file contains cpu specific data only. Hence, a
+	 * profile target is either set of cpus or systemwide, never a process.
+	 * Note that, although `-- <workload>` is supported, profile data are
+	 * still cpu/systemwide.
+	 */
+	if (cpu_list)
+		target.cpu_list = cpu_list;
+	else
+		target.system_wide = true;
+
+	if (argc) {
+		err = evlist__prepare_workload(evlist, &target, argv, false, NULL);
+		if (err)
+			goto out;
+	}
+
+	err = evlist__create_maps(evlist, &target);
+	if (err < 0)
+		goto out;
+
+	user_requested_cpus = evlist->core.user_requested_cpus;
+
+	err = perf_event__synthesize_schedstat(&(sched->tool),
+					       process_synthesized_schedstat_event,
+					       user_requested_cpus);
+	if (err < 0)
+		goto out;
+
+	err = enable_sched_schedstats(&reset);
+	if (err < 0)
+		goto out;
+
+	if (argc)
+		evlist__start_workload(evlist);
+
+	/* wait for signal */
+	pause();
+
+	if (reset) {
+		err = disable_sched_schedstat();
+		if (err < 0)
+			goto out;
+	}
+
+	err = perf_event__synthesize_schedstat(&(sched->tool),
+					       process_synthesized_schedstat_event,
+					       user_requested_cpus);
+	if (err < 0)
+		goto out;
+
+	err = perf_session__write_header(session, evlist, fd, true);
+
+out:
+	if (!err)
+		fprintf(stderr, "[ perf sched stats: Wrote samples to %s ]\n", data.path);
+	else
+		fprintf(stderr, "[ perf sched stats: Failed !! ]\n");
+
+	evlist__delete(evlist);
+	close(fd);
+	return err;
+}
+
+struct schedstat_domain {
+	struct list_head domain_list;
+	struct perf_record_schedstat_domain *domain_data;
+};
+
+struct schedstat_cpu {
+	struct list_head cpu_list;
+	struct list_head domain_head;
+	struct perf_record_schedstat_cpu *cpu_data;
+};
+
+static struct list_head cpu_head = LIST_HEAD_INIT(cpu_head);
+static struct schedstat_cpu *cpu_second_pass;
+static struct schedstat_domain *domain_second_pass;
+static bool after_workload_flag;
+static bool verbose_field;
+
+static void store_schedstat_cpu_diff(struct schedstat_cpu *after_workload)
+{
+	struct perf_record_schedstat_cpu *before = cpu_second_pass->cpu_data;
+	struct perf_record_schedstat_cpu *after = after_workload->cpu_data;
+	__u16 version = after_workload->cpu_data->version;
+
+#define CPU_FIELD(_type, _name, _desc, _format, _is_pct, _pct_of, _ver)	\
+	(before->_ver._name = after->_ver._name - before->_ver._name)
+
+	if (version == 15) {
+#include <perf/schedstat-v15.h>
+	} else if (version == 16) {
+#include <perf/schedstat-v16.h>
+	} else if (version == 17) {
+#include <perf/schedstat-v17.h>
+	}
+
+#undef CPU_FIELD
+}
+
+static void store_schedstat_domain_diff(struct schedstat_domain *after_workload)
+{
+	struct perf_record_schedstat_domain *before = domain_second_pass->domain_data;
+	struct perf_record_schedstat_domain *after = after_workload->domain_data;
+	__u16 version = after_workload->domain_data->version;
+
+#define DOMAIN_FIELD(_type, _name, _desc, _format, _is_jiffies, _ver)	\
+	(before->_ver._name = after->_ver._name - before->_ver._name)
+
+	if (version == 15) {
+#include <perf/schedstat-v15.h>
+	} else if (version == 16) {
+#include <perf/schedstat-v16.h>
+	} else if (version == 17) {
+#include <perf/schedstat-v17.h>
+	}
+#undef DOMAIN_FIELD
+}
+
+#define PCT_CHNG(_x, _y)        ((_x) ? ((double)((double)(_y) - (_x)) / (_x)) * 100 : 0.0)
+static inline void print_cpu_stats(struct perf_record_schedstat_cpu *cs1,
+				   struct perf_record_schedstat_cpu *cs2)
+{
+	printf("%-65s ", "DESC");
+	if (!cs2)
+		printf("%12s %12s", "COUNT", "PCT_CHANGE");
+	else
+		printf("%12s %11s %12s %14s %10s", "COUNT1", "COUNT2", "PCT_CHANGE",
+		       "PCT_CHANGE1", "PCT_CHANGE2");
+
+	printf("\n");
+	print_separator2(SEP_LEN, "", 0);
+
+#define CALC_PCT(_x, _y)	((_y) ? ((double)(_x) / (_y)) * 100 : 0.0)
+
+#define CPU_FIELD(_type, _name, _desc, _format, _is_pct, _pct_of, _ver)			\
+	do {										\
+		printf("%-65s: " _format, verbose_field ? _desc : #_name,		\
+		       cs1->_ver._name);						\
+		if (!cs2) {								\
+			if (_is_pct)							\
+				printf("  ( %8.2lf%% )",				\
+				       CALC_PCT(cs1->_ver._name, cs1->_ver._pct_of));	\
+		} else {								\
+			printf("," _format "  | %8.2lf%% |", cs2->_ver._name,		\
+			       PCT_CHNG(cs1->_ver._name, cs2->_ver._name));		\
+			if (_is_pct)							\
+				printf("  ( %8.2lf%%,  %8.2lf%% )",			\
+				       CALC_PCT(cs1->_ver._name, cs1->_ver._pct_of),	\
+				       CALC_PCT(cs2->_ver._name, cs2->_ver._pct_of));	\
+		}									\
+		printf("\n");								\
+	} while (0)
+
+	if (cs1->version == 15) {
+#include <perf/schedstat-v15.h>
+	} else if (cs1->version == 16) {
+#include <perf/schedstat-v16.h>
+	} else if (cs1->version == 17) {
+#include <perf/schedstat-v17.h>
+	}
+
+#undef CPU_FIELD
+#undef CALC_PCT
+}
+
+static inline void print_domain_stats(struct perf_record_schedstat_domain *ds1,
+				      struct perf_record_schedstat_domain *ds2,
+				      __u64 jiffies1, __u64 jiffies2)
+{
+	printf("%-65s ", "DESC");
+	if (!ds2)
+		printf("%12s %14s", "COUNT", "AVG_JIFFIES");
+	else
+		printf("%12s %11s %12s %16s %12s", "COUNT1", "COUNT2", "PCT_CHANGE",
+		       "AVG_JIFFIES1", "AVG_JIFFIES2");
+	printf("\n");
+
+#define DOMAIN_CATEGORY(_desc)							\
+	do {									\
+		size_t _len = strlen(_desc);					\
+		size_t _pre_dash_cnt = (SEP_LEN - _len) / 2;			\
+		size_t _post_dash_cnt = SEP_LEN - _len - _pre_dash_cnt;		\
+		print_separator2((int)_pre_dash_cnt, _desc, (int)_post_dash_cnt);\
+	} while (0)
+
+#define CALC_AVG(_x, _y)	((_y) ? (long double)(_x) / (_y) : 0.0)
+
+#define DOMAIN_FIELD(_type, _name, _desc, _format, _is_jiffies, _ver)		\
+	do {									\
+		printf("%-65s: " _format, verbose_field ? _desc : #_name,	\
+		       ds1->_ver._name);					\
+		if (!ds2) {							\
+			if (_is_jiffies)					\
+				printf("  $ %11.2Lf $",				\
+				       CALC_AVG(jiffies1, ds1->_ver._name));	\
+		} else {							\
+			printf("," _format "  | %8.2lf%% |", ds2->_ver._name,	\
+			       PCT_CHNG(ds1->_ver._name, ds2->_ver._name));	\
+			if (_is_jiffies)					\
+				printf("  $ %11.2Lf, %11.2Lf $",		\
+				       CALC_AVG(jiffies1, ds1->_ver._name),	\
+				       CALC_AVG(jiffies2, ds2->_ver._name));	\
+		}								\
+		printf("\n");							\
+	} while (0)
+
+#define DERIVED_CNT_FIELD(_name, _desc, _format, _x, _y, _z, _ver)		\
+	do {									\
+		__u32 t1 = ds1->_ver._x - ds1->_ver._y - ds1->_ver._z;		\
+		printf("*%-64s: " _format, verbose_field ? _desc : #_name, t1);	\
+		if (ds2) {							\
+			__u32 t2 = ds2->_ver._x - ds2->_ver._y - ds2->_ver._z;	\
+			printf("," _format "  | %8.2lf%% |", t2,		\
+			       PCT_CHNG(t1, t2));				\
+		}								\
+		printf("\n");							\
+	} while (0)
+
+#define DERIVED_AVG_FIELD(_name, _desc, _format, _x, _y, _z, _w, _ver)		\
+	do {									\
+		__u32 t1 = ds1->_ver._x - ds1->_ver._y - ds1->_ver._z;		\
+		printf("*%-64s: " _format, verbose_field ? _desc : #_name,	\
+		       CALC_AVG(ds1->_ver._w, t1));				\
+		if (ds2) {							\
+			__u32 t2 = ds2->_ver._x - ds2->_ver._y - ds2->_ver._z;	\
+			printf("," _format "  | %8.2Lf%% |",			\
+			       CALC_AVG(ds2->_ver._w, t2),			\
+			       PCT_CHNG(CALC_AVG(ds1->_ver._w, t1),		\
+					CALC_AVG(ds2->_ver._w, t2)));		\
+		}								\
+		printf("\n");							\
+	} while (0)
+
+	if (ds1->version == 15) {
+#include <perf/schedstat-v15.h>
+	} else if (ds1->version == 16) {
+#include <perf/schedstat-v16.h>
+	} else if (ds1->version == 17) {
+#include <perf/schedstat-v17.h>
+	}
+
+#undef DERIVED_AVG_FIELD
+#undef DERIVED_CNT_FIELD
+#undef DOMAIN_FIELD
+#undef CALC_AVG
+#undef DOMAIN_CATEGORY
+}
+#undef PCT_CHNG
+
+static void summarize_schedstat_cpu(struct schedstat_cpu *summary_cpu,
+				    struct schedstat_cpu *cptr,
+				    int cnt, bool is_last)
+{
+	struct perf_record_schedstat_cpu *summary_cs = summary_cpu->cpu_data,
+					 *temp_cs = cptr->cpu_data;
+
+#define CPU_FIELD(_type, _name, _desc, _format, _is_pct, _pct_of, _ver)		\
+	do {									\
+		summary_cs->_ver._name += temp_cs->_ver._name;			\
+		if (is_last)							\
+			summary_cs->_ver._name /= cnt;				\
+	} while (0)
+
+	if (cptr->cpu_data->version == 15) {
+#include <perf/schedstat-v15.h>
+	} else if (cptr->cpu_data->version == 16) {
+#include <perf/schedstat-v16.h>
+	} else if (cptr->cpu_data->version == 17) {
+#include <perf/schedstat-v17.h>
+	}
+#undef CPU_FIELD
+}
+
+static void summarize_schedstat_domain(struct schedstat_domain *summary_domain,
+				       struct schedstat_domain *dptr,
+				       int cnt, bool is_last)
+{
+	struct perf_record_schedstat_domain *summary_ds = summary_domain->domain_data,
+					    *temp_ds = dptr->domain_data;
+
+#define DOMAIN_FIELD(_type, _name, _desc, _format, _is_jiffies, _ver)		\
+	do {									\
+		summary_ds->_ver._name += temp_ds->_ver._name;			\
+		if (is_last)							\
+			summary_ds->_ver._name /= cnt;				\
+	} while (0)
+
+	if (dptr->domain_data->version == 15) {
+#include <perf/schedstat-v15.h>
+	} else if (dptr->domain_data->version == 16) {
+#include <perf/schedstat-v16.h>
+	} else if (dptr->domain_data->version == 17) {
+#include <perf/schedstat-v17.h>
+	}
+#undef DOMAIN_FIELD
+}
+
+/*
+ * get_all_cpu_stats() appends the summary to the head of the list.
+ */
+static int get_all_cpu_stats(struct list_head *head)
+{
+	struct schedstat_cpu *cptr = list_first_entry(head, struct schedstat_cpu, cpu_list);
+	struct schedstat_cpu *summary_head = NULL;
+	struct perf_record_schedstat_domain *ds;
+	struct perf_record_schedstat_cpu *cs;
+	struct schedstat_domain *dptr, *tdptr;
+	bool is_last = false;
+	int cnt = 1;
+	int ret = 0;
+
+	if (cptr) {
+		summary_head = zalloc(sizeof(*summary_head));
+		if (!summary_head)
+			return -ENOMEM;
+
+		summary_head->cpu_data = zalloc(sizeof(*cs));
+		memcpy(summary_head->cpu_data, cptr->cpu_data, sizeof(*cs));
+
+		INIT_LIST_HEAD(&summary_head->domain_head);
+
+		list_for_each_entry(dptr, &cptr->domain_head, domain_list) {
+			tdptr = zalloc(sizeof(*tdptr));
+			if (!tdptr)
+				return -ENOMEM;
+
+			tdptr->domain_data = zalloc(sizeof(*ds));
+			if (!tdptr->domain_data)
+				return -ENOMEM;
+
+			memcpy(tdptr->domain_data, dptr->domain_data, sizeof(*ds));
+			list_add_tail(&tdptr->domain_list, &summary_head->domain_head);
+		}
+	}
+
+	list_for_each_entry(cptr, head, cpu_list) {
+		if (list_is_first(&cptr->cpu_list, head))
+			continue;
+
+		if (list_is_last(&cptr->cpu_list, head))
+			is_last = true;
+
+		cnt++;
+		summarize_schedstat_cpu(summary_head, cptr, cnt, is_last);
+		tdptr = list_first_entry(&summary_head->domain_head, struct schedstat_domain,
+					 domain_list);
+
+		list_for_each_entry(dptr, &cptr->domain_head, domain_list) {
+			summarize_schedstat_domain(tdptr, dptr, cnt, is_last);
+			tdptr = list_next_entry(tdptr, domain_list);
+		}
+	}
+
+	list_add(&summary_head->cpu_list, head);
+	return ret;
+}
+
+static int show_schedstat_data(struct list_head *head1, struct cpu_domain_map **cd_map1,
+			       struct list_head *head2, struct cpu_domain_map **cd_map2,
+			       bool summary_only)
+{
+	struct schedstat_cpu *cptr1 = list_first_entry(head1, struct schedstat_cpu, cpu_list);
+	struct perf_record_schedstat_domain *ds1 = NULL, *ds2 = NULL;
+	struct perf_record_schedstat_cpu *cs1 = NULL, *cs2 = NULL;
+	struct schedstat_domain *dptr1 = NULL, *dptr2 = NULL;
+	struct schedstat_cpu *cptr2 = NULL;
+	__u64 jiffies1 = 0, jiffies2 = 0;
+	bool is_summary = true;
+	int ret = 0;
+
+	printf("Description\n");
+	print_separator2(SEP_LEN, "", 0);
+	printf("%-30s-> %s\n", "DESC", "Description of the field");
+	printf("%-30s-> %s\n", "COUNT", "Value of the field");
+	printf("%-30s-> %s\n", "PCT_CHANGE", "Percent change with corresponding base value");
+	printf("%-30s-> %s\n", "AVG_JIFFIES",
+	       "Avg time in jiffies between two consecutive occurrence of event");
+
+	print_separator2(SEP_LEN, "", 0);
+	printf("\n");
+
+	printf("%-65s: ", "Time elapsed (in jiffies)");
+	jiffies1 = cptr1->cpu_data->timestamp;
+	printf("%11llu", jiffies1);
+	if (head2) {
+		cptr2 = list_first_entry(head2, struct schedstat_cpu, cpu_list);
+		jiffies2 = cptr2->cpu_data->timestamp;
+		printf(",%11llu", jiffies2);
+	}
+	printf("\n");
+
+	ret = get_all_cpu_stats(head1);
+	if (cptr2) {
+		ret = get_all_cpu_stats(head2);
+		cptr2 = list_first_entry(head2, struct schedstat_cpu, cpu_list);
+	}
+
+	list_for_each_entry(cptr1, head1, cpu_list) {
+		struct cpu_domain_map *cd_info1 = NULL, *cd_info2 = NULL;
+
+		cs1 = cptr1->cpu_data;
+		cd_info1 = cd_map1[cs1->cpu];
+		if (cptr2) {
+			cs2 = cptr2->cpu_data;
+			cd_info2 = cd_map2[cs2->cpu];
+			dptr2 = list_first_entry(&cptr2->domain_head, struct schedstat_domain,
+						 domain_list);
+		}
+
+		if (cs2 && cs1->cpu != cs2->cpu) {
+			pr_err("Failed because matching cpus not found for diff\n");
+			return -1;
+		}
+
+		if (cd_info2 && cd_info1->nr_domains != cd_info2->nr_domains) {
+			pr_err("Failed because nr_domains is not same for cpus\n");
+			return -1;
+		}
+
+		print_separator2(SEP_LEN, "", 0);
+
+		if (is_summary)
+			printf("CPU: <ALL CPUS SUMMARY>\n");
+		else
+			printf("CPU: %d\n", cs1->cpu);
+
+		print_separator2(SEP_LEN, "", 0);
+		print_cpu_stats(cs1, cs2);
+		print_separator2(SEP_LEN, "", 0);
+
+		list_for_each_entry(dptr1, &cptr1->domain_head, domain_list) {
+			struct domain_info *dinfo1 = NULL, *dinfo2 = NULL;
+
+			ds1 = dptr1->domain_data;
+			dinfo1 = cd_info1->domains[ds1->domain];
+			if (dptr2) {
+				ds2 = dptr2->domain_data;
+				dinfo2 = cd_info2->domains[ds2->domain];
+			}
+
+			if (dinfo2 && dinfo1->domain != dinfo2->domain) {
+				pr_err("Failed because matching domain not found for diff\n");
+				return -1;
+			}
+
+			if (is_summary) {
+				if (dinfo1->dname)
+					printf("CPU: <ALL CPUS SUMMARY> | DOMAIN: %s\n",
+					       dinfo1->dname);
+				else
+					printf("CPU: <ALL CPUS SUMMARY> | DOMAIN: %d\n",
+					       dinfo1->domain);
+			} else {
+				if (dinfo1->dname)
+					printf("CPU: %d | DOMAIN: %s | DOMAIN_CPUS: ",
+					       cs1->cpu, dinfo1->dname);
+				else
+					printf("CPU: %d | DOMAIN: %d | DOMAIN_CPUS: ",
+					       cs1->cpu, dinfo1->domain);
+
+				printf("%s\n", dinfo1->cpulist);
+			}
+			print_separator2(SEP_LEN, "", 0);
+			print_domain_stats(ds1, ds2, jiffies1, jiffies2);
+			print_separator2(SEP_LEN, "", 0);
+
+			if (dptr2)
+				dptr2 = list_next_entry(dptr2, domain_list);
+		}
+		if (summary_only)
+			break;
+
+		if (cptr2)
+			cptr2 = list_next_entry(cptr2, cpu_list);
+
+		is_summary = false;
+	}
+	return ret;
+}
+
+/*
+ * Creates a linked list of cpu_data and domain_data. Below represents the structure of the linked
+ * list where CPU0,CPU1,CPU2, ..., CPU(N-1) stores the cpu_data. Here N is the total number of cpus.
+ * Each of the CPU points to the list of domain_data. Here DOMAIN0, DOMAIN1, DOMAIN2, ... represents
+ * the domain_data. Here D0, D1, D2, ..., Dm are the number of domains in the respective cpus.
+ *
+ *	+----------+
+ *	| CPU_HEAD |
+ *	+----------+
+ *	      |
+ *	      v
+ *	+----------+    +---------+    +---------+    +---------+	    +--------------+
+ *	|   CPU0   | -> | DOMAIN0 | -> | DOMAIN1 | -> | DOMAIN2 | -> ... -> | DOMAIN(D0-1) |
+ *	+----------+    +---------+    +---------+    +---------+           +--------------+
+ *	      |
+ *	      v
+ *	+----------+    +---------+    +---------+    +---------+           +--------------+
+ *	|   CPU1   | -> | DOMAIN0 | -> | DOMAIN1 | -> | DOMAIN2 | -> ... -> | DOMAIN(D1-1) |
+ *	+----------+    +---------+    +---------+    +---------+           +--------------+
+ *	      |
+ *	      v
+ *	+----------+    +---------+    +---------+    +---------+           +--------------+
+ *	|   CPU2   | -> | DOMAIN0 | -> | DOMAIN1 | -> | DOMAIN2 | -> ... -> | DOMAIN(D2-1) |
+ *	+----------+    +---------+    +---------+    +---------+           +--------------+
+ *	      |
+ *	      v
+ *	     ...
+ *	      |
+ *	      v
+ *	+----------+    +---------+    +---------+    +---------+           +--------------+
+ *	| CPU(N-1) | -> | DOMAIN0 | -> | DOMAIN1 | -> | DOMAIN2 | -> ... -> | DOMAIN(Dm-1) |
+ *	+----------+    +---------+    +---------+    +---------+           +--------------+
+ *
+ * Each cpu as well as domain has 2 enties in the event list one before the workload starts and
+ * other after completion of the workload. The above linked list stores the diff of the cpu and
+ * domain statistics.
+ */
+static int perf_sched__process_schedstat(const struct perf_tool *tool __maybe_unused,
+					 struct perf_session *session __maybe_unused,
+					 union perf_event *event)
+{
+	struct perf_cpu this_cpu;
+	static __u32 initial_cpu;
+
+	switch (event->header.type) {
+	case PERF_RECORD_SCHEDSTAT_CPU:
+		this_cpu.cpu = event->schedstat_cpu.cpu;
+		break;
+	case PERF_RECORD_SCHEDSTAT_DOMAIN:
+		this_cpu.cpu = event->schedstat_domain.cpu;
+		break;
+	default:
+		return 0;
+	}
+
+	if (user_requested_cpus && !perf_cpu_map__has(user_requested_cpus, this_cpu))
+		return 0;
+
+	if (event->header.type == PERF_RECORD_SCHEDSTAT_CPU) {
+		struct schedstat_cpu *temp = zalloc(sizeof(*temp));
+
+		if (!temp)
+			return -ENOMEM;
+
+		temp->cpu_data = zalloc(sizeof(*temp->cpu_data));
+		if (!temp->cpu_data)
+			return -ENOMEM;
+
+		memcpy(temp->cpu_data, &event->schedstat_cpu, sizeof(*temp->cpu_data));
+
+		if (!list_empty(&cpu_head) && temp->cpu_data->cpu == initial_cpu)
+			after_workload_flag = true;
+
+		if (!after_workload_flag) {
+			if (list_empty(&cpu_head))
+				initial_cpu = temp->cpu_data->cpu;
+
+			list_add_tail(&temp->cpu_list, &cpu_head);
+			INIT_LIST_HEAD(&temp->domain_head);
+		} else {
+			if (temp->cpu_data->cpu == initial_cpu) {
+				cpu_second_pass = list_first_entry(&cpu_head, struct schedstat_cpu,
+								   cpu_list);
+				cpu_second_pass->cpu_data->timestamp =
+					temp->cpu_data->timestamp - cpu_second_pass->cpu_data->timestamp;
+			} else {
+				cpu_second_pass = list_next_entry(cpu_second_pass, cpu_list);
+			}
+			domain_second_pass = list_first_entry(&cpu_second_pass->domain_head,
+							      struct schedstat_domain, domain_list);
+			store_schedstat_cpu_diff(temp);
+		}
+	} else if (event->header.type == PERF_RECORD_SCHEDSTAT_DOMAIN) {
+		struct schedstat_cpu *cpu_tail;
+		struct schedstat_domain *temp = zalloc(sizeof(*temp));
+
+		if (!temp)
+			return -ENOMEM;
+
+		temp->domain_data = zalloc(sizeof(*temp->domain_data));
+		if (!temp->domain_data)
+			return -ENOMEM;
+
+		memcpy(temp->domain_data, &event->schedstat_domain, sizeof(*temp->domain_data));
+
+		if (!after_workload_flag) {
+			cpu_tail = list_last_entry(&cpu_head, struct schedstat_cpu, cpu_list);
+			list_add_tail(&temp->domain_list, &cpu_tail->domain_head);
+		} else {
+			store_schedstat_domain_diff(temp);
+			domain_second_pass = list_next_entry(domain_second_pass, domain_list);
+		}
+	}
+
+	return 0;
+}
+
+static void free_schedstat(struct list_head *head)
+{
+	struct schedstat_domain *dptr, *n1;
+	struct schedstat_cpu *cptr, *n2;
+
+	list_for_each_entry_safe(cptr, n2, head, cpu_list) {
+		list_for_each_entry_safe(dptr, n1, &cptr->domain_head, domain_list) {
+			list_del_init(&dptr->domain_list);
+			free(dptr);
+		}
+		list_del_init(&cptr->cpu_list);
+		free(cptr);
+	}
+}
+
+static int perf_sched__schedstat_report(struct perf_sched *sched)
+{
+	struct cpu_domain_map **cd_map;
+	struct perf_session *session;
+	struct target target = {};
+	struct perf_data data = {
+		.path  = input_name,
+		.mode  = PERF_DATA_MODE_READ,
+	};
+	int err = 0;
+
+	sched->tool.schedstat_cpu = perf_sched__process_schedstat;
+	sched->tool.schedstat_domain = perf_sched__process_schedstat;
+
+	session = perf_session__new(&data, &sched->tool);
+	if (IS_ERR(session)) {
+		pr_err("Perf session creation failed.\n");
+		return PTR_ERR(session);
+	}
+
+	if (cpu_list)
+		target.cpu_list = cpu_list;
+	else
+		target.system_wide = true;
+
+	err = evlist__create_maps(session->evlist, &target);
+	if (err < 0)
+		goto out;
+
+	user_requested_cpus = session->evlist->core.user_requested_cpus;
+
+	err = perf_session__process_events(session);
+
+	if (!err) {
+		setup_pager();
+
+		if (list_empty(&cpu_head)) {
+			pr_err("Data is not available\n");
+			err = -1;
+			goto out;
+		}
+
+		cd_map = session->header.env.cpu_domain;
+		err = show_schedstat_data(&cpu_head, cd_map, NULL, NULL, false);
+	}
+
+out:
+	free_schedstat(&cpu_head);
+	perf_session__delete(session);
+	return err;
+}
+
+static int perf_sched__schedstat_diff(struct perf_sched *sched,
+				      int argc, const char **argv)
+{
+	struct cpu_domain_map **cd_map0 = NULL, **cd_map1 = NULL;
+	struct list_head cpu_head_ses0, cpu_head_ses1;
+	struct perf_session *session[2];
+	struct perf_data data[2];
+	int ret = 0, err = 0;
+	static const char *defaults[] = {
+		"perf.data.old",
+		"perf.data",
+	};
+
+	if (argc) {
+		if (argc == 1)
+			defaults[1] = argv[0];
+		else if (argc == 2) {
+			defaults[0] = argv[0];
+			defaults[1] = argv[1];
+		} else {
+			pr_err("perf sched stats diff is not supported with more than 2 files.\n");
+			goto out_ret;
+		}
+	}
+
+	INIT_LIST_HEAD(&cpu_head_ses0);
+	INIT_LIST_HEAD(&cpu_head_ses1);
+
+	sched->tool.schedstat_cpu = perf_sched__process_schedstat;
+	sched->tool.schedstat_domain = perf_sched__process_schedstat;
+
+	data[0].path = defaults[0];
+	data[0].mode  = PERF_DATA_MODE_READ;
+	session[0] = perf_session__new(&data[0], &sched->tool);
+	if (IS_ERR(session[0])) {
+		ret = PTR_ERR(session[0]);
+		pr_err("Failed to open %s\n", data[0].path);
+		goto out_delete_ses0;
+	}
+
+	err = perf_session__process_events(session[0]);
+	if (err)
+		goto out_delete_ses0;
+
+	cd_map0 = session[0]->header.env.cpu_domain;
+	list_replace_init(&cpu_head, &cpu_head_ses0);
+	after_workload_flag = false;
+
+	data[1].path = defaults[1];
+	data[1].mode  = PERF_DATA_MODE_READ;
+	session[1] = perf_session__new(&data[1], &sched->tool);
+	if (IS_ERR(session[1])) {
+		ret = PTR_ERR(session[1]);
+		pr_err("Failed to open %s\n", data[1].path);
+		goto out_delete_ses1;
+	}
+
+	err = perf_session__process_events(session[1]);
+	if (err)
+		goto out_delete_ses1;
+
+	cd_map1 = session[1]->header.env.cpu_domain;
+	list_replace_init(&cpu_head, &cpu_head_ses1);
+	after_workload_flag = false;
+	setup_pager();
+
+	if (list_empty(&cpu_head_ses1)) {
+		pr_err("Data is not available\n");
+		ret = -1;
+		goto out_delete_ses1;
+	}
+
+	if (list_empty(&cpu_head_ses0)) {
+		pr_err("Data is not available\n");
+		ret = -1;
+		goto out_delete_ses0;
+	}
+
+	show_schedstat_data(&cpu_head_ses0, cd_map0, &cpu_head_ses1, cd_map1, true);
+
+out_delete_ses1:
+	free_schedstat(&cpu_head_ses1);
+	if (!IS_ERR(session[1]))
+		perf_session__delete(session[1]);
+
+out_delete_ses0:
+	free_schedstat(&cpu_head_ses0);
+	if (!IS_ERR(session[0]))
+		perf_session__delete(session[0]);
+
+out_ret:
+	return ret;
+}
+
+static int process_synthesized_event_live(const struct perf_tool *tool __maybe_unused,
+					  union perf_event *event,
+					  struct perf_sample *sample __maybe_unused,
+					  struct machine *machine __maybe_unused)
+{
+	return perf_sched__process_schedstat(tool, NULL, event);
+}
+
+static int perf_sched__schedstat_live(struct perf_sched *sched,
+				      int argc, const char **argv)
+{
+	struct cpu_domain_map **cd_map = NULL;
+	struct target target = {};
+	u32 __maybe_unused md;
+	struct evlist *evlist;
+	u32 nr = 0, sv;
+	int reset = 0;
+	int err = 0;
+
+	signal(SIGINT, sighandler);
+	signal(SIGCHLD, sighandler);
+	signal(SIGTERM, sighandler);
+
+	evlist = evlist__new();
+	if (!evlist)
+		return -ENOMEM;
+
+	/*
+	 * `perf sched schedstat` does not support workload profiling (-p pid)
+	 * since /proc/schedstat file contains cpu specific data only. Hence, a
+	 * profile target is either set of cpus or systemwide, never a process.
+	 * Note that, although `-- <workload>` is supported, profile data are
+	 * still cpu/systemwide.
+	 */
+	if (cpu_list)
+		target.cpu_list = cpu_list;
+	else
+		target.system_wide = true;
+
+	if (argc) {
+		err = evlist__prepare_workload(evlist, &target, argv, false, NULL);
+		if (err)
+			goto out;
+	}
+
+	err = evlist__create_maps(evlist, &target);
+	if (err < 0)
+		goto out;
+
+	user_requested_cpus = evlist->core.user_requested_cpus;
+
+	err = perf_event__synthesize_schedstat(&(sched->tool),
+					       process_synthesized_event_live,
+					       user_requested_cpus);
+	if (err < 0)
+		goto out;
+
+	err = enable_sched_schedstats(&reset);
+	if (err < 0)
+		goto out;
+
+	if (argc)
+		evlist__start_workload(evlist);
+
+	/* wait for signal */
+	pause();
+
+	if (reset) {
+		err = disable_sched_schedstat();
+		if (err < 0)
+			goto out;
+	}
+
+	err = perf_event__synthesize_schedstat(&(sched->tool),
+					       process_synthesized_event_live,
+					       user_requested_cpus);
+	if (err)
+		goto out;
+
+	setup_pager();
+
+	if (list_empty(&cpu_head)) {
+		pr_err("Data is not available\n");
+		err = -1;
+		goto out;
+	}
+
+	nr = cpu__max_present_cpu().cpu;
+	cd_map = build_cpu_domain_map(&sv, &md, nr);
+	if (!cd_map) {
+		pr_err("Unable to generate cpu-domain relation info");
+		goto out;
+	}
+
+	show_schedstat_data(&cpu_head, cd_map, NULL, NULL, false);
+	free_cpu_domain_info(cd_map, sv, nr);
+out:
+	free_schedstat(&cpu_head);
+	evlist__delete(evlist);
+	return err;
+}
+
 static bool schedstat_events_exposed(void)
 {
 	/*
@@ -3910,6 +4904,15 @@ int cmd_sched(int argc, const char **argv)
 	OPT_BOOLEAN('P', "pre-migrations", &sched.pre_migrations, "Show pre-migration wait time"),
 	OPT_PARENT(sched_options)
 	};
+	const struct option stats_options[] = {
+	OPT_STRING('i', "input", &input_name, "file",
+		   "`stats report` with input filename"),
+	OPT_STRING('o', "output", &output_name, "file",
+		   "`stats record` with output filename"),
+	OPT_STRING('C', "cpu", &cpu_list, "cpu", "list of cpus to profile"),
+	OPT_BOOLEAN('v', "verbose", &verbose_field, "Show explanation for fields in the report"),
+	OPT_END()
+	};
 
 	const char * const latency_usage[] = {
 		"perf sched latency [<options>]",
@@ -3927,9 +4930,13 @@ int cmd_sched(int argc, const char **argv)
 		"perf sched timehist [<options>]",
 		NULL
 	};
+	const char *stats_usage[] = {
+		"perf sched stats {record|report} [<options>]",
+		NULL
+	};
 	const char *const sched_subcommands[] = { "record", "latency", "map",
 						  "replay", "script",
-						  "timehist", NULL };
+						  "timehist", "stats", NULL };
 	const char *sched_usage[] = {
 		NULL,
 		NULL
@@ -4027,6 +5034,31 @@ int cmd_sched(int argc, const char **argv)
 		ret = symbol__validate_sym_arguments();
 		if (!ret)
 			ret = perf_sched__timehist(&sched);
+	} else if (!strcmp(argv[0], "stats")) {
+		const char *const stats_subcommands[] = {"record", "report", NULL};
+
+		argc = parse_options_subcommand(argc, argv, stats_options,
+						stats_subcommands,
+						stats_usage,
+						PARSE_OPT_STOP_AT_NON_OPTION);
+
+		if (argv[0] && !strcmp(argv[0], "record")) {
+			if (argc)
+				argc = parse_options(argc, argv, stats_options,
+						     stats_usage, 0);
+			return perf_sched__schedstat_record(&sched, argc, argv);
+		} else if (argv[0] && !strcmp(argv[0], "report")) {
+			if (argc)
+				argc = parse_options(argc, argv, stats_options,
+						     stats_usage, 0);
+			return perf_sched__schedstat_report(&sched);
+		} else if (argv[0] && !strcmp(argv[0], "diff")) {
+			if (argc)
+				argc = parse_options(argc, argv, stats_options,
+						     stats_usage, 0);
+			return perf_sched__schedstat_diff(&sched, argc, argv);
+		}
+		return perf_sched__schedstat_live(&sched, argc, argv);
 	} else {
 		usage_with_options(sched_usage, sched_options);
 	}
diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c
index 62e43d3c5ad7..7c743a303507 100644
--- a/tools/perf/builtin-script.c
+++ b/tools/perf/builtin-script.c
@@ -37,7 +37,6 @@
 #include "ui/ui.h"
 #include "print_binary.h"
 #include "print_insn.h"
-#include "archinsn.h"
 #include <linux/bitmap.h>
 #include <linux/compiler.h>
 #include <linux/kernel.h>
@@ -90,7 +89,6 @@ static bool			print_flags;
 static const char		*cpu_list;
 static DECLARE_BITMAP(cpu_bitmap, MAX_NR_CPUS);
 static int			max_blocks;
-static bool			native_arch;
 static struct dlfilter		*dlfilter;
 static int			dlargc;
 static char			**dlargv;
@@ -717,7 +715,8 @@ out:
 	return 0;
 }
 
-static int perf_sample__fprintf_regs(struct regs_dump *regs, uint64_t mask, const char *arch,
+static int perf_sample__fprintf_regs(struct regs_dump *regs, uint64_t mask,
+				     uint16_t e_machine, uint32_t e_flags,
 				     FILE *fp)
 {
 	unsigned i = 0, r;
@@ -730,7 +729,9 @@ static int perf_sample__fprintf_regs(struct regs_dump *regs, uint64_t mask, cons
 
 	for_each_set_bit(r, (unsigned long *) &mask, sizeof(mask) * 8) {
 		u64 val = regs->regs[i++];
-		printed += fprintf(fp, "%5s:0x%"PRIx64" ", perf_reg_name(r, arch), val);
+		printed += fprintf(fp, "%5s:0x%"PRIx64" ",
+				   perf_reg_name(r, e_machine, e_flags),
+				   val);
 	}
 
 	return printed;
@@ -787,23 +788,29 @@ tod_scnprintf(struct perf_script *script, char *buf, int buflen,
 }
 
 static int perf_sample__fprintf_iregs(struct perf_sample *sample,
-				      struct perf_event_attr *attr, const char *arch, FILE *fp)
+				      struct perf_event_attr *attr,
+				      uint16_t e_machine,
+				      uint32_t e_flags,
+				      FILE *fp)
 {
 	if (!sample->intr_regs)
 		return 0;
 
 	return perf_sample__fprintf_regs(perf_sample__intr_regs(sample),
-					 attr->sample_regs_intr, arch, fp);
+					 attr->sample_regs_intr, e_machine, e_flags, fp);
 }
 
 static int perf_sample__fprintf_uregs(struct perf_sample *sample,
-				      struct perf_event_attr *attr, const char *arch, FILE *fp)
+				      struct perf_event_attr *attr,
+				      uint16_t e_machine,
+				      uint32_t e_flags,
+				      FILE *fp)
 {
 	if (!sample->user_regs)
 		return 0;
 
 	return perf_sample__fprintf_regs(perf_sample__user_regs(sample),
-					 attr->sample_regs_user, arch, fp);
+					 attr->sample_regs_user, e_machine, e_flags, fp);
 }
 
 static int perf_sample__fprintf_start(struct perf_script *script,
@@ -1618,7 +1625,7 @@ static int perf_sample__fprintf_insn(struct perf_sample *sample,
 {
 	int printed = 0;
 
-	script_fetch_insn(sample, thread, machine, native_arch);
+	perf_sample__fetch_insn(sample, thread, machine);
 
 	if (PRINT_FIELD(INSNLEN))
 		printed += fprintf(fp, " ilen: %d", sample->insn_len);
@@ -2418,7 +2425,7 @@ static void process_event(struct perf_script *script,
 	struct evsel_script *es = evsel->priv;
 	FILE *fp = es->fp;
 	char str[PAGE_SIZE_NAME_LEN];
-	const char *arch = perf_env__arch(machine->env);
+	uint32_t e_flags;
 
 	if (output[type].fields == 0)
 		return;
@@ -2505,11 +2512,19 @@ static void process_event(struct perf_script *script,
 				    symbol_conf.bt_stop_list, fp);
 	}
 
-	if (PRINT_FIELD(IREGS))
-		perf_sample__fprintf_iregs(sample, attr, arch, fp);
+	if (PRINT_FIELD(IREGS)) {
+		perf_sample__fprintf_iregs(sample, attr,
+					   thread__e_machine(thread, machine, &e_flags),
+					   e_flags,
+					   fp);
+	}
 
-	if (PRINT_FIELD(UREGS))
-		perf_sample__fprintf_uregs(sample, attr, arch, fp);
+	if (PRINT_FIELD(UREGS)) {
+		perf_sample__fprintf_uregs(sample, attr,
+					   thread__e_machine(thread, machine, &e_flags),
+					   e_flags,
+					   fp);
+	}
 
 	if (PRINT_FIELD(BRSTACK))
 		perf_sample__fprintf_brstack(sample, thread, evsel, fp);
@@ -2803,6 +2818,7 @@ static int process_attr(const struct perf_tool *tool, union perf_event *event,
 	struct perf_script *scr = container_of(tool, struct perf_script, tool);
 	struct evlist *evlist;
 	struct evsel *evsel, *pos;
+	uint16_t e_machine;
 	u64 sample_type;
 	int err;
 
@@ -2844,7 +2860,8 @@ static int process_attr(const struct perf_tool *tool, union perf_event *event,
 	 * on events sample_type.
 	 */
 	sample_type = evlist__combined_sample_type(evlist);
-	callchain_param_setup(sample_type, perf_env__arch(perf_session__env(scr->session)));
+	e_machine = perf_session__e_machine(evsel__session(evsel), /*e_flags=*/NULL);
+	callchain_param_setup(sample_type, e_machine);
 
 	/* Enable fields for callchain entries */
 	if (symbol_conf.use_callchain &&
@@ -3819,7 +3836,7 @@ static void script__setup_sample_type(struct perf_script *script)
 	struct perf_session *session = script->session;
 	u64 sample_type = evlist__combined_sample_type(session->evlist);
 
-	callchain_param_setup(sample_type, perf_env__arch(session->machines.host.env));
+	callchain_param_setup(sample_type, perf_session__e_machine(session, /*e_flags=*/NULL));
 
 	if (script->stitch_lbr && (callchain_param.record_mode != CALLCHAIN_LBR)) {
 		pr_warning("Can't find LBR callchain. Switch off --stitch-lbr.\n"
@@ -4017,7 +4034,6 @@ int cmd_script(int argc, const char **argv)
 		.set = false,
 		.default_no_sample = true,
 	};
-	struct utsname uts;
 	char *script_path = NULL;
 	const char *dlfilter_file = NULL;
 	const char **__argv;
@@ -4439,17 +4455,6 @@ script_found:
 	if (symbol__init(env) < 0)
 		goto out_delete;
 
-	uname(&uts);
-	if (data.is_pipe) { /* Assume pipe_mode indicates native_arch */
-		native_arch = true;
-	} else if (env->arch) {
-		if (!strcmp(uts.machine, env->arch))
-			native_arch = true;
-		else if (!strcmp(uts.machine, "x86_64") &&
-			 !strcmp(env->arch, "i386"))
-			native_arch = true;
-	}
-
 	script.session = session;
 	script__setup_sample_type(&script);
 
@@ -4484,6 +4489,7 @@ script_found:
 	if (generate_script_lang) {
 		struct stat perf_stat;
 		int input;
+		char *filename = strdup("perf-script");
 
 		if (output_set_by_user()) {
 			fprintf(stderr,
@@ -4511,17 +4517,32 @@ script_found:
 		}
 
 		scripting_ops = script_spec__lookup(generate_script_lang);
+		if (!scripting_ops && ends_with(generate_script_lang, ".py")) {
+			scripting_ops = script_spec__lookup("python");
+			free(filename);
+			filename = strdup(generate_script_lang);
+			filename[strlen(filename) - 3] = '\0';
+		} else if (!scripting_ops && ends_with(generate_script_lang, ".pl")) {
+			scripting_ops = script_spec__lookup("perl");
+			free(filename);
+			filename = strdup(generate_script_lang);
+			filename[strlen(filename) - 3] = '\0';
+		}
 		if (!scripting_ops) {
-			fprintf(stderr, "invalid language specifier");
+			fprintf(stderr, "invalid language specifier '%s'\n", generate_script_lang);
 			err = -ENOENT;
 			goto out_delete;
 		}
+		if (!filename) {
+			err = -ENOMEM;
+			goto out_delete;
+		}
 #ifdef HAVE_LIBTRACEEVENT
-		err = scripting_ops->generate_script(session->tevent.pevent,
-						     "perf-script");
+		err = scripting_ops->generate_script(session->tevent.pevent, filename);
 #else
-		err = scripting_ops->generate_script(NULL, "perf-script");
+		err = scripting_ops->generate_script(NULL, filename);
 #endif
+		free(filename);
 		goto out_delete;
 	}
 
diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index ab40d85fb125..73c2ba7e3076 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -369,19 +369,11 @@ static int read_counter_cpu(struct evsel *counter, int cpu_map_idx)
 static int read_counters_with_affinity(void)
 {
 	struct evlist_cpu_iterator evlist_cpu_itr;
-	struct affinity saved_affinity, *affinity;
 
 	if (all_counters_use_bpf)
 		return 0;
 
-	if (!target__has_cpu(&target) || target__has_per_thread(&target))
-		affinity = NULL;
-	else if (affinity__setup(&saved_affinity) < 0)
-		return -1;
-	else
-		affinity = &saved_affinity;
-
-	evlist__for_each_cpu(evlist_cpu_itr, evsel_list, affinity) {
+	evlist__for_each_cpu(evlist_cpu_itr, evsel_list) {
 		struct evsel *counter = evlist_cpu_itr.evsel;
 
 		if (evsel__is_bpf(counter))
@@ -393,8 +385,6 @@ static int read_counters_with_affinity(void)
 		if (!counter->err)
 			counter->err = read_counter_cpu(counter, evlist_cpu_itr.cpu_map_idx);
 	}
-	if (affinity)
-		affinity__cleanup(&saved_affinity);
 
 	return 0;
 }
@@ -793,7 +783,6 @@ static int __run_perf_stat(int argc, const char **argv, int run_idx)
 	const bool forks = (argc > 0);
 	bool is_pipe = STAT_RECORD ? perf_stat.data.is_pipe : false;
 	struct evlist_cpu_iterator evlist_cpu_itr;
-	struct affinity saved_affinity, *affinity = NULL;
 	int err, open_err = 0;
 	bool second_pass = false, has_supported_counters;
 
@@ -805,14 +794,6 @@ static int __run_perf_stat(int argc, const char **argv, int run_idx)
 		child_pid = evsel_list->workload.pid;
 	}
 
-	if (!cpu_map__is_dummy(evsel_list->core.user_requested_cpus)) {
-		if (affinity__setup(&saved_affinity) < 0) {
-			err = -1;
-			goto err_out;
-		}
-		affinity = &saved_affinity;
-	}
-
 	evlist__for_each_entry(evsel_list, counter) {
 		counter->reset_group = false;
 		if (bpf_counter__load(counter, &target)) {
@@ -825,49 +806,48 @@ static int __run_perf_stat(int argc, const char **argv, int run_idx)
 
 	evlist__reset_aggr_stats(evsel_list);
 
-	evlist__for_each_cpu(evlist_cpu_itr, evsel_list, affinity) {
-		counter = evlist_cpu_itr.evsel;
+	/*
+	 * bperf calls evsel__open_per_cpu() in bperf__load(), so
+	 * no need to call it again here.
+	 */
+	if (!target.use_bpf) {
+		evlist__for_each_cpu(evlist_cpu_itr, evsel_list) {
+			counter = evlist_cpu_itr.evsel;
 
-		/*
-		 * bperf calls evsel__open_per_cpu() in bperf__load(), so
-		 * no need to call it again here.
-		 */
-		if (target.use_bpf)
-			break;
+			if (counter->reset_group || !counter->supported)
+				continue;
+			if (evsel__is_bperf(counter))
+				continue;
 
-		if (counter->reset_group || !counter->supported)
-			continue;
-		if (evsel__is_bperf(counter))
-			continue;
+			while (true) {
+				if (create_perf_stat_counter(counter, &stat_config,
+							      evlist_cpu_itr.cpu_map_idx) == 0)
+					break;
 
-		while (true) {
-			if (create_perf_stat_counter(counter, &stat_config,
-						     evlist_cpu_itr.cpu_map_idx) == 0)
-				break;
+				open_err = errno;
+				/*
+				 * Weak group failed. We cannot just undo this
+				 * here because earlier CPUs might be in group
+				 * mode, and the kernel doesn't support mixing
+				 * group and non group reads. Defer it to later.
+				 * Don't close here because we're in the wrong
+				 * affinity.
+				 */
+				if ((open_err == EINVAL || open_err == EBADF) &&
+					evsel__leader(counter) != counter &&
+					counter->weak_group) {
+					evlist__reset_weak_group(evsel_list, counter, false);
+					assert(counter->reset_group);
+					counter->supported = true;
+					second_pass = true;
+					break;
+				}
 
-			open_err = errno;
-			/*
-			 * Weak group failed. We cannot just undo this here
-			 * because earlier CPUs might be in group mode, and the kernel
-			 * doesn't support mixing group and non group reads. Defer
-			 * it to later.
-			 * Don't close here because we're in the wrong affinity.
-			 */
-			if ((open_err == EINVAL || open_err == EBADF) &&
-				evsel__leader(counter) != counter &&
-				counter->weak_group) {
-				evlist__reset_weak_group(evsel_list, counter, false);
-				assert(counter->reset_group);
-				counter->supported = true;
-				second_pass = true;
-				break;
+				if (stat_handle_error(counter, open_err) != COUNTER_RETRY)
+					break;
 			}
-
-			if (stat_handle_error(counter, open_err) != COUNTER_RETRY)
-				break;
 		}
 	}
-
 	if (second_pass) {
 		/*
 		 * Now redo all the weak group after closing them,
@@ -875,7 +855,7 @@ static int __run_perf_stat(int argc, const char **argv, int run_idx)
 		 */
 
 		/* First close errored or weak retry */
-		evlist__for_each_cpu(evlist_cpu_itr, evsel_list, affinity) {
+		evlist__for_each_cpu(evlist_cpu_itr, evsel_list) {
 			counter = evlist_cpu_itr.evsel;
 
 			if (!counter->reset_group && counter->supported)
@@ -884,7 +864,7 @@ static int __run_perf_stat(int argc, const char **argv, int run_idx)
 			perf_evsel__close_cpu(&counter->core, evlist_cpu_itr.cpu_map_idx);
 		}
 		/* Now reopen weak */
-		evlist__for_each_cpu(evlist_cpu_itr, evsel_list, affinity) {
+		evlist__for_each_cpu(evlist_cpu_itr, evsel_list) {
 			counter = evlist_cpu_itr.evsel;
 
 			if (!counter->reset_group)
@@ -893,17 +873,18 @@ static int __run_perf_stat(int argc, const char **argv, int run_idx)
 			while (true) {
 				pr_debug2("reopening weak %s\n", evsel__name(counter));
 				if (create_perf_stat_counter(counter, &stat_config,
-							     evlist_cpu_itr.cpu_map_idx) == 0)
+							     evlist_cpu_itr.cpu_map_idx) == 0) {
+					evlist_cpu_iterator__exit(&evlist_cpu_itr);
 					break;
-
+				}
 				open_err = errno;
-				if (stat_handle_error(counter, open_err) != COUNTER_RETRY)
+				if (stat_handle_error(counter, open_err) != COUNTER_RETRY) {
+					evlist_cpu_iterator__exit(&evlist_cpu_itr);
 					break;
+				}
 			}
 		}
 	}
-	affinity__cleanup(affinity);
-	affinity = NULL;
 
 	has_supported_counters = false;
 	evlist__for_each_entry(evsel_list, counter) {
@@ -937,9 +918,8 @@ static int __run_perf_stat(int argc, const char **argv, int run_idx)
 	}
 
 	if (evlist__apply_filters(evsel_list, &counter, &target)) {
-		pr_err("failed to set filter \"%s\" on event %s with %d (%s)\n",
-			counter->filter, evsel__name(counter), errno,
-			str_error_r(errno, msg, sizeof(msg)));
+		pr_err("failed to set filter \"%s\" on event %s: %m\n",
+			counter->filter, evsel__name(counter));
 		return -1;
 	}
 
@@ -1001,8 +981,8 @@ static int __run_perf_stat(int argc, const char **argv, int run_idx)
 		}
 
 		if (workload_exec_errno) {
-			const char *emsg = str_error_r(workload_exec_errno, msg, sizeof(msg));
-			pr_err("Workload failed: %s\n", emsg);
+			errno = workload_exec_errno;
+			pr_err("Workload failed: %m\n");
 			err = -1;
 			goto err_out;
 		}
@@ -1066,7 +1046,6 @@ err_out:
 	if (forks)
 		evlist__cancel_workload(evsel_list);
 
-	affinity__cleanup(affinity);
 	return err;
 }
 
@@ -2447,6 +2426,7 @@ static int parse_tpebs_mode(const struct option *opt, const char *str,
 int cmd_stat(int argc, const char **argv)
 {
 	struct opt_aggr_mode opt_mode = {};
+	bool affinity = true, affinity_set = false;
 	struct option stat_options[] = {
 		OPT_BOOLEAN('T', "transaction", &transaction_run,
 			"hardware transaction statistics"),
@@ -2575,6 +2555,8 @@ int cmd_stat(int argc, const char **argv)
 			"don't print 'summary' for CSV summary output"),
 		OPT_BOOLEAN(0, "quiet", &quiet,
 			"don't print any output, messages or warnings (useful with record)"),
+		OPT_BOOLEAN_SET(0, "affinity", &affinity, &affinity_set,
+			"enable (default) or disable affinity optimizations to reduce IPIs"),
 		OPT_CALLBACK(0, "cputype", &evsel_list, "hybrid cpu type",
 			"Only enable events on applying cpu with this type "
 			"for hybrid platform (e.g. core or atom)",
@@ -2632,6 +2614,9 @@ int cmd_stat(int argc, const char **argv)
 	} else
 		stat_config.csv_sep = DEFAULT_SEPARATOR;
 
+	if (affinity_set)
+		evsel_list->no_affinity = !affinity;
+
 	if (argc && strlen(argv[0]) > 2 && strstarts("record", argv[0])) {
 		argc = __cmd_record(stat_options, &opt_mode, argc, argv);
 		if (argc < 0)
diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c
index baee1f695600..311d9da9896a 100644
--- a/tools/perf/builtin-trace.c
+++ b/tools/perf/builtin-trace.c
@@ -2616,12 +2616,10 @@ static struct syscall *trace__syscall_info(struct trace *trace, struct evsel *ev
 		err = syscall__read_info(sc, trace);
 
 	if (err && verbose > 0) {
-		char sbuf[STRERR_BUFSIZE];
-
-		fprintf(trace->output, "Problems reading syscall %d: %d (%s)", id, -err,
-			str_error_r(-err, sbuf, sizeof(sbuf)));
+		errno = -err;
+		fprintf(trace->output, "Problems reading syscall %d: %m", id);
 		if (sc && sc->name)
-			fprintf(trace->output, "(%s)", sc->name);
+			fprintf(trace->output, " (%s)", sc->name);
 		fputs(" information\n", trace->output);
 	}
 	return err ? NULL : sc;
@@ -2791,7 +2789,7 @@ static int trace__sys_enter(struct trace *trace, struct evsel *evsel,
 	struct thread_trace *ttrace;
 
 	thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
-	e_machine = thread__e_machine(thread, trace->host);
+	e_machine = thread__e_machine(thread, trace->host, /*e_flags=*/NULL);
 	sc = trace__syscall_info(trace, evsel, e_machine, id);
 	if (sc == NULL)
 		goto out_put;
@@ -2870,7 +2868,7 @@ static int trace__fprintf_sys_enter(struct trace *trace, struct evsel *evsel,
 
 
 	thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
-	e_machine = thread__e_machine(thread, trace->host);
+	e_machine = thread__e_machine(thread, trace->host, /*e_flags=*/NULL);
 	sc = trace__syscall_info(trace, evsel, e_machine, id);
 	if (sc == NULL)
 		goto out_put;
@@ -2936,7 +2934,7 @@ static int trace__sys_exit(struct trace *trace, struct evsel *evsel,
 	struct thread_trace *ttrace;
 
 	thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
-	e_machine = thread__e_machine(thread, trace->host);
+	e_machine = thread__e_machine(thread, trace->host, /*e_flags=*/NULL);
 	sc = trace__syscall_info(trace, evsel, e_machine, id);
 	if (sc == NULL)
 		goto out_put;
@@ -3287,7 +3285,9 @@ static int trace__event_handler(struct trace *trace, struct evsel *evsel,
 
 	if (evsel == trace->syscalls.events.bpf_output) {
 		int id = perf_evsel__sc_tp_uint(evsel, id, sample);
-		int e_machine = thread ? thread__e_machine(thread, trace->host) : EM_HOST;
+		int e_machine = thread
+			? thread__e_machine(thread, trace->host, /*e_flags=*/NULL)
+			: EM_HOST;
 		struct syscall *sc = trace__syscall_info(trace, evsel, e_machine, id);
 
 		if (sc) {
@@ -4673,9 +4673,8 @@ out_error:
 
 out_error_apply_filters:
 	fprintf(trace->output,
-		"Failed to set filter \"%s\" on event %s with %d (%s)\n",
-		evsel->filter, evsel__name(evsel), errno,
-		str_error_r(errno, errbuf, sizeof(errbuf)));
+		"Failed to set filter \"%s\" on event %s: %m\n",
+		evsel->filter, evsel__name(evsel));
 	goto out_delete_evlist;
 }
 out_error_mem:
@@ -4683,7 +4682,7 @@ out_error_mem:
 	goto out_delete_evlist;
 
 out_errno:
-	fprintf(trace->output, "errno=%d,%s\n", errno, strerror(errno));
+	fprintf(trace->output, "%m\n");
 	goto out_delete_evlist;
 }
 
@@ -4919,7 +4918,7 @@ static size_t trace__fprintf_thread(FILE *fp, struct thread *thread, struct trac
 {
 	size_t printed = 0;
 	struct thread_trace *ttrace = thread__priv(thread);
-	int e_machine = thread__e_machine(thread, trace->host);
+	int e_machine = thread__e_machine(thread, trace->host, /*e_flags=*/NULL);
 	double ratio;
 
 	if (ttrace == NULL)
@@ -5173,8 +5172,8 @@ static int trace__parse_events_option(const struct option *opt, const char *str,
 				      int unset __maybe_unused)
 {
 	struct trace *trace = (struct trace *)opt->value;
-	const char *s = str;
-	char *sep = NULL, *lists[2] = { NULL, NULL, };
+	const char *s;
+	char *strd, *sep = NULL, *lists[2] = { NULL, NULL, };
 	int len = strlen(str) + 1, err = -1, list, idx;
 	char *strace_groups_dir = system_path(STRACE_GROUPS_DIR);
 	char group_name[PATH_MAX];
@@ -5183,13 +5182,17 @@ static int trace__parse_events_option(const struct option *opt, const char *str,
 	if (strace_groups_dir == NULL)
 		return -1;
 
+	s = strd = strdup(str);
+	if (strd == NULL)
+		return -1;
+
 	if (*s == '!') {
 		++s;
 		trace->not_ev_qualifier = true;
 	}
 
 	while (1) {
-		if ((sep = strchr(s, ',')) != NULL)
+		if ((sep = strchr((char *)s, ',')) != NULL)
 			*sep = '\0';
 
 		list = 0;
@@ -5257,8 +5260,7 @@ out:
 	free(strace_groups_dir);
 	free(lists[0]);
 	free(lists[1]);
-	if (sep)
-		*sep = ',';
+	free(strd);
 
 	return err;
 }
diff --git a/tools/perf/check-headers.sh b/tools/perf/check-headers.sh
index e0537f275da2..da3aca87457f 100755
--- a/tools/perf/check-headers.sh
+++ b/tools/perf/check-headers.sh
@@ -54,7 +54,6 @@ declare -a FILES=(
   "arch/s390/include/uapi/asm/kvm.h"
   "arch/s390/include/uapi/asm/sie.h"
   "arch/arm64/include/uapi/asm/kvm.h"
-  "arch/arm64/include/uapi/asm/unistd.h"
   "arch/alpha/include/uapi/asm/errno.h"
   "arch/mips/include/asm/errno.h"
   "arch/mips/include/uapi/asm/errno.h"
diff --git a/tools/perf/command-list.txt b/tools/perf/command-list.txt
deleted file mode 100644
index e8d2762adade..000000000000
--- a/tools/perf/command-list.txt
+++ /dev/null
@@ -1,36 +0,0 @@
-#
-# List of known perf commands.
-# command name			category [deprecated] [common]
-#
-perf-annotate			mainporcelain common
-perf-archive			mainporcelain common
-perf-bench			mainporcelain common
-perf-buildid-cache		mainporcelain common
-perf-buildid-list		mainporcelain common
-perf-data			mainporcelain common
-perf-diff			mainporcelain common
-perf-c2c			mainporcelain common
-perf-config			mainporcelain common
-perf-evlist			mainporcelain common
-perf-ftrace			mainporcelain common
-perf-inject			mainporcelain common
-perf-iostat			mainporcelain common
-perf-kallsyms			mainporcelain common
-perf-kmem			mainporcelain traceevent
-perf-kvm			mainporcelain common
-perf-kwork			mainporcelain traceevent
-perf-list			mainporcelain common
-perf-lock			mainporcelain traceevent
-perf-mem			mainporcelain common
-perf-probe			mainporcelain full
-perf-record			mainporcelain common
-perf-report			mainporcelain common
-perf-sched			mainporcelain traceevent
-perf-script			mainporcelain common
-perf-stat			mainporcelain common
-perf-test			mainporcelain common
-perf-timechart			mainporcelain traceevent
-perf-top			mainporcelain common
-perf-trace			mainporcelain audit
-perf-version			mainporcelain common
-perf-daemon			mainporcelain common
diff --git a/tools/perf/jvmti/libjvmti.c b/tools/perf/jvmti/libjvmti.c
index 82514e6532b8..87bfd4781003 100644
--- a/tools/perf/jvmti/libjvmti.c
+++ b/tools/perf/jvmti/libjvmti.c
@@ -142,7 +142,7 @@ copy_class_filename(const char * class_sign, const char * file_name, char * resu
 	*/
 	if (*class_sign == 'L') {
 		size_t j, i = 0;
-		char *p = strrchr(class_sign, '/');
+		const char *p = strrchr(class_sign, '/');
 		if (p) {
 			/* drop the 'L' prefix and copy up to the final '/' */
 			for (i = 0; i < (size_t)(p - class_sign); i++)
diff --git a/tools/perf/perf.c b/tools/perf/perf.c
index 88c60ecf3395..f475a8664ffc 100644
--- a/tools/perf/perf.c
+++ b/tools/perf/perf.c
@@ -169,8 +169,8 @@ static int set_debug_file(const char *path)
 {
 	debug_fp = fopen(path, "w");
 	if (!debug_fp) {
-		fprintf(stderr, "Open debug file '%s' failed: %s\n",
-			path, strerror(errno));
+		fprintf(stderr, "Open debug file '%s' failed: %m\n",
+			path);
 		return -1;
 	}
 
@@ -335,7 +335,6 @@ static int run_builtin(struct cmd_struct *p, int argc, const char **argv)
 {
 	int status;
 	struct stat st;
-	char sbuf[STRERR_BUFSIZE];
 
 	if (use_browser == -1)
 		use_browser = check_browser_config(p->cmd);
@@ -363,17 +362,15 @@ static int run_builtin(struct cmd_struct *p, int argc, const char **argv)
 	status = 1;
 	/* Check for ENOSPC and EIO errors.. */
 	if (fflush(stdout)) {
-		fprintf(stderr, "write failure on standard output: %s",
-			str_error_r(errno, sbuf, sizeof(sbuf)));
+		fprintf(stderr, "write failure on standard output: %m\n");
 		goto out;
 	}
 	if (ferror(stdout)) {
-		fprintf(stderr, "unknown write failure on standard output");
+		fprintf(stderr, "unknown write failure on standard output\n");
 		goto out;
 	}
 	if (fclose(stdout)) {
-		fprintf(stderr, "close failed on standard output: %s",
-			str_error_r(errno, sbuf, sizeof(sbuf)));
+		fprintf(stderr, "close failed on standard output: %m\n");
 		goto out;
 	}
 	status = 0;
@@ -459,7 +456,6 @@ int main(int argc, const char **argv)
 {
 	int err, done_help = 0;
 	const char *cmd;
-	char sbuf[STRERR_BUFSIZE];
 
 	perf_debug_setup();
 
@@ -573,8 +569,8 @@ int main(int argc, const char **argv)
 	}
 
 	if (cmd) {
-		fprintf(stderr, "Failed to run command '%s': %s\n",
-			cmd, str_error_r(errno, sbuf, sizeof(sbuf)));
+		fprintf(stderr, "Failed to run command '%s': %m\n",
+			cmd);
 	}
 out:
 	if (debug_fp)
diff --git a/tools/perf/pmu-events/Build b/tools/perf/pmu-events/Build
index a46ab7b612df..63c65788d442 100644
--- a/tools/perf/pmu-events/Build
+++ b/tools/perf/pmu-events/Build
@@ -1,52 +1,167 @@
-pmu-events-y	+= pmu-events.o
-JSON		=  $(shell find pmu-events/arch -name '*.json' -o -name '*.csv')
-JDIR_TEST	=  pmu-events/arch/test
-JSON_TEST	=  $(shell [ -d $(JDIR_TEST) ] &&			\
-			find $(JDIR_TEST) -name '*.json')
-JEVENTS_PY	=  pmu-events/jevents.py
-METRIC_PY	=  pmu-events/metric.py
-METRIC_TEST_PY	=  pmu-events/metric_test.py
 EMPTY_PMU_EVENTS_C = pmu-events/empty-pmu-events.c
+# pmu-events.c will be generated by jevents.py or copied from EMPTY_PMU_EVENTS_C
 PMU_EVENTS_C	=  $(OUTPUT)pmu-events/pmu-events.c
-METRIC_TEST_LOG	=  $(OUTPUT)pmu-events/metric_test.log
-TEST_EMPTY_PMU_EVENTS_C = $(OUTPUT)pmu-events/test-empty-pmu-events.c
-EMPTY_PMU_EVENTS_TEST_LOG = $(OUTPUT)pmu-events/empty-pmu-events.log
-LEGACY_CACHE_PY	=  pmu-events/make_legacy_cache.py
-LEGACY_CACHE_JSON = $(OUTPUT)pmu-events/arch/common/common/legacy-cache.json
+pmu-events-y	+= pmu-events.o
 
-ifeq ($(JEVENTS_ARCH),)
-JEVENTS_ARCH=$(SRCARCH)
-endif
-JEVENTS_MODEL ?= all
+# pmu-events.c file is generated in the OUTPUT directory so it needs a
+# separate rule to depend on it properly
+$(OUTPUT)pmu-events/pmu-events.o: $(PMU_EVENTS_C)
+	$(call rule_mkdir)
+	$(call if_changed_dep,cc_o_c)
 
-#
-# Locate/process JSON files in pmu-events/arch/
-# directory and create tables in pmu-events.c.
-#
+# Message for $(call echo-cmd,cp), possibly remove the src file from
+# the destination to save space in the build log.
+quiet_cmd_cp   = COPY    $(patsubst %$<,%,$@) <- $<
 
+# --- NO_JEVENTS=1 build ---
 ifeq ($(NO_JEVENTS),1)
 $(PMU_EVENTS_C): $(EMPTY_PMU_EVENTS_C)
 	$(call rule_mkdir)
-	$(Q)$(call echo-cmd,gen)cp $< $@
+	$(Q)$(call echo-cmd,cp)cp $< $@
 else
-# Copy checked-in json to OUTPUT for generation if it's an out of source build
-ifneq ($(OUTPUT),)
-$(OUTPUT)pmu-events/arch/%: pmu-events/arch/%
-	$(call rule_mkdir)
-	$(Q)$(call echo-cmd,gen)cp $< $@
+# --- Regular build ---
+
+# Setup the JEVENTS_ARCH and JEVENTS_MODEL
+ifeq ($(JEVENTS_ARCH),)
+JEVENTS_ARCH=$(SRCARCH)
+endif
+JEVENTS_MODEL ?= all
+
+# The input json/csv files
+SRC_DIR		:= pmu-events/arch
+ifeq ($(JEVENTS_ARCH),all)
+SRC_JSON	:= $(shell find $(SRC_DIR) -name '*.json' -o -name '*.csv')
+else
+SRC_JSON	:= $(shell find $(SRC_DIR)/common $(SRC_DIR)/test $(SRC_DIR)/$(JEVENTS_ARCH) -name '*.json' -o -name '*.csv')
 endif
 
+# Python to build the generic legacy cache events
+LEGACY_CACHE_PY	=  pmu-events/make_legacy_cache.py
+LEGACY_CACHE_JSON = $(OUTPUT)pmu-events/arch/common/common/legacy-cache.json
+GEN_JSON = $(LEGACY_CACHE_JSON)
+
 $(LEGACY_CACHE_JSON): $(LEGACY_CACHE_PY)
 	$(call rule_mkdir)
 	$(Q)$(call echo-cmd,gen)$(PYTHON) $(LEGACY_CACHE_PY) > $@
 
-GEN_JSON = $(patsubst %,$(OUTPUT)%,$(JSON)) $(LEGACY_CACHE_JSON)
+# Python to generate architectural metrics
+GEN_METRIC_DEPS := pmu-events/metric.py pmu-events/common_metrics.py
+# Functions to extract the model from an extra-metrics.json or extra-metricgroups.json path.
+model_name = $(shell echo $(1)|sed -e 's@.\+/\(.*\)/extra-metric.*\.json@\1@')
+vendor_name = $(shell echo $(1)|sed -e 's@.\+/\(.*\)/[^/]*/extra-metric.*\.json@\1@')
+
+ifeq ($(JEVENTS_ARCH),$(filter $(JEVENTS_ARCH),x86 all))
+# Generate AMD Json
+ZENS = $(shell ls -d pmu-events/arch/x86/amdzen*)
+ZEN_METRICS = $(foreach x,$(ZENS),$(OUTPUT)$(x)/extra-metrics.json)
+ZEN_METRICGROUPS = $(foreach x,$(ZENS),$(OUTPUT)$(x)/extra-metricgroups.json)
+GEN_JSON += $(ZEN_METRICS) $(ZEN_METRICGROUPS)
+
+$(ZEN_METRICS): pmu-events/amd_metrics.py $(GEN_METRIC_DEPS)
+	$(call rule_mkdir)
+	$(Q)$(call echo-cmd,gen)$(PYTHON) $< $(call model_name,$@) pmu-events/arch > $@
+
+$(ZEN_METRICGROUPS): pmu-events/amd_metrics.py $(GEN_METRIC_DEPS)
+	$(call rule_mkdir)
+	$(Q)$(call echo-cmd,gen)$(PYTHON) $< -metricgroups $(call model_name,$@) pmu-events/arch > $@
+
+endif
+
+ifeq ($(JEVENTS_ARCH),$(filter $(JEVENTS_ARCH),arm64 all))
+# Generate ARM Json
+ARMS = $(shell ls -d pmu-events/arch/arm64/arm/*|grep -v cmn)
+ARM_METRICS = $(foreach x,$(ARMS),$(OUTPUT)$(x)/extra-metrics.json)
+ARM_METRICGROUPS = $(foreach x,$(ARMS),$(OUTPUT)$(x)/extra-metricgroups.json)
+GEN_JSON += $(ARM_METRICS) $(ARM_METRICGROUPS)
+
+$(ARM_METRICS): pmu-events/arm64_metrics.py $(GEN_METRIC_DEPS)
+	$(call rule_mkdir)
+	$(Q)$(call echo-cmd,gen)$(PYTHON) $< $(call vendor_name,$@) $(call model_name,$@) pmu-events/arch > $@
+
+$(ARM_METRICGROUPS): pmu-events/arm64_metrics.py $(GEN_METRIC_DEPS)
+	$(call rule_mkdir)
+	$(Q)$(call echo-cmd,gen)$(PYTHON) $< -metricgroups $(call vendor_name,$@) $(call model_name,$@) pmu-events/arch > $@
+
+endif
+
+ifeq ($(JEVENTS_ARCH),$(filter $(JEVENTS_ARCH),x86 all))
+# Generate Intel Json
+INTELS = $(shell ls -d pmu-events/arch/x86/*|grep -v amdzen|grep -v mapfile.csv)
+INTEL_METRICS = $(foreach x,$(INTELS),$(OUTPUT)$(x)/extra-metrics.json)
+INTEL_METRICGROUPS = $(foreach x,$(INTELS),$(OUTPUT)$(x)/extra-metricgroups.json)
+GEN_JSON += $(INTEL_METRICS) $(INTEL_METRICGROUPS)
+
+$(INTEL_METRICS): pmu-events/intel_metrics.py $(GEN_METRIC_DEPS)
+	$(call rule_mkdir)
+	$(Q)$(call echo-cmd,gen)$(PYTHON) $< $(call model_name,$@) pmu-events/arch > $@
+
+$(INTEL_METRICGROUPS): pmu-events/intel_metrics.py $(GEN_METRIC_DEPS)
+	$(call rule_mkdir)
+	$(Q)$(call echo-cmd,gen)$(PYTHON) $< -metricgroups $(call model_name,$@) pmu-events/arch > $@
+
+endif
+
+OUT_DIR		:= $(OUTPUT)pmu-events/arch
+
+ifeq ($(OUTPUT),)
+OUT_JSON	:= $(SRC_JSON)
+ORPHAN_FILES	:=
+else
+# Things that need to be built in the OUTPUT directory. Note, ensure
+# there is a slash after the directory name so that it matches what
+# $(dir) gives in COPY_RULE.
+OUT_JSON	:= $(patsubst $(SRC_DIR)/%,$(OUT_DIR)/%,$(SRC_JSON))
+OUT_DIRS	:= $(sort $(patsubst %/,%,$(dir $(OUT_JSON))))
+
+# Things already in the OUTPUT directory
+CUR_OUT_JSON	:= $(shell [ -d $(OUT_DIR) ] && find $(OUT_DIR) -type f)
+
+# Things in the OUTPUT directory but shouldn't be there as computed by
+# OUT_JSON and GEN_JSON.
+ORPHAN_FILES	:= $(filter-out $(OUT_JSON) $(GEN_JSON),$(CUR_OUT_JSON))
+
+# Message for $(call echo-cmd,mkd). There is already a mkdir message
+# but it assumes $@ is a file to mkdir the directory for.
+quiet_cmd_mkd  = MKDIR   $@
+
+$(OUT_DIRS):
+	$(Q)$(call echo-cmd,mkd)mkdir -p $@
+
+# Explicitly generate rules to copy SRC_JSON files as $(dir) cannot
+# apply to $@ in a dependency. Exclude from the copy rules any that
+# look like they are copying generated json. This happens as a perf
+# build within the tools/perf directory will leave generated json
+# files within the tree, these then get picked up by SRC_JSON find.
+define COPY_RULE
+$(2): $(1) | $(3)
+	$$(Q)$$(call echo-cmd,cp)cp $(1) $(2)
+endef
+$(foreach src,$(SRC_JSON), \
+    $(eval dest := $(patsubst $(SRC_DIR)/%,$(OUT_DIR)/%,$(src))) \
+    $(eval ddir := $(patsubst %/,%,$(dir $(dest)))) \
+    $(if $(filter $(dest),$(GEN_JSON)),, \
+        $(eval $(call COPY_RULE,$(src),$(dest),$(ddir))) \
+    ) \
+)
+
+endif # ifneq ($(OUTPUT),)
+
+JEVENTS_PY	=  pmu-events/jevents.py
+METRIC_PY	=  pmu-events/metric.py
+
+# Rule to run the metric test.
+METRIC_TEST_PY	=  pmu-events/metric_test.py
+METRIC_TEST_LOG	=  $(OUTPUT)pmu-events/metric_test.log
 
 $(METRIC_TEST_LOG): $(METRIC_TEST_PY) $(METRIC_PY)
 	$(call rule_mkdir)
 	$(Q)$(call echo-cmd,test)$(PYTHON) $< 2> $@ || (cat $@ && false)
 
-$(TEST_EMPTY_PMU_EVENTS_C): $(GEN_JSON) $(JSON_TEST) $(JEVENTS_PY) $(METRIC_PY) $(METRIC_TEST_LOG)
+# Rule to create then ensure the empty-pmu-events.c is in sync.
+TEST_EMPTY_PMU_EVENTS_C = $(OUTPUT)pmu-events/test-empty-pmu-events.c
+EMPTY_PMU_EVENTS_TEST_LOG = $(OUTPUT)pmu-events/empty-pmu-events.log
+
+$(TEST_EMPTY_PMU_EVENTS_C): $(OUT_JSON) $(GEN_JSON) $(JEVENTS_PY) $(METRIC_PY)
 	$(call rule_mkdir)
 	$(Q)$(call echo-cmd,gen)$(PYTHON) $(JEVENTS_PY) none none $(OUTPUT)pmu-events/arch $@
 
@@ -54,36 +169,60 @@ $(EMPTY_PMU_EVENTS_TEST_LOG): $(EMPTY_PMU_EVENTS_C) $(TEST_EMPTY_PMU_EVENTS_C)
 	$(call rule_mkdir)
 	$(Q)$(call echo-cmd,test)diff -u $^ 2> $@ || (cat $@ && false)
 
+
+# Dependencies for jevents.py
+JEVENTS_DEPS := $(OUT_JSON) $(GEN_JSON) $(JEVENTS_PY) $(METRIC_PY) $(EMPTY_PMU_EVENTS_TEST_LOG) $(METRIC_TEST_LOG)
+
+# Rules to run mypy if enabled.
 ifdef MYPY
-  PMU_EVENTS_PY_TESTS := $(wildcard *.py)
-  PMU_EVENTS_MYPY_TEST_LOGS := $(JEVENTS_PY_TESTS:%=%.mypy_log)
-else
-  PMU_EVENTS_MYPY_TEST_LOGS :=
-endif
+define MYPY_RULE
+$(2): $(1)
+	$$(Q)$$(call echo-cmd,test)mypy $(1) > $(2) || (cat $(2) && rm $(2) && false)
+endef
+$(foreach src,$(wildcard pmu-events/*.py), \
+    $(eval dest := $(patsubst pmu-events/%,$(OUTPUT)pmu-events/%.mypy_log,$(src))) \
+    $(eval $(call MYPY_RULE,$(src),$(dest))) \
+)
 
-$(OUTPUT)%.mypy_log: %
-	$(call rule_mkdir)
-	$(Q)$(call echo-cmd,test)mypy "$<" > $@ || (cat $@ && rm $@ && false)
+MYPY_INPUTS := $(wildcard pmu-events/*.py)
+MYPY_OUTPUTS := $(patsubst pmu-events/%,$(OUTPUT)pmu-events/%.mypy_log,$(MYPY_INPUTS))
+JEVENTS_DEPS += $(MYPY_OUTPUTS)
+endif
 
+# Rules to run pylint if enabled.
 ifdef PYLINT
-  PMU_EVENTS_PY_TESTS := $(wildcard *.py)
-  PMU_EVENTS_PYLINT_TEST_LOGS := $(JEVENTS_PY_TESTS:%=%.pylint_log)
-else
-  PMU_EVENTS_PYLINT_TEST_LOGS :=
+define PYLINT_RULE
+$(2): $(1)
+	$$(Q)$$(call echo-cmd,test)pylint $(1) > $(2) || (cat $(2) && rm $(2) && false)
+endef
+$(foreach src,$(wildcard pmu-events/*.py), \
+    $(eval dest := $(patsubst pmu-events/%,$(OUTPUT)pmu-events/%.pylint_log,$(src))) \
+    $(eval $(call PYLINT_RULE,$(src),$(dest))) \
+)
+
+PYLINT_INPUTS := $(wildcard pmu-events/*.py)
+PYLINT_OUTPUTS := $(patsubst pmu-events/%,$(OUTPUT)pmu-events/%.pylint_log,$(PYLINT_INPUTS))
+JEVENTS_DEPS += $(PYLINT_OUTPUTS)
 endif
 
-$(OUTPUT)%.pylint_log: %
-	$(call rule_mkdir)
-	$(Q)$(call echo-cmd,test)pylint "$<" > $@ || (cat $@ && rm $@ && false)
+# If there are orphaned files remove them.
+ifneq ($(strip $(ORPHAN_FILES)),)
+.PHONY: prune_orphans
 
-$(PMU_EVENTS_C): $(GEN_JSON) $(JSON_TEST) $(JEVENTS_PY) $(METRIC_PY) $(METRIC_TEST_LOG) \
-    $(EMPTY_PMU_EVENTS_TEST_LOG) $(PMU_EVENTS_MYPY_TEST_LOGS) $(PMU_EVENTS_PYLINT_TEST_LOGS)
-	$(call rule_mkdir)
-	$(Q)$(call echo-cmd,gen)$(PYTHON) $(JEVENTS_PY) $(JEVENTS_ARCH) $(JEVENTS_MODEL) $(OUTPUT)pmu-events/arch $@
+# Message for $(call echo-cmd,rm). Generally cleaning files isn't part
+# of a build step.
+quiet_cmd_rm  = RM      $^
+
+prune_orphans: $(ORPHAN_FILES)
+	$(Q)$(call echo-cmd,rm)rm -f $^
+
+JEVENTS_DEPS += prune_orphans
 endif
 
-# pmu-events.c file is generated in the OUTPUT directory so it needs a
-# separate rule to depend on it properly
-$(OUTPUT)pmu-events/pmu-events.o: $(PMU_EVENTS_C)
+# Finally, the rule to build pmu-events.c using jevents.py. All test
+# and inputs are dependencies.
+$(PMU_EVENTS_C): $(JEVENTS_DEPS)
 	$(call rule_mkdir)
-	$(call if_changed_dep,cc_o_c)
+	$(Q)$(call echo-cmd,gen)$(PYTHON) $(JEVENTS_PY) $(JEVENTS_ARCH) $(JEVENTS_MODEL) $(OUT_DIR) $@
+
+endif # ifeq ($(NO_JEVENTS),1)
diff --git a/tools/perf/pmu-events/amd_metrics.py b/tools/perf/pmu-events/amd_metrics.py
new file mode 100755
index 000000000000..e2defaffde3e
--- /dev/null
+++ b/tools/perf/pmu-events/amd_metrics.py
@@ -0,0 +1,492 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause)
+import argparse
+import math
+import os
+from typing import Optional
+from common_metrics import Cycles
+from metric import (d_ratio, has_event, max, Event, JsonEncodeMetric,
+                    JsonEncodeMetricGroupDescriptions, Literal, LoadEvents,
+                    Metric, MetricGroup, Select)
+
+# Global command line arguments.
+_args = None
+_zen_model: int = 1
+interval_sec = Event("duration_time")
+ins = Event("instructions")
+cycles = Event("cycles")
+# Number of CPU cycles scaled for SMT.
+smt_cycles = Select(cycles / 2, Literal("#smt_on"), cycles)
+
+
+def AmdBr():
+    def Total() -> MetricGroup:
+        br = Event("ex_ret_brn")
+        br_m_all = Event("ex_ret_brn_misp")
+        br_clr = Event("ex_ret_brn_cond_misp",
+                       "ex_ret_msprd_brnch_instr_dir_msmtch",
+                       "ex_ret_brn_resync")
+
+        br_r = d_ratio(br, interval_sec)
+        ins_r = d_ratio(ins, br)
+        misp_r = d_ratio(br_m_all, br)
+        clr_r = d_ratio(br_clr, interval_sec)
+
+        return MetricGroup("lpm_br_total", [
+            Metric("lpm_br_total_retired",
+                   "The number of branch instructions retired per second.", br_r,
+                   "insn/s"),
+            Metric(
+                "lpm_br_total_mispred",
+                "The number of branch instructions retired, of any type, that were "
+                "not correctly predicted as a percentage of all branch instrucions.",
+                misp_r, "100%"),
+            Metric("lpm_br_total_insn_between_branches",
+                   "The number of instructions divided by the number of branches.",
+                   ins_r, "insn"),
+            Metric("lpm_br_total_insn_fe_resteers",
+                   "The number of resync branches per second.", clr_r, "req/s")
+        ])
+
+    def Taken() -> MetricGroup:
+        br = Event("ex_ret_brn_tkn")
+        br_m_tk = Event("ex_ret_brn_tkn_misp")
+        br_r = d_ratio(br, interval_sec)
+        ins_r = d_ratio(ins, br)
+        misp_r = d_ratio(br_m_tk, br)
+        return MetricGroup("lpm_br_taken", [
+            Metric("lpm_br_taken_retired",
+                   "The number of taken branches that were retired per second.",
+                   br_r, "insn/s"),
+            Metric(
+                "lpm_br_taken_mispred",
+                "The number of retired taken branch instructions that were "
+                "mispredicted as a percentage of all taken branches.", misp_r,
+                "100%"),
+            Metric(
+                "lpm_br_taken_insn_between_branches",
+                "The number of instructions divided by the number of taken branches.",
+                ins_r, "insn"),
+        ])
+
+    def Conditional() -> Optional[MetricGroup]:
+        global _zen_model
+        br = Event("ex_ret_brn_cond", "ex_ret_cond")
+        br_r = d_ratio(br, interval_sec)
+        ins_r = d_ratio(ins, br)
+
+        metrics = [
+            Metric("lpm_br_cond_retired", "Retired conditional branch instructions.",
+                   br_r, "insn/s"),
+            Metric("lpm_br_cond_insn_between_branches",
+                   "The number of instructions divided by the number of conditional "
+                   "branches.", ins_r, "insn"),
+        ]
+        if _zen_model == 2:
+            br_m_cond = Event("ex_ret_cond_misp")
+            misp_r = d_ratio(br_m_cond, br)
+            metrics += [
+                Metric("lpm_br_cond_mispred",
+                       "Retired conditional branch instructions mispredicted as a "
+                       "percentage of all conditional branches.", misp_r, "100%"),
+            ]
+
+        return MetricGroup("lpm_br_cond", metrics)
+
+    def Fused() -> MetricGroup:
+        br = Event("ex_ret_fused_instr", "ex_ret_fus_brnch_inst")
+        br_r = d_ratio(br, interval_sec)
+        ins_r = d_ratio(ins, br)
+        return MetricGroup("lpm_br_cond", [
+            Metric("lpm_br_fused_retired",
+                   "Retired fused branch instructions per second.", br_r, "insn/s"),
+            Metric(
+                "lpm_br_fused_insn_between_branches",
+                "The number of instructions divided by the number of fused "
+                "branches.", ins_r, "insn"),
+        ])
+
+    def Far() -> MetricGroup:
+        br = Event("ex_ret_brn_far")
+        br_r = d_ratio(br, interval_sec)
+        ins_r = d_ratio(ins, br)
+        return MetricGroup("lpm_br_far", [
+            Metric("lpm_br_far_retired", "Retired far control transfers per second.",
+                   br_r, "insn/s"),
+            Metric(
+                "lpm_br_far_insn_between_branches",
+                "The number of instructions divided by the number of far branches.",
+                ins_r, "insn"),
+        ])
+
+    return MetricGroup("lpm_br", [Total(), Taken(), Conditional(), Fused(), Far()],
+                       description="breakdown of retired branch instructions")
+
+
+def AmdCtxSw() -> MetricGroup:
+    cs = Event("context\\-switches")
+    metrics = [
+        Metric("lpm_cs_rate", "Context switches per second",
+               d_ratio(cs, interval_sec), "ctxsw/s")
+    ]
+
+    ev = Event("instructions")
+    metrics.append(Metric("lpm_cs_instr", "Instructions per context switch",
+                          d_ratio(ev, cs), "instr/cs"))
+
+    ev = Event("cycles")
+    metrics.append(Metric("lpm_cs_cycles", "Cycles per context switch",
+                          d_ratio(ev, cs), "cycles/cs"))
+
+    ev = Event("ls_dispatch.pure_ld", "ls_dispatch.ld_dispatch")
+    metrics.append(Metric("lpm_cs_loads", "Loads per context switch",
+                          d_ratio(ev, cs), "loads/cs"))
+
+    ev = Event("ls_dispatch.pure_st", "ls_dispatch.store_dispatch")
+    metrics.append(Metric("lpm_cs_stores", "Stores per context switch",
+                          d_ratio(ev, cs), "stores/cs"))
+
+    ev = Event("ex_ret_brn_tkn")
+    metrics.append(Metric("lpm_cs_br_taken", "Branches taken per context switch",
+                          d_ratio(ev, cs), "br_taken/cs"))
+
+    return MetricGroup("lpm_cs", metrics,
+                       description=("Number of context switches per second, instructions "
+                                    "retired & core cycles between context switches"))
+
+
+def AmdDtlb() -> Optional[MetricGroup]:
+    global _zen_model
+    if _zen_model >= 4:
+        return None
+
+    d_dat = Event("ls_dc_accesses") if _zen_model <= 3 else None
+    d_h4k = Event("ls_l1_d_tlb_miss.tlb_reload_4k_l2_hit")
+    d_hcoal = Event(
+        "ls_l1_d_tlb_miss.tlb_reload_coalesced_page_hit") if _zen_model >= 2 else 0
+    d_h2m = Event("ls_l1_d_tlb_miss.tlb_reload_2m_l2_hit")
+    d_h1g = Event("ls_l1_d_tlb_miss.tlb_reload_1g_l2_hit")
+
+    d_m4k = Event("ls_l1_d_tlb_miss.tlb_reload_4k_l2_miss")
+    d_mcoal = Event(
+        "ls_l1_d_tlb_miss.tlb_reload_coalesced_page_miss") if _zen_model >= 2 else 0
+    d_m2m = Event("ls_l1_d_tlb_miss.tlb_reload_2m_l2_miss")
+    d_m1g = Event("ls_l1_d_tlb_miss.tlb_reload_1g_l2_miss")
+
+    d_w0 = Event("ls_tablewalker.dc_type0") if _zen_model <= 3 else None
+    d_w1 = Event("ls_tablewalker.dc_type1") if _zen_model <= 3 else None
+    walks = d_w0 + d_w1
+    walks_r = d_ratio(walks, interval_sec)
+    ins_w = d_ratio(ins, walks)
+    l1 = d_dat
+    l1_r = d_ratio(l1, interval_sec)
+    l2_hits = d_h4k + d_hcoal + d_h2m + d_h1g
+    l2_miss = d_m4k + d_mcoal + d_m2m + d_m1g
+    l2_r = d_ratio(l2_hits + l2_miss, interval_sec)
+    l1_miss = l2_hits + l2_miss + walks
+    l1_hits = max(l1 - l1_miss, 0)
+    ins_l = d_ratio(ins, l1_miss)
+
+    return MetricGroup("lpm_dtlb", [
+        MetricGroup("lpm_dtlb_ov", [
+            Metric("lpm_dtlb_ov_insn_bt_l1_miss",
+                   "DTLB overview: instructions between l1 misses.", ins_l,
+                   "insns"),
+            Metric("lpm_dtlb_ov_insn_bt_walks",
+                   "DTLB overview: instructions between dtlb page table walks.",
+                   ins_w, "insns"),
+        ]),
+        MetricGroup("lpm_dtlb_l1", [
+            Metric("lpm_dtlb_l1_hits",
+                   "DTLB L1 hits as percentage of all DTLB L1 accesses.",
+                   d_ratio(l1_hits, l1), "100%"),
+            Metric("lpm_dtlb_l1_miss",
+                   "DTLB L1 misses as percentage of all DTLB L1 accesses.",
+                   d_ratio(l1_miss, l1), "100%"),
+            Metric("lpm_dtlb_l1_reqs", "DTLB L1 accesses per second.", l1_r,
+                   "insns/s"),
+        ]),
+        MetricGroup("lpm_dtlb_l2", [
+            Metric("lpm_dtlb_l2_hits",
+                   "DTLB L2 hits as percentage of all DTLB L2 accesses.",
+                   d_ratio(l2_hits, l2_hits + l2_miss), "100%"),
+            Metric("lpm_dtlb_l2_miss",
+                   "DTLB L2 misses as percentage of all DTLB L2 accesses.",
+                   d_ratio(l2_miss, l2_hits + l2_miss), "100%"),
+            Metric("lpm_dtlb_l2_reqs", "DTLB L2 accesses per second.", l2_r,
+                   "insns/s"),
+            MetricGroup("lpm_dtlb_l2_4kb", [
+                Metric(
+                    "lpm_dtlb_l2_4kb_hits",
+                    "DTLB L2 4kb page size hits as percentage of all DTLB L2 4kb "
+                    "accesses.", d_ratio(d_h4k, d_h4k + d_m4k), "100%"),
+                Metric(
+                    "lpm_dtlb_l2_4kb_miss",
+                    "DTLB L2 4kb page size misses as percentage of all DTLB L2 4kb"
+                    "accesses.", d_ratio(d_m4k, d_h4k + d_m4k), "100%")
+            ]),
+            MetricGroup("lpm_dtlb_l2_coalesced", [
+                Metric(
+                    "lpm_dtlb_l2_coal_hits",
+                    "DTLB L2 coalesced page (16kb) hits as percentage of all DTLB "
+                    "L2 coalesced accesses.", d_ratio(d_hcoal,
+                                                      d_hcoal + d_mcoal), "100%"),
+                Metric(
+                    "lpm_dtlb_l2_coal_miss",
+                    "DTLB L2 coalesced page (16kb) misses as percentage of all "
+                    "DTLB L2 coalesced accesses.",
+                    d_ratio(d_mcoal, d_hcoal + d_mcoal), "100%")
+            ]),
+            MetricGroup("lpm_dtlb_l2_2mb", [
+                Metric(
+                    "lpm_dtlb_l2_2mb_hits",
+                    "DTLB L2 2mb page size hits as percentage of all DTLB L2 2mb "
+                    "accesses.", d_ratio(d_h2m, d_h2m + d_m2m), "100%"),
+                Metric(
+                    "lpm_dtlb_l2_2mb_miss",
+                    "DTLB L2 2mb page size misses as percentage of all DTLB L2 "
+                    "accesses.", d_ratio(d_m2m, d_h2m + d_m2m), "100%")
+            ]),
+            MetricGroup("lpm_dtlb_l2_1g", [
+                Metric(
+                    "lpm_dtlb_l2_1g_hits",
+                    "DTLB L2 1gb page size hits as percentage of all DTLB L2 1gb "
+                    "accesses.", d_ratio(d_h1g, d_h1g + d_m1g), "100%"),
+                Metric(
+                    "lpm_dtlb_l2_1g_miss",
+                    "DTLB L2 1gb page size misses as percentage of all DTLB L2 "
+                    "1gb accesses.", d_ratio(d_m1g, d_h1g + d_m1g), "100%")
+            ]),
+        ]),
+        MetricGroup("lpm_dtlb_walks", [
+            Metric("lpm_dtlb_walks_reqs", "DTLB page table walks per second.",
+                   walks_r, "walks/s"),
+        ]),
+    ], description="Data TLB metrics")
+
+
+def AmdItlb():
+    global _zen_model
+    l2h = Event("bp_l1_tlb_miss_l2_tlb_hit", "bp_l1_tlb_miss_l2_hit")
+    l2m = Event("l2_itlb_misses")
+    l2r = l2h + l2m
+
+    itlb_l1_mg = None
+    l1m = l2r
+    if _zen_model <= 3:
+        l1r = Event("ic_fw32")
+        l1h = max(l1r - l1m, 0)
+        itlb_l1_mg = MetricGroup("lpm_itlb_l1", [
+            Metric("lpm_itlb_l1_hits",
+                   "L1 ITLB hits as a perecentage of L1 ITLB accesses.",
+                   d_ratio(l1h, l1h + l1m), "100%"),
+            Metric("lpm_itlb_l1_miss",
+                   "L1 ITLB misses as a perecentage of L1 ITLB accesses.",
+                   d_ratio(l1m, l1h + l1m), "100%"),
+            Metric("lpm_itlb_l1_reqs",
+                   "The number of 32B fetch windows transferred from IC pipe to DE "
+                   "instruction decoder per second.", d_ratio(
+                       l1r, interval_sec),
+                   "windows/sec"),
+        ])
+
+    return MetricGroup("lpm_itlb", [
+        MetricGroup("lpm_itlb_ov", [
+            Metric("lpm_itlb_ov_insn_bt_l1_miss",
+                   "Number of instructions between l1 misses", d_ratio(
+                       ins, l1m), "insns"),
+            Metric("lpm_itlb_ov_insn_bt_l2_miss",
+                   "Number of instructions between l2 misses", d_ratio(
+                       ins, l2m), "insns"),
+        ]),
+        itlb_l1_mg,
+        MetricGroup("lpm_itlb_l2", [
+            Metric("lpm_itlb_l2_hits",
+                   "L2 ITLB hits as a percentage of all L2 ITLB accesses.",
+                   d_ratio(l2h, l2r), "100%"),
+            Metric("lpm_itlb_l2_miss",
+                   "L2 ITLB misses as a percentage of all L2 ITLB accesses.",
+                   d_ratio(l2m, l2r), "100%"),
+            Metric("lpm_itlb_l2_reqs", "ITLB accesses per second.",
+                   d_ratio(l2r, interval_sec), "accesses/sec"),
+        ]),
+    ], description="Instruction TLB breakdown")
+
+
+def AmdLdSt() -> MetricGroup:
+    ldst_ld = Event("ls_dispatch.pure_ld", "ls_dispatch.ld_dispatch")
+    ldst_st = Event("ls_dispatch.pure_st", "ls_dispatch.store_dispatch")
+    ldst_ldc1 = Event(f"{ldst_ld}/cmask=1/")
+    ldst_stc1 = Event(f"{ldst_st}/cmask=1/")
+    ldst_ldc2 = Event(f"{ldst_ld}/cmask=2/")
+    ldst_stc2 = Event(f"{ldst_st}/cmask=2/")
+    ldst_ldc3 = Event(f"{ldst_ld}/cmask=3/")
+    ldst_stc3 = Event(f"{ldst_st}/cmask=3/")
+    ldst_cyc = Event("ls_not_halted_cyc")
+
+    ld_rate = d_ratio(ldst_ld, interval_sec)
+    st_rate = d_ratio(ldst_st, interval_sec)
+
+    ld_v1 = max(ldst_ldc1 - ldst_ldc2, 0)
+    ld_v2 = max(ldst_ldc2 - ldst_ldc3, 0)
+    ld_v3 = ldst_ldc3
+
+    st_v1 = max(ldst_stc1 - ldst_stc2, 0)
+    st_v2 = max(ldst_stc2 - ldst_stc3, 0)
+    st_v3 = ldst_stc3
+
+    return MetricGroup("lpm_ldst", [
+        MetricGroup("lpm_ldst_total", [
+            Metric("lpm_ldst_total_ld", "Number of loads dispatched per second.",
+                   ld_rate, "insns/sec"),
+            Metric("lpm_ldst_total_st", "Number of stores dispatched per second.",
+                   st_rate, "insns/sec"),
+        ]),
+        MetricGroup("lpm_ldst_percent_insn", [
+            Metric("lpm_ldst_percent_insn_ld",
+                   "Load instructions as a percentage of all instructions.",
+                   d_ratio(ldst_ld, ins), "100%"),
+            Metric("lpm_ldst_percent_insn_st",
+                   "Store instructions as a percentage of all instructions.",
+                   d_ratio(ldst_st, ins), "100%"),
+        ]),
+        MetricGroup("lpm_ldst_ret_loads_per_cycle", [
+            Metric(
+                "lpm_ldst_ret_loads_per_cycle_1",
+                "Load instructions retiring in 1 cycle as a percentage of all "
+                "unhalted cycles.", d_ratio(ld_v1, ldst_cyc), "100%"),
+            Metric(
+                "lpm_ldst_ret_loads_per_cycle_2",
+                "Load instructions retiring in 2 cycles as a percentage of all "
+                "unhalted cycles.", d_ratio(ld_v2, ldst_cyc), "100%"),
+            Metric(
+                "lpm_ldst_ret_loads_per_cycle_3",
+                "Load instructions retiring in 3 or more cycles as a percentage"
+                "of all unhalted cycles.", d_ratio(ld_v3, ldst_cyc), "100%"),
+        ]),
+        MetricGroup("lpm_ldst_ret_stores_per_cycle", [
+            Metric(
+                "lpm_ldst_ret_stores_per_cycle_1",
+                "Store instructions retiring in 1 cycle as a percentage of all "
+                "unhalted cycles.", d_ratio(st_v1, ldst_cyc), "100%"),
+            Metric(
+                "lpm_ldst_ret_stores_per_cycle_2",
+                "Store instructions retiring in 2 cycles as a percentage of all "
+                "unhalted cycles.", d_ratio(st_v2, ldst_cyc), "100%"),
+            Metric(
+                "lpm_ldst_ret_stores_per_cycle_3",
+                "Store instructions retiring in 3 or more cycles as a percentage"
+                "of all unhalted cycles.", d_ratio(st_v3, ldst_cyc), "100%"),
+        ]),
+        MetricGroup("lpm_ldst_insn_bt", [
+            Metric("lpm_ldst_insn_bt_ld", "Number of instructions between loads.",
+                   d_ratio(ins, ldst_ld), "insns"),
+            Metric("lpm_ldst_insn_bt_st", "Number of instructions between stores.",
+                   d_ratio(ins, ldst_st), "insns"),
+        ])
+    ], description="Breakdown of load/store instructions")
+
+
+def AmdUpc() -> Metric:
+    ops = Event("ex_ret_ops", "ex_ret_cops")
+    upc = d_ratio(ops, smt_cycles)
+    return Metric("lpm_upc", "Micro-ops retired per core cycle (higher is better)",
+                  upc, "uops/cycle")
+
+
+def Idle() -> Metric:
+    cyc = Event("msr/mperf/")
+    tsc = Event("msr/tsc/")
+    low = max(tsc - cyc, 0)
+    return Metric(
+        "lpm_idle",
+        "Percentage of total wallclock cycles where CPUs are in low power state (C1 or deeper sleep state)",
+        d_ratio(low, tsc), "100%")
+
+
+def Rapl() -> MetricGroup:
+    """Processor socket power consumption estimate.
+
+    Use events from the running average power limit (RAPL) driver.
+    """
+    # Watts = joules/second
+    # Currently only energy-pkg is supported by AMD:
+    # https://lore.kernel.org/lkml/20220105185659.643355-1-eranian@google.com/
+    pkg = Event("power/energy\\-pkg/")
+    cond_pkg = Select(pkg, has_event(pkg), math.nan)
+    scale = 2.3283064365386962890625e-10
+    metrics = [
+        Metric("lpm_cpu_power_pkg", "",
+               d_ratio(cond_pkg * scale, interval_sec), "Watts"),
+    ]
+
+    return MetricGroup("lpm_cpu_power", metrics,
+                       description="Processor socket power consumption estimates")
+
+
+def UncoreL3():
+    acc = Event("l3_lookup_state.all_coherent_accesses_to_l3",
+                "l3_lookup_state.all_l3_req_typs")
+    miss = Event("l3_lookup_state.l3_miss",
+                 "l3_comb_clstr_state.request_miss")
+    acc = max(acc, miss)
+    hits = acc - miss
+
+    return MetricGroup("lpm_l3", [
+        Metric("lpm_l3_accesses", "L3 victim cache accesses",
+               d_ratio(acc, interval_sec), "accesses/sec"),
+        Metric("lpm_l3_hits", "L3 victim cache hit rate",
+               d_ratio(hits, acc), "100%"),
+        Metric("lpm_l3_miss", "L3 victim cache miss rate", d_ratio(miss, acc),
+               "100%"),
+    ], description="L3 cache breakdown per CCX")
+
+
+def main() -> None:
+    global _args
+    global _zen_model
+
+    def dir_path(path: str) -> str:
+        """Validate path is a directory for argparse."""
+        if os.path.isdir(path):
+            return path
+        raise argparse.ArgumentTypeError(
+            f'\'{path}\' is not a valid directory')
+
+    parser = argparse.ArgumentParser(description="AMD perf json generator")
+    parser.add_argument(
+        "-metricgroups", help="Generate metricgroups data", action='store_true')
+    parser.add_argument("model", help="e.g. amdzen[123]")
+    parser.add_argument(
+        'events_path',
+        type=dir_path,
+        help='Root of tree containing architecture directories containing json files'
+    )
+    _args = parser.parse_args()
+
+    directory = f"{_args.events_path}/x86/{_args.model}/"
+    LoadEvents(directory)
+
+    _zen_model = int(_args.model[6:])
+
+    all_metrics = MetricGroup("", [
+        AmdBr(),
+        AmdCtxSw(),
+        AmdDtlb(),
+        AmdItlb(),
+        AmdLdSt(),
+        AmdUpc(),
+        Cycles(),
+        Idle(),
+        Rapl(),
+        UncoreL3(),
+    ])
+
+    if _args.metricgroups:
+        print(JsonEncodeMetricGroupDescriptions(all_metrics))
+    else:
+        print(JsonEncodeMetric(all_metrics))
+
+
+if __name__ == '__main__':
+    main()
diff --git a/tools/perf/pmu-events/arch/arm64/arm/cortex-a510/pmu.json b/tools/perf/pmu-events/arch/arm64/arm/cortex-a510/pmu.json
deleted file mode 100644
index d8b7b9f9e5fa..000000000000
--- a/tools/perf/pmu-events/arch/arm64/arm/cortex-a510/pmu.json
+++ /dev/null
@@ -1,8 +0,0 @@
-[
-    {
-        "ArchStdEvent": "PMU_OVFS"
-    },
-    {
-        "ArchStdEvent": "PMU_HOVFS"
-    }
-]
diff --git a/tools/perf/pmu-events/arch/arm64/common-and-microarch.json b/tools/perf/pmu-events/arch/arm64/common-and-microarch.json
index 2416d9f8a83d..468cb085d879 100644
--- a/tools/perf/pmu-events/arch/arm64/common-and-microarch.json
+++ b/tools/perf/pmu-events/arch/arm64/common-and-microarch.json
@@ -433,24 +433,12 @@
         "BriefDescription": "Trace buffer current write pointer wrapped"
     },
     {
-        "PublicDescription": "PMU overflow, counters accessible to EL1 and EL0",
-        "EventCode": "0x400D",
-        "EventName": "PMU_OVFS",
-        "BriefDescription": "PMU overflow, counters accessible to EL1 and EL0"
-    },
-    {
         "PublicDescription": "Trace buffer Trigger Event",
         "EventCode": "0x400E",
         "EventName": "TRB_TRIG",
         "BriefDescription": "Trace buffer Trigger Event"
     },
     {
-        "PublicDescription": "PMU overflow, counters reserved for use by EL2",
-        "EventCode": "0x400F",
-        "EventName": "PMU_HOVFS",
-        "BriefDescription": "PMU overflow, counters reserved for use by EL2"
-    },
-    {
         "PublicDescription": "PE Trace Unit external output 0",
         "EventCode": "0x4010",
         "EventName": "TRCEXTOUT0",
diff --git a/tools/perf/pmu-events/arch/arm64/fujitsu/monaka/pmu.json b/tools/perf/pmu-events/arch/arm64/fujitsu/monaka/pmu.json
deleted file mode 100644
index 65bd6cdd0dd5..000000000000
--- a/tools/perf/pmu-events/arch/arm64/fujitsu/monaka/pmu.json
+++ /dev/null
@@ -1,10 +0,0 @@
-[
-    {
-        "ArchStdEvent": "PMU_OVFS",
-        "BriefDescription": "This event counts the event generated each time one of the condition occurs described in Arm Architecture Reference Manual for A-profile architecture. This event is only for output to the trace unit."
-    },
-    {
-        "ArchStdEvent": "PMU_HOVFS",
-        "BriefDescription": "This event counts the event generated each time an event is counted by an event counter <n> and all of the condition occur described in Arm Architecture Reference Manual for A-profile architecture. This event is only for output to the trace unit."
-    }
-]
diff --git a/tools/perf/pmu-events/arch/powerpc/power9/nest_metrics.json b/tools/perf/pmu-events/arch/powerpc/power9/nest_metrics.json
index 7a5d1bf543f8..8d028a7c2777 100644
--- a/tools/perf/pmu-events/arch/powerpc/power9/nest_metrics.json
+++ b/tools/perf/pmu-events/arch/powerpc/power9/nest_metrics.json
@@ -29,25 +29,25 @@
 	"MetricExpr" : "nest_mcs01_imc@PM_MCS01_128B_RD_DISP_PORT01@ + nest_mcs01_imc@PM_MCS01_128B_RD_DISP_PORT23@",
 	"MetricName" : "mcs01-read",
 	"MetricGroup" : "memory-bandwidth",
-	"ScaleUnit": "6.1e-5MB"
+	"ScaleUnit": "6.1e-5MiB"
     },
     {
 	"MetricExpr" : "nest_mcs23_imc@PM_MCS23_128B_RD_DISP_PORT01@ + nest_mcs23_imc@PM_MCS23_128B_RD_DISP_PORT23@",
 	"MetricName" : "mcs23-read",
 	"MetricGroup" : "memory-bandwidth",
-	"ScaleUnit": "6.1e-5MB"
+	"ScaleUnit": "6.1e-5MiB"
     },
     {
 	"MetricExpr" : "nest_mcs01_imc@PM_MCS01_128B_WR_DISP_PORT01@ + nest_mcs01_imc@PM_MCS01_128B_WR_DISP_PORT23@",
 	"MetricName" : "mcs01-write",
 	"MetricGroup" : "memory-bandwidth",
-	"ScaleUnit": "6.1e-5MB"
+	"ScaleUnit": "6.1e-5MiB"
     },
     {
 	"MetricExpr" : "nest_mcs23_imc@PM_MCS23_128B_WR_DISP_PORT01@ + nest_mcs23_imc@PM_MCS23_128B_WR_DISP_PORT23@",
 	"MetricName" : "mcs23-write",
 	"MetricGroup" : "memory-bandwidth",
-	"ScaleUnit": "6.1e-5MB"
+	"ScaleUnit": "6.1e-5MiB"
     },
     {
 	"MetricExpr" : "nest_powerbus0_imc@PM_PB_CYC@",
diff --git a/tools/perf/pmu-events/arch/riscv/mapfile.csv b/tools/perf/pmu-events/arch/riscv/mapfile.csv
index d5eea7f9aa9a..87cfb0e0849f 100644
--- a/tools/perf/pmu-events/arch/riscv/mapfile.csv
+++ b/tools/perf/pmu-events/arch/riscv/mapfile.csv
@@ -21,5 +21,6 @@
 0x489-0x8000000000000[1-6]08-0x[9b][[:xdigit:]]+,v1,sifive/p650,core
 0x5b7-0x0-0x0,v1,thead/c900-legacy,core
 0x5b7-0x80000000090c0d00-0x2047000,v1,thead/c900-legacy,core
+0x602-0x3-0x0,v1,openhwgroup/cva6,core
 0x67e-0x80000000db0000[89]0-0x[[:xdigit:]]+,v1,starfive/dubhe-80,core
 0x31e-0x8000000000008a45-0x[[:xdigit:]]+,v1,andes/ax45,core
diff --git a/tools/perf/pmu-events/arch/riscv/openhwgroup/cva6/firmware.json b/tools/perf/pmu-events/arch/riscv/openhwgroup/cva6/firmware.json
new file mode 100644
index 000000000000..7149caec4f80
--- /dev/null
+++ b/tools/perf/pmu-events/arch/riscv/openhwgroup/cva6/firmware.json
@@ -0,0 +1,68 @@
+[
+  {
+    "ArchStdEvent": "FW_MISALIGNED_LOAD"
+  },
+  {
+    "ArchStdEvent": "FW_MISALIGNED_STORE"
+  },
+  {
+    "ArchStdEvent": "FW_ACCESS_LOAD"
+  },
+  {
+    "ArchStdEvent": "FW_ACCESS_STORE"
+  },
+  {
+    "ArchStdEvent": "FW_ILLEGAL_INSN"
+  },
+  {
+    "ArchStdEvent": "FW_SET_TIMER"
+  },
+  {
+    "ArchStdEvent": "FW_IPI_SENT"
+  },
+  {
+    "ArchStdEvent": "FW_IPI_RECEIVED"
+  },
+  {
+    "ArchStdEvent": "FW_FENCE_I_SENT"
+  },
+  {
+    "ArchStdEvent": "FW_FENCE_I_RECEIVED"
+  },
+  {
+    "ArchStdEvent": "FW_SFENCE_VMA_SENT"
+  },
+  {
+    "ArchStdEvent": "FW_SFENCE_VMA_RECEIVED"
+  },
+  {
+    "ArchStdEvent": "FW_SFENCE_VMA_ASID_SENT"
+  },
+  {
+    "ArchStdEvent": "FW_SFENCE_VMA_ASID_RECEIVED"
+  },
+  {
+    "ArchStdEvent": "FW_HFENCE_GVMA_SENT"
+  },
+  {
+    "ArchStdEvent": "FW_HFENCE_GVMA_RECEIVED"
+  },
+  {
+    "ArchStdEvent": "FW_HFENCE_GVMA_VMID_SENT"
+  },
+  {
+    "ArchStdEvent": "FW_HFENCE_GVMA_VMID_RECEIVED"
+  },
+  {
+    "ArchStdEvent": "FW_HFENCE_VVMA_SENT"
+  },
+  {
+    "ArchStdEvent": "FW_HFENCE_VVMA_RECEIVED"
+  },
+  {
+    "ArchStdEvent": "FW_HFENCE_VVMA_ASID_SENT"
+  },
+  {
+    "ArchStdEvent": "FW_HFENCE_VVMA_ASID_RECEIVED"
+  }
+]
diff --git a/tools/perf/pmu-events/arch/riscv/openhwgroup/cva6/instructions.json b/tools/perf/pmu-events/arch/riscv/openhwgroup/cva6/instructions.json
new file mode 100644
index 000000000000..c38f6c97cf1f
--- /dev/null
+++ b/tools/perf/pmu-events/arch/riscv/openhwgroup/cva6/instructions.json
@@ -0,0 +1,47 @@
+[
+  {
+    "EventName": "LOAD_INSTRUCTIONS_RETIRED",
+    "EventCode": "0x5",
+    "BriefDescription": "number of data memory load instructions retired"
+  },
+  {
+    "EventName": "STORE_INSTRUCTIONS_RETIRED",
+    "EventCode": "0x6",
+    "BriefDescription": "number of data memory store instructions retired"
+  },
+  {
+    "EventName": "EXCEPTIONS",
+    "EventCode": "0x7",
+    "BriefDescription": "valid exceptions encountered"
+  },
+  {
+    "EventName": "EXCEPTION_HANDLER_RETURNS",
+    "EventCode": "0x8",
+    "BriefDescription": "return from an exception"
+  },
+  {
+    "EventName": "BRANCH_INSTRUCTIONS_RETIRED",
+    "EventCode": "0x9",
+    "BriefDescription": "number of branch instructions encountered retired"
+  },
+  {
+    "EventName": "CALL_INSTRUCTIONS_RETIRED",
+    "EventCode": "0xC",
+    "BriefDescription": "number of call instructions retired"
+  },
+  {
+    "EventName": "RETURN_INSTRUCTIONS_RETIRED",
+    "EventCode": "0xD",
+    "BriefDescription": "number of return instructions retired"
+  },
+  {
+    "EventName": "INTEGER_INSTRUCTIONS_RETIRED",
+    "EventCode": "0x14",
+    "BriefDescription": "number of integer instructions retired"
+  },
+  {
+    "EventName": "FLOATING_POINT_INSTRUCTIONS_RETIRED",
+    "EventCode": "0x15",
+    "BriefDescription": "number of floating point instructions retired"
+  }
+]
diff --git a/tools/perf/pmu-events/arch/riscv/openhwgroup/cva6/memory.json b/tools/perf/pmu-events/arch/riscv/openhwgroup/cva6/memory.json
new file mode 100644
index 000000000000..c4f376a0ee4e
--- /dev/null
+++ b/tools/perf/pmu-events/arch/riscv/openhwgroup/cva6/memory.json
@@ -0,0 +1,42 @@
+[
+  {
+    "EventName": "L1_I_CACHE_MISSES",
+    "EventCode": "0x1",
+    "BriefDescription": "number of misses in L1 I-Cache"
+  },
+  {
+    "EventName": "L1_D_CACHE_MISSES",
+    "EventCode": "0x2",
+    "BriefDescription": "number of misses in L1 D-Cache"
+  },
+  {
+    "EventName": "ITLB_MISSES",
+    "EventCode": "0x3",
+    "BriefDescription": "number of misses in ITLB"
+  },
+  {
+    "EventName": "DTLB_MISSES",
+    "EventCode": "0x4",
+    "BriefDescription": "number of misses in DTLB"
+  },
+  {
+    "EventName": "L1_I_CACHE_ACCESSES",
+    "EventCode": "0x10",
+    "BriefDescription": "number of accesses to instruction cache"
+  },
+  {
+    "EventName": "L1_D_CACHE_ACCESSES",
+    "EventCode": "0x11",
+    "BriefDescription": "number of accesses to data cache"
+  },
+  {
+    "EventName": "L1_CACHE_LINE_EVICTION",
+    "EventCode": "0x12",
+    "BriefDescription": "number of data cache line eviction"
+  },
+  {
+    "EventName": "ITLB_FLUSH",
+    "EventCode": "0x13",
+    "BriefDescription": "number of ITLB flushes"
+  }
+]
diff --git a/tools/perf/pmu-events/arch/riscv/openhwgroup/cva6/microarch.json b/tools/perf/pmu-events/arch/riscv/openhwgroup/cva6/microarch.json
new file mode 100644
index 000000000000..104e6e8197da
--- /dev/null
+++ b/tools/perf/pmu-events/arch/riscv/openhwgroup/cva6/microarch.json
@@ -0,0 +1,27 @@
+[
+  {
+    "EventName": "BRANCH_MISPREDICTS",
+    "EventCode": "0xA",
+    "BriefDescription": "number of branch mispredictions"
+  },
+  {
+    "EventName": "BRANCH_EXCEPTIONS",
+    "EventCode": "0xB",
+    "BriefDescription": "number of valid branch exceptions"
+  },
+  {
+    "EventName": "MSB_FULL",
+    "EventCode": "0xE",
+    "BriefDescription": "scoreboard is full"
+  },
+  {
+    "EventName": "INSTRUCTION_FETCH_EMPTY",
+    "EventCode": "0xF",
+    "BriefDescription": "number of invalid instructions in IF stage"
+  },
+  {
+    "EventName": "PIPELINE_STALL",
+    "EventCode": "0x16",
+    "BriefDescription": "number of cycles the pipeline is stalled during read operands"
+  }
+]
diff --git a/tools/perf/pmu-events/arch/x86/amdzen5/load-store.json b/tools/perf/pmu-events/arch/x86/amdzen5/load-store.json
index ff6627a77805..06bbaea15925 100644
--- a/tools/perf/pmu-events/arch/x86/amdzen5/load-store.json
+++ b/tools/perf/pmu-events/arch/x86/amdzen5/load-store.json
@@ -70,19 +70,19 @@
     "EventName": "ls_mab_alloc.load_store_allocations",
     "EventCode": "0x41",
     "BriefDescription": "Miss Address Buffer (MAB) entries allocated by a Load-Store (LS) pipe for load-store allocations.",
-    "UMask": "0x3f"
+    "UMask": "0x07"
   },
   {
     "EventName": "ls_mab_alloc.hardware_prefetcher_allocations",
     "EventCode": "0x41",
     "BriefDescription": "Miss Address Buffer (MAB) entries allocated by a Load-Store (LS) pipe for hardware prefetcher allocations.",
-    "UMask": "0x40"
+    "UMask": "0x08"
   },
   {
     "EventName": "ls_mab_alloc.all_allocations",
     "EventCode": "0x41",
     "BriefDescription": "Miss Address Buffer (MAB) entries allocated by a Load-Store (LS) pipe for all types of allocations.",
-    "UMask": "0x7f"
+    "UMask": "0x0f"
   },
   {
     "EventName": "ls_dmnd_fills_from_sys.local_l2",
diff --git a/tools/perf/pmu-events/arch/x86/amdzen6/branch-prediction.json b/tools/perf/pmu-events/arch/x86/amdzen6/branch-prediction.json
new file mode 100644
index 000000000000..dd70069f68ed
--- /dev/null
+++ b/tools/perf/pmu-events/arch/x86/amdzen6/branch-prediction.json
@@ -0,0 +1,93 @@
+[
+  {
+    "EventName": "bp_l1_tlb_miss_l2_tlb_hit",
+    "EventCode": "0x84",
+    "BriefDescription": "Instruction fetches that miss in the L1 ITLB but hit in the L2 ITLB."
+  },
+  {
+    "EventName": "bp_l1_tlb_miss_l2_tlb_miss.if4k",
+    "EventCode": "0x85",
+    "BriefDescription": "Instruction fetches that miss in both the L1 and L2 ITLBs (page-table walks requested) for 4k pages.",
+    "UMask": "0x01"
+  },
+  {
+    "EventName": "bp_l1_tlb_miss_l2_tlb_miss.if2m",
+    "EventCode": "0x85",
+    "BriefDescription": "Instruction fetches that miss in both the L1 and L2 ITLBs (page-table walks requested) for 2M pages.",
+    "UMask": "0x02"
+  },
+  {
+    "EventName": "bp_l1_tlb_miss_l2_tlb_miss.if1g",
+    "EventCode": "0x85",
+    "BriefDescription": "Instruction fetches that miss in both the L1 and L2 ITLBs (page-table walks requested) for 1G pages.",
+    "UMask": "0x04"
+  },
+  {
+    "EventName": "bp_l1_tlb_miss_l2_tlb_miss.coalesced_4k",
+    "EventCode": "0x85",
+    "BriefDescription": "Instruction fetches that miss in both the L1 and L2 ITLBs (page-table walks requested) for coalesced pages (16k pages created from four adjacent 4k pages).",
+    "UMask": "0x08"
+  },
+  {
+    "EventName": "bp_l1_tlb_miss_l2_tlb_miss.all",
+    "EventCode": "0x85",
+    "BriefDescription": "Instruction fetches that miss in both the L1 and L2 ITLBs (page-table walks requested) for all page sizes.",
+    "UMask": "0x0f"
+  },
+  {
+    "EventName": "bp_pipe_correct",
+    "EventCode": "0x8b",
+    "BriefDescription": "Branch predictor pipeline flushes due to internal conditions such as a second level prediction structure."
+  },
+  {
+    "EventName": "bp_var_target_pred",
+    "EventCode": "0x8e",
+    "BriefDescription": "Indirect predictions (branch used the indirect predictor to make a prediction)."
+  },
+  {
+    "EventName": "bp_early_redir",
+    "EventCode": "0x91",
+    "BriefDescription": "Early redirects sent to branch predictor. This happens when either the decoder or dispatch logic is able to detect that the branch predictor needs to be redirected."
+  },
+  {
+    "EventName": "bp_l1_tlb_fetch_hit.if4k",
+    "EventCode": "0x94",
+    "BriefDescription": "Instruction fetches that hit in the L1 ITLB for 4k or coalesced pages (16k pages created from four adjacent 4k pages).",
+    "UMask": "0x01"
+  },
+  {
+    "EventName": "bp_l1_tlb_fetch_hit.if2m",
+    "EventCode": "0x94",
+    "BriefDescription": "Instruction fetches that hit in the L1 ITLB for 2M pages.",
+    "UMask": "0x02"
+  },
+  {
+    "EventName": "bp_l1_tlb_fetch_hit.if1g",
+    "EventCode": "0x94",
+    "BriefDescription": "Instruction fetches that hit in the L1 ITLB for 1G pages.",
+    "UMask": "0x04"
+  },
+  {
+    "EventName": "bp_l1_tlb_fetch_hit.all",
+    "EventCode": "0x94",
+    "BriefDescription": "Instruction fetches that hit in the L1 ITLB for all page sizes.",
+    "UMask": "0x07"
+  },
+  {
+    "EventName": "bp_fe_redir.resync",
+    "EventCode": "0x9f",
+    "BriefDescription": "Redirects of the pipeline frontend caused by resyncs. These are retire time pipeline restarts.",
+    "UMask": "0x01"
+  },
+  {
+    "EventName": "bp_fe_redir.ex_redir",
+    "EventCode": "0x9f",
+    "BriefDescription": "Redirects of the pipeline frontend caused by mispredicts. These are used for branch direction correction and handling indirect branch target mispredicts.",
+    "UMask": "0x02"
+  },
+  {
+    "EventName": "bp_fe_redir.all",
+    "EventCode": "0x9f",
+    "BriefDescription": "Redirects of the pipeline frontend caused by any reason."
+  }
+]
diff --git a/tools/perf/pmu-events/arch/x86/amdzen6/decode.json b/tools/perf/pmu-events/arch/x86/amdzen6/decode.json
new file mode 100644
index 000000000000..c5d37fbac948
--- /dev/null
+++ b/tools/perf/pmu-events/arch/x86/amdzen6/decode.json
@@ -0,0 +1,139 @@
+[
+  {
+    "EventName": "de_op_queue_empty",
+    "EventCode": "0xa9",
+    "BriefDescription": "Cycles where the op queue is empty. Such cycles indicate that the frontend is not delivering instructions fast enough."
+  },
+  {
+    "EventName": "de_src_op_disp.x86_decoder",
+    "EventCode": "0xaa",
+    "BriefDescription": "Ops dispatched from x86 decoder.",
+    "UMask": "0x01"
+  },
+  {
+    "EventName": "de_src_op_disp.op_cache",
+    "EventCode": "0xaa",
+    "BriefDescription": "Ops dispatched from op cache.",
+    "UMask": "0x02"
+  },
+  {
+    "EventName": "de_src_op_disp.all",
+    "EventCode": "0xaa",
+    "BriefDescription": "Ops dispatched from any source.",
+    "UMask": "0x07"
+  },
+  {
+    "EventName": "de_dis_ops_from_decoder.any_fp",
+    "EventCode": "0xab",
+    "BriefDescription": "Ops dispatched from the decoder to a floating-point unit.",
+    "UMask": "0x04"
+  },
+  {
+    "EventName": "de_dis_ops_from_decoder.any_int",
+    "EventCode": "0xab",
+    "BriefDescription": "Ops dispatched from the decoder to an integer unit.",
+    "UMask": "0x08"
+  },
+  {
+    "EventName": "de_disp_stall_cycles_dynamic_tokens_part1.int_phy_reg_file_rsrc_stall",
+    "EventCode": "0xae",
+    "BriefDescription": "Cycles where a dispatch group is valid but does not get dispatched due to integer physical register file resource stalls.",
+    "UMask": "0x01"
+  },
+  {
+    "EventName": "de_dispatch_stall_cycle_dynamic_tokens_part1.load_queue_rsrc_stall",
+    "EventCode": "0xae",
+    "BriefDescription": "Cycles where a dispatch group is valid but does not get dispatched due to load queue token stalls.",
+    "UMask": "0x02"
+  },
+  {
+    "EventName": "de_dispatch_stall_cycle_dynamic_tokens_part1.store_queue_rsrc_stall",
+    "EventCode": "0xae",
+    "BriefDescription": "Cycles where a dispatch group is valid but does not get dispatched due to store queue token stalls.",
+    "UMask": "0x04"
+  },
+  {
+    "EventName": "de_dispatch_stall_cycle_dynamic_tokens_part1.taken_brnch_buffer_rsrc",
+    "EventCode": "0xae",
+    "BriefDescription": "Cycles where a dispatch group is valid but does not get dispatched due to taken branch buffer resource stalls.",
+    "UMask": "0x10"
+  },
+  {
+    "EventName": "de_dispatch_stall_cycle_dynamic_tokens_part1.fp_sch_rsrc_stall",
+    "EventCode": "0xae",
+    "BriefDescription": "Cycles where a dispatch group is valid but does not get dispatched due to floating-point non-schedulable queue token stalls.",
+    "UMask": "0x40"
+  },
+  {
+    "EventName": "de_dispatch_stall_cycle_dynamic_tokens_part2.int_sq0",
+    "EventCode": "0xaf",
+    "BriefDescription": "Cycles where a dispatch group is valid but does not get dispatched due to unavailability of integer scheduler 0 tokens.",
+    "UMask": "0x01"
+  },
+  {
+    "EventName": "de_dispatch_stall_cycle_dynamic_tokens_part2.int_sq1",
+    "EventCode": "0xaf",
+    "BriefDescription": "Cycles where a dispatch group is valid but does not get dispatched due to unavailability of integer scheduler 1 tokens.",
+    "UMask": "0x02"
+  },
+  {
+    "EventName": "de_dispatch_stall_cycle_dynamic_tokens_part2.int_sq2",
+    "EventCode": "0xaf",
+    "BriefDescription": "Cycles where a dispatch group is valid but does not get dispatched due to unavailability of integer scheduler 2 tokens.",
+    "UMask": "0x04"
+  },
+  {
+    "EventName": "de_dispatch_stall_cycle_dynamic_tokens_part2.int_sq3",
+    "EventCode": "0xaf",
+    "BriefDescription": "Cycles where a dispatch group is valid but does not get dispatched due to unavailability of integer scheduler 3 tokens.",
+    "UMask": "0x08"
+  },
+  {
+    "EventName": "de_dispatch_stall_cycle_dynamic_tokens_part2.int_sq4",
+    "EventCode": "0xaf",
+    "BriefDescription": "Cycles where a dispatch group is valid but does not get dispatched due to unavailability of integer scheduler 4 tokens.",
+    "UMask": "0x10"
+  },
+  {
+    "EventName": "de_dispatch_stall_cycle_dynamic_tokens_part2.int_sq5",
+    "EventCode": "0xaf",
+    "BriefDescription": "Cycles where a dispatch group is valid but does not get dispatched due to unavailability of integer scheduler 5 tokens.",
+    "UMask": "0x20"
+  },
+  {
+    "EventName": "de_dispatch_stall_cycle_dynamic_tokens_part2.ret_q",
+    "EventCode": "0xaf",
+    "BriefDescription": "Cycles where a dispatch group is valid but does not get dispatched due to unavailability of retire queue tokens.",
+    "UMask": "0x80"
+  },
+  {
+    "EventName": "de_dispatch_stall_cycle_dynamic_tokens_part2.all",
+    "EventCode": "0xaf",
+    "BriefDescription": "Cycles where a dispatch group is valid but does not get dispatched due to any token stalls.",
+    "UMask": "0xbf"
+  },
+  {
+    "EventName": "de_no_dispatch_per_slot.no_ops_from_frontend",
+    "EventCode": "0x1a0",
+    "BriefDescription": "Dispatch slots in each cycle that were empty because the frontend did not supply ops.",
+    "UMask": "0x01"
+  },
+  {
+    "EventName": "de_no_dispatch_per_slot.backend_stalls",
+    "EventCode": "0x1a0",
+    "BriefDescription": "Dispatch slots in each cycle that were unused because of backend stalls.",
+    "UMask": "0x1e"
+  },
+  {
+    "EventName": "de_no_dispatch_per_slot.smt_contention",
+    "EventCode": "0x1a0",
+    "BriefDescription": "Dispatch slots in each cycle that were unused because the dispatch cycle was granted to the other SMT thread.",
+    "UMask": "0x60"
+  },
+  {
+    "EventName": "de_additional_resource_stalls.dispatch_stalls",
+    "EventCode": "0x1a2",
+    "BriefDescription": "Counts additional cycles where dispatch is stalled due to a lack of dispatch resources.",
+    "UMask": "0x30"
+  }
+]
diff --git a/tools/perf/pmu-events/arch/x86/amdzen6/execution.json b/tools/perf/pmu-events/arch/x86/amdzen6/execution.json
new file mode 100644
index 000000000000..1b80acc89b6f
--- /dev/null
+++ b/tools/perf/pmu-events/arch/x86/amdzen6/execution.json
@@ -0,0 +1,192 @@
+[
+  {
+    "EventName": "ex_ret_instr",
+    "EventCode": "0xc0",
+    "BriefDescription": "Retired instructions."
+  },
+  {
+    "EventName": "ex_ret_ops",
+    "EventCode": "0xc1",
+    "BriefDescription": "Retired macro-ops."
+  },
+  {
+    "EventName": "ex_ret_brn",
+    "EventCode": "0xc2",
+    "BriefDescription": "Retired branch instructions (all types of architectural control flow changes, including exceptions and interrupts)."
+  },
+  {
+    "EventName": "ex_ret_brn_misp",
+    "EventCode": "0xc3",
+    "BriefDescription": "Retired branch instructions that were mispredicted."
+  },
+  {
+    "EventName": "ex_ret_brn_tkn",
+    "EventCode": "0xc4",
+    "BriefDescription": "Retired taken branch instructions (all types of architectural control flow changes, including exceptions and interrupts)."
+  },
+  {
+    "EventName": "ex_ret_brn_tkn_misp",
+    "EventCode": "0xc5",
+    "BriefDescription": "Retired taken branch instructions that were mispredicted."
+  },
+  {
+    "EventName": "ex_ret_brn_far",
+    "EventCode": "0xc6",
+    "BriefDescription": "Retired far control transfers (far call, far jump, far return, IRET, SYSCALL and SYSRET, plus exceptions and interrupts). Far control transfers are not subject to branch prediction."
+  },
+  {
+    "EventName": "ex_ret_near_ret",
+    "EventCode": "0xc8",
+    "BriefDescription": "Retired near returns (RET or RET Iw)."
+  },
+  {
+    "EventName": "ex_ret_near_ret_mispred",
+    "EventCode": "0xc9",
+    "BriefDescription": "Retired near returns that were mispredicted. Each misprediction incurs the same penalty as that of a mispredicted conditional branch instruction."
+  },
+  {
+    "EventName": "ex_ret_brn_ind_misp",
+    "EventCode": "0xca",
+    "BriefDescription": "Retired indirect branch instructions that were mispredicted (only EX mispredicts). Each misprediction incurs the same penalty as that of a mispredicted conditional branch instruction."
+  },
+  {
+    "EventName": "ex_ret_brn_ind",
+    "EventCode": "0xcc",
+    "BriefDescription": "Retired indirect branch instructions."
+  },
+  {
+    "EventName": "ex_ret_brn_cond",
+    "EventCode": "0xd1",
+    "BriefDescription": "Retired conditional branch instructions."
+  },
+  {
+    "EventName": "ex_div_busy",
+    "EventCode": "0xd3",
+    "BriefDescription": "Cycles where the divider is busy."
+  },
+  {
+    "EventName": "ex_div_count",
+    "EventCode": "0xd4",
+    "BriefDescription": "Divide ops executed."
+  },
+  {
+    "EventName": "ex_no_retire.empty",
+    "EventCode": "0xd6",
+    "BriefDescription": "Cycles where the thread does not retire any ops due to a lack of valid ops in the retire queue (may be caused by front-end bottlenecks or pipeline redirects).",
+    "UMask": "0x01"
+  },
+  {
+    "EventName": "ex_no_retire.not_complete",
+    "EventCode": "0xd6",
+    "BriefDescription": "Cycles where the thread does not retire any ops as the oldest retire slot is waiting to be marked as completed.",
+    "UMask": "0x02"
+  },
+  {
+    "EventName": "ex_no_retire.other",
+    "EventCode": "0xd6",
+    "BriefDescription": "Cycles where the thread does not retire any ops due to other reasons (retire breaks, traps, faults, etc.).",
+    "UMask": "0x08"
+  },
+  {
+    "EventName": "ex_no_retire.thread_not_selected",
+    "EventCode": "0xd6",
+    "BriefDescription": "Cycles where the thread does not retire any ops as thread arbitration did not select the current thread.",
+    "UMask": "0x10"
+  },
+  {
+    "EventName": "ex_no_retire.load_not_complete",
+    "EventCode": "0xd6",
+    "BriefDescription": "Cycles where the thread does not retire any ops due to missing load completion.",
+    "UMask": "0xa2"
+  },
+  {
+    "EventName": "ex_ret_ucode_instr",
+    "EventCode": "0x1c1",
+    "BriefDescription": "Retired microcoded instructions."
+  },
+  {
+    "EventName": "ex_ret_ucode_ops",
+    "EventCode": "0x1c2",
+    "BriefDescription": "Retired microcode ops."
+  },
+  {
+    "EventName": "ex_ret_brn_cond_misp",
+    "EventCode": "0x1c7",
+    "BriefDescription": "Retired conditional branch instructions that were mispredicted due to direction mismatch."
+  },
+  {
+    "EventName": "ex_ret_brn_uncond_ind_near_misp",
+    "EventCode": "0x1c8",
+    "BriefDescription": "Retired unconditional indirect near branch instructions that were mispredicted."
+  },
+  {
+    "EventName": "ex_ret_brn_uncond",
+    "EventCode": "0x1c9",
+    "BriefDescription": "Retired unconditional branch instructions."
+  },
+  {
+    "EventName": "ex_tagged_ibs_ops.tagged",
+    "EventCode": "0x1cf",
+    "BriefDescription": "Execution IBS tagged ops.",
+    "UMask": "0x01"
+  },
+  {
+    "EventName": "ex_tagged_ibs_ops.tagged_ret",
+    "EventCode": "0x1cf",
+    "BriefDescription": "Execution IBS tagged ops that retired.",
+    "UMask": "0x02"
+  },
+  {
+    "EventName": "ex_tagged_ibs_ops.rollovers",
+    "EventCode": "0x1cf",
+    "BriefDescription": "Execution IBS periodic counter rollovers due to a previous tagged op not being IBS complete.",
+    "UMask": "0x04"
+  },
+  {
+    "EventName": "ex_tagged_ibs_ops.filtered",
+    "EventCode": "0x1cf",
+    "BriefDescription": "Execution IBS tagged ops that retired but were discarded due to IBS filtering.",
+    "UMask": "0x08"
+  },
+  {
+    "EventName": "ex_tagged_ibs_ops.valid",
+    "EventCode": "0x1cf",
+    "BriefDescription": "Execution IBS tagged ops that resulted in a valid sample and an IBS interrupt.",
+    "UMask": "0x10"
+  },
+  {
+    "EventName": "ex_ret_fused_instr",
+    "EventCode": "0x1d0",
+    "BriefDescription": "Retired fused instructions."
+  },
+  {
+    "EventName": "ex_mprof_ibs_ops.tagged",
+    "EventCode": "0x2c0",
+    "BriefDescription": "Memory Profiler IBS tagged ops.",
+    "UMask": "0x01"
+  },
+  {
+    "EventName": "ex_mprof_ibs_ops.tagged_ret",
+    "EventCode": "0x2c0",
+    "BriefDescription": "Memory Profiler IBS tagged ops that retired.",
+    "UMask": "0x02"
+  },
+  {
+    "EventName": "ex_mprof_ibs_ops.rollovers",
+    "EventCode": "0x2c0",
+    "BriefDescription": "Memory Profiler IBS periodic counter rollovers due to a previous tagged op not being IBS complete.",
+    "UMask": "0x04"
+  },
+  {
+    "EventName": "ex_mprof_ibs_ops.filtered",
+    "EventCode": "0x2c0",
+    "BriefDescription": "Memory Profiler IBS tagged ops that retired but were discarded due to IBS filtering.",
+    "UMask": "0x08"
+  },
+  {
+    "EventName": "ex_mprof_ibs_ops.valid",
+    "EventCode": "0x2c0",
+    "BriefDescription": "Memory Profiler IBS tagged ops that resulted in a valid sample and an IBS interrupt.",
+    "UMask": "0x10"
+  }
+]
diff --git a/tools/perf/pmu-events/arch/x86/amdzen6/floating-point.json b/tools/perf/pmu-events/arch/x86/amdzen6/floating-point.json
new file mode 100644
index 000000000000..03cb039434de
--- /dev/null
+++ b/tools/perf/pmu-events/arch/x86/amdzen6/floating-point.json
@@ -0,0 +1,1106 @@
+[
+  {
+    "EventName": "fp_ret_x87_fp_ops.add_sub_ops",
+    "EventCode": "0x02",
+    "BriefDescription": "Retired x87 floating-point add and subtract uops.",
+    "UMask": "0x01"
+  },
+  {
+    "EventName": "fp_ret_x87_fp_ops.mul_ops",
+    "EventCode": "0x02",
+    "BriefDescription": "Retired x87 floating-point multiply uops.",
+    "UMask": "0x02"
+  },
+  {
+    "EventName": "fp_ret_x87_fp_ops.div_sqrt_ops",
+    "EventCode": "0x02",
+    "BriefDescription": "Retired x87 floating-point divide and square root uops.",
+    "UMask": "0x04"
+  },
+  {
+    "EventName": "fp_ret_x87_fp_ops.all",
+    "EventCode": "0x02",
+    "BriefDescription": "Retired x87 floating-point uops of all types.",
+    "UMask": "0x07"
+  },
+  {
+    "EventName": "fp_ret_sse_avx_ops.add_sub_flops",
+    "EventCode": "0x03",
+    "BriefDescription": "Retired SSE and AVX add and subtract FLOPs.",
+    "UMask": "0x01"
+  },
+  {
+    "EventName": "fp_ret_sse_avx_ops.mult_flops",
+    "EventCode": "0x03",
+    "BriefDescription": "Retired SSE and AVX multiply FLOPs.",
+    "UMask": "0x02"
+  },
+  {
+    "EventName": "fp_ret_sse_avx_ops.div_flops",
+    "EventCode": "0x03",
+    "BriefDescription": "Retired SSE and AVX divide and square root FLOPs.",
+    "UMask": "0x04"
+  },
+  {
+    "EventName": "fp_ret_sse_avx_ops.mac_flops",
+    "EventCode": "0x03",
+    "BriefDescription": "Retired SSE and AVX multiply-accumulate FLOPs (each operation is counted as 2 FLOPs, bfloat operations are not included).",
+    "UMask": "0x08"
+  },
+  {
+    "EventName": "fp_ret_sse_avx_ops.bfloat16_flops",
+    "EventCode": "0x03",
+    "BriefDescription": "Retired SSE and AVX bfloat16 FLOPs.",
+    "UMask": "0x20"
+  },
+  {
+    "EventName": "fp_ret_sse_avx_ops.scalar_single_flops",
+    "EventCode": "0x03",
+    "BriefDescription": "Retired SSE and AVX scalar single-precision (FP32) FLOPs.",
+    "UMask": "0x40"
+  },
+  {
+    "EventName": "fp_ret_sse_avx_ops.packed_single_flops",
+    "EventCode": "0x03",
+    "BriefDescription": "Retired SSE and AVX packed single-precision (FP32) FLOPs.",
+    "UMask": "0x60"
+  },
+  {
+    "EventName": "fp_ret_sse_avx_ops.scalar_double_flops",
+    "EventCode": "0x03",
+    "BriefDescription": "Retired SSE and AVX scalar double-precision (FP64) FLOPs.",
+    "UMask": "0x80"
+  },
+  {
+    "EventName": "fp_ret_sse_avx_ops.packed_double_flops",
+    "EventCode": "0x03",
+    "BriefDescription": "Retired SSE and AVX packed double-precision (FP64) FLOPs.",
+    "UMask": "0xa0"
+  },
+  {
+    "EventName": "fp_ret_sse_avx_ops.scalar_half_flops",
+    "EventCode": "0x03",
+    "BriefDescription": "Retired SSE and AVX scalar half-precision (FP16) FLOPs.",
+    "UMask": "0xa0"
+  },
+  {
+    "EventName": "fp_ret_sse_avx_ops.packed_half_flops",
+    "EventCode": "0x03",
+    "BriefDescription": "Retired SSE and AVX packed half-precision (FP16) FLOPs.",
+    "UMask": "0xa0"
+  },
+  {
+    "EventName": "fp_ret_sse_avx_ops.all",
+    "EventCode": "0x03",
+    "BriefDescription": "Retired SSE and AVX FLOPs of all types.",
+    "UMask": "0x0f"
+  },
+  {
+    "EventName": "fp_ops_ret_by_width.x87",
+    "EventCode": "0x08",
+    "BriefDescription": "Retired x87 floating-point uops.",
+    "UMask": "0x01"
+  },
+  {
+    "EventName": "fp_ops_ret_by_width.mmx",
+    "EventCode": "0x08",
+    "BriefDescription": "Retired MMX floating-point uops.",
+    "UMask": "0x02"
+  },
+  {
+    "EventName": "fp_ops_ret_by_width.scalar",
+    "EventCode": "0x08",
+    "BriefDescription": "Retired scalar floating-point uops.",
+    "UMask": "0x04"
+  },
+  {
+    "EventName": "fp_ops_ret_by_width.pack_128",
+    "EventCode": "0x08",
+    "BriefDescription": "Retired packed 128-bit floating-point uops.",
+    "UMask": "0x08"
+  },
+  {
+    "EventName": "fp_ops_ret_by_width.pack_256",
+    "EventCode": "0x08",
+    "BriefDescription": "Retired packed 256-bit floating-point uops.",
+    "UMask": "0x10"
+  },
+  {
+    "EventName": "fp_ops_ret_by_width.pack_512",
+    "EventCode": "0x08",
+    "BriefDescription": "Retired packed 512-bit floating-point uops.",
+    "UMask": "0x20"
+  },
+  {
+    "EventName": "fp_ops_ret_by_width.all",
+    "EventCode": "0x08",
+    "BriefDescription": "Retired floating-point uops of all widths.",
+    "UMask": "0x3f"
+  },
+  {
+    "EventName": "fp_ops_ret_by_type.scalar_add",
+    "EventCode": "0x0a",
+    "BriefDescription": "Retired scalar floating-point add uops.",
+    "UMask": "0x01"
+  },
+  {
+    "EventName": "fp_ops_ret_by_type.scalar_sub",
+    "EventCode": "0x0a",
+    "BriefDescription": "Retired scalar floating-point subtract uops.",
+    "UMask": "0x02"
+  },
+  {
+    "EventName": "fp_ops_ret_by_type.scalar_mul",
+    "EventCode": "0x0a",
+    "BriefDescription": "Retired scalar floating-point multiply uops.",
+    "UMask": "0x03"
+  },
+  {
+    "EventName": "fp_ops_ret_by_type.scalar_mac",
+    "EventCode": "0x0a",
+    "BriefDescription": "Retired scalar floating-point multiply-accumulate uops.",
+    "UMask": "0x04"
+  },
+  {
+    "EventName": "fp_ops_ret_by_type.scalar_div",
+    "EventCode": "0x0a",
+    "BriefDescription": "Retired scalar floating-point divide uops.",
+    "UMask": "0x05"
+  },
+  {
+    "EventName": "fp_ops_ret_by_type.scalar_sqrt",
+    "EventCode": "0x0a",
+    "BriefDescription": "Retired scalar floating-point square root uops.",
+    "UMask": "0x06"
+  },
+  {
+    "EventName": "fp_ops_ret_by_type.scalar_cmp",
+    "EventCode": "0x0a",
+    "BriefDescription": "Retired scalar floating-point compare uops.",
+    "UMask": "0x07"
+  },
+  {
+    "EventName": "fp_ops_ret_by_type.scalar_cvt",
+    "EventCode": "0x0a",
+    "BriefDescription": "Retired scalar floating-point convert uops.",
+    "UMask": "0x08"
+  },
+  {
+    "EventName": "fp_ops_ret_by_type.scalar_blend",
+    "EventCode": "0x0a",
+    "BriefDescription": "Retired scalar floating-point blend uops.",
+    "UMask": "0x09"
+  },
+  {
+    "EventName": "fp_ops_ret_by_type.scalar_move",
+    "EventCode": "0x0a",
+    "BriefDescription": "Retired scalar floating-point move uops.",
+    "UMask": "0x0a"
+  },
+  {
+    "EventName": "fp_ops_ret_by_type.scalar_shuffle",
+    "EventCode": "0x0a",
+    "BriefDescription": "Retired scalar floating-point shuffle uops (may include instructions not necessarily thought of as including shuffles e.g. horizontal add, dot product, and certain MOV instructions).",
+    "UMask": "0x0b"
+  },
+  {
+    "EventName": "fp_ops_ret_by_type.scalar_bfloat",
+    "EventCode": "0x0a",
+    "BriefDescription": "Retired scalar floating-point bfloat uops.",
+    "UMask": "0x0c"
+  },
+  {
+    "EventName": "fp_ops_ret_by_type.scalar_logical",
+    "EventCode": "0x0a",
+    "BriefDescription": "Retired scalar floating-point move uops.",
+    "UMask": "0x0d"
+  },
+  {
+    "EventName": "fp_ops_ret_by_type.scalar_other",
+    "EventCode": "0x0a",
+    "BriefDescription": "Retired scalar floating-point uops of other types.",
+    "UMask": "0x0e"
+  },
+  {
+    "EventName": "fp_ops_ret_by_type.scalar_all",
+    "EventCode": "0x0a",
+    "BriefDescription": "Retired scalar floating-point uops of all types.",
+    "UMask": "0x0f"
+  },
+  {
+    "EventName": "fp_ops_ret_by_type.vector_add",
+    "EventCode": "0x0a",
+    "BriefDescription": "Retired vector floating-point add uops.",
+    "UMask": "0x10"
+  },
+  {
+    "EventName": "fp_ops_ret_by_type.vector_sub",
+    "EventCode": "0x0a",
+    "BriefDescription": "Retired vector floating-point subtract uops.",
+    "UMask": "0x20"
+  },
+  {
+    "EventName": "fp_ops_ret_by_type.vector_mul",
+    "EventCode": "0x0a",
+    "BriefDescription": "Retired vector floating-point multiply uops.",
+    "UMask": "0x30"
+  },
+  {
+    "EventName": "fp_ops_ret_by_type.vector_mac",
+    "EventCode": "0x0a",
+    "BriefDescription": "Retired vector floating-point multiply-accumulate uops.",
+    "UMask": "0x40"
+  },
+  {
+    "EventName": "fp_ops_ret_by_type.vector_div",
+    "EventCode": "0x0a",
+    "BriefDescription": "Retired vector floating-point divide uops.",
+    "UMask": "0x50"
+  },
+  {
+    "EventName": "fp_ops_ret_by_type.vector_sqrt",
+    "EventCode": "0x0a",
+    "BriefDescription": "Retired vector floating-point square root uops.",
+    "UMask": "0x60"
+  },
+  {
+    "EventName": "fp_ops_ret_by_type.vector_cmp",
+    "EventCode": "0x0a",
+    "BriefDescription": "Retired vector floating-point compare uops.",
+    "UMask": "0x70"
+  },
+  {
+    "EventName": "fp_ops_ret_by_type.vector_cvt",
+    "EventCode": "0x0a",
+    "BriefDescription": "Retired vector floating-point convert uops.",
+    "UMask": "0x80"
+  },
+  {
+    "EventName": "fp_ops_ret_by_type.vector_blend",
+    "EventCode": "0x0a",
+    "BriefDescription": "Retired vector floating-point blend uops.",
+    "UMask": "0x90"
+  },
+  {
+    "EventName": "fp_ops_ret_by_type.vector_move",
+    "EventCode": "0x0a",
+    "BriefDescription": "Retired vector floating-point move uops.",
+    "UMask": "0xa0"
+  },
+  {
+    "EventName": "fp_ops_ret_by_type.vector_shuffle",
+    "EventCode": "0x0a",
+    "BriefDescription": "Retired vector floating-point shuffle uops (may include instructions not necessarily thought of as including shuffles e.g. horizontal add, dot product, and certain MOV instructions).",
+    "UMask": "0xb0"
+  },
+  {
+    "EventName": "fp_ops_ret_by_type.vector_bfloat",
+    "EventCode": "0x0a",
+    "BriefDescription": "Retired vector floating-point bfloat uops.",
+    "UMask": "0xc0"
+  },
+  {
+    "EventName": "fp_ops_ret_by_type.vector_logical",
+    "EventCode": "0x0a",
+    "BriefDescription": "Retired vector floating-point logical uops.",
+    "UMask": "0xd0"
+  },
+  {
+    "EventName": "fp_ops_ret_by_type.vector_other",
+    "EventCode": "0x0a",
+    "BriefDescription": "Retired vector floating-point uops of other types.",
+    "UMask": "0xe0"
+  },
+  {
+    "EventName": "fp_ops_ret_by_type.vector_all",
+    "EventCode": "0x0a",
+    "BriefDescription": "Retired vector floating-point uops of all types.",
+    "UMask": "0xf0"
+  },
+  {
+    "EventName": "fp_ops_ret_by_type.all",
+    "EventCode": "0x0a",
+    "BriefDescription": "Retired floating-point uops of all types.",
+    "UMask": "0xff"
+  },
+  {
+    "EventName": "fp_sse_avx_ops_ret.mmx_add",
+    "EventCode": "0x0b",
+    "BriefDescription": "Retired MMX integer add uops.",
+    "UMask": "0x01"
+  },
+  {
+    "EventName": "fp_sse_avx_ops_ret.mmx_sub",
+    "EventCode": "0x0b",
+    "BriefDescription": "Retired MMX integer subtract uops.",
+    "UMask": "0x02"
+  },
+  {
+    "EventName": "fp_sse_avx_ops_ret.mmx_mul",
+    "EventCode": "0x0b",
+    "BriefDescription": "Retired MMX integer multiply uops.",
+    "UMask": "0x03"
+  },
+  {
+    "EventName": "fp_sse_avx_ops_ret.mmx_mac",
+    "EventCode": "0x0b",
+    "BriefDescription": "Retired MMX integer multiply-accumulate uops.",
+    "UMask": "0x04"
+  },
+  {
+    "EventName": "fp_sse_avx_ops_ret.mmx_aes",
+    "EventCode": "0x0b",
+    "BriefDescription": "Retired MMX integer AES uops.",
+    "UMask": "0x05"
+  },
+  {
+    "EventName": "fp_sse_avx_ops_ret.mmx_sha",
+    "EventCode": "0x0b",
+    "BriefDescription": "Retired MMX integer SHA uops.",
+    "UMask": "0x06"
+  },
+  {
+    "EventName": "fp_sse_avx_ops_ret.mmx_cmp",
+    "EventCode": "0x0b",
+    "BriefDescription": "Retired MMX integer compare uops.",
+    "UMask": "0x07"
+  },
+  {
+    "EventName": "fp_sse_avx_ops_ret.mmx_cvt",
+    "EventCode": "0x0b",
+    "BriefDescription": "Retired MMX integer convert or pack uops.",
+    "UMask": "0x08"
+  },
+  {
+    "EventName": "fp_sse_avx_ops_ret.mmx_shift",
+    "EventCode": "0x0b",
+    "BriefDescription": "Retired MMX integer shift or rotate uops.",
+    "UMask": "0x09"
+  },
+  {
+    "EventName": "fp_sse_avx_ops_ret.mmx_mov",
+    "EventCode": "0x0b",
+    "BriefDescription": "Retired MMX integer move uops.",
+    "UMask": "0x0a"
+  },
+  {
+    "EventName": "fp_sse_avx_ops_ret.mmx_shuffle",
+    "EventCode": "0x0b",
+    "BriefDescription": "Retired MMX integer shuffle uops (may include instructions not necessarily thought of as including shuffles e.g. horizontal add, dot product, and certain MOV instructions).",
+    "UMask": "0x0b"
+  },
+  {
+    "EventName": "fp_sse_avx_ops_ret.mmx_vnni",
+    "EventCode": "0x0b",
+    "BriefDescription": "Retired MMX integer VNNI uops.",
+    "UMask": "0x0c"
+  },
+  {
+    "EventName": "fp_sse_avx_ops_ret.mmx_logical",
+    "EventCode": "0x0b",
+    "BriefDescription": "Retired MMX integer logical uops.",
+    "UMask": "0x0d"
+  },
+  {
+    "EventName": "fp_sse_avx_ops_ret.mmx_other",
+    "EventCode": "0x0b",
+    "BriefDescription": "Retired MMX integer multiply uops of other types.",
+    "UMask": "0x0e"
+  },
+  {
+    "EventName": "fp_sse_avx_ops_ret.mmx_all",
+    "EventCode": "0x0b",
+    "BriefDescription": "Retired MMX integer uops of all types.",
+    "UMask": "0x0f"
+  },
+  {
+    "EventName": "fp_sse_avx_ops_ret.sse_avx_add",
+    "EventCode": "0x0b",
+    "BriefDescription": "Retired SSE and AVX integer add uops.",
+    "UMask": "0x10"
+  },
+  {
+    "EventName": "fp_sse_avx_ops_ret.sse_avx_sub",
+    "EventCode": "0x0b",
+    "BriefDescription": "Retired SSE and AVX integer subtract uops.",
+    "UMask": "0x20"
+  },
+  {
+    "EventName": "fp_sse_avx_ops_ret.sse_avx_mul",
+    "EventCode": "0x0b",
+    "BriefDescription": "Retired SSE and AVX integer multiply uops.",
+    "UMask": "0x30"
+  },
+  {
+    "EventName": "fp_sse_avx_ops_ret.sse_avx_mac",
+    "EventCode": "0x0b",
+    "BriefDescription": "Retired SSE and AVX integer multiply-accumulate uops.",
+    "UMask": "0x40"
+  },
+  {
+    "EventName": "fp_sse_avx_ops_ret.sse_avx_aes",
+    "EventCode": "0x0b",
+    "BriefDescription": "Retired SSE and AVX integer AES uops.",
+    "UMask": "0x50"
+  },
+  {
+    "EventName": "fp_sse_avx_ops_ret.sse_avx_sha",
+    "EventCode": "0x0b",
+    "BriefDescription": "Retired SSE and AVX integer SHA uops.",
+    "UMask": "0x60"
+  },
+  {
+    "EventName": "fp_sse_avx_ops_ret.sse_avx_cmp",
+    "EventCode": "0x0b",
+    "BriefDescription": "Retired SSE and AVX integer compare uops.",
+    "UMask": "0x70"
+  },
+  {
+    "EventName": "fp_sse_avx_ops_ret.sse_avx_cvt",
+    "EventCode": "0x0b",
+    "BriefDescription": "Retired SSE and AVX integer convert or pack uops.",
+    "UMask": "0x80"
+  },
+  {
+    "EventName": "fp_sse_avx_ops_ret.sse_avx_shift",
+    "EventCode": "0x0b",
+    "BriefDescription": "Retired SSE and AVX integer shift or rotate uops.",
+    "UMask": "0x90"
+  },
+  {
+    "EventName": "fp_sse_avx_ops_ret.sse_avx_mov",
+    "EventCode": "0x0b",
+    "BriefDescription": "Retired SSE and AVX integer move uops.",
+    "UMask": "0xa0"
+  },
+  {
+    "EventName": "fp_sse_avx_ops_ret.sse_avx_shuffle",
+    "EventCode": "0x0b",
+    "BriefDescription": "Retired SSE and AVX integer shuffle uops (may include instructions not necessarily thought of as including shuffles e.g. horizontal add, dot product, and certain MOV instructions).",
+    "UMask": "0xb0"
+  },
+  {
+    "EventName": "fp_sse_avx_ops_ret.sse_avx_vnni",
+    "EventCode": "0x0b",
+    "BriefDescription": "Retired SSE and AVX integer VNNI uops.",
+    "UMask": "0xc0"
+  },
+  {
+    "EventName": "fp_sse_avx_ops_ret.sse_avx_logical",
+    "EventCode": "0x0b",
+    "BriefDescription": "Retired SSE and AVX integer logical uops.",
+    "UMask": "0xd0"
+  },
+  {
+    "EventName": "fp_sse_avx_ops_ret.sse_avx_other",
+    "EventCode": "0x0b",
+    "BriefDescription": "Retired SSE and AVX integer uops of other types.",
+    "UMask": "0xe0"
+  },
+  {
+    "EventName": "fp_sse_avx_ops_ret.sse_avx_all",
+    "EventCode": "0x0b",
+    "BriefDescription": "Retired SSE and AVX integer uops of all types.",
+    "UMask": "0xf0"
+  },
+  {
+    "EventName": "fp_sse_avx_ops_ret.all",
+    "EventCode": "0x0b",
+    "BriefDescription": "Retired MMX, SSE and AVX integer uops of all types.",
+    "UMask": "0xff"
+  },
+  {
+    "EventName": "fp_pack_ops_ret.fp128_add",
+    "EventCode": "0x0c",
+    "BriefDescription": "Retired 128-bit packed floating-point add uops.",
+    "UMask": "0x01"
+  },
+  {
+    "EventName": "fp_pack_ops_ret.fp128_sub",
+    "EventCode": "0x0c",
+    "BriefDescription": "Retired 128-bit packed floating-point subtract uops.",
+    "UMask": "0x02"
+  },
+  {
+    "EventName": "fp_pack_ops_ret.fp128_mul",
+    "EventCode": "0x0c",
+    "BriefDescription": "Retired 128-bit packed floating-point multiply uops.",
+    "UMask": "0x03"
+  },
+  {
+    "EventName": "fp_pack_ops_ret.fp128_mac",
+    "EventCode": "0x0c",
+    "BriefDescription": "Retired 128-bit packed floating-point multiply-accumulate uops.",
+    "UMask": "0x04"
+  },
+  {
+    "EventName": "fp_pack_ops_ret.fp128_div",
+    "EventCode": "0x0c",
+    "BriefDescription": "Retired 128-bit packed floating-point divide uops.",
+    "UMask": "0x05"
+  },
+  {
+    "EventName": "fp_pack_ops_ret.fp128_sqrt",
+    "EventCode": "0x0c",
+    "BriefDescription": "Retired 128-bit packed floating-point square root uops.",
+    "UMask": "0x06"
+  },
+  {
+    "EventName": "fp_pack_ops_ret.fp128_cmp",
+    "EventCode": "0x0c",
+    "BriefDescription": "Retired 128-bit packed floating-point compare uops.",
+    "UMask": "0x07"
+  },
+  {
+    "EventName": "fp_pack_ops_ret.fp128_cvt",
+    "EventCode": "0x0c",
+    "BriefDescription": "Retired 128-bit packed floating-point convert uops.",
+    "UMask": "0x08"
+  },
+  {
+    "EventName": "fp_pack_ops_ret.fp128_blend",
+    "EventCode": "0x0c",
+    "BriefDescription": "Retired 128-bit packed floating-point blend uops.",
+    "UMask": "0x09"
+  },
+  {
+    "EventName": "fp_pack_ops_ret.fp128_mov",
+    "EventCode": "0x0c",
+    "BriefDescription": "Retired 128-bit packed floating-point move uops.",
+    "UMask": "0x0a"
+  },
+  {
+    "EventName": "fp_pack_ops_ret.fp128_shuffle",
+    "EventCode": "0x0c",
+    "BriefDescription": "Retired 128-bit packed floating-point shuffle uops (may include instructions not necessarily thought of as including shuffles e.g. horizontal add, dot product, and certain MOV instructions).",
+    "UMask": "0x0b"
+  },
+  {
+    "EventName": "fp_pack_ops_ret.fp128_bfloat",
+    "EventCode": "0x0c",
+    "BriefDescription": "Retired 128-bit packed floating-point bfloat uops.",
+    "UMask": "0x0c"
+  },
+  {
+    "EventName": "fp_pack_ops_ret.fp128_logical",
+    "EventCode": "0x0c",
+    "BriefDescription": "Retired 128-bit packed floating-point logical uops.",
+    "UMask": "0x0d"
+  },
+  {
+    "EventName": "fp_pack_ops_ret.fp128_other",
+    "EventCode": "0x0c",
+    "BriefDescription": "Retired 128-bit packed floating-point uops of other types.",
+    "UMask": "0x0e"
+  },
+  {
+    "EventName": "fp_pack_ops_ret.fp128_all",
+    "EventCode": "0x0c",
+    "BriefDescription": "Retired 128-bit packed floating-point uops of all types.",
+    "UMask": "0x0f"
+  },
+  {
+    "EventName": "fp_pack_ops_ret.fp256_add",
+    "EventCode": "0x0c",
+    "BriefDescription": "Retired 256-bit packed floating-point add uops.",
+    "UMask": "0x10"
+  },
+  {
+    "EventName": "fp_pack_ops_ret.fp256_sub",
+    "EventCode": "0x0c",
+    "BriefDescription": "Retired 256-bit packed floating-point subtract uops.",
+    "UMask": "0x20"
+  },
+  {
+    "EventName": "fp_pack_ops_ret.fp256_mul",
+    "EventCode": "0x0c",
+    "BriefDescription": "Retired 256-bit packed floating-point multiply uops.",
+    "UMask": "0x30"
+  },
+  {
+    "EventName": "fp_pack_ops_ret.fp256_mac",
+    "EventCode": "0x0c",
+    "BriefDescription": "Retired 256-bit packed floating-point multiply-accumulate uops.",
+    "UMask": "0x40"
+  },
+  {
+    "EventName": "fp_pack_ops_ret.fp256_div",
+    "EventCode": "0x0c",
+    "BriefDescription": "Retired 256-bit packed floating-point divide uops.",
+    "UMask": "0x50"
+  },
+  {
+    "EventName": "fp_pack_ops_ret.fp256_sqrt",
+    "EventCode": "0x0c",
+    "BriefDescription": "Retired 256-bit packed floating-point square root uops.",
+    "UMask": "0x60"
+  },
+  {
+    "EventName": "fp_pack_ops_ret.fp256_cmp",
+    "EventCode": "0x0c",
+    "BriefDescription": "Retired 256-bit packed floating-point compare uops.",
+    "UMask": "0x70"
+  },
+  {
+    "EventName": "fp_pack_ops_ret.fp256_cvt",
+    "EventCode": "0x0c",
+    "BriefDescription": "Retired 256-bit packed floating-point convert uops.",
+    "UMask": "0x80"
+  },
+  {
+    "EventName": "fp_pack_ops_ret.fp256_blend",
+    "EventCode": "0x0c",
+    "BriefDescription": "Retired 256-bit packed floating-point blend uops.",
+    "UMask": "0x90"
+  },
+  {
+    "EventName": "fp_pack_ops_ret.fp256_mov",
+    "EventCode": "0x0c",
+    "BriefDescription": "Retired 256-bit packed floating-point move uops.",
+    "UMask": "0xa0"
+  },
+  {
+    "EventName": "fp_pack_ops_ret.fp256_shuffle",
+    "EventCode": "0x0c",
+    "BriefDescription": "Retired 256-bit packed floating-point shuffle uops (may include instructions not necessarily thought of as including shuffles e.g. horizontal add, dot product, and certain MOV instructions).",
+    "UMask": "0xb0"
+  },
+  {
+    "EventName": "fp_pack_ops_ret.fp256_logical",
+    "EventCode": "0x0c",
+    "BriefDescription": "Retired 256-bit packed floating-point logical uops.",
+    "UMask": "0xd0"
+  },
+  {
+    "EventName": "fp_pack_ops_ret.fp256_other",
+    "EventCode": "0x0c",
+    "BriefDescription": "Retired 256-bit packed floating-point uops of other types.",
+    "UMask": "0xe0"
+  },
+  {
+    "EventName": "fp_pack_ops_ret.fp256_all",
+    "EventCode": "0x0c",
+    "BriefDescription": "Retired 256-bit packed floating-point uops of all types.",
+    "UMask": "0xf0"
+  },
+  {
+    "EventName": "fp_pack_ops_ret.fp_all",
+    "EventCode": "0x0c",
+    "BriefDescription": "Retired packed floating-point uops of all types.",
+    "UMask": "0xff"
+  },
+  {
+    "EventName": "fp_pack_int_ops_ret.int128_add",
+    "EventCode": "0x0d",
+    "BriefDescription": "Retired 128-bit packed integer add uops.",
+    "UMask": "0x01"
+  },
+  {
+    "EventName": "fp_pack_int_ops_ret.int128_sub",
+    "EventCode": "0x0d",
+    "BriefDescription": "Retired 128-bit packed integer subtract uops.",
+    "UMask": "0x02"
+  },
+  {
+    "EventName": "fp_pack_int_ops_ret.int128_mul",
+    "EventCode": "0x0d",
+    "BriefDescription": "Retired 128-bit packed integer multiply uops.",
+    "UMask": "0x03"
+  },
+  {
+    "EventName": "fp_pack_int_ops_ret.int128_mac",
+    "EventCode": "0x0d",
+    "BriefDescription": "Retired 128-bit packed integer multiply-accumulate uops.",
+    "UMask": "0x04"
+  },
+  {
+    "EventName": "fp_pack_int_ops_ret.int128_aes",
+    "EventCode": "0x0d",
+    "BriefDescription": "Retired 128-bit packed integer AES uops.",
+    "UMask": "0x05"
+  },
+  {
+    "EventName": "fp_pack_int_ops_ret.int128_sha",
+    "EventCode": "0x0d",
+    "BriefDescription": "Retired 128-bit packed integer SHA uops.",
+    "UMask": "0x06"
+  },
+  {
+    "EventName": "fp_pack_int_ops_ret.int128_cmp",
+    "EventCode": "0x0d",
+    "BriefDescription": "Retired 128-bit packed integer compare uops.",
+    "UMask": "0x07"
+  },
+  {
+    "EventName": "fp_pack_int_ops_ret.int128_cvt",
+    "EventCode": "0x0d",
+    "BriefDescription": "Retired 128-bit packed integer convert or pack uops.",
+    "UMask": "0x08"
+  },
+  {
+    "EventName": "fp_pack_int_ops_ret.int128_shift",
+    "EventCode": "0x0d",
+    "BriefDescription": "Retired 128-bit packed integer shift or rotate uops.",
+    "UMask": "0x09"
+  },
+  {
+    "EventName": "fp_pack_int_ops_ret.int128_mov",
+    "EventCode": "0x0d",
+    "BriefDescription": "Retired 128-bit packed integer move uops.",
+    "UMask": "0x0a"
+  },
+  {
+    "EventName": "fp_pack_int_ops_ret.int128_shuffle",
+    "EventCode": "0x0d",
+    "BriefDescription": "Retired 128-bit packed integer shuffle uops (may include instructions not necessarily thought of as including shuffles e.g. horizontal add, dot product, and certain MOV instructions).",
+    "UMask": "0x0b"
+  },
+  {
+    "EventName": "fp_pack_int_ops_ret.int128_vnni",
+    "EventCode": "0x0d",
+    "BriefDescription": "Retired 128-bit packed integer VNNI ops.",
+    "UMask": "0x0c"
+  },
+  {
+    "EventName": "fp_pack_int_ops_ret.int128_logical",
+    "EventCode": "0x0d",
+    "BriefDescription": "Retired 128-bit packed integer logical uops.",
+    "UMask": "0x0d"
+  },
+  {
+    "EventName": "fp_pack_int_ops_ret.int128_other",
+    "EventCode": "0x0d",
+    "BriefDescription": "Retired 128-bit packed integer uops of other types.",
+    "UMask": "0x0e"
+  },
+  {
+    "EventName": "fp_pack_int_ops_ret.int128_all",
+    "EventCode": "0x0d",
+    "BriefDescription": "Retired 128-bit packed integer uops of all types.",
+    "UMask": "0x0f"
+  },
+  {
+    "EventName": "fp_pack_int_ops_ret.int256_add",
+    "EventCode": "0x0d",
+    "BriefDescription": "Retired 256-bit packed integer add uops.",
+    "UMask": "0x10"
+  },
+  {
+    "EventName": "fp_pack_int_ops_ret.int256_sub",
+    "EventCode": "0x0d",
+    "BriefDescription": "Retired 256-bit packed integer subtract uops.",
+    "UMask": "0x20"
+  },
+  {
+    "EventName": "fp_pack_int_ops_ret.int256_mul",
+    "EventCode": "0x0d",
+    "BriefDescription": "Retired 256-bit packed integer multiply uops.",
+    "UMask": "0x30"
+  },
+  {
+    "EventName": "fp_pack_int_ops_ret.int256_mac",
+    "EventCode": "0x0d",
+    "BriefDescription": "Retired 256-bit packed integer multiply-accumulate uops.",
+    "UMask": "0x40"
+  },
+  {
+    "EventName": "fp_pack_int_ops_ret.int256_cmp",
+    "EventCode": "0x0d",
+    "BriefDescription": "Retired 256-bit packed integer compare uops.",
+    "UMask": "0x70"
+  },
+  {
+    "EventName": "fp_pack_int_ops_ret.int256_shift",
+    "EventCode": "0x0d",
+    "BriefDescription": "Retired 256-bit packed integer shift or rotate uops.",
+    "UMask": "0x90"
+  },
+  {
+    "EventName": "fp_pack_int_ops_ret.int256_mov",
+    "EventCode": "0x0d",
+    "BriefDescription": "Retired 256-bit packed integer move uops.",
+    "UMask": "0xa0"
+  },
+  {
+    "EventName": "fp_pack_int_ops_ret.int256_shuffle",
+    "EventCode": "0x0d",
+    "BriefDescription": "Retired 256-bit packed integer shuffle uops (may include instructions not necessarily thought of as including shuffles e.g. horizontal add, dot product, and certain MOV instructions).",
+    "UMask": "0xb0"
+  },
+  {
+    "EventName": "fp_pack_int_ops_ret.int256_vnni",
+    "EventCode": "0x0d",
+    "BriefDescription": "Retired 256-bit packed integer VNNI uops.",
+    "UMask": "0xc0"
+  },
+  {
+    "EventName": "fp_pack_int_ops_ret.int256_logical",
+    "EventCode": "0x0d",
+    "BriefDescription": "Retired 256-bit packed integer logical uops.",
+    "UMask": "0xd0"
+  },
+  {
+    "EventName": "fp_pack_int_ops_ret.int256_other",
+    "EventCode": "0x0d",
+    "BriefDescription": "Retired 256-bit packed integer uops of other types.",
+    "UMask": "0xe0"
+  },
+  {
+    "EventName": "fp_pack_int_ops_ret.int256_all",
+    "EventCode": "0x0d",
+    "BriefDescription": "Retired 256-bit packed integer uops of all types.",
+    "UMask": "0xf0"
+  },
+  {
+    "EventName": "fp_pack_int_ops_ret.int_all",
+    "EventCode": "0x0d",
+    "BriefDescription": "Retired packed integer uops of all types.",
+    "UMask": "0xff"
+  },
+  {
+    "EventName": "fp_disp_faults.x87_fill_fault",
+    "EventCode": "0x0e",
+    "BriefDescription": "Floating-point dispatch faults for x87 fills.",
+    "UMask": "0x01"
+  },
+  {
+    "EventName": "fp_disp_faults.xmm_fill_fault",
+    "EventCode": "0x0e",
+    "BriefDescription": "Floating-point dispatch faults for XMM fills.",
+    "UMask": "0x02"
+  },
+  {
+    "EventName": "fp_disp_faults.ymm_fill_fault",
+    "EventCode": "0x0e",
+    "BriefDescription": "Floating-point dispatch faults for YMM fills.",
+    "UMask": "0x04"
+  },
+  {
+    "EventName": "fp_disp_faults.ymm_spill_fault",
+    "EventCode": "0x0e",
+    "BriefDescription": "Floating-point dispatch faults for YMM spills.",
+    "UMask": "0x08"
+  },
+  {
+    "EventName": "fp_disp_faults.sse_avx_all",
+    "EventCode": "0x0e",
+    "BriefDescription": "Floating-point dispatch faults of all types for SSE and AVX ops.",
+    "UMask": "0x0e"
+  },
+  {
+    "EventName": "fp_disp_faults.all",
+    "EventCode": "0x0e",
+    "BriefDescription": "Floating-point dispatch faults of all types.",
+    "UMask": "0x0f"
+  },
+  {
+    "EventName": "fp_pack_512b_ops_ret.fp512_add",
+    "EventCode": "0x0f",
+    "BriefDescription": "Retired 512-bit packed floating-point add uops.",
+    "UMask": "0x01"
+  },
+  {
+    "EventName": "fp_pack_512b_ops_ret.fp512_sub",
+    "EventCode": "0x0f",
+    "BriefDescription": "Retired 512-bit packed floating-point subtract uops.",
+    "UMask": "0x02"
+  },
+  {
+    "EventName": "fp_pack_512b_ops_ret.fp512_mul",
+    "EventCode": "0x0f",
+    "BriefDescription": "Retired 512-bit packed floating-point multiply uops.",
+    "UMask": "0x03"
+  },
+  {
+    "EventName": "fp_pack_512b_ops_ret.fp512_mac",
+    "EventCode": "0x0f",
+    "BriefDescription": "Retired 512-bit packed floating-point multiply-accumulate uops.",
+    "UMask": "0x04"
+  },
+  {
+    "EventName": "fp_pack_512b_ops_ret.fp512_div",
+    "EventCode": "0x0f",
+    "BriefDescription": "Retired 512-bit packed floating-point divide uops.",
+    "UMask": "0x05"
+  },
+  {
+    "EventName": "fp_pack_512b_ops_ret.fp512_sqrt",
+    "EventCode": "0x0f",
+    "BriefDescription": "Retired 512-bit packed floating-point square root uops.",
+    "UMask": "0x06"
+  },
+  {
+    "EventName": "fp_pack_512b_ops_ret.fp512_cmp",
+    "EventCode": "0x0f",
+    "BriefDescription": "Retired 512-bit packed floating-point compare uops.",
+    "UMask": "0x07"
+  },
+  {
+    "EventName": "fp_pack_512b_ops_ret.fp512_cvt",
+    "EventCode": "0x0f",
+    "BriefDescription": "Retired 512-bit packed floating-point convert uops.",
+    "UMask": "0x08"
+  },
+  {
+    "EventName": "fp_pack_512b_ops_ret.fp512_blend",
+    "EventCode": "0x0f",
+    "BriefDescription": "Retired 512-bit packed floating-point blend uops.",
+    "UMask": "0x09"
+  },
+  {
+    "EventName": "fp_pack_512b_ops_ret.fp512_mov",
+    "EventCode": "0x0f",
+    "BriefDescription": "Retired 512-bit packed floating-point move uops.",
+    "UMask": "0x0a"
+  },
+  {
+    "EventName": "fp_pack_512b_ops_ret.fp512_shuffle",
+    "EventCode": "0x0f",
+    "BriefDescription": "Retired 512-bit packed floating-point shuffle uops (may include instructions not necessarily thought of as including shuffles e.g. horizontal add, dot product, and certain MOV instructions).",
+    "UMask": "0x0b"
+  },
+  {
+    "EventName": "fp_pack_512b_ops_ret.fp512_bfloat",
+    "EventCode": "0x0f",
+    "BriefDescription": "Retired 512-bit packed floating-point bfloat uops.",
+    "UMask": "0x0c"
+  },
+  {
+    "EventName": "fp_pack_512b_ops_ret.fp512_logical",
+    "EventCode": "0x0f",
+    "BriefDescription": "Retired 512-bit packed floating-point logical uops.",
+    "UMask": "0x0d"
+  },
+  {
+    "EventName": "fp_pack_512b_ops_ret.fp512_other",
+    "EventCode": "0x0f",
+    "BriefDescription": "Retired 512-bit packed floating-point uops of other types.",
+    "UMask": "0x0e"
+  },
+  {
+    "EventName": "fp_pack_512b_ops_ret.fp512_all",
+    "EventCode": "0x0f",
+    "BriefDescription": "Retired 512-bit packed floating-point uops of all types.",
+    "UMask": "0x0f"
+  },
+  {
+    "EventName": "fp_pack_512b_ops_ret.int512_add",
+    "EventCode": "0x0f",
+    "BriefDescription": "Retired 512-bit packed integer add uops.",
+    "UMask": "0x10"
+  },
+  {
+    "EventName": "fp_pack_512b_ops_ret.int512_sub",
+    "EventCode": "0x0f",
+    "BriefDescription": "Retired 512-bit packed integer subtract uops.",
+    "UMask": "0x20"
+  },
+  {
+    "EventName": "fp_pack_512b_ops_ret.int512_mul",
+    "EventCode": "0x0f",
+    "BriefDescription": "Retired 512-bit packed integer multiply uops.",
+    "UMask": "0x30"
+  },
+  {
+    "EventName": "fp_pack_512b_ops_ret.int512_mac",
+    "EventCode": "0x0f",
+    "BriefDescription": "Retired 512-bit packed integer multiply-accumulate uops.",
+    "UMask": "0x40"
+  },
+  {
+    "EventName": "fp_pack_512b_ops_ret.int512_aes",
+    "EventCode": "0x0f",
+    "BriefDescription": "Retired 512-bit packed integer AES uops.",
+    "UMask": "0x50"
+  },
+  {
+    "EventName": "fp_pack_512b_ops_ret.int512_sha",
+    "EventCode": "0x0f",
+    "BriefDescription": "Retired 512-bit packed integer SHA uops.",
+    "UMask": "0x60"
+  },
+  {
+    "EventName": "fp_pack_512b_ops_ret.int512_cmp",
+    "EventCode": "0x0f",
+    "BriefDescription": "Retired 512-bit packed integer compare uops.",
+    "UMask": "0x70"
+  },
+  {
+    "EventName": "fp_pack_512b_ops_ret.int512_cvt",
+    "EventCode": "0x0f",
+    "BriefDescription": "Retired 512-bit packed integer convert or pack uops.",
+    "UMask": "0x80"
+  },
+  {
+    "EventName": "fp_pack_512b_ops_ret.int512_shift",
+    "EventCode": "0x0f",
+    "BriefDescription": "Retired 512-bit packed integer shift or rotate uops.",
+    "UMask": "0x90"
+  },
+  {
+    "EventName": "fp_pack_512b_ops_ret.int512_mov",
+    "EventCode": "0x0f",
+    "BriefDescription": "Retired 512-bit packed integer move uops.",
+    "UMask": "0xa0"
+  },
+  {
+    "EventName": "fp_pack_512b_ops_ret.int512_shuffle",
+    "EventCode": "0x0f",
+    "BriefDescription": "Retired 512-bit packed integer shuffle uops (may include instructions not necessarily thought of as including shuffles e.g. horizontal add, dot product, and certain MOV instructions).",
+    "UMask": "0xb0"
+  },
+  {
+    "EventName": "fp_pack_512b_ops_ret.int512_vnni",
+    "EventCode": "0x0f",
+    "BriefDescription": "Retired 512-bit packed integer VNNI uops.",
+    "UMask": "0xc0"
+  },
+  {
+    "EventName": "fp_pack_512b_ops_ret.int512_logical",
+    "EventCode": "0x0f",
+    "BriefDescription": "Retired 512-bit packed integer logical uops.",
+    "UMask": "0xd0"
+  },
+  {
+    "EventName": "fp_pack_512b_ops_ret.int512_other",
+    "EventCode": "0x0f",
+    "BriefDescription": "Retired 512-bit packed integer uops of other types.",
+    "UMask": "0xe0"
+  },
+  {
+    "EventName": "fp_pack_512b_ops_ret.int512_all",
+    "EventCode": "0x0f",
+    "BriefDescription": "Retired 512-bit packed integer uops of all types.",
+    "UMask": "0xf0"
+  },
+  {
+    "EventName": "fp_pack_512b_ops_ret.512b_all",
+    "EventCode": "0x0f",
+    "BriefDescription": "Retired 512-bit packed uops of all types.",
+    "UMask": "0xff"
+  },
+  {
+    "EventName": "fp_nsq_read_stalls.fp_prf",
+    "EventCode": "0x13",
+    "BriefDescription": "Cycles when reads of the NSQ and writes to the floating-point or SIMD schedulers are stalled due to insufficient free physical register file (FP-PRF) entries.",
+    "UMask": "0x0e"
+  },
+  {
+    "EventName": "fp_nsq_read_stalls.k_prf",
+    "EventCode": "0x13",
+    "BriefDescription": "Cycles when reads of the NSQ and writes to the floating-point or SIMD schedulers are stalled due to insufficient free mask physical register file (K-PRF) entries.",
+    "UMask": "0x0e"
+  },
+  {
+    "EventName": "fp_nsq_read_stalls.fp_sq",
+    "EventCode": "0x13",
+    "BriefDescription": "Cycles when reads of the NSQ and writes to the floating-point or SIMD schedulers are stalled due to insufficient free scheduler entries.",
+    "UMask": "0x0e"
+  },
+  {
+    "EventName": "fp_nsq_read_stalls.all",
+    "EventCode": "0x13",
+    "BriefDescription": "Cycles when reads of the NSQ and writes to the floating-point or SIMD schedulers are stalled due to any reason.",
+    "UMask": "0x0e"
+  }
+]
diff --git a/tools/perf/pmu-events/arch/x86/amdzen6/inst-cache.json b/tools/perf/pmu-events/arch/x86/amdzen6/inst-cache.json
new file mode 100644
index 000000000000..5ab6766f8940
--- /dev/null
+++ b/tools/perf/pmu-events/arch/x86/amdzen6/inst-cache.json
@@ -0,0 +1,120 @@
+[
+  {
+    "EventName": "ic_cache_fill_l2",
+    "EventCode": "0x82",
+    "BriefDescription": "Instruction cache lines (64 bytes) fulfilled from the L2 cache."
+  },
+  {
+    "EventName": "ic_cache_fill_sys",
+    "EventCode": "0x83",
+    "BriefDescription": "Instruction cache lines (64 bytes) fulfilled from system memory or another cache."
+  },
+  {
+    "EventName": "ic_fetch_ibs_events.tagged",
+    "EventCode": "0x188",
+    "BriefDescription": "Fetch IBS tagged fetches. Not all tagged fetches result in a valid sample and an IBS interrupt.",
+    "UMask": "0x02"
+  },
+  {
+    "EventName": "ic_fetch_ibs_events.filtered",
+    "EventCode": "0x188",
+    "BriefDescription": "Fetch IBS tagged fetches that were discarded due to IBS filtering.",
+    "UMask": "0x08"
+  },
+  {
+    "EventName": "ic_fetch_ibs_events.valid",
+    "EventCode": "0x188",
+    "BriefDescription": "Fetch IBS tagged fetches that resulted in a valid sample and an IBS interrupt.",
+    "UMask": "0x10"
+  },
+  {
+    "EventName": "op_cache_hit_miss.hit",
+    "EventCode": "0x28f",
+    "BriefDescription": "Op cache fetch hits.",
+    "UMask": "0x03"
+  },
+  {
+    "EventName": "op_cache_hit_miss.miss",
+    "EventCode": "0x28f",
+    "BriefDescription": "Op cache fetch misses.",
+    "UMask": "0x04"
+  },
+  {
+    "EventName": "op_cache_hit_miss.all",
+    "EventCode": "0x28f",
+    "BriefDescription": "Op cache fetches of all types.",
+    "UMask": "0x07"
+  },
+  {
+    "EventName": "ic_fills_from_sys.local_l2",
+    "EventCode": "0x29c",
+    "BriefDescription": "Instruction cache fills where data is returned from local L2 cache.",
+    "UMask": "0x01"
+  },
+  {
+    "EventName": "ic_fills_from_sys.local_ccx",
+    "EventCode": "0x29c",
+    "BriefDescription": "Instruction cache fills where data is returned from L3 cache or different L2 cache in the same CCX.",
+    "UMask": "0x02"
+  },
+  {
+    "EventName": "ic_fills_from_sys.local_all",
+    "EventCode": "0x29c",
+    "BriefDescription": "Instruction cache fills where data is returned from local L2 cache, L3 cache or different L2 cache in the same CCX.",
+    "UMask": "0x03"
+  },
+  {
+    "EventName": "ic_fills_from_sys.near_cache",
+    "EventCode": "0x29c",
+    "BriefDescription": "Instruction cache fills where data is returned from cache of another CCX in the same NUMA node.",
+    "UMask": "0x04"
+  },
+  {
+    "EventName": "ic_fills_from_sys.dram_io_near",
+    "EventCode": "0x29c",
+    "BriefDescription": "Instruction cache fills where data is returned from either DRAM or MMIO in the same NUMA node.",
+    "UMask": "0x08"
+  },
+  {
+    "EventName": "ic_fills_from_sys.far_cache",
+    "EventCode": "0x29c",
+    "BriefDescription": "Instruction cache fills where data is returned from cache of another CCX in a different NUMA node.",
+    "UMask": "0x10"
+  },
+  {
+    "EventName": "ic_fills_from_sys.remote_cache",
+    "EventCode": "0x29c",
+    "BriefDescription": "Instruction cache fills where data is returned from cache of another CCX in the same or a different NUMA node.",
+    "UMask": "0x14"
+  },
+  {
+    "EventName": "ic_fills_from_sys.dram_io_far",
+    "EventCode": "0x29c",
+    "BriefDescription": "Instruction cache fills where data is returned from either DRAM or MMIO in a different NUMA node.",
+    "UMask": "0x40"
+  },
+  {
+    "EventName": "ic_fills_from_sys.dram_io_all",
+    "EventCode": "0x29c",
+    "BriefDescription": "Instruction cache fills where data is returned from either DRAM or MMIO in the same or a different NUMA node.",
+    "UMask": "0x48"
+  },
+  {
+    "EventName": "ic_fills_from_sys.far_all",
+    "EventCode": "0x29c",
+    "BriefDescription": "Instruction cache fills where data is returned from either cache of another CCX, DRAM or MMIO in a different NUMA node.",
+    "UMask": "0x50"
+  },
+  {
+    "EventName": "ic_fills_from_sys.alt_mem",
+    "EventCode": "0x29c",
+    "BriefDescription": "Instruction cache fills where data is returned from extension memory (CXL).",
+    "UMask": "0x80"
+  },
+  {
+    "EventName": "ic_fills_from_sys.all",
+    "EventCode": "0x29c",
+    "BriefDescription": "Instruction cache fills where data is returned from all types of sources.",
+    "UMask": "0xdf"
+  }
+]
diff --git a/tools/perf/pmu-events/arch/x86/amdzen6/l2-cache.json b/tools/perf/pmu-events/arch/x86/amdzen6/l2-cache.json
new file mode 100644
index 000000000000..b0b2090fb920
--- /dev/null
+++ b/tools/perf/pmu-events/arch/x86/amdzen6/l2-cache.json
@@ -0,0 +1,326 @@
+[
+  {
+    "EventName": "l2_request_g1.group2",
+    "EventCode": "0x60",
+    "BriefDescription": "L2 cache requests of non-cacheable type (non-cached data and instructions reads, self-modifying code checks).",
+    "UMask": "0x01"
+  },
+  {
+    "EventName": "l2_request_g1.l2_hwpf",
+    "EventCode": "0x60",
+    "BriefDescription": "L2 cache requests from hardware prefetchers to prefetch directly into L2 (hit or miss).",
+    "UMask": "0x02"
+  },
+  {
+    "EventName": "l2_request_g1.prefetch_l2_cmd",
+    "EventCode": "0x60",
+    "BriefDescription": "L2 cache requests to prefetch directly into L2.",
+    "UMask": "0x04"
+  },
+  {
+    "EventName": "l2_request_g1.cacheable_ic_read",
+    "EventCode": "0x60",
+    "BriefDescription": "L2 cache requests for instruction cache reads.",
+    "UMask": "0x10"
+  },
+  {
+    "EventName": "l2_request_g1.ls_rd_blk_c_s",
+    "EventCode": "0x60",
+    "BriefDescription": "L2 cache requests for data cache shared reads.",
+    "UMask": "0x20"
+  },
+  {
+    "EventName": "l2_request_g1.rd_blk_x",
+    "EventCode": "0x60",
+    "BriefDescription": "L2 cache requests for data cache stores.",
+    "UMask": "0x40"
+  },
+  {
+    "EventName": "l2_request_g1.rd_blk_l",
+    "EventCode": "0x60",
+    "BriefDescription": "L2 cache requests for data cache reads (includes hardware and software prefetches).",
+    "UMask": "0x80"
+  },
+  {
+    "EventName": "l2_request_g1.dc_all",
+    "EventCode": "0x60",
+    "BriefDescription": "L2 cache requests of common types from data cache (includes prefetches).",
+    "UMask": "0xe0"
+  },
+  {
+    "EventName": "l2_request_g1.no_pf_all",
+    "EventCode": "0x60",
+    "BriefDescription": "L2 cache requests of common types not including prefetches.",
+    "UMask": "0xf1"
+  },
+  {
+    "EventName": "l2_request_g1.all",
+    "EventCode": "0x60",
+    "BriefDescription": "L2 cache requests of all types.",
+    "UMask": "0xf7"
+  },
+  {
+    "EventName": "l2_request_g2.ls_rd_sized_nc",
+    "EventCode": "0x61",
+    "BriefDescription": "L2 cache requests for non-coherent, non-cacheable LS sized reads.",
+    "UMask": "0x20"
+  },
+  {
+    "EventName": "l2_request_g2.ls_rd_sized",
+    "EventCode": "0x61",
+    "BriefDescription": "L2 cache requests for coherent, non-cacheable LS sized reads.",
+    "UMask": "0x40"
+  },
+  {
+    "EventName": "l2_request_g2.all",
+    "EventCode": "0x61",
+    "BriefDescription": "L2 cache requests of all rare types.",
+    "UMask": "0x40"
+  },
+  {
+    "EventName": "l2_wcb_req.wcb_close",
+    "EventCode": "0x63",
+    "BriefDescription": "Write Combining Buffer (WCB) closures.",
+    "UMask": "0x20"
+  },
+  {
+    "EventName": "l2_cache_req_stat.ic_fill_miss",
+    "EventCode": "0x64",
+    "BriefDescription": "Core to L2 cache requests (not including L2 prefetch) from the instruction cache that result in L2 misses.",
+    "UMask": "0x01"
+  },
+  {
+    "EventName": "l2_cache_req_stat.ic_fill_hit_s",
+    "EventCode": "0x64",
+    "BriefDescription": "Core to L2 cache requests (not including L2 prefetch) from the instruction cache that result in L2 hits on non-modifiable lines.",
+    "UMask": "0x02"
+  },
+  {
+    "EventName": "l2_cache_req_stat.ic_fill_hit_x",
+    "EventCode": "0x64",
+    "BriefDescription": "Core to L2 cache requests (not including L2 prefetch) from the instruction cache that result in L2 hits on modifiable lines.",
+    "UMask": "0x04"
+  },
+  {
+    "EventName": "l2_cache_req_stat.ic_hit_in_l2",
+    "EventCode": "0x64",
+    "BriefDescription": "Core to L2 cache requests (not including L2 prefetch) from the instruction cache that result in L2 hits.",
+    "UMask": "0x06"
+  },
+  {
+    "EventName": "l2_cache_req_stat.ic_access_in_l2",
+    "EventCode": "0x64",
+    "BriefDescription": "Core to L2 cache requests (not including L2 prefetch) from the instruction cache that result in L2 accesses.",
+    "UMask": "0x07"
+  },
+  {
+    "EventName": "l2_cache_req_stat.ls_rd_blk_c",
+    "EventCode": "0x64",
+    "BriefDescription": "Core to L2 cache requests (not including L2 prefetch) from the data cache that result in L2 misses.",
+    "UMask": "0x08"
+  },
+  {
+    "EventName": "l2_cache_req_stat.ic_dc_miss_in_l2",
+    "EventCode": "0x64",
+    "BriefDescription": "Core to L2 cache requests (not including L2 prefetch) from the data cache and the instruction cache that result in L2 misses.",
+    "UMask": "0x09"
+  },
+  {
+    "EventName": "l2_cache_req_stat.ls_rd_blk_x",
+    "EventCode": "0x64",
+    "BriefDescription": "Core to L2 cache requests (not including L2 prefetch) that result in data cache stores or L2 state change hits.",
+    "UMask": "0x10"
+  },
+  {
+    "EventName": "l2_cache_req_stat.ls_rd_blk_l_hit_s",
+    "EventCode": "0x64",
+    "BriefDescription": "Core to L2 cache requests (not including L2 prefetch) from the data cache that result in L2 hits on non-modifiable lines.",
+    "UMask": "0x20"
+  },
+  {
+    "EventName": "l2_cache_req_stat.ls_rd_blk_l_hit_x",
+    "EventCode": "0x64",
+    "BriefDescription": "Core to L2 cache requests (not including L2 prefetch) from the data cache that result in L2 hits on modifiable lines.",
+    "UMask": "0x40"
+  },
+  {
+    "EventName": "l2_cache_req_stat.ls_rd_blk_cs",
+    "EventCode": "0x64",
+    "BriefDescription": "Core to L2 cache requests (not including L2 prefetch) from the data cache that result in L2 read hits on shared lines.",
+    "UMask": "0x80"
+  },
+  {
+    "EventName": "l2_cache_req_stat.dc_hit_in_l2",
+    "EventCode": "0x64",
+    "BriefDescription": "Core to L2 cache requests (not including L2 prefetch) from the data cache that result in L2 hits.",
+    "UMask": "0xf0"
+  },
+  {
+    "EventName": "l2_cache_req_stat.ic_dc_hit_in_l2",
+    "EventCode": "0x64",
+    "BriefDescription": "Core to L2 cache requests (not including L2 prefetch) from the data cache and the instruction cache that result in L2 hits.",
+    "UMask": "0xf6"
+  },
+  {
+    "EventName": "l2_cache_req_stat.dc_access_in_l2",
+    "EventCode": "0x64",
+    "BriefDescription": "Core to L2 cache requests (not including L2 prefetch) from the data cache that result in L2 accesses.",
+    "UMask": "0xf8"
+  },
+  {
+    "EventName": "l2_cache_req_stat.all",
+    "EventCode": "0x64",
+    "BriefDescription": "Core to L2 cache requests (not including L2 prefetch) from the data cache and the instruction cache that result in L2 accesses.",
+    "UMask": "0xff"
+  },
+  {
+    "EventName": "l2_pf_hit_l2.l2_hwpf",
+    "EventCode": "0x70",
+    "BriefDescription": "L2 prefetches accepted by the L2 pipeline which hit in the L2 cache and are generated from L2 hardware prefetchers.",
+    "UMask": "0x1f"
+  },
+  {
+    "EventName": "l2_pf_hit_l2.l1_dc_hwpf",
+    "EventCode": "0x70",
+    "BriefDescription": "L2 prefetches accepted by the L2 pipeline which hit in the L2 cache and are generated from L1 data hardware prefetchers.",
+    "UMask": "0xe0"
+  },
+  {
+    "EventName": "l2_pf_hit_l2.l1_dc_l2_hwpf",
+    "EventCode": "0x70",
+    "BriefDescription": "L2 prefetches accepted by the L2 pipeline which hit in the L2 cache and are generated from L1 data and L2 hardware prefetchers.",
+    "UMask": "0xff"
+  },
+  {
+    "EventName": "l2_pf_miss_l2_hit_l3.l2_hwpf",
+    "EventCode": "0x71",
+    "BriefDescription": "L2 prefetches accepted by the L2 pipeline which miss the L2 cache but hit in the L3 cache and are generated from L2 hardware prefetchers.",
+    "UMask": "0x1f"
+  },
+  {
+    "EventName": "l2_pf_miss_l2_hit_l3.l1_dc_hwpf",
+    "EventCode": "0x71",
+    "BriefDescription": "L2 prefetches accepted by the L2 pipeline which miss the L2 cache but hit in the L3 cache and are generated from L1 data hardware prefetchers.",
+    "UMask": "0xe0"
+  },
+  {
+    "EventName": "l2_pf_miss_l2_hit_l3.l1_dc_l2_hwpf",
+    "EventCode": "0x71",
+    "BriefDescription": "L2 prefetches accepted by the L2 pipeline which miss the L2 cache but hit in the L3 cache and are generated from L1 data and L2 hardware prefetchers.",
+    "UMask": "0xff"
+  },
+  {
+    "EventName": "l2_pf_miss_l2_l3.l2_hwpf",
+    "EventCode": "0x72",
+    "BriefDescription": "L2 prefetches accepted by the L2 pipeline which miss the L2 as well as the L3 caches and are generated from L2 hardware prefetchers.",
+    "UMask": "0x1f"
+  },
+  {
+    "EventName": "l2_pf_miss_l2_l3.l1_dc_hwpf",
+    "EventCode": "0x72",
+    "BriefDescription": "L2 prefetches accepted by the L2 pipeline which miss the L2 as well as the L3 caches and are generated from L1 data hardware prefetchers.",
+    "UMask": "0xe0"
+  },
+  {
+    "EventName": "l2_pf_miss_l2_l3.l1_dc_l2_hwpf",
+    "EventCode": "0x72",
+    "BriefDescription": "L2 prefetches accepted by the L2 pipeline which miss the L2 as well as the L3 caches and are generated from L1 data and L2 hardware prefetchers.",
+    "UMask": "0xff"
+  },
+  {
+    "EventName": "l2_fill_rsp_src.local_ccx",
+    "EventCode": "0x165",
+    "BriefDescription": "L2 cache fills where data is returned from L3 cache or different L2 cache in the same CCX.",
+    "UMask": "0x02"
+  },
+  {
+    "EventName": "l2_fill_rsp_src.near_cache",
+    "EventCode": "0x165",
+    "BriefDescription": "L2 cache fills where data is returned from cache of another CCX in the same NUMA node.",
+    "UMask": "0x04"
+  },
+  {
+    "EventName": "l2_fill_rsp_src.dram_io_near",
+    "EventCode": "0x165",
+    "BriefDescription": "L2 cache fills where data is returned from either DRAM or MMIO in the same NUMA node.",
+    "UMask": "0x08"
+  },
+  {
+    "EventName": "l2_fill_rsp_src.far_cache",
+    "EventCode": "0x165",
+    "BriefDescription": "L2 cache fills where data is returned from cache of another CCX in a different NUMA node.",
+    "UMask": "0x10"
+  },
+  {
+    "EventName": "l2_fill_rsp_src.dram_io_far",
+    "EventCode": "0x165",
+    "BriefDescription": "L2 cache fills where data is returned from either DRAM or MMIO in a different NUMA node.",
+    "UMask": "0x40"
+  },
+  {
+    "EventName": "l2_fill_rsp_src.dram_io_all",
+    "EventCode": "0x165",
+    "BriefDescription": "L2 cache fills where data is returned from either DRAM or MMIO in the same or a different NUMA node.",
+    "UMask": "0x48"
+  },
+  {
+    "EventName": "l2_fill_rsp_src.far_all",
+    "EventCode": "0x165",
+    "BriefDescription": "L2 cache fills where data is returned from either cache of another CCX, DRAM or MMIO in a different NUMA node.",
+    "UMask": "0x50"
+  },
+  {
+    "EventName": "l2_fill_rsp_src.alt_mem",
+    "EventCode": "0x165",
+    "BriefDescription": "L2 cache fills where data is returned from extension memory (CXL).",
+    "UMask": "0x80"
+  },
+  {
+    "EventName": "l2_fill_rsp_src.all",
+    "EventCode": "0x165",
+    "BriefDescription": "L2 cache fills where data is returned from all types of sources.",
+    "UMask": "0xde"
+  },
+  {
+    "EventName": "l2_sys_bw.local_dram_fill",
+    "EventCode": "0x175",
+    "BriefDescription": "System bandwidth utilization for fill events that target the same NUMA node and return from DRAM in the same NUMA node.",
+    "UMask": "0x01"
+  },
+  {
+    "EventName": "l2_sys_bw.remote_dram_fill",
+    "EventCode": "0x175",
+    "BriefDescription": "System bandwidth utilization for fill events that target a different NUMA node and return from DRAM in a different NUMA node.",
+    "UMask": "0x02"
+  },
+  {
+    "EventName": "l2_sys_bw.nt_write",
+    "EventCode": "0x175",
+    "BriefDescription": "System bandwidth utilization for non-temporal write events that target all NUMA nodes.",
+    "UMask": "0x04"
+  },
+  {
+    "EventName": "l2_sys_bw.local_scm_fill",
+    "EventCode": "0x175",
+    "BriefDescription": "System bandwidth utilization for fill events that target the same NUMA node and return from extension memory (CXL) in the same NUMA node.",
+    "UMask": "0x10"
+  },
+  {
+    "EventName": "l2_sys_bw.remote_scm_fill",
+    "EventCode": "0x175",
+    "BriefDescription": "System bandwidth utilization for fill events that target a different NUMA node and return from extension memory (CXL) in a different NUMA node.",
+    "UMask": "0x20"
+  },
+  {
+    "EventName": "l2_sys_bw.victim",
+    "EventCode": "0x175",
+    "BriefDescription": "System bandwidth utilization for cache victim events that target all NUMA nodes.",
+    "UMask": "0x40"
+  },
+  {
+    "EventName": "l2_sys_bw.all",
+    "EventCode": "0x175",
+    "BriefDescription": "System bandwidth utilization for all types of events (total utilization).",
+    "UMask": "0xff"
+  }
+]
diff --git a/tools/perf/pmu-events/arch/x86/amdzen6/l3-cache.json b/tools/perf/pmu-events/arch/x86/amdzen6/l3-cache.json
new file mode 100644
index 000000000000..9b9804317da7
--- /dev/null
+++ b/tools/perf/pmu-events/arch/x86/amdzen6/l3-cache.json
@@ -0,0 +1,177 @@
+[
+  {
+    "EventName": "l3_lookup_state.l3_miss",
+    "EventCode": "0x04",
+    "BriefDescription": "L3 cache misses.",
+    "UMask": "0x01",
+    "Unit": "L3PMC"
+  },
+  {
+    "EventName": "l3_lookup_state.l3_hit",
+    "EventCode": "0x04",
+    "BriefDescription": "L3 cache hits.",
+    "UMask": "0xfe",
+    "Unit": "L3PMC"
+  },
+  {
+    "EventName": "l3_lookup_state.all_coherent_accesses_to_l3",
+    "EventCode": "0x04",
+    "BriefDescription": "L3 cache requests for all coherent accesses.",
+    "UMask": "0xff",
+    "Unit": "L3PMC"
+  },
+  {
+    "EventName": "l3_xi_sampled_latency.dram_near",
+    "EventCode": "0xac",
+    "BriefDescription": "Average sampled latency for L3 requests where data is returned from DRAM in the same NUMA node.",
+    "UMask": "0x01",
+    "EnAllCores": "0x1",
+    "EnAllSlices": "0x1",
+    "SliceId": "0x3",
+    "ThreadMask": "0x3",
+    "Unit": "L3PMC"
+  },
+  {
+    "EventName": "l3_xi_sampled_latency.dram_far",
+    "EventCode": "0xac",
+    "BriefDescription": "Average sampled latency for L3 requests where data is returned from DRAM in a different NUMA node.",
+    "UMask": "0x02",
+    "EnAllCores": "0x1",
+    "EnAllSlices": "0x1",
+    "SliceId": "0x3",
+    "ThreadMask": "0x3",
+    "Unit": "L3PMC"
+  },
+  {
+    "EventName": "l3_xi_sampled_latency.near_cache",
+    "EventCode": "0xac",
+    "BriefDescription": "Average sampled latency for L3 requests where data is returned from cache of another CCX in the same NUMA node.",
+    "UMask": "0x04",
+    "EnAllCores": "0x1",
+    "EnAllSlices": "0x1",
+    "SliceId": "0x3",
+    "ThreadMask": "0x3",
+    "Unit": "L3PMC"
+  },
+  {
+    "EventName": "l3_xi_sampled_latency.far_cache",
+    "EventCode": "0xac",
+    "BriefDescription": "Average sampled latency for L3 requests where data is returned from cache of another CCX in a different NUMA node.",
+    "UMask": "0x08",
+    "EnAllCores": "0x1",
+    "EnAllSlices": "0x1",
+    "SliceId": "0x3",
+    "ThreadMask": "0x3",
+    "Unit": "L3PMC"
+  },
+  {
+    "EventName": "l3_xi_sampled_latency.ext_near",
+    "EventCode": "0xac",
+    "BriefDescription": "Average sampled latency for L3 requests where data is returned from extension memory (CXL) in the same NUMA node.",
+    "UMask": "0x10",
+    "EnAllCores": "0x1",
+    "EnAllSlices": "0x1",
+    "SliceId": "0x3",
+    "ThreadMask": "0x3",
+    "Unit": "L3PMC"
+  },
+  {
+    "EventName": "l3_xi_sampled_latency.ext_far",
+    "EventCode": "0xac",
+    "BriefDescription": "Average sampled latency for L3 requests where data is returned from extension memory (CXL) in a different NUMA node.",
+    "UMask": "0x20",
+    "EnAllCores": "0x1",
+    "EnAllSlices": "0x1",
+    "SliceId": "0x3",
+    "ThreadMask": "0x3",
+    "Unit": "L3PMC"
+  },
+  {
+    "EventName": "l3_xi_sampled_latency.all",
+    "EventCode": "0xac",
+    "BriefDescription": "Average sampled latency for L3 requests where data is returned from all types of sources.",
+    "UMask": "0x3f",
+    "EnAllCores": "0x1",
+    "EnAllSlices": "0x1",
+    "SliceId": "0x3",
+    "ThreadMask": "0x3",
+    "Unit": "L3PMC"
+  },
+  {
+    "EventName": "l3_xi_sampled_latency_requests.dram_near",
+    "EventCode": "0xad",
+    "BriefDescription": "Average sampled L3 requests where data is returned from DRAM in the same NUMA node.",
+    "UMask": "0x01",
+    "EnAllCores": "0x1",
+    "EnAllSlices": "0x1",
+    "SliceId": "0x3",
+    "ThreadMask": "0x3",
+    "Unit": "L3PMC"
+  },
+  {
+    "EventName": "l3_xi_sampled_latency_requests.dram_far",
+    "EventCode": "0xad",
+    "BriefDescription": "Average sampled L3 requests where data is returned from DRAM in a different NUMA node.",
+    "UMask": "0x02",
+    "EnAllCores": "0x1",
+    "EnAllSlices": "0x1",
+    "SliceId": "0x3",
+    "ThreadMask": "0x3",
+    "Unit": "L3PMC"
+  },
+  {
+    "EventName": "l3_xi_sampled_latency_requests.near_cache",
+    "EventCode": "0xad",
+    "BriefDescription": "Average sampled L3 requests where data is returned from cache of another CCX in the same NUMA node.",
+    "UMask": "0x04",
+    "EnAllCores": "0x1",
+    "EnAllSlices": "0x1",
+    "SliceId": "0x3",
+    "ThreadMask": "0x3",
+    "Unit": "L3PMC"
+  },
+  {
+    "EventName": "l3_xi_sampled_latency_requests.far_cache",
+    "EventCode": "0xad",
+    "BriefDescription": "Average sampled L3 requests where data is returned from cache of another CCX in a different NUMA node.",
+    "UMask": "0x08",
+    "EnAllCores": "0x1",
+    "EnAllSlices": "0x1",
+    "SliceId": "0x3",
+    "ThreadMask": "0x3",
+    "Unit": "L3PMC"
+  },
+  {
+    "EventName": "l3_xi_sampled_latency_requests.ext_near",
+    "EventCode": "0xad",
+    "BriefDescription": "Average sampled L3 requests where data is returned from extension memory (CXL) in the same NUMA node.",
+    "UMask": "0x10",
+    "EnAllCores": "0x1",
+    "EnAllSlices": "0x1",
+    "SliceId": "0x3",
+    "ThreadMask": "0x3",
+    "Unit": "L3PMC"
+  },
+  {
+    "EventName": "l3_xi_sampled_latency_requests.ext_far",
+    "EventCode": "0xad",
+    "BriefDescription": "Average sampled L3 requests where data is returned from extension memory (CXL) in a different NUMA node.",
+    "UMask": "0x20",
+    "EnAllCores": "0x1",
+    "EnAllSlices": "0x1",
+    "SliceId": "0x3",
+    "ThreadMask": "0x3",
+    "Unit": "L3PMC"
+  },
+  {
+    "EventName": "l3_xi_sampled_latency_requests.all",
+    "EventCode": "0xad",
+    "BriefDescription": "Average sampled L3 requests where data is returned from all types of sources.",
+    "UMask": "0x3f",
+    "EnAllCores": "0x1",
+    "EnAllSlices": "0x1",
+    "SliceId": "0x3",
+    "ThreadMask": "0x3",
+    "Unit": "L3PMC"
+  }
+]
diff --git a/tools/perf/pmu-events/arch/x86/amdzen6/load-store.json b/tools/perf/pmu-events/arch/x86/amdzen6/load-store.json
new file mode 100644
index 000000000000..4291eb59426f
--- /dev/null
+++ b/tools/perf/pmu-events/arch/x86/amdzen6/load-store.json
@@ -0,0 +1,523 @@
+[
+  {
+    "EventName": "ls_bad_status2.stli_other",
+    "EventCode": "0x24",
+    "BriefDescription": "Store-to-load conflicts (loads unable to complete due to a non-forwardable conflict with an older store).",
+    "UMask": "0x02"
+  },
+  {
+    "EventName": "ls_locks.bus_lock",
+    "EventCode": "0x25",
+    "BriefDescription": "Retired lock instructions which caused a bus lock (non-cacheable or cache-misaligned lock).",
+    "UMask": "0x01"
+  },
+  {
+    "EventName": "ls_locks.all",
+    "EventCode": "0x25",
+    "BriefDescription": "Retired lock instructions of all types.",
+    "UMask": "0x1f"
+  },
+  {
+    "EventName": "ls_ret_cl_flush",
+    "EventCode": "0x26",
+    "BriefDescription": "Retired CLFLUSH instructions."
+  },
+  {
+    "EventName": "ls_ret_cpuid",
+    "EventCode": "0x27",
+    "BriefDescription": "Retired CPUID instructions."
+  },
+  {
+    "EventName": "ls_dispatch.pure_ld",
+    "EventCode": "0x29",
+    "BriefDescription": "Memory load operations dispatched to the load-store unit.",
+    "UMask": "0x01"
+  },
+  {
+    "EventName": "ls_dispatch.pure_st",
+    "EventCode": "0x29",
+    "BriefDescription": "Memory store operations dispatched to the load-store unit.",
+    "UMask": "0x02"
+  },
+  {
+    "EventName": "ls_dispatch.ld_st",
+    "EventCode": "0x29",
+    "BriefDescription": "Memory load-store operations (load from and store to the same memory address) dispatched to the load-store unit.",
+    "UMask": "0x04"
+  },
+  {
+    "EventName": "ls_dispatch.all",
+    "EventCode": "0x29",
+    "BriefDescription": "Memory operations dispatched to the load-store unit of all types.",
+    "UMask": "0x07"
+  },
+  {
+    "EventName": "ls_smi_rx",
+    "EventCode": "0x2b",
+    "BriefDescription": "System Management Interrupts (SMIs) received."
+  },
+  {
+    "EventName": "ls_int_taken",
+    "EventCode": "0x2c",
+    "BriefDescription": "Interrupts taken."
+  },
+  {
+    "EventName": "ls_stlf",
+    "EventCode": "0x35",
+    "BriefDescription": "Store-to-load-forward (STLF) hits."
+  },
+  {
+    "EventName": "ls_st_commit_cancel.older_st_vis_dep",
+    "EventCode": "0x37",
+    "BriefDescription": "Store commits cancelled due to an older store, that the thread was waiting on to become globally visible, was unable to become globally visible.",
+    "UMask": "0x01"
+  },
+  {
+    "EventName": "ls_mab_alloc.ls",
+    "EventCode": "0x41",
+    "BriefDescription": "Miss Address Buffer (MAB) entries allocated by a Load-Store (LS) pipe for load-store allocations.",
+    "UMask": "0x07"
+  },
+  {
+    "EventName": "ls_mab_alloc.hwpf",
+    "EventCode": "0x41",
+    "BriefDescription": "Miss Address Buffer (MAB) entries allocated by a Load-Store (LS) pipe for hardware prefetcher allocations.",
+    "UMask": "0x08"
+  },
+  {
+    "EventName": "ls_mab_alloc.all",
+    "EventCode": "0x41",
+    "BriefDescription": "Miss Address Buffer (MAB) entries allocated by a Load-Store (LS) pipe for all types of allocations.",
+    "UMask": "0x0f"
+  },
+  {
+    "EventName": "ls_dmnd_fills_from_sys.local_l2",
+    "EventCode": "0x43",
+    "BriefDescription": "Demand data cache fills where data is returned from local L2 cache.",
+    "UMask": "0x01"
+  },
+  {
+    "EventName": "ls_dmnd_fills_from_sys.local_ccx",
+    "EventCode": "0x43",
+    "BriefDescription": "Demand data cache fills where data is returned from L3 cache or different L2 cache in the same CCX.",
+    "UMask": "0x02"
+  },
+  {
+    "EventName": "ls_dmnd_fills_from_sys.local_all",
+    "EventCode": "0x43",
+    "BriefDescription": "Demand data cache fills where data is returned from local L2 cache, L3 cache or different L2 cache in the same CCX.",
+    "UMask": "0x03"
+  },
+  {
+    "EventName": "ls_dmnd_fills_from_sys.near_cache",
+    "EventCode": "0x43",
+    "BriefDescription": "Demand data cache fills where data is returned from cache of another CCX in the same NUMA node.",
+    "UMask": "0x04"
+  },
+  {
+    "EventName": "ls_dmnd_fills_from_sys.dram_io_near",
+    "EventCode": "0x43",
+    "BriefDescription": "Demand data cache fills where data is returned from either DRAM or MMIO in the same NUMA node.",
+    "UMask": "0x08"
+  },
+  {
+    "EventName": "ls_dmnd_fills_from_sys.far_cache",
+    "EventCode": "0x43",
+    "BriefDescription": "Demand data cache fills where data is returned from cache of another CCX in a different NUMA node.",
+    "UMask": "0x10"
+  },
+  {
+    "EventName": "ls_dmnd_fills_from_sys.remote_cache",
+    "EventCode": "0x43",
+    "BriefDescription": "Demand data cache fills where data is returned from cache of another CCX in the same or a different NUMA node.",
+    "UMask": "0x14"
+  },
+  {
+    "EventName": "ls_dmnd_fills_from_sys.dram_io_far",
+    "EventCode": "0x43",
+    "BriefDescription": "Demand data cache fills where data is returned from either DRAM or MMIO in a different NUMA node.",
+    "UMask": "0x40"
+  },
+  {
+    "EventName": "ls_dmnd_fills_from_sys.dram_io_all",
+    "EventCode": "0x43",
+    "BriefDescription": "Demand data cache fills where data is returned from either DRAM or MMIO in the same or a different NUMA node.",
+    "UMask": "0x48"
+  },
+  {
+    "EventName": "ls_dmnd_fills_from_sys.far_all",
+    "EventCode": "0x43",
+    "BriefDescription": "Demand data cache fills where data is returned from either cache of another CCX, DRAM or MMIO in a different NUMA node.",
+    "UMask": "0x50"
+  },
+  {
+    "EventName": "ls_dmnd_fills_from_sys.alt_mem",
+    "EventCode": "0x43",
+    "BriefDescription": "Demand data cache fills where data is returned from extension memory (CXL).",
+    "UMask": "0x80"
+  },
+  {
+    "EventName": "ls_dmnd_fills_from_sys.all",
+    "EventCode": "0x43",
+    "BriefDescription": "Demand data cache fills where data is returned from all types of sources.",
+    "UMask": "0xdf"
+  },
+  {
+    "EventName": "ls_any_fills_from_sys.local_l2",
+    "EventCode": "0x44",
+    "BriefDescription": "Any data cache fills where data is returned from local L2 cache.",
+    "UMask": "0x01"
+  },
+  {
+    "EventName": "ls_any_fills_from_sys.local_ccx",
+    "EventCode": "0x44",
+    "BriefDescription": "Any data cache fills where data is returned from L3 cache or different L2 cache in the same CCX.",
+    "UMask": "0x02"
+  },
+  {
+    "EventName": "ls_any_fills_from_sys.local_all",
+    "EventCode": "0x44",
+    "BriefDescription": "Any data cache fills where data is returned from local L2 cache, L3 cache or different L2 cache in the same CCX.",
+    "UMask": "0x03"
+  },
+  {
+    "EventName": "ls_any_fills_from_sys.near_cache",
+    "EventCode": "0x44",
+    "BriefDescription": "Any data cache fills where data is returned from cache of another CCX in the same NUMA node.",
+    "UMask": "0x04"
+  },
+  {
+    "EventName": "ls_any_fills_from_sys.dram_io_near",
+    "EventCode": "0x44",
+    "BriefDescription": "Any data cache fills where data is returned from either DRAM or MMIO in the same NUMA node.",
+    "UMask": "0x08"
+  },
+  {
+    "EventName": "ls_any_fills_from_sys.far_cache",
+    "EventCode": "0x44",
+    "BriefDescription": "Any data cache fills where data is returned from cache of another CCX in a different NUMA node.",
+    "UMask": "0x10"
+  },
+  {
+    "EventName": "ls_any_fills_from_sys.remote_cache",
+    "EventCode": "0x44",
+    "BriefDescription": "Any data cache fills where data is returned from cache of another CCX in the same or a different NUMA node.",
+    "UMask": "0x14"
+  },
+  {
+    "EventName": "ls_any_fills_from_sys.dram_io_far",
+    "EventCode": "0x44",
+    "BriefDescription": "Any data cache fills where data is returned from either DRAM or MMIO in a different NUMA node.",
+    "UMask": "0x40"
+  },
+  {
+    "EventName": "ls_any_fills_from_sys.dram_io_all",
+    "EventCode": "0x44",
+    "BriefDescription": "Any data cache fills where data is returned from either DRAM or MMIO in the same or a different NUMA node.",
+    "UMask": "0x48"
+  },
+  {
+    "EventName": "ls_any_fills_from_sys.far_all",
+    "EventCode": "0x44",
+    "BriefDescription": "Any data cache fills where data is returned from either cache of another CCX, DRAM or MMIO when the address was in a different NUMA node.",
+    "UMask": "0x50"
+  },
+  {
+    "EventName": "ls_any_fills_from_sys.alt_mem",
+    "EventCode": "0x44",
+    "BriefDescription": "Any data cache fills where data is returned from extension memory (CXL).",
+    "UMask": "0x80"
+  },
+  {
+    "EventName": "ls_any_fills_from_sys.all",
+    "EventCode": "0x44",
+    "BriefDescription": "Any data cache fills where data is returned from all types of data sources.",
+    "UMask": "0xff"
+  },
+  {
+    "EventName": "ls_l1_d_tlb_miss.tlb_reload_4k_l2_hit",
+    "EventCode": "0x45",
+    "BriefDescription": "L1 DTLB misses with L2 DTLB hits for 4k pages.",
+    "UMask": "0x01"
+  },
+  {
+    "EventName": "ls_l1_d_tlb_miss.tlb_reload_coalesced_page_hit",
+    "EventCode": "0x45",
+    "BriefDescription": "L1 DTLB misses with L2 DTLB hits for coalesced pages (16k pages created from four adjacent 4k pages).",
+    "UMask": "0x02"
+  },
+  {
+    "EventName": "ls_l1_d_tlb_miss.tlb_reload_2m_l2_hit",
+    "EventCode": "0x45",
+    "BriefDescription": "L1 DTLB misses with L2 DTLB hits for 2M pages.",
+    "UMask": "0x04"
+  },
+  {
+    "EventName": "ls_l1_d_tlb_miss.tlb_reload_1g_l2_hit",
+    "EventCode": "0x45",
+    "BriefDescription": "L1 DTLB misses with L2 DTLB hits for 1G pages.",
+    "UMask": "0x08"
+  },
+  {
+    "EventName": "ls_l1_d_tlb_miss.tlb_reload_4k_l2_miss",
+    "EventCode": "0x45",
+    "BriefDescription": "L1 DTLB misses with L2 DTLB misses (page-table walks requested) for 4k pages.",
+    "UMask": "0x10"
+  },
+  {
+    "EventName": "ls_l1_d_tlb_miss.tlb_reload_coalesced_page_miss",
+    "EventCode": "0x45",
+    "BriefDescription": "L1 DTLB misses with L2 DTLB misses (page-table walks requested) for coalesced pages (16k pages created from four adjacent 4k pages).",
+    "UMask": "0x20"
+  },
+  {
+    "EventName": "ls_l1_d_tlb_miss.tlb_reload_2m_l2_miss",
+    "EventCode": "0x45",
+    "BriefDescription": "L1 DTLB misses with L2 DTLB misses (page-table walks requested) for 2M pages.",
+    "UMask": "0x40"
+  },
+  {
+    "EventName": "ls_l1_d_tlb_miss.tlb_reload_1g_l2_miss",
+    "EventCode": "0x45",
+    "BriefDescription": "L1 DTLB misses with L2 DTLB misses (page-table walks requested) for 1G pages.",
+    "UMask": "0x80"
+  },
+  {
+    "EventName": "ls_l1_d_tlb_miss.l2_miss_all",
+    "EventCode": "0x45",
+    "BriefDescription": "L1 DTLB misses with L2 DTLB misses (page-table walks requested) for all page sizes.",
+    "UMask": "0xf0"
+  },
+  {
+    "EventName": "ls_l1_d_tlb_miss.all",
+    "EventCode": "0x45",
+    "BriefDescription": "L1 DTLB misses for all page sizes.",
+    "UMask": "0xff"
+  },
+  {
+    "EventName": "ls_misal_loads.ma64",
+    "EventCode": "0x47",
+    "BriefDescription": "64B misaligned (cacheline crossing) loads.",
+    "UMask": "0x01"
+  },
+  {
+    "EventName": "ls_misal_loads.ma4k",
+    "EventCode": "0x47",
+    "BriefDescription": "4kB misaligned (page crossing) loads.",
+    "UMask": "0x02"
+  },
+  {
+    "EventName": "ls_pref_instr_disp.prefetch",
+    "EventCode": "0x4b",
+    "BriefDescription": "Software prefetch instructions dispatched (speculative) of type PrefetchT0 (move data to all cache levels), T1 (move data to all cache levels except L1) and T2 (move data to all cache levels except L1 and L2).",
+    "UMask": "0x01"
+  },
+  {
+    "EventName": "ls_pref_instr_disp.prefetch_w",
+    "EventCode": "0x4b",
+    "BriefDescription": "Software prefetch instructions dispatched (speculative) of type PrefetchW (move data to L1 cache and mark it modifiable).",
+    "UMask": "0x02"
+  },
+  {
+    "EventName": "ls_pref_instr_disp.prefetch_nta",
+    "EventCode": "0x4b",
+    "BriefDescription": "Software prefetch instructions dispatched (speculative) of type PrefetchNTA (move data with minimum cache pollution i.e. non-temporal access).",
+    "UMask": "0x04"
+  },
+  {
+    "EventName": "ls_pref_instr_disp.all",
+    "EventCode": "0x4b",
+    "BriefDescription": "Software prefetch instructions dispatched (speculative) of all types.",
+    "UMask": "0x07"
+  },
+  {
+    "EventName": "wcb_close.full_line_64b",
+    "EventCode": "0x50",
+    "BriefDescription": "Events that caused a Write Combining Buffer (WCB) entry to close because all 64 bytes of the entry have been written to.",
+    "UMask": "0x01"
+  },
+  {
+    "EventName": "ls_inef_sw_pref.dc_hit",
+    "EventCode": "0x52",
+    "BriefDescription": "Software prefetches that did not fetch data outside of the processor core as the PREFETCH instruction saw a data cache hit.",
+    "UMask": "0x01"
+  },
+  {
+    "EventName": "ls_inef_sw_pref.mab_hit",
+    "EventCode": "0x52",
+    "BriefDescription": "Software prefetches that did not fetch data outside of the processor core as the PREFETCH instruction saw a match on an already allocated miss request (MAB).",
+    "UMask": "0x02"
+  },
+  {
+    "EventName": "ls_inef_sw_pref.all",
+    "EventCode": "0x52",
+    "BriefDescript6ion": "Software prefetches that did not fetch data outside of the processor core for any reason.",
+    "UMask": "0x03"
+  },
+  {
+    "EventName": "ls_sw_pf_dc_fills.local_l2",
+    "EventCode": "0x59",
+    "BriefDescription": "Software prefetch data cache fills where data is returned from local L2 cache.",
+    "UMask": "0x01"
+  },
+  {
+    "EventName": "ls_sw_pf_dc_fills.local_ccx",
+    "EventCode": "0x59",
+    "BriefDescription": "Software prefetch data cache fills where data is returned from L3 cache or different L2 cache in the same CCX.",
+    "UMask": "0x02"
+  },
+  {
+    "EventName": "ls_sw_pf_dc_fills.local_all",
+    "EventCode": "0x59",
+    "BriefDescription": "Software prefetch data cache fills where data is returned from local L2 cache, L3 cache or different L2 cache in the same CCX.",
+    "UMask": "0x03"
+  },
+  {
+    "EventName": "ls_sw_pf_dc_fills.near_cache",
+    "EventCode": "0x59",
+    "BriefDescription": "Software prefetch data cache fills where data is returned from cache of another CCX in the same NUMA node.",
+    "UMask": "0x04"
+  },
+  {
+    "EventName": "ls_sw_pf_dc_fills.dram_io_near",
+    "EventCode": "0x59",
+    "BriefDescription": "Software prefetch data cache fills where data is returned from either DRAM or MMIO in the same NUMA node.",
+    "UMask": "0x08"
+  },
+  {
+    "EventName": "ls_sw_pf_dc_fills.far_cache",
+    "EventCode": "0x59",
+    "BriefDescription": "Software prefetch data cache fills where data is returned from cache of another CCX in a different NUMA node.",
+    "UMask": "0x10"
+  },
+  {
+    "EventName": "ls_sw_pf_dc_fills.remote_cache",
+    "EventCode": "0x59",
+    "BriefDescription": "Software prefetch data cache fills where data is returned from cache of another CCX in the same or a different NUMA node.",
+    "UMask": "0x14"
+  },
+  {
+    "EventName": "ls_sw_pf_dc_fills.dram_io_far",
+    "EventCode": "0x59",
+    "BriefDescription": "Software prefetch data cache fills where data is returned from either DRAM or MMIO in a different NUMA node.",
+    "UMask": "0x40"
+  },
+  {
+    "EventName": "ls_sw_pf_dc_fills.dram_io_all",
+    "EventCode": "0x59",
+    "BriefDescription": "Software prefetch data cache fills where data is returned from either DRAM or MMIO in the same or a different NUMA node.",
+    "UMask": "0x48"
+  },
+  {
+    "EventName": "ls_sw_pf_dc_fills.far_all",
+    "EventCode": "0x59",
+    "BriefDescription": "Software prefetch data cache fills where data is returned from either cache of another CCX, DRAM or MMIO in a different NUMA node.",
+    "UMask": "0x50"
+  },
+  {
+    "EventName": "ls_sw_pf_dc_fills.alt_mem",
+    "EventCode": "0x59",
+    "BriefDescription": "Software prefetch data cache fills where data is returned from extension memory (CXL).",
+    "UMask": "0x80"
+  },
+  {
+    "EventName": "ls_sw_pf_dc_fills.all",
+    "EventCode": "0x59",
+    "BriefDescription": "Software prefetch data cache fills where data is returned from all types of data sources.",
+    "UMask": "0xdf"
+  },
+  {
+    "EventName": "ls_hw_pf_dc_fills.local_l2",
+    "EventCode": "0x5a",
+    "BriefDescription": "Hardware prefetch data cache fills where data is returned from local L2 cache.",
+    "UMask": "0x01"
+  },
+  {
+    "EventName": "ls_hw_pf_dc_fills.local_ccx",
+    "EventCode": "0x5a",
+    "BriefDescription": "Hardware prefetch data cache fills where data is returned from L3 cache or different L2 cache in the same CCX.",
+    "UMask": "0x02"
+  },
+  {
+    "EventName": "ls_hw_pf_dc_fills.local_all",
+    "EventCode": "0x5a",
+    "BriefDescription": "Hardware prefetch data cache fills where data is returned from local L2 cache, L3 cache or different L2 cache in the same CCX.",
+    "UMask": "0x03"
+  },
+  {
+    "EventName": "ls_hw_pf_dc_fills.near_cache",
+    "EventCode": "0x5a",
+    "BriefDescription": "Hardware prefetch data cache fills where data is returned from cache of another CCX in the same NUMA node.",
+    "UMask": "0x04"
+  },
+  {
+    "EventName": "ls_hw_pf_dc_fills.dram_io_near",
+    "EventCode": "0x5a",
+    "BriefDescription": "Hardware prefetch data cache fills where data is returned from either DRAM or MMIO in the same NUMA node.",
+    "UMask": "0x08"
+  },
+  {
+    "EventName": "ls_hw_pf_dc_fills.far_cache",
+    "EventCode": "0x5a",
+    "BriefDescription": "Hardware prefetch data cache fills where data is returned from cache of another CCX in a different NUMA node.",
+    "UMask": "0x10"
+  },
+  {
+    "EventName": "ls_hw_pf_dc_fills.remote_cache",
+    "EventCode": "0x5a",
+    "BriefDescription": "Hardware prefetch data cache fills where data is returned from cache of another CCX in the same or a different NUMA node.",
+    "UMask": "0x14"
+  },
+  {
+    "EventName": "ls_hw_pf_dc_fills.dram_io_far",
+    "EventCode": "0x5a",
+    "BriefDescription": "Hardware prefetch data cache fills where data is returned from either DRAM or MMIO in a different NUMA node.",
+    "UMask": "0x40"
+  },
+  {
+    "EventName": "ls_hw_pf_dc_fills.dram_io_all",
+    "EventCode": "0x5a",
+    "BriefDescription": "Hardware prefetch data cache fills where data is returned from either DRAM or MMIO in the same or a different NUMA node.",
+    "UMask": "0x48"
+  },
+  {
+    "EventName": "ls_hw_pf_dc_fills.far_all",
+    "EventCode": "0x5a",
+    "BriefDescription": "Hardware prefetch data cache fills where data is returned from either cache of another CCX, DRAM or MMIO in a different NUMA node.",
+    "UMask": "0x50"
+  },
+  {
+    "EventName": "ls_hw_pf_dc_fills.alt_mem",
+    "EventCode": "0x5a",
+    "BriefDescription": "Hardware prefetch data cache fills where data is returned from extension memory (CXL).",
+    "UMask": "0x80"
+  },
+  {
+    "EventName": "ls_hw_pf_dc_fills.all",
+    "EventCode": "0x5a",
+    "BriefDescription": "Hardware prefetch data cache fills where data is returned from all types of data sources.",
+    "UMask": "0xdf"
+  },
+  {
+    "EventName": "ls_alloc_mab_count",
+    "EventCode": "0x5f",
+    "BriefDescription": "In-flight L1 data cache misses i.e. Miss Address Buffer (MAB) allocations each cycle."
+  },
+  {
+    "EventName": "ls_not_halted_cyc",
+    "EventCode": "0x76",
+    "BriefDescription": "Core cycles where the thread is not in halted state."
+  },
+  {
+    "EventName": "ls_tlb_flush.all",
+    "EventCode": "0x78",
+    "BriefDescription": "All TLB flushes.",
+    "UMask": "0xff"
+  },
+  {
+    "EventName": "ls_not_halted_p0_cyc.p0_freq_cyc",
+    "EventCode": "0x120",
+    "BriefDescription": "Reference cycles (P0 frequency) where the thread is not in halted state.",
+    "UMask": "0x1"
+  }
+]
diff --git a/tools/perf/pmu-events/arch/x86/amdzen6/memory-controller.json b/tools/perf/pmu-events/arch/x86/amdzen6/memory-controller.json
new file mode 100644
index 000000000000..649a60b09e1b
--- /dev/null
+++ b/tools/perf/pmu-events/arch/x86/amdzen6/memory-controller.json
@@ -0,0 +1,101 @@
+[
+  {
+    "EventName": "umc_mem_clk",
+    "PublicDescription": "Memory clock (MEMCLK) cycles.",
+    "EventCode": "0x00",
+    "PerPkg": "1",
+    "Unit": "UMCPMC"
+  },
+  {
+    "EventName": "umc_act_cmd.all",
+    "PublicDescription": "ACTIVATE commands sent.",
+    "EventCode": "0x05",
+    "PerPkg": "1",
+    "Unit": "UMCPMC"
+  },
+  {
+    "EventName": "umc_act_cmd.rd",
+    "PublicDescription": "ACTIVATE commands sent for reads.",
+    "EventCode": "0x05",
+    "RdWrMask": "0x1",
+    "PerPkg": "1",
+    "Unit": "UMCPMC"
+  },
+  {
+    "EventName": "umc_act_cmd.wr",
+    "PublicDescription": "ACTIVATE commands sent for writes.",
+    "EventCode": "0x05",
+    "RdWrMask": "0x2",
+    "PerPkg": "1",
+    "Unit": "UMCPMC"
+  },
+  {
+    "EventName": "umc_pchg_cmd.all",
+    "PublicDescription": "PRECHARGE commands sent.",
+    "EventCode": "0x06",
+    "PerPkg": "1",
+    "Unit": "UMCPMC"
+  },
+  {
+    "EventName": "umc_pchg_cmd.rd",
+    "PublicDescription": "PRECHARGE commands sent for reads.",
+    "EventCode": "0x06",
+    "RdWrMask": "0x1",
+    "PerPkg": "1",
+    "Unit": "UMCPMC"
+  },
+  {
+    "EventName": "umc_pchg_cmd.wr",
+    "PublicDescription": "PRECHARGE commands sent for writes.",
+    "EventCode": "0x06",
+    "RdWrMask": "0x2",
+    "PerPkg": "1",
+    "Unit": "UMCPMC"
+  },
+  {
+    "EventName": "umc_cas_cmd.all",
+    "PublicDescription": "CAS commands sent.",
+    "EventCode": "0x0a",
+    "PerPkg": "1",
+    "Unit": "UMCPMC"
+  },
+  {
+    "EventName": "umc_cas_cmd.rd",
+    "PublicDescription": "CAS commands sent for reads.",
+    "EventCode": "0x0a",
+    "RdWrMask": "0x1",
+    "PerPkg": "1",
+    "Unit": "UMCPMC"
+  },
+  {
+    "EventName": "umc_cas_cmd.wr",
+    "PublicDescription": "CAS commands sent for writes.",
+    "EventCode": "0x0a",
+    "RdWrMask": "0x2",
+    "PerPkg": "1",
+    "Unit": "UMCPMC"
+  },
+  {
+    "EventName": "umc_data_slot_clks.all",
+    "PublicDescription": "Clock cycles where the data bus is utilized.",
+    "EventCode": "0x14",
+    "PerPkg": "1",
+    "Unit": "UMCPMC"
+  },
+  {
+    "EventName": "umc_data_slot_clks.rd",
+    "PublicDescription": "Clock cycles where the data bus is utilized for reads.",
+    "EventCode": "0x14",
+    "RdWrMask": "0x1",
+    "PerPkg": "1",
+    "Unit": "UMCPMC"
+  },
+  {
+    "EventName": "umc_data_slot_clks.wr",
+    "PublicDescription": "Clock cycles where the data bus is utilized for writes.",
+    "EventCode": "0x14",
+    "RdWrMask": "0x2",
+    "PerPkg": "1",
+    "Unit": "UMCPMC"
+  }
+]
diff --git a/tools/perf/pmu-events/arch/x86/amdzen6/pipeline.json b/tools/perf/pmu-events/arch/x86/amdzen6/pipeline.json
new file mode 100644
index 000000000000..48c501d8a097
--- /dev/null
+++ b/tools/perf/pmu-events/arch/x86/amdzen6/pipeline.json
@@ -0,0 +1,99 @@
+[
+  {
+    "MetricName": "total_dispatch_slots",
+    "BriefDescription": "Total dispatch slots (up to 8 instructions can be dispatched in each cycle).",
+    "MetricExpr": "8 * ls_not_halted_cyc",
+    "ScaleUnit": "1slots"
+  },
+  {
+    "MetricName": "frontend_bound",
+    "BriefDescription": "Percentage of dispatch slots that remained unused because the frontend did not supply enough instructions/ops.",
+    "MetricExpr": "d_ratio(de_no_dispatch_per_slot.no_ops_from_frontend, total_dispatch_slots)",
+    "MetricGroup": "PipelineL1",
+    "ScaleUnit": "100%slots"
+  },
+  {
+    "MetricName": "bad_speculation",
+    "BriefDescription": "Percentage of dispatched ops that did not retire.",
+    "MetricExpr": "d_ratio(de_src_op_disp.all - ex_ret_ops, total_dispatch_slots)",
+    "MetricGroup": "PipelineL1",
+    "ScaleUnit": "100%ops"
+  },
+  {
+    "MetricName": "backend_bound",
+    "BriefDescription": "Percentage of dispatch slots that remained unused because of backend stalls.",
+    "MetricExpr": "d_ratio(de_no_dispatch_per_slot.backend_stalls, total_dispatch_slots)",
+    "MetricGroup": "PipelineL1",
+    "ScaleUnit": "100%slots"
+  },
+  {
+    "MetricName": "smt_contention",
+    "BriefDescription": "Percentage of dispatch slots that remained unused because the other thread was selected.",
+    "MetricExpr": "d_ratio(de_no_dispatch_per_slot.smt_contention, total_dispatch_slots)",
+    "MetricGroup": "PipelineL1",
+    "ScaleUnit": "100%slots"
+  },
+  {
+    "MetricName": "retiring",
+    "BriefDescription": "Percentage of dispatch slots used by ops that retired.",
+    "MetricExpr": "d_ratio(ex_ret_ops, total_dispatch_slots)",
+    "MetricGroup": "PipelineL1",
+    "ScaleUnit": "100%slots"
+  },
+  {
+    "MetricName": "frontend_bound_by_latency",
+    "BriefDescription": "Percentage of dispatch slots that remained unused because of a latency bottleneck in the frontend (such as instruction cache or TLB misses).",
+    "MetricExpr": "d_ratio((8 * cpu@de_no_dispatch_per_slot.no_ops_from_frontend\\,cmask\\=0x8@), total_dispatch_slots)",
+    "MetricGroup": "PipelineL2;frontend_bound_group",
+    "ScaleUnit": "100%slots"
+  },
+  {
+    "MetricName": "frontend_bound_by_bandwidth",
+    "BriefDescription": "Percentage of dispatch slots that remained unused because of a bandwidth bottleneck in the frontend (such as decode or op cache fetch bandwidth).",
+    "MetricExpr": "d_ratio(de_no_dispatch_per_slot.no_ops_from_frontend - (8 * cpu@de_no_dispatch_per_slot.no_ops_from_frontend\\,cmask\\=0x8@), total_dispatch_slots)",
+    "MetricGroup": "PipelineL2;frontend_bound_group",
+    "ScaleUnit": "100%slots"
+  },
+  {
+    "MetricName": "bad_speculation_from_mispredicts",
+    "BriefDescription": "Percentage of dispatched ops that were flushed due to branch mispredicts.",
+    "MetricExpr": "d_ratio(bad_speculation * ex_ret_brn_misp, ex_ret_brn_misp + bp_fe_redir.resync)",
+    "MetricGroup": "PipelineL2;bad_speculation_group",
+    "ScaleUnit": "100%ops"
+  },
+  {
+    "MetricName": "bad_speculation_from_pipeline_restarts",
+    "BriefDescription": "Percentage of dispatched ops that were flushed due to pipeline restarts (resyncs).",
+    "MetricExpr": "d_ratio(bad_speculation * bp_fe_redir.resync, ex_ret_brn_misp + bp_fe_redir.resync)",
+    "MetricGroup": "PipelineL2;bad_speculation_group",
+    "ScaleUnit": "100%ops"
+  },
+  {
+    "MetricName": "backend_bound_by_memory",
+    "BriefDescription": "Percentage of dispatch slots that remained unused because of stalls due to the memory subsystem.",
+    "MetricExpr": "backend_bound * d_ratio(ex_no_retire.load_not_complete, ex_no_retire.not_complete)",
+    "MetricGroup": "PipelineL2;backend_bound_group",
+    "ScaleUnit": "100%slots"
+  },
+  {
+    "MetricName": "backend_bound_by_cpu",
+    "BriefDescription": "Percentage of dispatch slots that remained unused because of stalls not related to the memory subsystem.",
+    "MetricExpr": "backend_bound * (1 - d_ratio(ex_no_retire.load_not_complete, ex_no_retire.not_complete))",
+    "MetricGroup": "PipelineL2;backend_bound_group",
+    "ScaleUnit": "100%slots"
+  },
+  {
+    "MetricName": "retiring_from_fastpath",
+    "BriefDescription": "Percentage of dispatch slots used by fastpath ops that retired.",
+    "MetricExpr": "retiring * (1 - d_ratio(ex_ret_ucode_ops, ex_ret_ops))",
+    "MetricGroup": "PipelineL2;retiring_group",
+    "ScaleUnit": "100%slots"
+  },
+  {
+    "MetricName": "retiring_from_microcode",
+    "BriefDescription": "Percentage of dispatch slots used by microcode ops that retired.",
+    "MetricExpr": "retiring * d_ratio(ex_ret_ucode_ops, ex_ret_ops)",
+    "MetricGroup": "PipelineL2;retiring_group",
+    "ScaleUnit": "100%slots"
+  }
+]
diff --git a/tools/perf/pmu-events/arch/x86/amdzen6/recommended.json b/tools/perf/pmu-events/arch/x86/amdzen6/recommended.json
new file mode 100644
index 000000000000..2849a8c159f6
--- /dev/null
+++ b/tools/perf/pmu-events/arch/x86/amdzen6/recommended.json
@@ -0,0 +1,339 @@
+[
+  {
+    "MetricName": "branch_misprediction_rate",
+    "BriefDescription": "Execution-time branch misprediction rate (non-speculative).",
+    "MetricExpr": "d_ratio(ex_ret_brn_misp, ex_ret_brn)",
+    "MetricGroup": "branch_prediction",
+    "ScaleUnit": "1per_branch"
+  },
+  {
+    "MetricName": "all_data_cache_accesses_pti",
+    "BriefDescription": "All data cache accesses per thousand instructions.",
+    "MetricExpr": "ls_dispatch.all / instructions",
+    "MetricGroup": "l1_dcache",
+    "ScaleUnit": "1e3per_1k_instr"
+  },
+  {
+    "MetricName": "all_l2_cache_accesses_pti",
+    "BriefDescription": "All L2 cache accesses per thousand instructions.",
+    "MetricExpr": "(l2_request_g1.no_pf_all + l2_pf_hit_l2.l2_hwpf + l2_pf_miss_l2_hit_l3.l2_hwpf + l2_pf_miss_l2_l3.l2_hwpf) / instructions",
+    "MetricGroup": "l2_cache",
+    "ScaleUnit": "1e3per_1k_instr"
+  },
+  {
+    "MetricName": "l2_cache_accesses_from_l1_ic_misses_pti",
+    "BriefDescription": "L2 cache accesses from L1 instruction cache misses (including prefetch) per thousand instructions.",
+    "MetricExpr": "l2_request_g1.cacheable_ic_read / instructions",
+    "MetricGroup": "l2_cache",
+    "ScaleUnit": "1e3per_1k_instr"
+  },
+  {
+    "MetricName": "l2_cache_accesses_from_l1_dc_misses_pti",
+    "BriefDescription": "L2 cache accesses from L1 data cache misses (including prefetch) per thousand instructions.",
+    "MetricExpr": "l2_request_g1.dc_all / instructions",
+    "MetricGroup": "l2_cache",
+    "ScaleUnit": "1e3per_1k_instr"
+  },
+  {
+    "MetricName": "l2_cache_accesses_from_l2_hwpf_pti",
+    "BriefDescription": "L2 cache accesses from L2 cache hardware prefetcher per thousand instructions.",
+    "MetricExpr": "(l2_pf_hit_l2.l1_dc_l2_hwpf + l2_pf_miss_l2_hit_l3.l1_dc_l2_hwpf + l2_pf_miss_l2_l3.l1_dc_l2_hwpf) / instructions",
+    "MetricGroup": "l2_cache",
+    "ScaleUnit": "1e3per_1k_instr"
+  },
+  {
+    "MetricName": "all_l2_cache_misses_pti",
+    "BriefDescription": "All L2 cache misses per thousand instructions.",
+    "MetricExpr": "(l2_cache_req_stat.ic_dc_miss_in_l2 + l2_pf_miss_l2_hit_l3.l2_hwpf + l2_pf_miss_l2_l3.l2_hwpf) / instructions",
+    "MetricGroup": "l2_cache",
+    "ScaleUnit": "1e3per_1k_instr"
+  },
+  {
+    "MetricName": "l2_cache_misses_from_l1_ic_miss_pti",
+    "BriefDescription": "L2 cache misses from L1 instruction cache misses per thousand instructions.",
+    "MetricExpr": "l2_cache_req_stat.ic_fill_miss / instructions",
+    "MetricGroup": "l2_cache",
+    "ScaleUnit": "1e3per_1k_instr"
+  },
+  {
+    "MetricName": "l2_cache_misses_from_l1_dc_miss_pti",
+    "BriefDescription": "L2 cache misses from L1 data cache misses per thousand instructions.",
+    "MetricExpr": "l2_cache_req_stat.ls_rd_blk_c / instructions",
+    "MetricGroup": "l2_cache",
+    "ScaleUnit": "1e3per_1k_instr"
+  },
+  {
+    "MetricName": "l2_cache_misses_from_l2_hwpf_pti",
+    "BriefDescription": "L2 cache misses from L2 cache hardware prefetcher per thousand instructions.",
+    "MetricExpr": "(l2_pf_miss_l2_hit_l3.l1_dc_l2_hwpf + l2_pf_miss_l2_l3.l1_dc_l2_hwpf) / instructions",
+    "MetricGroup": "l2_cache",
+    "ScaleUnit": "1e3per_1k_instr"
+  },
+  {
+    "MetricName": "all_l2_cache_hits_pti",
+    "BriefDescription": "All L2 cache hits per thousand instructions.",
+    "MetricExpr": "(l2_cache_req_stat.ic_dc_hit_in_l2 + l2_pf_hit_l2.l2_hwpf) / instructions",
+    "MetricGroup": "l2_cache",
+    "ScaleUnit": "1e3per_1k_instr"
+  },
+  {
+    "MetricName": "l2_cache_hits_from_l1_ic_miss_pti",
+    "BriefDescription": "L2 cache hits from L1 instruction cache misses per thousand instructions.",
+    "MetricExpr": "l2_cache_req_stat.ic_hit_in_l2 / instructions",
+    "MetricGroup": "l2_cache",
+    "ScaleUnit": "1e3per_1k_instr"
+  },
+  {
+    "MetricName": "l2_cache_hits_from_l1_dc_miss_pti",
+    "BriefDescription": "L2 cache hits from L1 data cache misses per thousand instructions.",
+    "MetricExpr": "l2_cache_req_stat.dc_hit_in_l2 / instructions",
+    "MetricGroup": "l2_cache",
+    "ScaleUnit": "1e3per_1k_instr"
+  },
+  {
+    "MetricName": "l2_cache_hits_from_l2_hwpf_pti",
+    "BriefDescription": "L2 cache hits from L2 cache hardware prefetcher per thousand instructions.",
+    "MetricExpr": "l2_pf_hit_l2.l1_dc_l2_hwpf / instructions",
+    "MetricGroup": "l2_cache",
+    "ScaleUnit": "1e3per_1k_instr"
+  },
+  {
+    "MetricName": "l3_cache_accesses",
+    "BriefDescription": "L3 cache accesses.",
+    "MetricExpr": "l3_lookup_state.all_coherent_accesses_to_l3",
+    "MetricGroup": "l3_cache"
+  },
+  {
+    "MetricName": "l3_misses",
+    "BriefDescription": "L3 misses (including cacheline state change requests).",
+    "MetricExpr": "l3_lookup_state.l3_miss",
+    "MetricGroup": "l3_cache"
+  },
+  {
+    "MetricName": "l3_read_miss_latency",
+    "BriefDescription": "Average L3 read miss latency (in core clocks).",
+    "MetricExpr": "(l3_xi_sampled_latency.all * 10) / l3_xi_sampled_latency_requests.all",
+    "MetricGroup": "l3_cache",
+    "ScaleUnit": "1ns"
+  },
+  {
+    "MetricName": "l3_read_miss_latency_for_local_dram",
+    "BriefDescription": "Average L3 read miss latency (in core clocks) for local DRAM.",
+    "MetricExpr": "(l3_xi_sampled_latency.dram_near * 10) / l3_xi_sampled_latency_requests.dram_near",
+    "MetricGroup": "l3_cache",
+    "ScaleUnit": "1ns"
+  },
+  {
+    "MetricName": "l3_read_miss_latency_for_remote_dram",
+    "BriefDescription": "Average L3 read miss latency (in core clocks) for remote DRAM.",
+    "MetricExpr": "(l3_xi_sampled_latency.dram_far * 10) / l3_xi_sampled_latency_requests.dram_far",
+    "MetricGroup": "l3_cache",
+    "ScaleUnit": "1ns"
+  },
+  {
+    "MetricName": "op_cache_fetch_miss_ratio",
+    "BriefDescription": "Op cache miss ratio for all fetches.",
+    "MetricExpr": "d_ratio(op_cache_hit_miss.miss, op_cache_hit_miss.all)",
+    "ScaleUnit": "100%"
+  },
+  {
+    "MetricName": "l1_data_cache_fills_from_memory_pti",
+    "BriefDescription": "L1 data cache fills from DRAM or MMIO in any NUMA node per thousand instructions.",
+    "MetricExpr": "ls_any_fills_from_sys.dram_io_all / instructions",
+    "MetricGroup": "l1_dcache",
+    "ScaleUnit": "1e3per_1k_instr"
+  },
+  {
+    "MetricName": "l1_data_cache_fills_from_remote_node_pti",
+    "BriefDescription": "L1 data cache fills from a different NUMA node per thousand instructions.",
+    "MetricExpr": "ls_any_fills_from_sys.far_all / instructions",
+    "MetricGroup": "l1_dcache",
+    "ScaleUnit": "1e3per_1k_instr"
+  },
+  {
+    "MetricName": "l1_data_cache_fills_from_same_ccx_pti",
+    "BriefDescription": "L1 data cache fills from within the same CCX per thousand instructions.",
+    "MetricExpr": "ls_any_fills_from_sys.local_all / instructions",
+    "MetricGroup": "l1_dcache",
+    "ScaleUnit": "1e3per_1k_instr"
+  },
+  {
+    "MetricName": "l1_data_cache_fills_from_different_ccx_pti",
+    "BriefDescription": "L1 data cache fills from another CCX cache in any NUMA node per thousand instructions.",
+    "MetricExpr": "ls_any_fills_from_sys.remote_cache / instructions",
+    "MetricGroup": "l1_dcache",
+    "ScaleUnit": "1e3per_1k_instr"
+  },
+  {
+    "MetricName": "all_l1_data_cache_fills_pti",
+    "BriefDescription": "All L1 data cache fills per thousand instructions.",
+    "MetricExpr": "ls_any_fills_from_sys.all / instructions",
+    "MetricGroup": "l1_dcache",
+    "ScaleUnit": "1e3per_1k_instr"
+  },
+  {
+    "MetricName": "l1_demand_data_cache_fills_from_local_l2_pti",
+    "BriefDescription": "L1 demand data cache fills from local L2 cache per thousand instructions.",
+    "MetricExpr": "ls_dmnd_fills_from_sys.local_l2 / instructions",
+    "MetricGroup": "l1_dcache",
+    "ScaleUnit": "1e3per_1k_instr"
+  },
+  {
+    "MetricName": "l1_demand_data_cache_fills_from_same_ccx_pti",
+    "BriefDescription": "L1 demand data cache fills from within the same CCX per thousand instructions.",
+    "MetricExpr": "ls_dmnd_fills_from_sys.local_ccx / instructions",
+    "MetricGroup": "l1_dcache",
+    "ScaleUnit": "1e3per_1k_instr"
+  },
+  {
+    "MetricName": "l1_demand_data_cache_fills_from_near_cache_pti",
+    "BriefDescription": "L1 demand data cache fills from another CCX cache in the same NUMA node per thousand instructions.",
+    "MetricExpr": "ls_dmnd_fills_from_sys.near_cache / instructions",
+    "MetricGroup": "l1_dcache",
+    "ScaleUnit": "1e3per_1k_instr"
+  },
+  {
+    "MetricName": "l1_demand_data_cache_fills_from_near_memory_pti",
+    "BriefDescription": "L1 demand data cache fills from DRAM or MMIO in the same NUMA node per thousand instructions.",
+    "MetricExpr": "ls_dmnd_fills_from_sys.dram_io_near / instructions",
+    "MetricGroup": "l1_dcache",
+    "ScaleUnit": "1e3per_1k_instr"
+  },
+  {
+    "MetricName": "l1_demand_data_cache_fills_from_far_cache_pti",
+    "BriefDescription": "L1 demand data cache fills from another CCX cache in a different NUMA node per thousand instructions.",
+    "MetricExpr": "ls_dmnd_fills_from_sys.far_cache / instructions",
+    "MetricGroup": "l1_dcache",
+    "ScaleUnit": "1e3per_1k_instr"
+  },
+  {
+    "MetricName": "l1_demand_data_cache_fills_from_far_memory_pti",
+    "BriefDescription": "L1 demand data cache fills from DRAM or MMIO in a different NUMA node per thousand instructions.",
+    "MetricExpr": "ls_dmnd_fills_from_sys.dram_io_far / instructions",
+    "MetricGroup": "l1_dcache",
+    "ScaleUnit": "1e3per_1k_instr"
+  },
+  {
+    "MetricName": "l1_itlb_misses_pti",
+    "BriefDescription": "L1 instruction TLB misses per thousand instructions.",
+    "MetricExpr": "(bp_l1_tlb_miss_l2_tlb_hit + bp_l1_tlb_miss_l2_tlb_miss.all) / instructions",
+    "MetricGroup": "tlb",
+    "ScaleUnit": "1e3per_1k_instr"
+  },
+  {
+    "MetricName": "l2_itlb_misses_pti",
+    "BriefDescription": "L2 instruction TLB misses and instruction page walks per thousand instructions.",
+    "MetricExpr": "bp_l1_tlb_miss_l2_tlb_miss.all / instructions",
+    "MetricGroup": "tlb",
+    "ScaleUnit": "1e3per_1k_instr"
+  },
+  {
+    "MetricName": "l1_dtlb_misses_pti",
+    "BriefDescription": "L1 data TLB misses per thousand instructions.",
+    "MetricExpr": "ls_l1_d_tlb_miss.all / instructions",
+    "MetricGroup": "tlb",
+    "ScaleUnit": "1e3per_1k_instr"
+  },
+  {
+    "MetricName": "l2_dtlb_misses_pti",
+    "BriefDescription": "L2 data TLB misses and data page walks per thousand instructions.",
+    "MetricExpr": "ls_l1_d_tlb_miss.l2_miss_all / instructions",
+    "MetricGroup": "tlb",
+    "ScaleUnit": "1e3per_1k_instr"
+  },
+  {
+    "MetricName": "all_tlbs_flushed_pti",
+    "BriefDescription": "All TLBs flushed per thousand instructions.",
+    "MetricExpr": "ls_tlb_flush.all / instructions",
+    "MetricGroup": "tlb",
+    "ScaleUnit": "1e3per_1k_instr"
+  },
+  {
+    "MetricName": "macro_ops_dispatched",
+    "BriefDescription": "Macro-ops dispatched.",
+    "MetricExpr": "de_src_op_disp.all",
+    "MetricGroup": "decoder"
+  },
+  {
+    "MetricName": "sse_avx_stalls",
+    "BriefDescription": "Mixed SSE/AVX stalls.",
+    "MetricExpr": "fp_disp_faults.sse_avx_all"
+  },
+  {
+    "MetricName": "macro_ops_retired",
+    "BriefDescription": "Macro-ops retired.",
+    "MetricExpr": "ex_ret_ops"
+  },
+  {
+    "MetricName": "umc_data_bus_utilization",
+    "BriefDescription": "Memory controller data bus utilization.",
+    "MetricExpr": "d_ratio(umc_data_slot_clks.all / 2, umc_mem_clk)",
+    "MetricGroup": "memory_controller",
+    "PerPkg": "1",
+    "ScaleUnit": "100%"
+  },
+  {
+    "MetricName": "umc_cas_cmd_rate",
+    "BriefDescription": "Memory controller CAS command rate.",
+    "MetricExpr": "d_ratio(umc_cas_cmd.all * 1000, umc_mem_clk)",
+    "MetricGroup": "memory_controller",
+    "PerPkg": "1",
+    "ScaleUnit": "1per_memclk"
+  },
+  {
+    "MetricName": "umc_cas_cmd_read_ratio",
+    "BriefDescription": "Ratio of memory controller CAS commands for reads.",
+    "MetricExpr": "d_ratio(umc_cas_cmd.rd, umc_cas_cmd.all)",
+    "MetricGroup": "memory_controller",
+    "PerPkg": "1",
+    "ScaleUnit": "100%"
+  },
+  {
+    "MetricName": "umc_cas_cmd_write_ratio",
+    "BriefDescription": "Ratio of memory controller CAS commands for writes.",
+    "MetricExpr": "d_ratio(umc_cas_cmd.wr, umc_cas_cmd.all)",
+    "MetricGroup": "memory_controller",
+    "PerPkg": "1",
+    "ScaleUnit": "100%"
+  },
+  {
+    "MetricName": "umc_mem_read_bandwidth",
+    "BriefDescription": "Estimated memory read bandwidth.",
+    "MetricExpr": "(umc_cas_cmd.rd * 64) / 1e6 / duration_time",
+    "MetricGroup": "memory_controller",
+    "PerPkg": "1",
+    "ScaleUnit": "1MB/s"
+  },
+  {
+    "MetricName": "umc_mem_write_bandwidth",
+    "BriefDescription": "Estimated memory write bandwidth.",
+    "MetricExpr": "(umc_cas_cmd.wr * 64) / 1e6 / duration_time",
+    "MetricGroup": "memory_controller",
+    "PerPkg": "1",
+    "ScaleUnit": "1MB/s"
+  },
+  {
+    "MetricName": "umc_mem_bandwidth",
+    "BriefDescription": "Estimated combined memory bandwidth.",
+    "MetricExpr": "(umc_cas_cmd.all * 64) / 1e6 / duration_time",
+    "MetricGroup": "memory_controller",
+    "PerPkg": "1",
+    "ScaleUnit": "1MB/s"
+  },
+  {
+    "MetricName": "umc_activate_cmd_rate",
+    "BriefDescription": "Memory controller ACTIVATE command rate.",
+    "MetricExpr": "d_ratio(umc_act_cmd.all * 1000, umc_mem_clk)",
+    "MetricGroup": "memory_controller",
+    "PerPkg": "1",
+    "ScaleUnit": "1per_memclk"
+  },
+  {
+    "MetricName": "umc_precharge_cmd_rate",
+    "BriefDescription": "Memory controller PRECHARGE command rate.",
+    "MetricExpr": "d_ratio(umc_pchg_cmd.all * 1000, umc_mem_clk)",
+    "MetricGroup": "memory_controller",
+    "PerPkg": "1",
+    "ScaleUnit": "1per_memclk"
+  }
+]
diff --git a/tools/perf/pmu-events/arch/x86/mapfile.csv b/tools/perf/pmu-events/arch/x86/mapfile.csv
index 3d0c57198056..149bbe7abaf5 100644
--- a/tools/perf/pmu-events/arch/x86/mapfile.csv
+++ b/tools/perf/pmu-events/arch/x86/mapfile.csv
@@ -43,4 +43,5 @@ AuthenticAMD-23-([12][0-9A-F]|[0-9A-F]),v2,amdzen1,core
 AuthenticAMD-23-[[:xdigit:]]+,v1,amdzen2,core
 AuthenticAMD-25-([245][[:xdigit:]]|[[:xdigit:]]),v1,amdzen3,core
 AuthenticAMD-25-[[:xdigit:]]+,v1,amdzen4,core
-AuthenticAMD-26-[[:xdigit:]]+,v1,amdzen5,core
+AuthenticAMD-26-([12467][[:xdigit:]]|[[:xdigit:]]),v1,amdzen5,core
+AuthenticAMD-26-[[:xdigit:]]+,v1,amdzen6,core
diff --git a/tools/perf/pmu-events/arm64_metrics.py b/tools/perf/pmu-events/arm64_metrics.py
new file mode 100755
index 000000000000..4ecda96d11fa
--- /dev/null
+++ b/tools/perf/pmu-events/arm64_metrics.py
@@ -0,0 +1,49 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause)
+import argparse
+import os
+from metric import (JsonEncodeMetric, JsonEncodeMetricGroupDescriptions, LoadEvents,
+                    MetricGroup)
+from common_metrics import Cycles
+
+# Global command line arguments.
+_args = None
+
+
+def main() -> None:
+    global _args
+
+    def dir_path(path: str) -> str:
+        """Validate path is a directory for argparse."""
+        if os.path.isdir(path):
+            return path
+        raise argparse.ArgumentTypeError(
+            f'\'{path}\' is not a valid directory')
+
+    parser = argparse.ArgumentParser(description="ARM perf json generator")
+    parser.add_argument(
+        "-metricgroups", help="Generate metricgroups data", action='store_true')
+    parser.add_argument("vendor", help="e.g. arm")
+    parser.add_argument("model", help="e.g. neoverse-n1")
+    parser.add_argument(
+        'events_path',
+        type=dir_path,
+        help='Root of tree containing architecture directories containing json files'
+    )
+    _args = parser.parse_args()
+
+    directory = f"{_args.events_path}/arm64/{_args.vendor}/{_args.model}/"
+    LoadEvents(directory)
+
+    all_metrics = MetricGroup("", [
+        Cycles(),
+    ])
+
+    if _args.metricgroups:
+        print(JsonEncodeMetricGroupDescriptions(all_metrics))
+    else:
+        print(JsonEncodeMetric(all_metrics))
+
+
+if __name__ == '__main__':
+    main()
diff --git a/tools/perf/pmu-events/common_metrics.py b/tools/perf/pmu-events/common_metrics.py
new file mode 100644
index 000000000000..fcdfb9d3e648
--- /dev/null
+++ b/tools/perf/pmu-events/common_metrics.py
@@ -0,0 +1,19 @@
+# SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause)
+from metric import (d_ratio, Event, Metric, MetricGroup)
+
+
+def Cycles() -> MetricGroup:
+    cyc_k = Event("cpu\\-cycles:kHh")  # exclude user and guest
+    cyc_g = Event("cpu\\-cycles:G")   # exclude host
+    cyc_u = Event("cpu\\-cycles:uH")  # exclude kernel, hypervisor and guest
+    cyc = cyc_k + cyc_g + cyc_u
+
+    return MetricGroup("lpm_cycles", [
+        Metric("lpm_cycles_total", "Total number of cycles", cyc, "cycles"),
+        Metric("lpm_cycles_user", "User cycles as a percentage of all cycles",
+               d_ratio(cyc_u, cyc), "100%"),
+        Metric("lpm_cycles_kernel", "Kernel cycles as a percentage of all cycles",
+               d_ratio(cyc_k, cyc), "100%"),
+        Metric("lpm_cycles_guest", "Hypervisor guest cycles as a percentage of all cycles",
+               d_ratio(cyc_g, cyc), "100%"),
+    ], description="cycles breakdown per privilege level (users, kernel, guest)")
diff --git a/tools/perf/pmu-events/intel_metrics.py b/tools/perf/pmu-events/intel_metrics.py
new file mode 100755
index 000000000000..52035433b505
--- /dev/null
+++ b/tools/perf/pmu-events/intel_metrics.py
@@ -0,0 +1,1129 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause)
+import argparse
+import json
+import math
+import os
+import re
+from typing import Optional
+from common_metrics import Cycles
+from metric import (d_ratio, has_event, max, source_count, CheckPmu, Event,
+                    JsonEncodeMetric, JsonEncodeMetricGroupDescriptions,
+                    Literal, LoadEvents, Metric, MetricConstraint, MetricGroup,
+                    MetricRef, Select)
+
+# Global command line arguments.
+_args = None
+interval_sec = Event("duration_time")
+
+
+def Idle() -> Metric:
+    cyc = Event("msr/mperf/")
+    tsc = Event("msr/tsc/")
+    low = max(tsc - cyc, 0)
+    return Metric(
+        "lpm_idle",
+        "Percentage of total wallclock cycles where CPUs are in low power state (C1 or deeper sleep state)",
+        d_ratio(low, tsc), "100%")
+
+
+def Rapl() -> MetricGroup:
+    """Processor power consumption estimate.
+
+    Use events from the running average power limit (RAPL) driver.
+    """
+    # Watts = joules/second
+    pkg = Event("power/energy\\-pkg/")
+    cond_pkg = Select(pkg, has_event(pkg), math.nan)
+    cores = Event("power/energy\\-cores/")
+    cond_cores = Select(cores, has_event(cores), math.nan)
+    ram = Event("power/energy\\-ram/")
+    cond_ram = Select(ram, has_event(ram), math.nan)
+    gpu = Event("power/energy\\-gpu/")
+    cond_gpu = Select(gpu, has_event(gpu), math.nan)
+    psys = Event("power/energy\\-psys/")
+    cond_psys = Select(psys, has_event(psys), math.nan)
+    scale = 2.3283064365386962890625e-10
+    metrics = [
+        Metric("lpm_cpu_power_pkg", "",
+               d_ratio(cond_pkg * scale, interval_sec), "Watts"),
+        Metric("lpm_cpu_power_cores", "",
+               d_ratio(cond_cores * scale, interval_sec), "Watts"),
+        Metric("lpm_cpu_power_ram", "",
+               d_ratio(cond_ram * scale, interval_sec), "Watts"),
+        Metric("lpm_cpu_power_gpu", "",
+               d_ratio(cond_gpu * scale, interval_sec), "Watts"),
+        Metric("lpm_cpu_power_psys", "",
+               d_ratio(cond_psys * scale, interval_sec), "Watts"),
+    ]
+
+    return MetricGroup("lpm_cpu_power", metrics,
+                       description="Running Average Power Limit (RAPL) power consumption estimates")
+
+
+def Smi() -> MetricGroup:
+    pmu = "<cpu_core or cpu_atom>" if CheckPmu("cpu_core") else "cpu"
+    aperf = Event('msr/aperf/')
+    cycles = Event('cycles')
+    smi_num = Event('msr/smi/')
+    smi_cycles = Select(Select((aperf - cycles) / aperf, smi_num > 0, 0),
+                        has_event(aperf),
+                        0)
+    return MetricGroup('smi', [
+        Metric('smi_num', 'Number of SMI interrupts.',
+               Select(smi_num, has_event(smi_num), 0), 'SMI#'),
+        # Note, the smi_cycles "Event" is really a reference to the metric.
+        Metric('smi_cycles',
+               'Percentage of cycles spent in System Management Interrupts. '
+               f'Requires /sys/bus/event_source/devices/{pmu}/freeze_on_smi to be 1.',
+               smi_cycles, '100%', threshold=(MetricRef('smi_cycles') > 0.10))
+    ], description='System Management Interrupt metrics')
+
+
+def Tsx() -> Optional[MetricGroup]:
+    pmu = "cpu_core" if CheckPmu("cpu_core") else "cpu"
+    cycles = Event('cycles')
+    cycles_in_tx = Event(f'{pmu}/cycles\\-t/')
+    cycles_in_tx_cp = Event(f'{pmu}/cycles\\-ct/')
+    try:
+        # Test if the tsx event is present in the json, prefer the
+        # sysfs version so that we can detect its presence at runtime.
+        transaction_start = Event("RTM_RETIRED.START")
+        transaction_start = Event(f'{pmu}/tx\\-start/')
+    except:
+        return None
+
+    elision_start = None
+    try:
+        # Elision start isn't supported by all models, but we'll not
+        # generate the tsx_cycles_per_elision metric in that
+        # case. Again, prefer the sysfs encoding of the event.
+        elision_start = Event("HLE_RETIRED.START")
+        elision_start = Event(f'{pmu}/el\\-start/')
+    except:
+        pass
+
+    return MetricGroup('transaction', [
+        Metric('tsx_transactional_cycles',
+               'Percentage of cycles within a transaction region.',
+               Select(cycles_in_tx / cycles, has_event(cycles_in_tx), 0),
+               '100%'),
+        Metric('tsx_aborted_cycles', 'Percentage of cycles in aborted transactions.',
+               Select(max(cycles_in_tx - cycles_in_tx_cp, 0) / cycles,
+                      has_event(cycles_in_tx),
+                      0),
+               '100%'),
+        Metric('tsx_cycles_per_transaction',
+               'Number of cycles within a transaction divided by the number of transactions.',
+               Select(cycles_in_tx / transaction_start,
+                      has_event(cycles_in_tx),
+                      0),
+               "cycles / transaction"),
+        Metric('tsx_cycles_per_elision',
+               'Number of cycles within a transaction divided by the number of elisions.',
+               Select(cycles_in_tx / elision_start,
+                      has_event(elision_start),
+                      0),
+               "cycles / elision") if elision_start else None,
+    ], description="Breakdown of transactional memory statistics")
+
+
+def IntelBr():
+    ins = Event("instructions")
+
+    def Total() -> MetricGroup:
+        br_all = Event("BR_INST_RETIRED.ALL_BRANCHES", "BR_INST_RETIRED.ANY")
+        br_m_all = Event("BR_MISP_RETIRED.ALL_BRANCHES",
+                         "BR_INST_RETIRED.MISPRED",
+                         "BR_MISP_EXEC.ANY")
+        br_clr = None
+        try:
+            br_clr = Event("BACLEARS.ANY", "BACLEARS.ALL")
+        except:
+            pass
+
+        br_r = d_ratio(br_all, interval_sec)
+        ins_r = d_ratio(ins, br_all)
+        misp_r = d_ratio(br_m_all, br_all)
+        clr_r = d_ratio(br_clr, interval_sec) if br_clr else None
+
+        return MetricGroup("lpm_br_total", [
+            Metric("lpm_br_total_retired",
+                   "The number of branch instructions retired per second.", br_r,
+                   "insn/s"),
+            Metric(
+                "lpm_br_total_mispred",
+                "The number of branch instructions retired, of any type, that were "
+                "not correctly predicted as a percentage of all branch instrucions.",
+                misp_r, "100%"),
+            Metric("lpm_br_total_insn_between_branches",
+                   "The number of instructions divided by the number of branches.",
+                   ins_r, "insn"),
+            Metric("lpm_br_total_insn_fe_resteers",
+                   "The number of resync branches per second.", clr_r, "req/s"
+                   ) if clr_r else None
+        ])
+
+    def Taken() -> MetricGroup:
+        br_all = Event("BR_INST_RETIRED.ALL_BRANCHES", "BR_INST_RETIRED.ANY")
+        br_m_tk = None
+        try:
+            br_m_tk = Event("BR_MISP_RETIRED.NEAR_TAKEN",
+                            "BR_MISP_RETIRED.TAKEN_JCC",
+                            "BR_INST_RETIRED.MISPRED_TAKEN")
+        except:
+            pass
+        br_r = d_ratio(br_all, interval_sec)
+        ins_r = d_ratio(ins, br_all)
+        misp_r = d_ratio(br_m_tk, br_all) if br_m_tk else None
+        return MetricGroup("lpm_br_taken", [
+            Metric("lpm_br_taken_retired",
+                   "The number of taken branches that were retired per second.",
+                   br_r, "insn/s"),
+            Metric(
+                "lpm_br_taken_mispred",
+                "The number of retired taken branch instructions that were "
+                "mispredicted as a percentage of all taken branches.", misp_r,
+                "100%") if misp_r else None,
+            Metric(
+                "lpm_br_taken_insn_between_branches",
+                "The number of instructions divided by the number of taken branches.",
+                ins_r, "insn"),
+        ])
+
+    def Conditional() -> Optional[MetricGroup]:
+        try:
+            br_cond = Event("BR_INST_RETIRED.COND",
+                            "BR_INST_RETIRED.CONDITIONAL",
+                            "BR_INST_RETIRED.TAKEN_JCC")
+            br_m_cond = Event("BR_MISP_RETIRED.COND",
+                              "BR_MISP_RETIRED.CONDITIONAL",
+                              "BR_MISP_RETIRED.TAKEN_JCC")
+        except:
+            return None
+
+        br_cond_nt = None
+        br_m_cond_nt = None
+        try:
+            br_cond_nt = Event("BR_INST_RETIRED.COND_NTAKEN")
+            br_m_cond_nt = Event("BR_MISP_RETIRED.COND_NTAKEN")
+        except:
+            pass
+        br_r = d_ratio(br_cond, interval_sec)
+        ins_r = d_ratio(ins, br_cond)
+        misp_r = d_ratio(br_m_cond, br_cond)
+        taken_metrics = [
+            Metric("lpm_br_cond_retired", "Retired conditional branch instructions.",
+                   br_r, "insn/s"),
+            Metric("lpm_br_cond_insn_between_branches",
+                   "The number of instructions divided by the number of conditional "
+                   "branches.", ins_r, "insn"),
+            Metric("lpm_br_cond_mispred",
+                   "Retired conditional branch instructions mispredicted as a "
+                   "percentage of all conditional branches.", misp_r, "100%"),
+        ]
+        if not br_m_cond_nt:
+            return MetricGroup("lpm_br_cond", taken_metrics)
+
+        br_r = d_ratio(br_cond_nt, interval_sec)
+        ins_r = d_ratio(ins, br_cond_nt)
+        misp_r = d_ratio(br_m_cond_nt, br_cond_nt)
+
+        not_taken_metrics = [
+            Metric("lpm_br_cond_retired", "Retired conditional not taken branch instructions.",
+                   br_r, "insn/s"),
+            Metric("lpm_br_cond_insn_between_branches",
+                   "The number of instructions divided by the number of not taken conditional "
+                   "branches.", ins_r, "insn"),
+            Metric("lpm_br_cond_mispred",
+                   "Retired not taken conditional branch instructions mispredicted as a "
+                   "percentage of all not taken conditional branches.", misp_r, "100%"),
+        ]
+        return MetricGroup("lpm_br_cond", [
+            MetricGroup("lpm_br_cond_nt", not_taken_metrics),
+            MetricGroup("lpm_br_cond_tkn", taken_metrics),
+        ])
+
+    def Far() -> Optional[MetricGroup]:
+        try:
+            br_far = Event("BR_INST_RETIRED.FAR_BRANCH")
+        except:
+            return None
+
+        br_r = d_ratio(br_far, interval_sec)
+        ins_r = d_ratio(ins, br_far)
+        return MetricGroup("lpm_br_far", [
+            Metric("lpm_br_far_retired", "Retired far control transfers per second.",
+                   br_r, "insn/s"),
+            Metric(
+                "lpm_br_far_insn_between_branches",
+                "The number of instructions divided by the number of far branches.",
+                ins_r, "insn"),
+        ])
+
+    return MetricGroup("lpm_br", [Total(), Taken(), Conditional(), Far()],
+                       description="breakdown of retired branch instructions")
+
+
+def IntelCtxSw() -> MetricGroup:
+    cs = Event("context\\-switches")
+    metrics = [
+        Metric("lpm_cs_rate", "Context switches per second",
+               d_ratio(cs, interval_sec), "ctxsw/s")
+    ]
+
+    ev = Event("instructions")
+    metrics.append(Metric("lpm_cs_instr", "Instructions per context switch",
+                          d_ratio(ev, cs), "instr/cs"))
+
+    ev = Event("cycles")
+    metrics.append(Metric("lpm_cs_cycles", "Cycles per context switch",
+                          d_ratio(ev, cs), "cycles/cs"))
+
+    try:
+        ev = Event("MEM_INST_RETIRED.ALL_LOADS", "MEM_UOPS_RETIRED.ALL_LOADS")
+        metrics.append(Metric("lpm_cs_loads", "Loads per context switch",
+                              d_ratio(ev, cs), "loads/cs"))
+    except:
+        pass
+
+    try:
+        ev = Event("MEM_INST_RETIRED.ALL_STORES",
+                   "MEM_UOPS_RETIRED.ALL_STORES")
+        metrics.append(Metric("lpm_cs_stores", "Stores per context switch",
+                              d_ratio(ev, cs), "stores/cs"))
+    except:
+        pass
+
+    try:
+        ev = Event("BR_INST_RETIRED.NEAR_TAKEN", "BR_INST_RETIRED.TAKEN_JCC")
+        metrics.append(Metric("lpm_cs_br_taken", "Branches taken per context switch",
+                              d_ratio(ev, cs), "br_taken/cs"))
+    except:
+        pass
+
+    try:
+        l2_misses = (Event("L2_RQSTS.DEMAND_DATA_RD_MISS") +
+                     Event("L2_RQSTS.RFO_MISS") +
+                     Event("L2_RQSTS.CODE_RD_MISS"))
+        try:
+            l2_misses += Event("L2_RQSTS.HWPF_MISS",
+                               "L2_RQSTS.L2_PF_MISS", "L2_RQSTS.PF_MISS")
+        except:
+            pass
+
+        metrics.append(Metric("lpm_cs_l2_misses", "L2 misses per context switch",
+                              d_ratio(l2_misses, cs), "l2_misses/cs"))
+    except:
+        pass
+
+    return MetricGroup("lpm_cs", metrics,
+                       description=("Number of context switches per second, instructions "
+                                    "retired & core cycles between context switches"))
+
+
+def IntelFpu() -> Optional[MetricGroup]:
+    cyc = Event("cycles")
+    try:
+        s_64 = Event("FP_ARITH_INST_RETIRED.SCALAR_SINGLE",
+                     "SIMD_INST_RETIRED.SCALAR_SINGLE")
+    except:
+        return None
+    d_64 = Event("FP_ARITH_INST_RETIRED.SCALAR_DOUBLE",
+                 "SIMD_INST_RETIRED.SCALAR_DOUBLE")
+    s_128 = Event("FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE",
+                  "SIMD_INST_RETIRED.PACKED_SINGLE")
+
+    flop = s_64 + d_64 + 4 * s_128
+
+    d_128 = None
+    s_256 = None
+    d_256 = None
+    s_512 = None
+    d_512 = None
+    try:
+        d_128 = Event("FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE")
+        flop += 2 * d_128
+        s_256 = Event("FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE")
+        flop += 8 * s_256
+        d_256 = Event("FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE")
+        flop += 4 * d_256
+        s_512 = Event("FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE")
+        flop += 16 * s_512
+        d_512 = Event("FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE")
+        flop += 8 * d_512
+    except:
+        pass
+
+    f_assist = Event("ASSISTS.FP", "FP_ASSIST.ANY", "FP_ASSIST.S")
+    if f_assist in [
+        "ASSISTS.FP",
+        "FP_ASSIST.S",
+    ]:
+        f_assist += "/cmask=1/"
+
+    flop_r = d_ratio(flop, interval_sec)
+    flop_c = d_ratio(flop, cyc)
+    nmi_constraint = MetricConstraint.GROUPED_EVENTS
+    if f_assist.name == "ASSISTS.FP":  # Icelake+
+        nmi_constraint = MetricConstraint.NO_GROUP_EVENTS_NMI
+
+    def FpuMetrics(group: str, fl: Optional[Event], mult: int, desc: str) -> Optional[MetricGroup]:
+        if not fl:
+            return None
+
+        f = fl * mult
+        fl_r = d_ratio(f, interval_sec)
+        r_s = d_ratio(fl, interval_sec)
+        return MetricGroup(group, [
+            Metric(f"{group}_of_total", desc + " floating point operations per second",
+                   d_ratio(f, flop), "100%"),
+            Metric(f"{group}_flops", desc + " floating point operations per second",
+                   fl_r, "flops/s"),
+            Metric(f"{group}_ops", desc + " operations per second",
+                   r_s, "ops/s"),
+        ])
+
+    return MetricGroup("lpm_fpu", [
+        MetricGroup("lpm_fpu_total", [
+            Metric("lpm_fpu_total_flops", "Floating point operations per second",
+                   flop_r, "flops/s"),
+            Metric("lpm_fpu_total_flopc", "Floating point operations per cycle",
+                   flop_c, "flops/cycle", constraint=nmi_constraint),
+        ]),
+        MetricGroup("lpm_fpu_64", [
+            FpuMetrics("lpm_fpu_64_single", s_64, 1, "64-bit single"),
+            FpuMetrics("lpm_fpu_64_double", d_64, 1, "64-bit double"),
+        ]),
+        MetricGroup("lpm_fpu_128", [
+            FpuMetrics("lpm_fpu_128_single", s_128,
+                       4, "128-bit packed single"),
+            FpuMetrics("lpm_fpu_128_double", d_128,
+                       2, "128-bit packed double"),
+        ]),
+        MetricGroup("lpm_fpu_256", [
+            FpuMetrics("lpm_fpu_256_single", s_256,
+                       8, "128-bit packed single"),
+            FpuMetrics("lpm_fpu_256_double", d_256,
+                       4, "128-bit packed double"),
+        ]),
+        MetricGroup("lpm_fpu_512", [
+            FpuMetrics("lpm_fpu_512_single", s_512,
+                       16, "128-bit packed single"),
+            FpuMetrics("lpm_fpu_512_double", d_512,
+                       8, "128-bit packed double"),
+        ]),
+        Metric("lpm_fpu_assists", "FP assists as a percentage of cycles",
+               d_ratio(f_assist, cyc), "100%"),
+    ])
+
+
+def IntelIlp() -> MetricGroup:
+    tsc = Event("msr/tsc/")
+    c0 = Event("msr/mperf/")
+    low = tsc - c0
+    inst_ret = Event("INST_RETIRED.ANY_P")
+    inst_ret_c = [Event(f"{inst_ret.name}/cmask={x}/") for x in range(1, 6)]
+    core_cycles = Event("CPU_CLK_UNHALTED.THREAD_P_ANY",
+                        "CPU_CLK_UNHALTED.DISTRIBUTED",
+                        "cycles")
+    ilp = [d_ratio(max(inst_ret_c[x] - inst_ret_c[x + 1], 0), core_cycles)
+           for x in range(0, 4)]
+    ilp.append(d_ratio(inst_ret_c[4], core_cycles))
+    ilp0 = 1
+    for x in ilp:
+        ilp0 -= x
+    return MetricGroup("lpm_ilp", [
+        Metric("lpm_ilp_idle", "Lower power cycles as a percentage of all cycles",
+               d_ratio(low, tsc), "100%"),
+        Metric("lpm_ilp_inst_ret_0",
+               "Instructions retired in 0 cycles as a percentage of all cycles",
+               ilp0, "100%"),
+        Metric("lpm_ilp_inst_ret_1",
+               "Instructions retired in 1 cycles as a percentage of all cycles",
+               ilp[0], "100%"),
+        Metric("lpm_ilp_inst_ret_2",
+               "Instructions retired in 2 cycles as a percentage of all cycles",
+               ilp[1], "100%"),
+        Metric("lpm_ilp_inst_ret_3",
+               "Instructions retired in 3 cycles as a percentage of all cycles",
+               ilp[2], "100%"),
+        Metric("lpm_ilp_inst_ret_4",
+               "Instructions retired in 4 cycles as a percentage of all cycles",
+               ilp[3], "100%"),
+        Metric("lpm_ilp_inst_ret_5",
+               "Instructions retired in 5 or more cycles as a percentage of all cycles",
+               ilp[4], "100%"),
+    ])
+
+
+def IntelL2() -> Optional[MetricGroup]:
+    try:
+        DC_HIT = Event("L2_RQSTS.DEMAND_DATA_RD_HIT")
+    except:
+        return None
+    try:
+        DC_MISS = Event("L2_RQSTS.DEMAND_DATA_RD_MISS")
+        l2_dmnd_miss = DC_MISS
+        l2_dmnd_rd_all = DC_MISS + DC_HIT
+    except:
+        DC_ALL = Event("L2_RQSTS.ALL_DEMAND_DATA_RD")
+        l2_dmnd_miss = DC_ALL - DC_HIT
+        l2_dmnd_rd_all = DC_ALL
+    l2_dmnd_mrate = d_ratio(l2_dmnd_miss, interval_sec)
+    l2_dmnd_rrate = d_ratio(l2_dmnd_rd_all, interval_sec)
+
+    DC_PFH = None
+    DC_PFM = None
+    l2_pf_all = None
+    l2_pf_mrate = None
+    l2_pf_rrate = None
+    try:
+        DC_PFH = Event("L2_RQSTS.PF_HIT")
+        DC_PFM = Event("L2_RQSTS.PF_MISS")
+        l2_pf_all = DC_PFH + DC_PFM
+        l2_pf_mrate = d_ratio(DC_PFM, interval_sec)
+        l2_pf_rrate = d_ratio(l2_pf_all, interval_sec)
+    except:
+        pass
+
+    DC_RFOH = None
+    DC_RFOM = None
+    l2_rfo_all = None
+    l2_rfo_mrate = None
+    l2_rfo_rrate = None
+    try:
+        DC_RFOH = Event("L2_RQSTS.RFO_HIT")
+        DC_RFOM = Event("L2_RQSTS.RFO_MISS")
+        l2_rfo_all = DC_RFOH + DC_RFOM
+        l2_rfo_mrate = d_ratio(DC_RFOM, interval_sec)
+        l2_rfo_rrate = d_ratio(l2_rfo_all, interval_sec)
+    except:
+        pass
+
+    DC_CH = None
+    try:
+        DC_CH = Event("L2_RQSTS.CODE_RD_HIT")
+    except:
+        pass
+    DC_CM = Event("L2_RQSTS.CODE_RD_MISS")
+    DC_IN = Event("L2_LINES_IN.ALL")
+    DC_OUT_NS = None
+    DC_OUT_S = None
+    l2_lines_out = None
+    l2_out_rate = None
+    wbn = None
+    isd = None
+    try:
+        DC_OUT_NS = Event("L2_LINES_OUT.NON_SILENT",
+                          "L2_LINES_OUT.DEMAND_DIRTY",
+                          "L2_LINES_IN.S")
+        DC_OUT_S = Event("L2_LINES_OUT.SILENT",
+                         "L2_LINES_OUT.DEMAND_CLEAN",
+                         "L2_LINES_IN.I")
+        if DC_OUT_S.name == "L2_LINES_OUT.SILENT" and (
+                args.model.startswith("skylake") or
+                args.model == "cascadelakex"):
+            DC_OUT_S.name = "L2_LINES_OUT.SILENT/any/"
+        # bring is back to per-CPU
+        l2_s = Select(DC_OUT_S / 2, Literal("#smt_on"), DC_OUT_S)
+        l2_ns = DC_OUT_NS
+        l2_lines_out = l2_s + l2_ns
+        l2_out_rate = d_ratio(l2_lines_out, interval_sec)
+        nlr = max(l2_ns - DC_WB_U - DC_WB_D, 0)
+        wbn = d_ratio(nlr, interval_sec)
+        isd = d_ratio(l2_s, interval_sec)
+    except:
+        pass
+    DC_OUT_U = None
+    l2_pf_useless = None
+    l2_useless_rate = None
+    try:
+        DC_OUT_U = Event("L2_LINES_OUT.USELESS_HWPF")
+        l2_pf_useless = DC_OUT_U
+        l2_useless_rate = d_ratio(l2_pf_useless, interval_sec)
+    except:
+        pass
+    DC_WB_U = None
+    DC_WB_D = None
+    wbu = None
+    wbd = None
+    try:
+        DC_WB_U = Event("IDI_MISC.WB_UPGRADE")
+        DC_WB_D = Event("IDI_MISC.WB_DOWNGRADE")
+        wbu = d_ratio(DC_WB_U, interval_sec)
+        wbd = d_ratio(DC_WB_D, interval_sec)
+    except:
+        pass
+
+    l2_lines_in = DC_IN
+    l2_code_all = (DC_CH + DC_CM) if DC_CH else None
+    l2_code_rate = d_ratio(l2_code_all, interval_sec) if DC_CH else None
+    l2_code_miss_rate = d_ratio(DC_CM, interval_sec)
+    l2_in_rate = d_ratio(l2_lines_in, interval_sec)
+
+    return MetricGroup("lpm_l2", [
+        MetricGroup("lpm_l2_totals", [
+            Metric("lpm_l2_totals_in", "L2 cache total in per second",
+                   l2_in_rate, "In/s"),
+            Metric("lpm_l2_totals_out", "L2 cache total out per second",
+                   l2_out_rate, "Out/s") if l2_out_rate else None,
+        ]),
+        MetricGroup("lpm_l2_rd", [
+            Metric("lpm_l2_rd_hits", "L2 cache data read hits",
+                   d_ratio(DC_HIT, l2_dmnd_rd_all), "100%"),
+            Metric("lpm_l2_rd_hits", "L2 cache data read hits",
+                   d_ratio(l2_dmnd_miss, l2_dmnd_rd_all), "100%"),
+            Metric("lpm_l2_rd_requests", "L2 cache data read requests per second",
+                   l2_dmnd_rrate, "requests/s"),
+            Metric("lpm_l2_rd_misses", "L2 cache data read misses per second",
+                   l2_dmnd_mrate, "misses/s"),
+        ]),
+        MetricGroup("lpm_l2_hwpf", [
+            Metric("lpm_l2_hwpf_hits", "L2 cache hardware prefetcher hits",
+                   d_ratio(DC_PFH, l2_pf_all), "100%"),
+            Metric("lpm_l2_hwpf_misses", "L2 cache hardware prefetcher misses",
+                   d_ratio(DC_PFM, l2_pf_all), "100%"),
+            Metric("lpm_l2_hwpf_useless", "L2 cache hardware prefetcher useless prefetches per second",
+                   l2_useless_rate, "100%") if l2_useless_rate else None,
+            Metric("lpm_l2_hwpf_requests", "L2 cache hardware prefetcher requests per second",
+                   l2_pf_rrate, "100%"),
+            Metric("lpm_l2_hwpf_misses", "L2 cache hardware prefetcher misses per second",
+                   l2_pf_mrate, "100%"),
+        ]) if DC_PFH else None,
+        MetricGroup("lpm_l2_rfo", [
+            Metric("lpm_l2_rfo_hits", "L2 cache request for ownership (RFO) hits",
+                   d_ratio(DC_RFOH, l2_rfo_all), "100%"),
+            Metric("lpm_l2_rfo_misses", "L2 cache request for ownership (RFO) misses",
+                   d_ratio(DC_RFOM, l2_rfo_all), "100%"),
+            Metric("lpm_l2_rfo_requests", "L2 cache request for ownership (RFO) requests per second",
+                   l2_rfo_rrate, "requests/s"),
+            Metric("lpm_l2_rfo_misses", "L2 cache request for ownership (RFO) misses per second",
+                   l2_rfo_mrate, "misses/s"),
+        ]) if DC_RFOH else None,
+        MetricGroup("lpm_l2_code", [
+            Metric("lpm_l2_code_hits", "L2 cache code hits",
+                   d_ratio(DC_CH, l2_code_all), "100%") if DC_CH else None,
+            Metric("lpm_l2_code_misses", "L2 cache code misses",
+                   d_ratio(DC_CM, l2_code_all), "100%") if DC_CH else None,
+            Metric("lpm_l2_code_requests", "L2 cache code requests per second",
+                   l2_code_rate, "requests/s") if DC_CH else None,
+            Metric("lpm_l2_code_misses", "L2 cache code misses per second",
+                   l2_code_miss_rate, "misses/s"),
+        ]),
+        MetricGroup("lpm_l2_evict", [
+            MetricGroup("lpm_l2_evict_mef_lines", [
+                Metric("lpm_l2_evict_mef_lines_l3_hot_lru", "L2 evictions M/E/F lines L3 hot LRU per second",
+                       wbu, "HotLRU/s") if wbu else None,
+                Metric("lpm_l2_evict_mef_lines_l3_norm_lru", "L2 evictions M/E/F lines L3 normal LRU per second",
+                       wbn, "NormLRU/s") if wbn else None,
+                Metric("lpm_l2_evict_mef_lines_dropped", "L2 evictions M/E/F lines dropped per second",
+                       wbd, "dropped/s") if wbd else None,
+                Metric("lpm_l2_evict_is_lines_dropped", "L2 evictions I/S lines dropped per second",
+                       isd, "dropped/s") if isd else None,
+            ]),
+        ]),
+    ], description="L2 data cache analysis")
+
+
+def IntelMissLat() -> Optional[MetricGroup]:
+    try:
+        ticks = Event("UNC_CHA_CLOCKTICKS", "UNC_C_CLOCKTICKS")
+        data_rd_loc_occ = Event("UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_LOCAL",
+                                "UNC_CHA_TOR_OCCUPANCY.IA_MISS",
+                                "UNC_C_TOR_OCCUPANCY.MISS_LOCAL_OPCODE",
+                                "UNC_C_TOR_OCCUPANCY.MISS_OPCODE")
+        data_rd_loc_ins = Event("UNC_CHA_TOR_INSERTS.IA_MISS_DRD_LOCAL",
+                                "UNC_CHA_TOR_INSERTS.IA_MISS",
+                                "UNC_C_TOR_INSERTS.MISS_LOCAL_OPCODE",
+                                "UNC_C_TOR_INSERTS.MISS_OPCODE")
+        data_rd_rem_occ = Event("UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_REMOTE",
+                                "UNC_CHA_TOR_OCCUPANCY.IA_MISS",
+                                "UNC_C_TOR_OCCUPANCY.MISS_REMOTE_OPCODE",
+                                "UNC_C_TOR_OCCUPANCY.NID_MISS_OPCODE")
+        data_rd_rem_ins = Event("UNC_CHA_TOR_INSERTS.IA_MISS_DRD_REMOTE",
+                                "UNC_CHA_TOR_INSERTS.IA_MISS",
+                                "UNC_C_TOR_INSERTS.MISS_REMOTE_OPCODE",
+                                "UNC_C_TOR_INSERTS.NID_MISS_OPCODE")
+    except:
+        return None
+
+    if (data_rd_loc_occ.name == "UNC_C_TOR_OCCUPANCY.MISS_LOCAL_OPCODE" or
+            data_rd_loc_occ.name == "UNC_C_TOR_OCCUPANCY.MISS_OPCODE"):
+        data_rd = 0x182
+        for e in [data_rd_loc_occ, data_rd_loc_ins, data_rd_rem_occ, data_rd_rem_ins]:
+            e.name += f"/filter_opc={hex(data_rd)}/"
+    elif data_rd_loc_occ.name == "UNC_CHA_TOR_OCCUPANCY.IA_MISS":
+        # Demand Data Read - Full cache-line read requests from core for
+        # lines to be cached in S or E, typically for data
+        demand_data_rd = 0x202
+        #  LLC Prefetch Data - Uncore will first look up the line in the
+        #  LLC; for a cache hit, the LRU will be updated, on a miss, the
+        #  DRd will be initiated
+        llc_prefetch_data = 0x25a
+        local_filter = (f"/filter_opc0={hex(demand_data_rd)},"
+                        f"filter_opc1={hex(llc_prefetch_data)},"
+                        "filter_loc,filter_nm,filter_not_nm/")
+        remote_filter = (f"/filter_opc0={hex(demand_data_rd)},"
+                         f"filter_opc1={hex(llc_prefetch_data)},"
+                         "filter_rem,filter_nm,filter_not_nm/")
+        for e in [data_rd_loc_occ, data_rd_loc_ins]:
+            e.name += local_filter
+        for e in [data_rd_rem_occ, data_rd_rem_ins]:
+            e.name += remote_filter
+    else:
+        assert data_rd_loc_occ.name == "UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_LOCAL", data_rd_loc_occ
+
+    ticks_per_cha = ticks / source_count(data_rd_loc_ins)
+    loc_lat = interval_sec * 1e9 * data_rd_loc_occ / \
+        (ticks_per_cha * data_rd_loc_ins)
+    ticks_per_cha = ticks / source_count(data_rd_rem_ins)
+    rem_lat = interval_sec * 1e9 * data_rd_rem_occ / \
+        (ticks_per_cha * data_rd_rem_ins)
+    return MetricGroup("lpm_miss_lat", [
+        Metric("lpm_miss_lat_loc", "Local to a socket miss latency in nanoseconds",
+               loc_lat, "ns"),
+        Metric("lpm_miss_lat_rem", "Remote to a socket miss latency in nanoseconds",
+               rem_lat, "ns"),
+    ])
+
+
+def IntelMlp() -> Optional[Metric]:
+    try:
+        l1d = Event("L1D_PEND_MISS.PENDING")
+        l1dc = Event("L1D_PEND_MISS.PENDING_CYCLES")
+    except:
+        return None
+
+    l1dc = Select(l1dc / 2, Literal("#smt_on"), l1dc)
+    ml = d_ratio(l1d, l1dc)
+    return Metric("lpm_mlp",
+                  "Miss level parallelism - number of outstanding load misses per cycle (higher is better)",
+                  ml, "load_miss_pending/cycle")
+
+
+def IntelPorts() -> Optional[MetricGroup]:
+    pipeline_events = json.load(
+        open(f"{_args.events_path}/x86/{_args.model}/pipeline.json"))
+
+    core_cycles = Event("CPU_CLK_UNHALTED.THREAD_P_ANY",
+                        "CPU_CLK_UNHALTED.DISTRIBUTED",
+                        "cycles")
+    # Number of CPU cycles scaled for SMT.
+    smt_cycles = Select(core_cycles / 2, Literal("#smt_on"), core_cycles)
+
+    metrics = []
+    for x in pipeline_events:
+        if "EventName" in x and re.search("^UOPS_DISPATCHED.PORT", x["EventName"]):
+            name = x["EventName"]
+            port = re.search(r"(PORT_[0-9].*)", name).group(0).lower()
+            if name.endswith("_CORE"):
+                cyc = core_cycles
+            else:
+                cyc = smt_cycles
+            metrics.append(Metric(f"lpm_{port}", f"{port} utilization (higher is better)",
+                                  d_ratio(Event(name), cyc), "100%"))
+    if len(metrics) == 0:
+        return None
+
+    return MetricGroup("lpm_ports", metrics, "functional unit (port) utilization -- "
+                       "fraction of cycles each port is utilized (higher is better)")
+
+
+def IntelSwpf() -> Optional[MetricGroup]:
+    ins = Event("instructions")
+    try:
+        s_ld = Event("MEM_INST_RETIRED.ALL_LOADS",
+                     "MEM_UOPS_RETIRED.ALL_LOADS")
+        s_nta = Event("SW_PREFETCH_ACCESS.NTA")
+        s_t0 = Event("SW_PREFETCH_ACCESS.T0")
+        s_t1 = Event("SW_PREFETCH_ACCESS.T1_T2")
+        s_w = Event("SW_PREFETCH_ACCESS.PREFETCHW")
+    except:
+        return None
+
+    all_sw = s_nta + s_t0 + s_t1 + s_w
+    swp_r = d_ratio(all_sw, interval_sec)
+    ins_r = d_ratio(ins, all_sw)
+    ld_r = d_ratio(s_ld, all_sw)
+
+    return MetricGroup("lpm_swpf", [
+        MetricGroup("lpm_swpf_totals", [
+            Metric("lpm_swpf_totals_exec", "Software prefetch instructions per second",
+                   swp_r, "swpf/s"),
+            Metric("lpm_swpf_totals_insn_per_pf",
+                   "Average number of instructions between software prefetches",
+                   ins_r, "insn/swpf"),
+            Metric("lpm_swpf_totals_loads_per_pf",
+                   "Average number of loads between software prefetches",
+                   ld_r, "loads/swpf"),
+        ]),
+        MetricGroup("lpm_swpf_bkdwn", [
+            MetricGroup("lpm_swpf_bkdwn_nta", [
+                Metric("lpm_swpf_bkdwn_nta_per_swpf",
+                       "Software prefetch NTA instructions as a percent of all prefetch instructions",
+                       d_ratio(s_nta, all_sw), "100%"),
+                Metric("lpm_swpf_bkdwn_nta_rate",
+                       "Software prefetch NTA instructions per second",
+                       d_ratio(s_nta, interval_sec), "insn/s"),
+            ]),
+            MetricGroup("lpm_swpf_bkdwn_t0", [
+                Metric("lpm_swpf_bkdwn_t0_per_swpf",
+                       "Software prefetch T0 instructions as a percent of all prefetch instructions",
+                       d_ratio(s_t0, all_sw), "100%"),
+                Metric("lpm_swpf_bkdwn_t0_rate",
+                       "Software prefetch T0 instructions per second",
+                       d_ratio(s_t0, interval_sec), "insn/s"),
+            ]),
+            MetricGroup("lpm_swpf_bkdwn_t1_t2", [
+                Metric("lpm_swpf_bkdwn_t1_t2_per_swpf",
+                       "Software prefetch T1 or T2 instructions as a percent of all prefetch instructions",
+                       d_ratio(s_t1, all_sw), "100%"),
+                Metric("lpm_swpf_bkdwn_t1_t2_rate",
+                       "Software prefetch T1 or T2 instructions per second",
+                       d_ratio(s_t1, interval_sec), "insn/s"),
+            ]),
+            MetricGroup("lpm_swpf_bkdwn_w", [
+                Metric("lpm_swpf_bkdwn_w_per_swpf",
+                       "Software prefetch W instructions as a percent of all prefetch instructions",
+                       d_ratio(s_w, all_sw), "100%"),
+                Metric("lpm_swpf_bkdwn_w_rate",
+                       "Software prefetch W instructions per second",
+                       d_ratio(s_w, interval_sec), "insn/s"),
+            ]),
+        ]),
+    ], description="Software prefetch instruction breakdown")
+
+
+def IntelLdSt() -> Optional[MetricGroup]:
+    if _args.model in [
+        "bonnell",
+        "nehalemep",
+        "nehalemex",
+        "westmereep-dp",
+        "westmereep-sp",
+        "westmereex",
+    ]:
+        return None
+    LDST_LD = Event("MEM_INST_RETIRED.ALL_LOADS", "MEM_UOPS_RETIRED.ALL_LOADS")
+    LDST_ST = Event("MEM_INST_RETIRED.ALL_STORES",
+                    "MEM_UOPS_RETIRED.ALL_STORES")
+    LDST_LDC1 = Event(f"{LDST_LD.name}/cmask=1/")
+    LDST_STC1 = Event(f"{LDST_ST.name}/cmask=1/")
+    LDST_LDC2 = Event(f"{LDST_LD.name}/cmask=2/")
+    LDST_STC2 = Event(f"{LDST_ST.name}/cmask=2/")
+    LDST_LDC3 = Event(f"{LDST_LD.name}/cmask=3/")
+    LDST_STC3 = Event(f"{LDST_ST.name}/cmask=3/")
+    ins = Event("instructions")
+    LDST_CYC = Event("CPU_CLK_UNHALTED.THREAD",
+                     "CPU_CLK_UNHALTED.CORE_P",
+                     "CPU_CLK_UNHALTED.THREAD_P")
+    LDST_PRE = None
+    try:
+        LDST_PRE = Event("LOAD_HIT_PREFETCH.SWPF", "LOAD_HIT_PRE.SW_PF")
+    except:
+        pass
+    LDST_AT = None
+    try:
+        LDST_AT = Event("MEM_INST_RETIRED.LOCK_LOADS")
+    except:
+        pass
+    cyc = LDST_CYC
+
+    ld_rate = d_ratio(LDST_LD, interval_sec)
+    st_rate = d_ratio(LDST_ST, interval_sec)
+    pf_rate = d_ratio(LDST_PRE, interval_sec) if LDST_PRE else None
+    at_rate = d_ratio(LDST_AT, interval_sec) if LDST_AT else None
+
+    ldst_ret_constraint = MetricConstraint.GROUPED_EVENTS
+    if LDST_LD.name == "MEM_UOPS_RETIRED.ALL_LOADS":
+        ldst_ret_constraint = MetricConstraint.NO_GROUP_EVENTS_NMI
+
+    return MetricGroup("lpm_ldst", [
+        MetricGroup("lpm_ldst_total", [
+            Metric("lpm_ldst_total_loads", "Load/store instructions total loads",
+                   ld_rate, "loads"),
+            Metric("lpm_ldst_total_stores", "Load/store instructions total stores",
+                   st_rate, "stores"),
+        ]),
+        MetricGroup("lpm_ldst_prcnt", [
+            Metric("lpm_ldst_prcnt_loads", "Percent of all instructions that are loads",
+                   d_ratio(LDST_LD, ins), "100%"),
+            Metric("lpm_ldst_prcnt_stores", "Percent of all instructions that are stores",
+                   d_ratio(LDST_ST, ins), "100%"),
+        ]),
+        MetricGroup("lpm_ldst_ret_lds", [
+            Metric("lpm_ldst_ret_lds_1", "Retired loads in 1 cycle",
+                   d_ratio(max(LDST_LDC1 - LDST_LDC2, 0), cyc), "100%",
+                   constraint=ldst_ret_constraint),
+            Metric("lpm_ldst_ret_lds_2", "Retired loads in 2 cycles",
+                   d_ratio(max(LDST_LDC2 - LDST_LDC3, 0), cyc), "100%",
+                   constraint=ldst_ret_constraint),
+            Metric("lpm_ldst_ret_lds_3", "Retired loads in 3 or more cycles",
+                   d_ratio(LDST_LDC3, cyc), "100%"),
+        ]),
+        MetricGroup("lpm_ldst_ret_sts", [
+            Metric("lpm_ldst_ret_sts_1", "Retired stores in 1 cycle",
+                   d_ratio(max(LDST_STC1 - LDST_STC2, 0), cyc), "100%",
+                   constraint=ldst_ret_constraint),
+            Metric("lpm_ldst_ret_sts_2", "Retired stores in 2 cycles",
+                   d_ratio(max(LDST_STC2 - LDST_STC3, 0), cyc), "100%",
+                   constraint=ldst_ret_constraint),
+            Metric("lpm_ldst_ret_sts_3", "Retired stores in 3 more cycles",
+                   d_ratio(LDST_STC3, cyc), "100%"),
+        ]),
+        Metric("lpm_ldst_ld_hit_swpf", "Load hit software prefetches per second",
+               pf_rate, "swpf/s") if pf_rate else None,
+        Metric("lpm_ldst_atomic_lds", "Atomic loads per second",
+               at_rate, "loads/s") if at_rate else None,
+    ], description="Breakdown of load/store instructions")
+
+
+def UncoreCState() -> Optional[MetricGroup]:
+    try:
+        pcu_ticks = Event("UNC_P_CLOCKTICKS")
+        c0 = Event("UNC_P_POWER_STATE_OCCUPANCY.CORES_C0")
+        c3 = Event("UNC_P_POWER_STATE_OCCUPANCY.CORES_C3")
+        c6 = Event("UNC_P_POWER_STATE_OCCUPANCY.CORES_C6")
+    except:
+        return None
+
+    num_cores = Literal("#num_cores") / Literal("#num_packages")
+
+    max_cycles = pcu_ticks * num_cores
+    total_cycles = c0 + c3 + c6
+
+    # remove fused-off cores which show up in C6/C7.
+    c6 = Select(max(c6 - (total_cycles - max_cycles), 0),
+                total_cycles > max_cycles,
+                c6)
+
+    return MetricGroup("lpm_cstate", [
+        Metric("lpm_cstate_c0", "C-State cores in C0/C1",
+               d_ratio(c0, pcu_ticks), "cores"),
+        Metric("lpm_cstate_c3", "C-State cores in C3",
+               d_ratio(c3, pcu_ticks), "cores"),
+        Metric("lpm_cstate_c6", "C-State cores in C6/C7",
+               d_ratio(c6, pcu_ticks), "cores"),
+    ])
+
+
+def UncoreDir() -> Optional[MetricGroup]:
+    try:
+        m2m_upd = Event("UNC_M2M_DIRECTORY_UPDATE.ANY")
+        m2m_hits = Event("UNC_M2M_DIRECTORY_HIT.DIRTY_I")
+        # Turn the umask into a ANY rather than DIRTY_I filter.
+        m2m_hits.name += "/umask=0xFF,name=UNC_M2M_DIRECTORY_HIT.ANY/"
+        m2m_miss = Event("UNC_M2M_DIRECTORY_MISS.DIRTY_I")
+        # Turn the umask into a ANY rather than DIRTY_I filter.
+        m2m_miss.name += "/umask=0xFF,name=UNC_M2M_DIRECTORY_MISS.ANY/"
+        cha_upd = Event("UNC_CHA_DIR_UPDATE.HA")
+        # Turn the umask into a ANY rather than HA filter.
+        cha_upd.name += "/umask=3,name=UNC_CHA_DIR_UPDATE.ANY/"
+    except:
+        return None
+
+    m2m_total = m2m_hits + m2m_miss
+    upd = m2m_upd + cha_upd  # in cache lines
+    upd_r = upd / interval_sec
+    look_r = m2m_total / interval_sec
+
+    scale = 64 / 1_000_000  # Cache lines to MB
+    return MetricGroup("lpm_dir", [
+        Metric("lpm_dir_lookup_rate", "",
+               d_ratio(m2m_total, interval_sec), "requests/s"),
+        Metric("lpm_dir_lookup_hits", "",
+               d_ratio(m2m_hits, m2m_total), "100%"),
+        Metric("lpm_dir_lookup_misses", "",
+               d_ratio(m2m_miss, m2m_total), "100%"),
+        Metric("lpm_dir_update_requests", "",
+               d_ratio(m2m_upd + cha_upd, interval_sec), "requests/s"),
+        Metric("lpm_dir_update_bw", "",
+               d_ratio(m2m_upd + cha_upd, interval_sec), f"{scale}MB/s"),
+    ])
+
+
+def UncoreMem() -> Optional[MetricGroup]:
+    try:
+        loc_rds = Event("UNC_CHA_REQUESTS.READS_LOCAL",
+                        "UNC_H_REQUESTS.READS_LOCAL")
+        rem_rds = Event("UNC_CHA_REQUESTS.READS_REMOTE",
+                        "UNC_H_REQUESTS.READS_REMOTE")
+        loc_wrs = Event("UNC_CHA_REQUESTS.WRITES_LOCAL",
+                        "UNC_H_REQUESTS.WRITES_LOCAL")
+        rem_wrs = Event("UNC_CHA_REQUESTS.WRITES_REMOTE",
+                        "UNC_H_REQUESTS.WRITES_REMOTE")
+    except:
+        return None
+
+    scale = 64 / 1_000_000
+    return MetricGroup("lpm_mem", [
+        MetricGroup("lpm_mem_local", [
+            Metric("lpm_mem_local_read", "Local memory read bandwidth not including directory updates",
+                   d_ratio(loc_rds, interval_sec), f"{scale}MB/s"),
+            Metric("lpm_mem_local_write", "Local memory write bandwidth not including directory updates",
+                   d_ratio(loc_wrs, interval_sec), f"{scale}MB/s"),
+        ]),
+        MetricGroup("lpm_mem_remote", [
+            Metric("lpm_mem_remote_read", "Remote memory read bandwidth not including directory updates",
+                   d_ratio(rem_rds, interval_sec), f"{scale}MB/s"),
+            Metric("lpm_mem_remote_write", "Remote memory write bandwidth not including directory updates",
+                   d_ratio(rem_wrs, interval_sec), f"{scale}MB/s"),
+        ]),
+    ], description="Memory Bandwidth breakdown local vs. remote (remote requests in). directory updates not included")
+
+
+def UncoreMemBw() -> Optional[MetricGroup]:
+    mem_events = []
+    try:
+        mem_events = json.load(open(f"{os.path.dirname(os.path.realpath(__file__))}"
+                                    f"/arch/x86/{args.model}/uncore-memory.json"))
+    except:
+        pass
+
+    ddr_rds = 0
+    ddr_wrs = 0
+    ddr_total = 0
+    for x in mem_events:
+        if "EventName" in x:
+            name = x["EventName"]
+            if re.search("^UNC_MC[0-9]+_RDCAS_COUNT_FREERUN", name):
+                ddr_rds += Event(name)
+            elif re.search("^UNC_MC[0-9]+_WRCAS_COUNT_FREERUN", name):
+                ddr_wrs += Event(name)
+            # elif re.search("^UNC_MC[0-9]+_TOTAL_REQCOUNT_FREERUN", name):
+            #  ddr_total += Event(name)
+
+    if ddr_rds == 0:
+        try:
+            ddr_rds = Event("UNC_M_CAS_COUNT.RD")
+            ddr_wrs = Event("UNC_M_CAS_COUNT.WR")
+        except:
+            return None
+
+    ddr_total = ddr_rds + ddr_wrs
+
+    pmm_rds = 0
+    pmm_wrs = 0
+    try:
+        pmm_rds = Event("UNC_M_PMM_RPQ_INSERTS")
+        pmm_wrs = Event("UNC_M_PMM_WPQ_INSERTS")
+    except:
+        pass
+
+    pmm_total = pmm_rds + pmm_wrs
+
+    scale = 64 / 1_000_000
+    return MetricGroup("lpm_mem_bw", [
+        MetricGroup("lpm_mem_bw_ddr", [
+            Metric("lpm_mem_bw_ddr_read", "DDR memory read bandwidth",
+                   d_ratio(ddr_rds, interval_sec), f"{scale}MB/s"),
+            Metric("lpm_mem_bw_ddr_write", "DDR memory write bandwidth",
+                   d_ratio(ddr_wrs, interval_sec), f"{scale}MB/s"),
+            Metric("lpm_mem_bw_ddr_total", "DDR memory write bandwidth",
+                   d_ratio(ddr_total, interval_sec), f"{scale}MB/s"),
+        ], description="DDR Memory Bandwidth"),
+        MetricGroup("lpm_mem_bw_pmm", [
+            Metric("lpm_mem_bw_pmm_read", "PMM memory read bandwidth",
+                   d_ratio(pmm_rds, interval_sec), f"{scale}MB/s"),
+            Metric("lpm_mem_bw_pmm_write", "PMM memory write bandwidth",
+                   d_ratio(pmm_wrs, interval_sec), f"{scale}MB/s"),
+            Metric("lpm_mem_bw_pmm_total", "PMM memory write bandwidth",
+                   d_ratio(pmm_total, interval_sec), f"{scale}MB/s"),
+        ], description="PMM Memory Bandwidth") if pmm_rds != 0 else None,
+    ], description="Memory Bandwidth")
+
+
+def UncoreMemSat() -> Optional[Metric]:
+    try:
+        clocks = Event("UNC_CHA_CLOCKTICKS", "UNC_C_CLOCKTICKS")
+        sat = Event("UNC_CHA_DISTRESS_ASSERTED.VERT", "UNC_CHA_FAST_ASSERTED.VERT",
+                    "UNC_C_FAST_ASSERTED")
+    except:
+        return None
+
+    desc = ("Mesh Bandwidth saturation (% CBOX cycles with FAST signal asserted, "
+            "include QPI bandwidth saturation), lower is better")
+    if "UNC_CHA_" in sat.name:
+        desc = ("Mesh Bandwidth saturation (% CHA cycles with FAST signal asserted, "
+                "include UPI bandwidth saturation), lower is better")
+    return Metric("lpm_mem_sat", desc, d_ratio(sat, clocks), "100%")
+
+
+def UncoreUpiBw() -> Optional[MetricGroup]:
+    try:
+        upi_rds = Event("UNC_UPI_RxL_FLITS.ALL_DATA")
+        upi_wrs = Event("UNC_UPI_TxL_FLITS.ALL_DATA")
+    except:
+        return None
+
+    upi_total = upi_rds + upi_wrs
+
+    # From "Uncore Performance Monitoring": When measuring the amount of
+    # bandwidth consumed by transmission of the data (i.e. NOT including
+    # the header), it should be .ALL_DATA / 9 * 64B.
+    scale = (64 / 9) / 1_000_000
+    return MetricGroup("lpm_upi_bw", [
+        Metric("lpm_upi_bw_read", "UPI read bandwidth",
+               d_ratio(upi_rds, interval_sec), f"{scale}MB/s"),
+        Metric("lpm_upi_bw_write", "DDR memory write bandwidth",
+               d_ratio(upi_wrs, interval_sec), f"{scale}MB/s"),
+    ], description="UPI Bandwidth")
+
+
+def main() -> None:
+    global _args
+
+    def dir_path(path: str) -> str:
+        """Validate path is a directory for argparse."""
+        if os.path.isdir(path):
+            return path
+        raise argparse.ArgumentTypeError(
+            f'\'{path}\' is not a valid directory')
+
+    parser = argparse.ArgumentParser(description="Intel perf json generator")
+    parser.add_argument(
+        "-metricgroups", help="Generate metricgroups data", action='store_true')
+    parser.add_argument("model", help="e.g. skylakex")
+    parser.add_argument(
+        'events_path',
+        type=dir_path,
+        help='Root of tree containing architecture directories containing json files'
+    )
+    _args = parser.parse_args()
+
+    directory = f"{_args.events_path}/x86/{_args.model}/"
+    LoadEvents(directory)
+
+    all_metrics = MetricGroup("", [
+        Cycles(),
+        Idle(),
+        Rapl(),
+        Smi(),
+        Tsx(),
+        IntelBr(),
+        IntelCtxSw(),
+        IntelFpu(),
+        IntelIlp(),
+        IntelL2(),
+        IntelLdSt(),
+        IntelMissLat(),
+        IntelMlp(),
+        IntelPorts(),
+        IntelSwpf(),
+        UncoreCState(),
+        UncoreDir(),
+        UncoreMem(),
+        UncoreMemBw(),
+        UncoreMemSat(),
+        UncoreUpiBw(),
+    ])
+
+    if _args.metricgroups:
+        print(JsonEncodeMetricGroupDescriptions(all_metrics))
+    else:
+        print(JsonEncodeMetric(all_metrics))
+
+
+if __name__ == '__main__':
+    main()
diff --git a/tools/perf/pmu-events/metric.py b/tools/perf/pmu-events/metric.py
index dd8fd06940e6..585454828c2f 100644
--- a/tools/perf/pmu-events/metric.py
+++ b/tools/perf/pmu-events/metric.py
@@ -3,10 +3,115 @@
 import ast
 import decimal
 import json
+import os
 import re
 from enum import Enum
 from typing import Dict, List, Optional, Set, Tuple, Union
 
+all_pmus = set()
+all_events = set()
+experimental_events = set()
+all_events_all_models = set()
+
+def LoadEvents(directory: str) -> None:
+  """Populate a global set of all known events for the purpose of validating Event names"""
+  global all_pmus
+  global all_events
+  global experimental_events
+  global all_events_all_models
+  all_events = {
+      "context\\-switches",
+      "cpu\\-cycles",
+      "cycles",
+      "duration_time",
+      "instructions",
+      "l2_itlb_misses",
+  }
+  for file in os.listdir(os.fsencode(directory)):
+    filename = os.fsdecode(file)
+    if filename.endswith(".json"):
+      try:
+        for x in json.load(open(f"{directory}/{filename}")):
+          if "Unit" in x:
+            all_pmus.add(x["Unit"])
+          if "EventName" in x:
+            all_events.add(x["EventName"])
+            if "Experimental" in x and x["Experimental"] == "1":
+              experimental_events.add(x["EventName"])
+          elif "ArchStdEvent" in x:
+            all_events.add(x["ArchStdEvent"])
+      except json.decoder.JSONDecodeError:
+        # The generated directory may be the same as the input, which
+        # causes partial json files. Ignore errors.
+        pass
+  all_events_all_models = all_events.copy()
+  for root, dirs, files in os.walk(directory + ".."):
+    for filename in files:
+      if filename.endswith(".json"):
+        try:
+          for x in json.load(open(f"{root}/{filename}")):
+            if "EventName" in x:
+              all_events_all_models.add(x["EventName"])
+            elif "ArchStdEvent" in x:
+              all_events_all_models.add(x["ArchStdEvent"])
+        except json.decoder.JSONDecodeError:
+          # The generated directory may be the same as the input, which
+          # causes partial json files. Ignore errors.
+          pass
+
+
+def CheckPmu(name: str) -> bool:
+  return name in all_pmus
+
+
+def CheckEvent(name: str) -> bool:
+  """Check the event name exists in the set of all loaded events"""
+  global all_events
+  if len(all_events) == 0:
+    # No events loaded so assume any event is good.
+    return True
+
+  if ':' in name:
+    # Remove trailing modifier.
+    name = name[:name.find(':')]
+  elif '/' in name:
+    # Name could begin with a PMU or an event, for now assume it is good.
+    return True
+
+  return name in all_events
+
+def CheckEveryEvent(*names: str) -> None:
+  """Check all the events exist in at least one json file"""
+  global all_events_all_models
+  if len(all_events_all_models) == 0:
+    assert len(names) == 1, f"Cannot determine valid events in {names}"
+    # No events loaded so assume any event is good.
+    return
+
+  for name in names:
+    # Remove trailing modifier.
+    if ':' in name:
+      name = name[:name.find(':')]
+    elif '/' in name:
+      name = name[:name.find('/')]
+      if any([name.startswith(x) for x in ['amd', 'arm', 'cpu', 'msr', 'power']]):
+        continue
+    if name not in all_events_all_models:
+      raise Exception(f"Is {name} a named json event?")
+
+
+def IsExperimentalEvent(name: str) -> bool:
+  global experimental_events
+  if ':' in name:
+    # Remove trailing modifier.
+    name = name[:name.find(':')]
+  elif '/' in name:
+    # Name could begin with a PMU or an event, for now assume it is not experimental.
+    return False
+
+  return name in experimental_events
+
+
 class MetricConstraint(Enum):
   GROUPED_EVENTS = 0
   NO_GROUP_EVENTS = 1
@@ -28,6 +133,10 @@ class Expression:
     """Returns a simplified version of self."""
     raise NotImplementedError()
 
+  def HasExperimentalEvents(self) -> bool:
+    """Are experimental events used in the expression?"""
+    raise NotImplementedError()
+
   def Equals(self, other) -> bool:
     """Returns true when two expressions are the same."""
     raise NotImplementedError()
@@ -195,6 +304,9 @@ class Operator(Expression):
 
     return Operator(self.operator, lhs, rhs)
 
+  def HasExperimentalEvents(self) -> bool:
+    return self.lhs.HasExperimentalEvents() or self.rhs.HasExperimentalEvents()
+
   def Equals(self, other: Expression) -> bool:
     if isinstance(other, Operator):
       return self.operator == other.operator and self.lhs.Equals(
@@ -243,6 +355,10 @@ class Select(Expression):
 
     return Select(true_val, cond, false_val)
 
+  def HasExperimentalEvents(self) -> bool:
+    return (self.cond.HasExperimentalEvents() or self.true_val.HasExperimentalEvents() or
+            self.false_val.HasExperimentalEvents())
+
   def Equals(self, other: Expression) -> bool:
     if isinstance(other, Select):
       return self.cond.Equals(other.cond) and self.false_val.Equals(
@@ -291,6 +407,9 @@ class Function(Expression):
 
     return Function(self.fn, lhs, rhs)
 
+  def HasExperimentalEvents(self) -> bool:
+    return self.lhs.HasExperimentalEvents() or (self.rhs and self.rhs.HasExperimentalEvents())
+
   def Equals(self, other: Expression) -> bool:
     if isinstance(other, Function):
       result = self.fn == other.fn and self.lhs.Equals(other.lhs)
@@ -317,9 +436,22 @@ def _FixEscapes(s: str) -> str:
 class Event(Expression):
   """An event in an expression."""
 
-  def __init__(self, name: str, legacy_name: str = ''):
-    self.name = _FixEscapes(name)
-    self.legacy_name = _FixEscapes(legacy_name)
+  def __init__(self, *args: str):
+    error = ""
+    CheckEveryEvent(*args)
+    for name in args:
+      if CheckEvent(name):
+        self.name = _FixEscapes(name)
+        return
+      if error:
+        error += " or " + name
+      else:
+        error = name
+    global all_events
+    raise Exception(f"No event {error} in:\n{all_events}")
+
+  def HasExperimentalEvents(self) -> bool:
+    return IsExperimentalEvent(self.name)
 
   def ToPerfJson(self):
     result = re.sub('/', '@', self.name)
@@ -338,6 +470,31 @@ class Event(Expression):
     return self
 
 
+class MetricRef(Expression):
+  """A metric reference in an expression."""
+
+  def __init__(self, name: str):
+    self.name = _FixEscapes(name)
+
+  def ToPerfJson(self):
+    return self.name
+
+  def ToPython(self):
+    return f'MetricRef(r"{self.name}")'
+
+  def Simplify(self) -> Expression:
+    return self
+
+  def HasExperimentalEvents(self) -> bool:
+    return False
+
+  def Equals(self, other: Expression) -> bool:
+    return isinstance(other, MetricRef) and self.name == other.name
+
+  def Substitute(self, name: str, expression: Expression) -> Expression:
+    return self
+
+
 class Constant(Expression):
   """A constant within the expression tree."""
 
@@ -358,6 +515,9 @@ class Constant(Expression):
   def Simplify(self) -> Expression:
     return self
 
+  def HasExperimentalEvents(self) -> bool:
+    return False
+
   def Equals(self, other: Expression) -> bool:
     return isinstance(other, Constant) and self.value == other.value
 
@@ -380,6 +540,9 @@ class Literal(Expression):
   def Simplify(self) -> Expression:
     return self
 
+  def HasExperimentalEvents(self) -> bool:
+    return False
+
   def Equals(self, other: Expression) -> bool:
     return isinstance(other, Literal) and self.value == other.value
 
@@ -442,6 +605,8 @@ class Metric:
     self.name = name
     self.description = description
     self.expr = expr.Simplify()
+    if self.expr.HasExperimentalEvents():
+      self.description += " (metric should be considered experimental as it contains experimental events)."
     # Workraound valid_only_metric hiding certain metrics based on unit.
     scale_unit = scale_unit.replace('/sec', ' per sec')
     if scale_unit[0].isdigit():
diff --git a/tools/perf/scripts/python/Perf-Trace-Util/Context.c b/tools/perf/scripts/python/Perf-Trace-Util/Context.c
index 60dcfe56d4d9..c19f44610983 100644
--- a/tools/perf/scripts/python/Perf-Trace-Util/Context.c
+++ b/tools/perf/scripts/python/Perf-Trace-Util/Context.c
@@ -93,7 +93,7 @@ static PyObject *perf_sample_insn(PyObject *obj, PyObject *args)
 	if (c->sample->ip && !c->sample->insn_len && thread__maps(c->al->thread)) {
 		struct machine *machine =  maps__machine(thread__maps(c->al->thread));
 
-		script_fetch_insn(c->sample, c->al->thread, machine, /*native_arch=*/true);
+		perf_sample__fetch_insn(c->sample, c->al->thread, machine);
 	}
 	if (!c->sample->insn_len)
 		Py_RETURN_NONE; /* N.B. This is a return statement */
diff --git a/tools/perf/tests/builtin-test.c b/tools/perf/tests/builtin-test.c
index bd6ffa8e4578..06507066213b 100644
--- a/tools/perf/tests/builtin-test.c
+++ b/tools/perf/tests/builtin-test.c
@@ -153,6 +153,11 @@ static struct test_workload *workloads[] = {
 	&workload__datasym,
 	&workload__landlock,
 	&workload__traploop,
+	&workload__inlineloop,
+
+#ifdef HAVE_RUST_SUPPORT
+	&workload__code_with_type,
+#endif
 };
 
 #define workloads__for_each(workload) \
diff --git a/tools/perf/tests/dlfilter-test.c b/tools/perf/tests/dlfilter-test.c
index 80a1c941138d..e63790c61d53 100644
--- a/tools/perf/tests/dlfilter-test.c
+++ b/tools/perf/tests/dlfilter-test.c
@@ -30,7 +30,6 @@
 #include "symbol.h"
 #include "synthetic-events.h"
 #include "util.h"
-#include "archinsn.h"
 #include "dlfilter.h"
 #include "tests.h"
 #include "util/sample.h"
diff --git a/tools/perf/tests/kallsyms-split.c b/tools/perf/tests/kallsyms-split.c
index bbbc66957e5d..117ed3b70f63 100644
--- a/tools/perf/tests/kallsyms-split.c
+++ b/tools/perf/tests/kallsyms-split.c
@@ -148,6 +148,7 @@ static int test__kallsyms_split(struct test_suite *test __maybe_unused,
 	ret = TEST_OK;
 
 out:
+	map__put(map);
 	remove_proc_dir(0);
 	machine__exit(&m);
 	return ret;
diff --git a/tools/perf/tests/make b/tools/perf/tests/make
index 6641701e4828..6587dc326d1b 100644
--- a/tools/perf/tests/make
+++ b/tools/perf/tests/make
@@ -70,6 +70,7 @@ make_python_perf_so := $(python_perf_so)
 make_debug          := DEBUG=1
 make_nondistro      := BUILD_NONDISTRO=1
 make_extra_tests    := EXTRA_TESTS=1
+make_no_jevents     := NO_JEVENTS=1
 make_jevents_all    := JEVENTS_ARCH=all
 make_no_bpf_skel    := BUILD_BPF_SKEL=0
 make_gen_vmlinux_h  := GEN_VMLINUX_H=1
@@ -83,9 +84,9 @@ make_no_demangle    := NO_DEMANGLE=1
 make_no_libelf      := NO_LIBELF=1
 make_no_libdw       := NO_LIBDW=1
 make_libunwind      := LIBUNWIND=1
-make_no_libdw_dwarf_unwind := NO_LIBDW_DWARF_UNWIND=1
 make_no_backtrace   := NO_BACKTRACE=1
 make_no_libcapstone := NO_CAPSTONE=1
+make_libcapstone_dlopen := LIBCAPSTONE_DLOPEN=1
 make_no_libnuma     := NO_LIBNUMA=1
 make_no_libbionic   := NO_LIBBIONIC=1
 make_no_libbpf	    := NO_LIBBPF=1
@@ -120,9 +121,12 @@ make_static         := LDFLAGS=-static NO_PERF_READ_VDSO32=1 NO_PERF_READ_VDSOX3
 make_minimal        := NO_LIBPYTHON=1 NO_GTK2=1
 make_minimal        += NO_DEMANGLE=1 NO_LIBELF=1 NO_BACKTRACE=1
 make_minimal        += NO_LIBNUMA=1 NO_LIBBIONIC=1 NO_LIBDW=1
-make_minimal        += NO_LIBDW_DWARF_UNWIND=1 NO_LIBBPF=1
+make_minimal        += NO_LIBBPF=1
 make_minimal        += NO_SDT=1 NO_JVMTI=1 NO_LIBZSTD=1
-make_minimal        += NO_LIBCAP=1 NO_CAPSTONE=1
+make_minimal        += NO_CAPSTONE=1
+
+# binutils 2_42 and newer have bfd_thread_init()
+new_libbfd := $(shell echo '#include <bfd.h>' | $(CC) -E -x c - | grep bfd_thread_init)
 
 # $(run) contains all available tests
 run := make_pure
@@ -137,8 +141,11 @@ MAKE_F := $(MAKE) -f $(MK)
 endif
 run += make_python_perf_so
 run += make_debug
+ifneq ($(new_libbfd),)
 run += make_nondistro
+endif
 run += make_extra_tests
+run += make_no_jevents
 run += make_jevents_all
 run += make_no_bpf_skel
 run += make_gen_vmlinux_h
@@ -155,6 +162,7 @@ run += make_libunwind
 run += make_no_libdw_dwarf_unwind
 run += make_no_backtrace
 run += make_no_libcapstone
+run += make_libcapstone_dlopen
 run += make_no_libnuma
 run += make_no_libbionic
 run += make_no_libbpf
diff --git a/tools/perf/tests/parse-events.c b/tools/perf/tests/parse-events.c
index 128d21dc389f..1d3cc224fbc2 100644
--- a/tools/perf/tests/parse-events.c
+++ b/tools/perf/tests/parse-events.c
@@ -2609,8 +2609,8 @@ static int test_events(const struct evlist_test *events, int cnt)
 	for (int i = 0; i < cnt; i++) {
 		struct evlist_test e = events[i];
 		int test_ret;
-		const char *pos = e.name;
-		char buf[1024], *buf_pos = buf, *end;
+		const char *pos = e.name, *end;
+		char buf[1024], *buf_pos = buf;
 
 		while ((end = strstr(pos, "default_core"))) {
 			size_t len = end - pos;
@@ -2627,7 +2627,7 @@ static int test_events(const struct evlist_test *events, int cnt)
 		pr_debug("running test %d '%s'\n", i, e.name);
 		test_ret = test_event(&e);
 		if (test_ret != TEST_OK) {
-			pr_debug("Event test failure: test %d '%s'", i, e.name);
+			pr_debug("Event test failure: test %d '%s'\n", i, e.name);
 			ret = combine_test_results(ret, test_ret);
 		}
 	}
@@ -2764,7 +2764,7 @@ static int test__pmu_events(struct test_suite *test __maybe_unused, int subtest
 
 			test_ret = test_event(&e);
 			if (test_ret != TEST_OK) {
-				pr_debug("Test PMU event failed for '%s'", name);
+				pr_debug("Test PMU event failed for '%s'\n", name);
 				ret = combine_test_results(ret, test_ret);
 			}
 
@@ -2790,7 +2790,7 @@ static int test__pmu_events(struct test_suite *test __maybe_unused, int subtest
 			e.check = test__checkevent_pmu_events_mix;
 			test_ret = test_event(&e);
 			if (test_ret != TEST_OK) {
-				pr_debug("Test PMU event failed for '%s'", name);
+				pr_debug("Test PMU event failed for '%s'\n", name);
 				ret = combine_test_results(ret, test_ret);
 			}
 		}
diff --git a/tools/perf/tests/parse-metric.c b/tools/perf/tests/parse-metric.c
index 6bbc209a5c6a..7c7f489a5eb0 100644
--- a/tools/perf/tests/parse-metric.c
+++ b/tools/perf/tests/parse-metric.c
@@ -41,6 +41,8 @@ static void load_runtime_stat(struct evlist *evlist, struct value *vals)
 		count = find_value(evsel->name, vals);
 		evsel->supported = true;
 		evsel->stats->aggr->counts.val = count;
+		evsel->stats->aggr->counts.ena = 1;
+		evsel->stats->aggr->counts.run = 1;
 	}
 }
 
diff --git a/tools/perf/tests/pmu.c b/tools/perf/tests/pmu.c
index cbded2c6faa4..0ebf2d7b2cb4 100644
--- a/tools/perf/tests/pmu.c
+++ b/tools/perf/tests/pmu.c
@@ -192,12 +192,102 @@ static int test__pmu_format(struct test_suite *test __maybe_unused, int subtest
 	}
 	if (attr.config2 != 0x0400000020041d07) {
 		pr_err("Unexpected config2 value %llx\n", attr.config2);
+	}
+
+	ret = TEST_OK;
+err_out:
+	parse_events_terms__exit(&terms);
+	test_pmu_put(dir, pmu);
+	return ret;
+}
+
+static int test__pmu_usr_chgs(struct test_suite *test __maybe_unused, int subtest __maybe_unused)
+{
+	const char *event = "perf-pmu-test/config=15,config1=4,krava02=170,"
+			    "krava03=1,krava11=27,krava12=1/";
+	struct parse_events_terms terms;
+	struct parse_events_error err;
+	LIST_HEAD(config_terms);
+	struct evlist *evlist;
+	struct perf_pmu *pmu;
+	struct evsel *evsel;
+	int ret = TEST_FAIL;
+	char dir[PATH_MAX];
+	u64 val;
+
+	pmu = test_pmu_get(dir, sizeof(dir));
+	if (!pmu)
+		return TEST_FAIL;
+
+	evlist = evlist__new();
+	if (evlist == NULL) {
+		pr_err("Failed allocation");
+		goto err_out;
+	}
+
+	parse_events_terms__init(&terms);
+	ret = parse_events(evlist, event, &err);
+	if (ret) {
+		pr_debug("failed to parse event '%s', err %d\n", event, ret);
+		parse_events_error__print(&err, event);
+		if (parse_events_error__contains(&err, "can't access trace events"))
+			ret = TEST_SKIP;
 		goto err_out;
 	}
+	evsel = evlist__first(evlist);
+
+	/*
+	 * Set via config=15, krava01 bits 0-1
+	 * Set via config1=4, krava11 bit 1
+	 * Set values: krava02=170, krava03=1, krava11=27, krava12=1
+	 *
+	 * Test that already set values aren't overwritten.
+	 */
+	evsel__set_config_if_unset(evsel, "krava01", 16);
+	evsel__get_config_val(evsel, "krava01", &val);
+	TEST_ASSERT_EQUAL("krava01 overwritten", (int) val, (15 & 0b11));
+
+	evsel__set_config_if_unset(evsel, "krava11", 45);
+	evsel__get_config_val(evsel, "krava11", &val);
+	TEST_ASSERT_EQUAL("krava11 overwritten", (int) val, (27 | (4 << 1)));
+
+	evsel__set_config_if_unset(evsel, "krava02", 32);
+	evsel__get_config_val(evsel, "krava02", &val);
+	TEST_ASSERT_EQUAL("krava02 overwritten", (int) val, 170);
+
+	evsel__set_config_if_unset(evsel, "krava03", 0);
+	evsel__get_config_val(evsel, "krava03", &val);
+	TEST_ASSERT_EQUAL("krava03 overwritten", (int) val, 1);
+
+	/*
+	 * krava13 doesn't have any bits set by either krava13= or config1=
+	 * but setting _any_ raw value for config1 implies that krava13
+	 * shouldn't be overwritten. So it's value should remain as 0.
+	 */
+	evsel__set_config_if_unset(evsel, "krava13", 5);
+	evsel__get_config_val(evsel, "krava13", &val);
+	TEST_ASSERT_EQUAL("krava13 overwritten", (int) val, 0);
+
+	/*
+	 * Unset values: krava21, krava22, krava23
+	 *
+	 * Test that unset values are overwritten.
+	 */
+	evsel__set_config_if_unset(evsel, "krava21", 13905);
+	evsel__get_config_val(evsel, "krava21", &val);
+	TEST_ASSERT_EQUAL("krava21 not overwritten", (int) val, 13905);
+
+	evsel__set_config_if_unset(evsel, "krava22", 11);
+	evsel__get_config_val(evsel, "krava22", &val);
+	TEST_ASSERT_EQUAL("krava22 not overwritten", (int) val, 11);
 
+	evsel__set_config_if_unset(evsel, "krava23", 0);
+	evsel__get_config_val(evsel, "krava23", &val);
+	TEST_ASSERT_EQUAL("krava23 not overwritten", (int) val, 0);
 	ret = TEST_OK;
 err_out:
 	parse_events_terms__exit(&terms);
+	evlist__delete(evlist);
 	test_pmu_put(dir, pmu);
 	return ret;
 }
@@ -539,6 +629,7 @@ static struct test_case tests__pmu[] = {
 	TEST_CASE("PMU name combining", name_len),
 	TEST_CASE("PMU name comparison", name_cmp),
 	TEST_CASE("PMU cmdline match", pmu_match),
+	TEST_CASE("PMU user config changes", pmu_usr_chgs),
 	{	.name = NULL, }
 };
 
diff --git a/tools/perf/tests/shell/addr2line_inlines.sh b/tools/perf/tests/shell/addr2line_inlines.sh
new file mode 100755
index 000000000000..e8754ef2d7f2
--- /dev/null
+++ b/tools/perf/tests/shell/addr2line_inlines.sh
@@ -0,0 +1,96 @@
+#!/bin/bash
+# test addr2line inline unwinding
+# SPDX-License-Identifier: GPL-2.0
+
+set -e
+
+err=0
+test_dir=$(mktemp -d /tmp/perf-test-inline-addr2line.XXXXXXXXXX)
+perf_data="${test_dir}/perf.data"
+perf_script_txt="${test_dir}/perf_script.txt"
+
+cleanup() {
+    rm -rf "${test_dir}"
+    trap - EXIT TERM INT
+}
+
+trap_cleanup() {
+    echo "Unexpected signal in ${FUNCNAME[1]}"
+    cleanup
+    exit 1
+}
+trap trap_cleanup EXIT TERM INT
+
+test_fp() {
+    echo "Inline unwinding fp verification test"
+    # Record data. Currently only dwarf callchains support inlined functions.
+    perf record --call-graph fp -e task-clock:u -o "${perf_data}" -- perf test -w inlineloop 1
+
+    # Check output with inline (default) and srcline
+    perf script -i "${perf_data}" --fields +srcline > "${perf_script_txt}"
+
+    # Expect the leaf and middle functions to occur on lines in the 20s, with
+    # the non-inlined parent function on a line in the 30s.
+    if grep -q "inlineloop.c:2. (inlined)" "${perf_script_txt}" &&
+       grep -q "inlineloop.c:3.$" "${perf_script_txt}"
+    then
+        echo "Inline unwinding fp verification test [Success]"
+    else
+        echo "Inline unwinding fp verification test [Failed missing inlined functions]"
+        err=1
+    fi
+}
+
+test_dwarf() {
+    echo "Inline unwinding dwarf verification test"
+    # Record data. Currently only dwarf callchains support inlined functions.
+    perf record --call-graph dwarf -e task-clock:u -o "${perf_data}" -- perf test -w inlineloop 1
+
+    # Check output with inline (default) and srcline
+    perf script -i "${perf_data}" --fields +srcline > "${perf_script_txt}"
+
+    # Expect the leaf and middle functions to occur on lines in the 20s, with
+    # the non-inlined parent function on a line in the 30s.
+    if grep -q "inlineloop.c:2. (inlined)" "${perf_script_txt}" &&
+       grep -q "inlineloop.c:3.$" "${perf_script_txt}"
+    then
+        echo "Inline unwinding dwarf verification test [Success]"
+    else
+        echo "Inline unwinding dwarf verification test [Failed missing inlined functions]"
+        err=1
+    fi
+}
+
+test_lbr() {
+    echo "Inline unwinding LBR verification test"
+    if [ ! -f /sys/bus/event_source/devices/cpu/caps/branches ] &&
+       [ ! -f /sys/bus/event_source/devices/cpu_core/caps/branches ]
+    then
+        echo "Skip: only x86 CPUs support LBR"
+        return
+    fi
+
+    # Record data. Currently only dwarf callchains support inlined functions.
+    perf record --call-graph lbr -e cycles:u -o "${perf_data}" -- perf test -w inlineloop 1
+
+    # Check output with inline (default) and srcline
+    perf script -i "${perf_data}" --fields +srcline > "${perf_script_txt}"
+
+    # Expect the leaf and middle functions to occur on lines in the 20s, with
+    # the non-inlined parent function on a line in the 30s.
+    if grep -q "inlineloop.c:2. (inlined)" "${perf_script_txt}" &&
+       grep -q "inlineloop.c:3.$" "${perf_script_txt}"
+    then
+        echo "Inline unwinding lbr verification test [Success]"
+    else
+        echo "Inline unwinding lbr verification test [Failed missing inlined functions]"
+        err=1
+    fi
+}
+
+test_fp
+test_dwarf
+test_lbr
+
+cleanup
+exit $err
diff --git a/tools/perf/tests/shell/data_type_profiling.sh b/tools/perf/tests/shell/data_type_profiling.sh
new file mode 100755
index 000000000000..2a7f8f7c42d0
--- /dev/null
+++ b/tools/perf/tests/shell/data_type_profiling.sh
@@ -0,0 +1,89 @@
+#!/bin/bash
+# perf data type profiling tests
+# SPDX-License-Identifier: GPL-2.0
+
+set -e
+
+# The logic below follows the same line as the annotate test, but looks for a
+# data type profiling manifestation
+
+# Values in testtypes and testprogs should match
+testtypes=("# data-type: struct Buf" "# data-type: struct _buf")
+testprogs=("perf test -w code_with_type" "perf test -w datasym")
+
+err=0
+perfdata=$(mktemp /tmp/__perf_test.perf.data.XXXXX)
+perfout=$(mktemp /tmp/__perf_test.perf.out.XXXXX)
+
+cleanup() {
+  rm -rf "${perfdata}" "${perfout}"
+  rm -rf "${perfdata}".old
+
+  trap - EXIT TERM INT
+}
+
+trap_cleanup() {
+  echo "Unexpected signal in ${FUNCNAME[1]}"
+  cleanup
+  exit 1
+}
+trap trap_cleanup EXIT TERM INT
+
+test_basic_annotate() {
+  mode=$1
+  runtime=$2
+
+  echo "${mode} ${runtime} perf annotate test"
+
+  case "x${runtime}" in
+    "xRust")
+    if ! perf check feature -q rust
+    then
+      echo "Skip: code_with_type workload not built in 'perf test'"
+      return
+    fi
+    index=0 ;;
+
+    "xC")
+    index=1 ;;
+  esac
+
+  if [ "x${mode}" == "xBasic" ]
+  then
+    perf mem record -o "${perfdata}" ${testprogs[$index]} 2> /dev/null
+  else
+    perf mem record -o - ${testprogs[$index]} 2> /dev/null > "${perfdata}"
+  fi
+  if [ "x$?" != "x0" ]
+  then
+    echo "${mode} annotate [Failed: perf record]"
+    err=1
+    return
+  fi
+
+  # Generate the annotated output file
+  if [ "x${mode}" == "xBasic" ]
+  then
+    perf annotate --code-with-type -i "${perfdata}" --stdio --percent-limit 1 2> /dev/null > "${perfout}"
+  else
+    perf annotate --code-with-type -i - --stdio 2> /dev/null --percent-limit 1 < "${perfdata}" > "${perfout}"
+  fi
+
+  # check if it has the target data type
+  if ! grep -q "${testtypes[$index]}" "${perfout}"
+  then
+    echo "${mode} annotate [Failed: missing target data type]"
+    cat "${perfout}"
+    err=1
+    return
+  fi
+  echo "${mode} annotate test [Success]"
+}
+
+test_basic_annotate Basic Rust
+test_basic_annotate Pipe Rust
+test_basic_annotate Basic C
+test_basic_annotate Pipe C
+
+cleanup
+exit $err
diff --git a/tools/perf/tests/shell/evlist.sh b/tools/perf/tests/shell/evlist.sh
index 140f099e75c1..8a22f4171c07 100755
--- a/tools/perf/tests/shell/evlist.sh
+++ b/tools/perf/tests/shell/evlist.sh
@@ -21,13 +21,13 @@ trap trap_cleanup EXIT TERM INT
 
 test_evlist_simple() {
 	echo "Simple evlist test"
-	if ! perf record -e cycles -o "${perfdata}" true 2> /dev/null
+	if ! perf record -e cpu-clock -o "${perfdata}" true 2> /dev/null
 	then
 		echo "Simple evlist [Failed record]"
 		err=1
 		return
 	fi
-	if ! perf evlist -i "${perfdata}" | grep -q "cycles"
+	if ! perf evlist -i "${perfdata}" | grep -q "cpu-clock"
 	then
 		echo "Simple evlist [Failed to list event]"
 		err=1
@@ -38,13 +38,14 @@ test_evlist_simple() {
 
 test_evlist_group() {
 	echo "Group evlist test"
-	if ! perf record -e "{cycles,instructions}" -o "${perfdata}" true 2> /dev/null
+	if ! perf record -e "{cpu-clock,task-clock}" -o "${perfdata}" \
+		-- perf test -w noploop 2> /dev/null
 	then
 		echo "Group evlist [Skipped event group recording failed]"
 		return
 	fi
 
-	if ! perf evlist -i "${perfdata}" -g | grep -q "{.*cycles.*,.*instructions.*}"
+	if ! perf evlist -i "${perfdata}" -g | grep -q "{.*cpu-clock.*,.*task-clock.*}"
 	then
 		echo "Group evlist [Failed to list event group]"
 		err=1
diff --git a/tools/perf/tests/shell/inject-callchain.sh b/tools/perf/tests/shell/inject-callchain.sh
new file mode 100755
index 000000000000..a1cba8010f95
--- /dev/null
+++ b/tools/perf/tests/shell/inject-callchain.sh
@@ -0,0 +1,45 @@
+#!/bin/bash
+# perf inject to convert DWARF callchains to regular ones
+# SPDX-License-Identifier: GPL-2.0
+
+if ! perf check feature -q dwarf; then
+    echo "SKIP: DWARF support is not available"
+    exit 2
+fi
+
+TESTDATA=$(mktemp /tmp/perf-test.XXXXXX)
+
+err=0
+
+cleanup()
+{
+    trap - EXIT TERM INT
+    rm -f ${TESTDATA}*
+}
+
+trap_cleanup()
+{
+	cleanup
+	exit 1
+}
+
+trap trap_cleanup EXIT TERM INT
+
+echo "recording data with DWARF callchain"
+perf record -F 999 --call-graph dwarf -o "${TESTDATA}" -- perf test -w noploop
+
+echo "convert DWARF callchain using perf inject"
+perf inject -i "${TESTDATA}" --convert-callchain -o "${TESTDATA}.new"
+
+perf report -i "${TESTDATA}" --no-children -q --percent-limit=1 > ${TESTDATA}.out
+perf report -i "${TESTDATA}.new" --no-children -q --percent-limit=1 > ${TESTDATA}.new.out
+
+echo "compare the both result excluding inlined functions"
+if diff -u "${TESTDATA}.out" "${TESTDATA}.new.out" | grep "^- " | grep -qv "(inlined)"; then
+    echo "Found some differences"
+    diff -u "${TESTDATA}.out" "${TESTDATA}.new.out"
+    err=1
+fi
+
+cleanup
+exit $err
diff --git a/tools/perf/tests/shell/kvm.sh b/tools/perf/tests/shell/kvm.sh
index 2a399b83fe80..f88e859025c4 100755
--- a/tools/perf/tests/shell/kvm.sh
+++ b/tools/perf/tests/shell/kvm.sh
@@ -7,9 +7,10 @@ set -e
 err=0
 perfdata=$(mktemp /tmp/__perf_kvm_test.perf.data.XXXXX)
 qemu_pid_file=$(mktemp /tmp/__perf_kvm_test.qemu.pid.XXXXX)
+log_file=$(mktemp /tmp/__perf_kvm_test.live_log.XXXXX)
 
 cleanup() {
-	rm -f "${perfdata}"
+	rm -f "${perfdata}" "${log_file}"
 	if [ -f "${qemu_pid_file}" ]; then
 		if [ -s "${qemu_pid_file}" ]; then
 			qemu_pid=$(cat "${qemu_pid_file}")
@@ -96,6 +97,32 @@ test_kvm_buildid_list() {
 	echo "perf kvm buildid-list test [Success]"
 }
 
+test_kvm_stat_live() {
+	echo "Testing perf kvm stat live"
+
+        # Run perf kvm live for 5 seconds, monitoring that PID
+	# Use sleep to keep stdin open but silent, preventing EOF loop or interactive spam
+	if ! sleep 10 | timeout 5s perf kvm stat live -p "${qemu_pid}" > "${log_file}" 2>&1; then
+		retval=$?
+		if [ $retval -ne 124 ] && [ $retval -ne 0 ]; then
+			echo "perf kvm stat live [Failed: perf kvm stat live failed to start or run (ret=$retval)]"
+			head -n 50 "${log_file}"
+			err=1
+			return
+		fi
+	fi
+
+	# Check for some sample data (percentage)
+	if ! grep -E -q "[0-9]+\.[0-9]+%" "${log_file}"; then
+		echo "perf kvm stat live [Failed: no sample percentage found]"
+		head -n 50 "${log_file}"
+		err=1
+		return
+	fi
+
+	echo "perf kvm stat live test [Success]"
+}
+
 setup_qemu() {
 	# Find qemu
 	if [ "$(uname -m)" = "x86_64" ]; then
@@ -148,6 +175,7 @@ if [ $err -eq 0 ]; then
 	test_kvm_stat
 	test_kvm_record_report
 	test_kvm_buildid_list
+	test_kvm_stat_live
 fi
 
 cleanup
diff --git a/tools/perf/tests/shell/perf_sched_stats.sh b/tools/perf/tests/shell/perf_sched_stats.sh
new file mode 100755
index 000000000000..2b1410b050d0
--- /dev/null
+++ b/tools/perf/tests/shell/perf_sched_stats.sh
@@ -0,0 +1,64 @@
+#!/bin/sh
+# perf sched stats tests
+# SPDX-License-Identifier: GPL-2.0
+
+set -e
+
+err=0
+test_perf_sched_stats_record() {
+  echo "Basic perf sched stats record test"
+  if ! perf sched stats record true 2>&1 | \
+    grep -E -q "[ perf sched stats: Wrote samples to perf.data ]"
+  then
+    echo "Basic perf sched stats record test [Failed]"
+    err=1
+    return
+  fi
+  echo "Basic perf sched stats record test [Success]"
+}
+
+test_perf_sched_stats_report() {
+  echo "Basic perf sched stats report test"
+  perf sched stats record true > /dev/null
+  if ! perf sched stats report 2>&1 | grep -E -q "Description"
+  then
+    echo "Basic perf sched stats report test [Failed]"
+    err=1
+    rm perf.data
+    return
+  fi
+  rm perf.data
+  echo "Basic perf sched stats report test [Success]"
+}
+
+test_perf_sched_stats_live() {
+  echo "Basic perf sched stats live mode test"
+  if ! perf sched stats true 2>&1 | grep -E -q "Description"
+  then
+    echo "Basic perf sched stats live mode test [Failed]"
+    err=1
+    return
+  fi
+  echo "Basic perf sched stats live mode test [Success]"
+}
+
+test_perf_sched_stats_diff() {
+  echo "Basic perf sched stats diff test"
+  perf sched stats record true > /dev/null
+  perf sched stats record true > /dev/null
+  if ! perf sched stats diff > /dev/null
+  then
+    echo "Basic perf sched stats diff test [Failed]"
+    err=1
+    rm perf.data.old perf.data
+    return
+  fi
+  rm perf.data.old perf.data
+  echo "Basic perf sched stats diff test [Success]"
+}
+
+test_perf_sched_stats_record
+test_perf_sched_stats_report
+test_perf_sched_stats_live
+test_perf_sched_stats_diff
+exit $err
diff --git a/tools/perf/tests/shell/record.sh b/tools/perf/tests/shell/record.sh
index 0f5841c479e7..7cb81cf3444a 100755
--- a/tools/perf/tests/shell/record.sh
+++ b/tools/perf/tests/shell/record.sh
@@ -260,7 +260,21 @@ test_uid() {
 
 test_leader_sampling() {
   echo "Basic leader sampling test"
-  if ! perf record -o "${perfdata}" -e "{cycles,cycles}:Su" -- \
+  events="{cycles,cycles}:Su"
+  [ "$(uname -m)" = "s390x" ] && {
+    [ ! -d /sys/devices/cpum_sf ] && {
+      echo "No CPUMF [Skipped record]"
+      return
+    }
+    events="{cpum_sf/SF_CYCLES_BASIC/,cycles}:Su"
+    perf record -o "${perfdata}" -e "$events" -- perf test -w brstack 2> /dev/null
+    # Perf grouping might be unsupported, depends on version.
+    [ "$?" -ne 0 ] && {
+      echo "Grouping not support [Skipped record]"
+      return
+    }
+  }
+  if ! perf record -o "${perfdata}" -e "$events" -- \
     perf test -w brstack 2> /dev/null
   then
     echo "Leader sampling [Failed record]"
diff --git a/tools/perf/tests/shell/sched.sh b/tools/perf/tests/shell/sched.sh
index b9b81eaf856e..b9637069adb1 100755
--- a/tools/perf/tests/shell/sched.sh
+++ b/tools/perf/tests/shell/sched.sh
@@ -53,7 +53,7 @@ start_noploops() {
 }
 
 cleanup_noploops() {
-  kill "$PID1" "$PID2"
+  kill "$PID1" "$PID2" || true
 }
 
 test_sched_record() {
diff --git a/tools/perf/tests/shell/script_dlfilter.sh b/tools/perf/tests/shell/script_dlfilter.sh
index 45c97d4a7d5f..aaed92bb7828 100755
--- a/tools/perf/tests/shell/script_dlfilter.sh
+++ b/tools/perf/tests/shell/script_dlfilter.sh
@@ -68,17 +68,17 @@ test_dlfilter() {
 	fi
 
 	# Build the dlfilter
-	if ! cc -c -I tools/perf/include -fpic -x c "${dlfilter_c}" -o "${dlfilter_so}.o"
+	if ! cc -c -I ${shelldir}/../../include -fpic -x c "${dlfilter_c}" -o "${dlfilter_so}.o"
 	then
-		echo "Basic --dlfilter test [Failed to build dlfilter object]"
-		err=1
+		echo "Basic --dlfilter test [Skip - failed to build dlfilter object]"
+		err=2
 		return
 	fi
 
 	if ! cc -shared -o "${dlfilter_so}" "${dlfilter_so}.o"
 	then
-		echo "Basic --dlfilter test [Failed to link dlfilter shared object]"
-		err=1
+		echo "Basic --dlfilter test [Skip - failed to link dlfilter shared object]"
+		err=2
 		return
 	fi
 
diff --git a/tools/perf/tests/shell/script_perl.sh b/tools/perf/tests/shell/script_perl.sh
new file mode 100755
index 000000000000..b6d65b6fbda1
--- /dev/null
+++ b/tools/perf/tests/shell/script_perl.sh
@@ -0,0 +1,102 @@
+#!/bin/bash
+# perf script perl tests
+# SPDX-License-Identifier: GPL-2.0
+
+set -e
+
+# set PERF_EXEC_PATH to find scripts in the source directory
+perfdir=$(dirname "$0")/../..
+if [ -e "$perfdir/scripts/perl/Perf-Trace-Util" ]; then
+  export PERF_EXEC_PATH=$perfdir
+fi
+
+
+perfdata=$(mktemp /tmp/__perf_test_script_perl.perf.data.XXXXX)
+generated_script=$(mktemp /tmp/__perf_test_script.XXXXX.pl)
+
+cleanup() {
+  rm -f "${perfdata}"
+  rm -f "${generated_script}"
+  trap - EXIT TERM INT
+}
+
+trap_cleanup() {
+  echo "Unexpected signal in ${FUNCNAME[1]}"
+  cleanup
+  exit 1
+}
+trap trap_cleanup TERM INT
+trap cleanup EXIT
+
+check_perl_support() {
+	if perf check feature -q libperl; then
+		return 0
+	fi
+	echo "perf script perl test [Skipped: no libperl support]"
+	return 2
+}
+
+test_script() {
+	local event_name=$1
+	local expected_output=$2
+	local record_opts=$3
+
+	echo "Testing event: $event_name"
+
+	# Try to record. If this fails, it might be permissions or lack of support.
+	# We return 2 to indicate "skip this event" rather than "fail test".
+	if ! perf record -o "${perfdata}" -e "$event_name" $record_opts -- perf test -w thloop > /dev/null 2>&1; then
+		echo "perf script perl test [Skipped: failed to record $event_name]"
+		return 2
+	fi
+
+	echo "Generating perl script..."
+	if ! perf script -i "${perfdata}" -g "${generated_script}"; then
+		echo "perf script perl test [Failed: script generation for $event_name]"
+		return 1
+	fi
+
+	if [ ! -f "${generated_script}" ]; then
+		echo "perf script perl test [Failed: script not generated for $event_name]"
+		return 1
+	fi
+
+	echo "Executing perl script..."
+	output=$(perf script -i "${perfdata}" -s "${generated_script}" 2>&1)
+
+	if echo "$output" | grep -q "$expected_output"; then
+		echo "perf script perl test [Success: $event_name triggered $expected_output]"
+		return 0
+	else
+		echo "perf script perl test [Failed: $event_name did not trigger $expected_output]"
+		echo "Output was:"
+		echo "$output" | head -n 20
+		return 1
+	fi
+}
+
+check_perl_support || exit 2
+
+# Try tracepoint first
+test_script "sched:sched_switch" "sched::sched_switch" "-c 1" && res=0 || res=$?
+
+if [ $res -eq 0 ]; then
+	exit 0
+elif [ $res -eq 1 ]; then
+	exit 1
+fi
+
+# If tracepoint skipped (res=2), try task-clock
+# For generic events like task-clock, the generated script uses process_event()
+# which dumps data using Data::Dumper. We check for "$VAR1" which is standard Dumper output.
+test_script "task-clock" "\$VAR1" "-c 100" && res=0 || res=$?
+
+if [ $res -eq 0 ]; then
+	exit 0
+elif [ $res -eq 1 ]; then
+	exit 1
+fi
+
+# If both skipped
+echo "perf script perl test [Skipped: Could not record tracepoint or task-clock]"
+exit 2
diff --git a/tools/perf/tests/shell/script_python.sh b/tools/perf/tests/shell/script_python.sh
new file mode 100755
index 000000000000..6bc66074a31f
--- /dev/null
+++ b/tools/perf/tests/shell/script_python.sh
@@ -0,0 +1,113 @@
+#!/bin/bash
+# perf script python tests
+# SPDX-License-Identifier: GPL-2.0
+
+set -e
+
+# set PERF_EXEC_PATH to find scripts in the source directory
+perfdir=$(dirname "$0")/../..
+if [ -e "$perfdir/scripts/python/Perf-Trace-Util" ]; then
+  export PERF_EXEC_PATH=$perfdir
+fi
+
+
+perfdata=$(mktemp /tmp/__perf_test_script_python.perf.data.XXXXX)
+generated_script=$(mktemp /tmp/__perf_test_script.XXXXX.py)
+
+cleanup() {
+  rm -f "${perfdata}"
+  rm -f "${generated_script}"
+  trap - EXIT TERM INT
+}
+
+trap_cleanup() {
+  echo "Unexpected signal in ${FUNCNAME[1]}"
+  cleanup
+  exit 1
+}
+trap trap_cleanup TERM INT
+trap cleanup EXIT
+
+check_python_support() {
+	if perf check feature -q libpython; then
+		return 0
+	fi
+	echo "perf script python test [Skipped: no libpython support]"
+	return 2
+}
+
+test_script() {
+	local event_name=$1
+	local expected_output=$2
+	local record_opts=$3
+
+	echo "Testing event: $event_name"
+
+	# Try to record. If this fails, it might be permissions or lack of
+	# support. Return 2 to indicate "skip this event" rather than "fail
+	# test".
+	if ! perf record -o "${perfdata}" -e "$event_name" $record_opts -- perf test -w thloop > /dev/null 2>&1; then
+		echo "perf script python test [Skipped: failed to record $event_name]"
+		return 2
+	fi
+
+	echo "Generating python script..."
+	if ! perf script -i "${perfdata}" -g "${generated_script}"; then
+		echo "perf script python test [Failed: script generation for $event_name]"
+		return 1
+	fi
+
+	if [ ! -f "${generated_script}" ]; then
+		echo "perf script python test [Failed: script not generated for $event_name]"
+		return 1
+	fi
+
+	# Perf script -g python doesn't generate process_event for generic
+	# events so append it manually to test that the callback works.
+	if ! grep -q "def process_event" "${generated_script}"; then
+		cat <<EOF >> "${generated_script}"
+
+def process_event(param_dict):
+	print("param_dict: %s" % param_dict)
+EOF
+	fi
+
+	echo "Executing python script..."
+	output=$(perf script -i "${perfdata}" -s "${generated_script}" 2>&1)
+
+	if echo "$output" | grep -q "$expected_output"; then
+		echo "perf script python test [Success: $event_name triggered $expected_output]"
+		return 0
+	else
+		echo "perf script python test [Failed: $event_name did not trigger $expected_output]"
+		echo "Output was:"
+		echo "$output" | head -n 20
+		return 1
+	fi
+}
+
+check_python_support || exit 2
+
+# Try tracepoint first
+test_script "sched:sched_switch" "sched__sched_switch" "-c 1" && res=0 || res=$?
+
+if [ $res -eq 0 ]; then
+	exit 0
+elif [ $res -eq 1 ]; then
+	exit 1
+fi
+
+# If tracepoint skipped (res=2), try task-clock
+# For generic events like task-clock, the generated script uses process_event()
+# which prints the param_dict.
+test_script "task-clock" "param_dict" "-c 100" && res=0 || res=$?
+
+if [ $res -eq 0 ]; then
+	exit 0
+elif [ $res -eq 1 ]; then
+	exit 1
+fi
+
+# If both skipped
+echo "perf script python test [Skipped: Could not record tracepoint or task-clock]"
+exit 2
diff --git a/tools/perf/tests/shell/stat.sh b/tools/perf/tests/shell/stat.sh
index 0b2f0f88ca16..4edb04039036 100755
--- a/tools/perf/tests/shell/stat.sh
+++ b/tools/perf/tests/shell/stat.sh
@@ -5,6 +5,21 @@
 set -e
 
 err=0
+stat_output=$(mktemp /tmp/perf-stat-test-output.XXXXX)
+
+cleanup() {
+  rm -f "${stat_output}"
+  trap - EXIT TERM INT
+}
+
+trap_cleanup() {
+  echo "Unexpected signal in ${FUNCNAME[1]}"
+  cleanup
+  exit 1
+}
+
+trap trap_cleanup EXIT TERM INT
+
 test_default_stat() {
   echo "Basic stat command test"
   if ! perf stat true 2>&1 | grep -E -q "Performance counter stats for 'true':"
@@ -233,7 +248,7 @@ test_hybrid() {
   fi
 
   # Run default Perf stat
-  cycles_events=$(perf stat -a -- sleep 0.1 2>&1 | grep -E "/cpu-cycles/[uH]*|  cpu-cycles[:uH]*  " -c)
+  cycles_events=$(perf stat -a -- sleep 0.1 2>&1 | grep -E "/cpu-cycles/[uH]*|  cpu-cycles[:uH]*  "  | wc -l)
 
   # The expectation is that default output will have a cycles events on each
   # hybrid PMU. In situations with no cycles PMU events, like virtualized, this
@@ -248,6 +263,226 @@ test_hybrid() {
   echo "hybrid test [Success]"
 }
 
+test_stat_cpu() {
+  echo "stat -C <cpu> test"
+  # Test the full online CPU list (ranges and lists)
+  online_cpus=$(cat /sys/devices/system/cpu/online)
+  if ! perf stat -C "$online_cpus" -a true > "${stat_output}" 2>&1
+  then
+    echo "stat -C <cpu> test [Failed - command failed for cpus $online_cpus]"
+    cat "${stat_output}"
+    err=1
+    return
+  fi
+
+  if ! grep -E -q "Performance counter stats for" "${stat_output}"
+  then
+    echo "stat -C <cpu> test [Failed - missing output for cpus $online_cpus]"
+    cat "${stat_output}"
+    err=1
+    return
+  fi
+
+  # Test each individual online CPU
+  for cpu_dir in /sys/devices/system/cpu/cpu[0-9]*; do
+    cpu=${cpu_dir##*/cpu}
+    # Check if online
+    if [ -f "$cpu_dir/online" ] && [ "$(cat "$cpu_dir/online")" -eq 0 ]
+    then
+      continue
+    fi
+
+    if ! perf stat -C "$cpu" -a true > "${stat_output}" 2>&1
+    then
+      echo "stat -C <cpu> test [Failed - command failed for cpu $cpu]"
+      cat "${stat_output}"
+      err=1
+      return
+    fi
+    if ! grep -E -q "Performance counter stats for" "${stat_output}"
+    then
+      echo "stat -C <cpu> test [Failed - missing output for cpu $cpu]"
+      cat "${stat_output}"
+      err=1
+      return
+    fi
+  done
+
+  # Test synthetic list and range if cpu0 and cpu1 are online
+  c0_online=0
+  c1_online=0
+  if [ -d "/sys/devices/system/cpu/cpu0" ]
+  then
+    if [ ! -f "/sys/devices/system/cpu/cpu0/online" ] || [ "$(cat /sys/devices/system/cpu/cpu0/online)" -eq 1 ]
+    then
+      c0_online=1
+    fi
+  fi
+  if [ -d "/sys/devices/system/cpu/cpu1" ]
+  then
+    if [ ! -f "/sys/devices/system/cpu/cpu1/online" ] || [ "$(cat /sys/devices/system/cpu/cpu1/online)" -eq 1 ]
+    then
+      c1_online=1
+    fi
+  fi
+
+  if [ $c0_online -eq 1 ] && [ $c1_online -eq 1 ]
+  then
+    # Test list "0,1"
+    if ! perf stat -C "0,1" -a true > "${stat_output}" 2>&1
+    then
+      echo "stat -C <cpu> test [Failed - command failed for cpus 0,1]"
+      cat "${stat_output}"
+      err=1
+      return
+    fi
+    if ! grep -E -q "Performance counter stats for" "${stat_output}"
+    then
+      echo "stat -C <cpu> test [Failed - missing output for cpus 0,1]"
+      cat "${stat_output}"
+      err=1
+      return
+    fi
+
+    # Test range "0-1"
+    if ! perf stat -C "0-1" -a true > "${stat_output}" 2>&1
+    then
+      echo "stat -C <cpu> test [Failed - command failed for cpus 0-1]"
+      cat "${stat_output}"
+      err=1
+      return
+    fi
+    if ! grep -E -q "Performance counter stats for" "${stat_output}"
+    then
+      echo "stat -C <cpu> test [Failed - missing output for cpus 0-1]"
+      cat "${stat_output}"
+      err=1
+      return
+    fi
+  fi
+
+  echo "stat -C <cpu> test [Success]"
+}
+
+test_stat_no_aggr() {
+  echo "stat -A test"
+  if ! perf stat -A -a true > "${stat_output}" 2>&1
+  then
+    echo "stat -A test [Failed - command failed]"
+    cat "${stat_output}"
+    err=1
+    return
+  fi
+
+  if ! grep -E -q "CPU" "${stat_output}"
+  then
+    echo "stat -A test [Failed - missing CPU column]"
+    cat "${stat_output}"
+    err=1
+    return
+  fi
+  echo "stat -A test [Success]"
+}
+
+test_stat_detailed() {
+  echo "stat -d test"
+  if ! perf stat -d true > "${stat_output}" 2>&1
+  then
+    echo "stat -d test [Failed - command failed]"
+    cat "${stat_output}"
+    err=1
+    return
+  fi
+
+  if ! grep -E -q "Performance counter stats" "${stat_output}"
+  then
+    echo "stat -d test [Failed - missing output]"
+    cat "${stat_output}"
+    err=1
+    return
+  fi
+
+  if ! perf stat -dd true > "${stat_output}" 2>&1
+  then
+    echo "stat -dd test [Failed - command failed]"
+    cat "${stat_output}"
+    err=1
+    return
+  fi
+
+  if ! grep -E -q "Performance counter stats" "${stat_output}"
+  then
+    echo "stat -dd test [Failed - missing output]"
+    cat "${stat_output}"
+    err=1
+    return
+  fi
+
+  if ! perf stat -ddd true > "${stat_output}" 2>&1
+  then
+    echo "stat -ddd test [Failed - command failed]"
+    cat "${stat_output}"
+    err=1
+    return
+  fi
+
+  if ! grep -E -q "Performance counter stats" "${stat_output}"
+  then
+    echo "stat -ddd test [Failed - missing output]"
+    cat "${stat_output}"
+    err=1
+    return
+  fi
+
+  echo "stat -d test [Success]"
+}
+
+test_stat_repeat() {
+  echo "stat -r test"
+  if ! perf stat -r 2 true > "${stat_output}" 2>&1
+  then
+    echo "stat -r test [Failed - command failed]"
+    cat "${stat_output}"
+    err=1
+    return
+  fi
+
+  if ! grep -E -q "\([[:space:]]*\+-.*%[[:space:]]*\)" "${stat_output}"
+  then
+    echo "stat -r test [Failed - missing variance]"
+    cat "${stat_output}"
+    err=1
+    return
+  fi
+  echo "stat -r test [Success]"
+}
+
+test_stat_pid() {
+  echo "stat -p test"
+  sleep 1 &
+  pid=$!
+  if ! perf stat -p $pid > "${stat_output}" 2>&1
+  then
+    echo "stat -p test [Failed - command failed]"
+    cat "${stat_output}"
+    err=1
+    kill $pid 2>/dev/null || true
+    wait $pid 2>/dev/null || true
+    return
+  fi
+
+  if ! grep -E -q "Performance counter stats" "${stat_output}"
+  then
+    echo "stat -p test [Failed - missing output]"
+    cat "${stat_output}"
+    err=1
+  else
+    echo "stat -p test [Success]"
+  fi
+  kill $pid 2>/dev/null || true
+  wait $pid 2>/dev/null || true
+}
+
 test_default_stat
 test_null_stat
 test_offline_cpu_stat
@@ -258,4 +493,11 @@ test_topdown_groups
 test_topdown_weak_groups
 test_cputype
 test_hybrid
+test_stat_cpu
+test_stat_no_aggr
+test_stat_detailed
+test_stat_repeat
+test_stat_pid
+
+cleanup
 exit $err
diff --git a/tools/perf/tests/shell/stat_all_metricgroups.sh b/tools/perf/tests/shell/stat_all_metricgroups.sh
index 1400880ec01f..81bc7070b5ab 100755
--- a/tools/perf/tests/shell/stat_all_metricgroups.sh
+++ b/tools/perf/tests/shell/stat_all_metricgroups.sh
@@ -12,31 +12,32 @@ if ParanoidAndNotRoot 0
 then
   system_wide_flag=""
 fi
-err=0
+
+err=3
+skip=0
 for m in $(perf list --raw-dump metricgroups)
 do
   echo "Testing $m"
   result=$(perf stat -M "$m" $system_wide_flag sleep 0.01 2>&1)
   result_err=$?
-  if [[ $result_err -gt 0 ]]
+  if [[ $result_err -eq 0 ]]
   then
+    if [[ "$err" -ne 1 ]]
+    then
+      err=0
+    fi
+  else
     if [[ "$result" =~ \
           "Access to performance monitoring and observability operations is limited" ]]
     then
       echo "Permission failure"
       echo $result
-      if [[ $err -eq 0 ]]
-      then
-        err=2 # Skip
-      fi
+      skip=1
     elif [[ "$result" =~ "in per-thread mode, enable system wide" ]]
     then
       echo "Permissions - need system wide mode"
       echo $result
-      if [[ $err -eq 0 ]]
-      then
-        err=2 # Skip
-      fi
+      skip=1
     elif [[ "$m" == @(Default2|Default3|Default4) ]]
     then
       echo "Ignoring failures in $m that may contain unsupported legacy events"
@@ -48,4 +49,9 @@ do
   fi
 done
 
+if [[ "$err" -eq 3 && "$skip" -eq 1 ]]
+then
+  err=2
+fi
+
 exit $err
diff --git a/tools/perf/tests/shell/stat_all_metrics.sh b/tools/perf/tests/shell/stat_all_metrics.sh
index 3dabb39c7cc8..b582d23f28c9 100755
--- a/tools/perf/tests/shell/stat_all_metrics.sh
+++ b/tools/perf/tests/shell/stat_all_metrics.sh
@@ -15,7 +15,8 @@ then
   test_prog="perf test -w noploop"
 fi
 
-err=0
+skip=0
+err=3
 for m in $(perf list --raw-dump metrics); do
   echo "Testing $m"
   result=$(perf stat -M "$m" $system_wide_flag -- $test_prog 2>&1)
@@ -23,6 +24,10 @@ for m in $(perf list --raw-dump metrics); do
   if [[ $result_err -eq 0 && "$result" =~ ${m:0:50} ]]
   then
     # No error result and metric shown.
+    if [[ "$err" -ne 1 ]]
+    then
+      err=0
+    fi
     continue
   fi
   if [[ "$result" =~ "Cannot resolve IDs for" || "$result" =~ "No supported events found" ]]
@@ -44,7 +49,7 @@ for m in $(perf list --raw-dump metrics); do
     echo $result
     if [[ $err -eq 0 ]]
     then
-      err=2 # Skip
+      skip=1
     fi
     continue
   elif [[ "$result" =~ "in per-thread mode, enable system wide" ]]
@@ -53,7 +58,7 @@ for m in $(perf list --raw-dump metrics); do
     echo $result
     if [[ $err -eq 0 ]]
     then
-      err=2 # Skip
+      skip=1
     fi
     continue
   elif [[ "$result" =~ "<not supported>" ]]
@@ -68,7 +73,7 @@ for m in $(perf list --raw-dump metrics); do
     echo $result
     if [[ $err -eq 0 ]]
     then
-      err=2 # Skip
+      skip=1
     fi
     continue
   elif [[ "$result" =~ "<not counted>" ]]
@@ -77,7 +82,7 @@ for m in $(perf list --raw-dump metrics); do
     echo $result
     if [[ $err -eq 0 ]]
     then
-      err=2 # Skip
+      skip=1
     fi
     continue
   elif [[ "$result" =~ "FP_ARITH" || "$result" =~ "AMX" ]]
@@ -86,7 +91,7 @@ for m in $(perf list --raw-dump metrics); do
     echo $result
     if [[ $err -eq 0 ]]
     then
-      err=2 # Skip
+      skip=1
     fi
     continue
   elif [[ "$result" =~ "PMM" ]]
@@ -95,7 +100,7 @@ for m in $(perf list --raw-dump metrics); do
     echo $result
     if [[ $err -eq 0 ]]
     then
-      err=2 # Skip
+      skip=1
     fi
     continue
   fi
@@ -106,6 +111,10 @@ for m in $(perf list --raw-dump metrics); do
   if [[ $result_err -eq 0 && "$result" =~ ${m:0:50} ]]
   then
     # No error result and metric shown.
+    if [[ "$err" -ne 1 ]]
+    then
+      err=0
+    fi
     continue
   fi
   echo "[Failed $m] has non-zero error '$result_err' or not printed in:"
@@ -113,4 +122,10 @@ for m in $(perf list --raw-dump metrics); do
   err=1
 done
 
+# return SKIP only if no success returned
+if [[ "$err" -eq 3 && "$skip" -eq 1 ]]
+then
+  err=2
+fi
+
 exit "$err"
diff --git a/tools/perf/tests/shell/test_arm_coresight.sh b/tools/perf/tests/shell/test_arm_coresight.sh
index 1c750b67d141..bbf89e944e7b 100755
--- a/tools/perf/tests/shell/test_arm_coresight.sh
+++ b/tools/perf/tests/shell/test_arm_coresight.sh
@@ -198,6 +198,58 @@ arm_cs_etm_basic_test() {
 	arm_cs_report "CoreSight basic testing with '$*'" $err
 }
 
+arm_cs_etm_test_cpu_list() {
+	echo "Testing sparse CPU list: $1"
+	perf record -o ${perfdata} -e cs_etm//u -C $1 \
+		-- taskset --cpu-list $1 true > /dev/null 2>&1
+	perf_script_branch_samples true
+	err=$?
+	arm_cs_report "CoreSight sparse CPUs with '$*'" $err
+}
+
+arm_cs_etm_sparse_cpus_test() {
+	# Iterate for every ETM device
+	cpus=()
+	for dev in /sys/bus/event_source/devices/cs_etm/cpu*; do
+		# Canonicalize the path
+		dev=`readlink -f $dev`
+
+		# Find the ETM device belonging to which CPU
+		cpus+=("$(cat $dev/cpu)")
+	done
+
+	mapfile -t cpus < <(printf '%s\n' "${cpus[@]}" | sort -n)
+	total=${#cpus[@]}
+
+	# Need more than 1 to test
+	if [ $total -le 1 ]; then
+		return 0
+	fi
+
+	half=$((total / 2))
+
+	# First half
+	first_half=$(IFS=,; echo "${cpus[*]:0:$half}")
+	arm_cs_etm_test_cpu_list $first_half
+
+	# Second half
+	second_half=$(IFS=,; echo "${cpus[*]:$half}")
+	arm_cs_etm_test_cpu_list $second_half
+
+	# Odd list is the same as halves unless >= 4 CPUs
+	if [ $total -lt 4 ]; then
+		return 0
+	fi
+
+	# Odd indices
+	odd_cpus=()
+	for ((i=1; i<total; i+=2)); do
+		odd_cpus+=("${cpus[$i]}")
+	done
+	odd_list=$(IFS=,; echo "${odd_cpus[*]}")
+	arm_cs_etm_test_cpu_list $odd_list
+}
+
 arm_cs_etm_traverse_path_test
 arm_cs_etm_system_wide_test
 arm_cs_etm_snapshot_test
@@ -211,4 +263,6 @@ arm_cs_etm_basic_test -e cs_etm/timestamp=1/ -a
 arm_cs_etm_basic_test -e cs_etm/timestamp=0/
 arm_cs_etm_basic_test -e cs_etm/timestamp=1/
 
+arm_cs_etm_sparse_cpus_test
+
 exit $glb_err
diff --git a/tools/perf/tests/shell/test_java_symbol.sh b/tools/perf/tests/shell/test_java_symbol.sh
index 499539d1c479..63a2cc9bf13f 100755
--- a/tools/perf/tests/shell/test_java_symbol.sh
+++ b/tools/perf/tests/shell/test_java_symbol.sh
@@ -22,10 +22,13 @@ cleanup_files()
 
 trap cleanup_files exit term int
 
+PERF_DIR=$(dirname "$(which perf)")
 if [ -e "$PWD/tools/perf/libperf-jvmti.so" ]; then
 	LIBJVMTI=$PWD/tools/perf/libperf-jvmti.so
 elif [ -e "$PWD/libperf-jvmti.so" ]; then
 	LIBJVMTI=$PWD/libperf-jvmti.so
+elif [ -e "$PERF_DIR/libperf-jvmti.so" ]; then
+	LIBJVMTI=$PERF_DIR/libperf-jvmti.so
 elif [ -e "$PREFIX/lib64/libperf-jvmti.so" ]; then
 	LIBJVMTI=$PREFIX/lib64/libperf-jvmti.so
 elif [ -e "$PREFIX/lib/libperf-jvmti.so" ]; then
@@ -34,6 +37,7 @@ elif [ -e "/usr/lib/linux-tools-$(uname -a | awk '{ print $3 }' | sed -r 's/-gen
 	LIBJVMTI=/usr/lib/linux-tools-$(uname -a | awk '{ print $3 }' | sed -r 's/-generic//')/libperf-jvmti.so
 else
 	echo "Fail to find libperf-jvmti.so"
+
 	# JVMTI is a build option, skip the test if fail to find lib
 	exit 2
 fi
diff --git a/tools/perf/tests/shell/test_perf_data_converter_ctf.sh b/tools/perf/tests/shell/test_perf_data_converter_ctf.sh
new file mode 100755
index 000000000000..334eebc9945e
--- /dev/null
+++ b/tools/perf/tests/shell/test_perf_data_converter_ctf.sh
@@ -0,0 +1,104 @@
+#!/bin/bash
+# 'perf data convert --to-ctf' command test
+# SPDX-License-Identifier: GPL-2.0
+
+set -e
+
+err=0
+
+perfdata=$(mktemp /tmp/__perf_test.perf.data.XXXXX)
+ctf_dir=$(mktemp -d /tmp/__perf_test.ctf.XXXXX)
+
+cleanup()
+{
+	rm -f "${perfdata}"
+	rm -rf "${ctf_dir}"
+	trap - exit term int
+}
+
+trap_cleanup()
+{
+	echo "Unexpected signal in ${FUNCNAME[1]}"
+	cleanup
+	exit ${err}
+}
+trap trap_cleanup exit term int
+
+check_babeltrace_support()
+{
+	if ! perf check feature libbabeltrace
+	then
+		echo "perf not linked with libbabeltrace, skipping test"
+		exit 2
+	fi
+}
+
+test_ctf_converter_file()
+{
+	echo "Testing Perf Data Conversion Command to CTF (File input)"
+	# Record some data
+	if ! perf record -o "$perfdata" -F 99 -g -- perf test -w noploop
+	then
+		echo "Failed to record perf data"
+		err=1
+		return
+	fi
+
+	# Cleanup previous ctf dir
+	rm -rf "${ctf_dir}"
+
+	# Convert
+	if ! perf data convert --to-ctf "$ctf_dir" --force -i "$perfdata"
+	then
+		echo "Perf Data Converter Command to CTF (File input) [FAILED]"
+		err=1
+		return
+	fi
+
+	if [ -d "${ctf_dir}" ] && [ "$(ls -A "${ctf_dir}")" ]
+	then
+		echo "Perf Data Converter Command to CTF (File input) [SUCCESS]"
+	else
+		echo "Perf Data Converter Command to CTF (File input) [FAILED]"
+		echo "    Output directory empty or missing"
+		err=1
+	fi
+}
+
+test_ctf_converter_pipe()
+{
+	echo "Testing Perf Data Conversion Command to CTF (Pipe mode)"
+
+	# Cleanup previous ctf dir
+	rm -rf "${ctf_dir}"
+
+	# Record to stdout and pipe to $perfdata file
+	if ! perf record -o - -F 99 -g -- perf test -w noploop > "$perfdata"
+	then
+		echo "Failed to record perf data"
+		err=1
+		return
+	fi
+
+	if ! perf data convert --to-ctf "$ctf_dir" --force -i "$perfdata"
+	then
+		echo "Perf Data Converter Command to CTF (Pipe mode) [FAILED]"
+		err=1
+		return
+	fi
+
+	if [ -d "${ctf_dir}" ] && [ "$(ls -A "${ctf_dir}")" ]
+	then
+		echo "Perf Data Converter Command to CTF (Pipe mode) [SUCCESS]"
+	else
+		echo "Perf Data Converter Command to CTF (Pipe mode) [FAILED]"
+		echo "    Output directory empty or missing"
+		err=1
+	fi
+}
+
+check_babeltrace_support
+test_ctf_converter_file
+test_ctf_converter_pipe
+
+exit ${err}
diff --git a/tools/perf/tests/shell/test_perf_data_converter_json.sh b/tools/perf/tests/shell/test_perf_data_converter_json.sh
index c4f1b59d116f..35d81e39a26c 100755
--- a/tools/perf/tests/shell/test_perf_data_converter_json.sh
+++ b/tools/perf/tests/shell/test_perf_data_converter_json.sh
@@ -15,29 +15,42 @@ result=$(mktemp /tmp/__perf_test.output.json.XXXXX)
 
 cleanup()
 {
-	rm -f "${perfdata}"
+	rm -f "${perfdata}*"
 	rm -f "${result}"
 	trap - exit term int
 }
 
 trap_cleanup()
 {
+	echo "Unexpected signal in ${FUNCNAME[1]}"
 	cleanup
-	exit ${err}
+	exit 1
 }
 trap trap_cleanup exit term int
 
 test_json_converter_command()
 {
-	echo "Testing Perf Data Convertion Command to JSON"
-	perf record -o "$perfdata" -F 99 -g -- perf test -w noploop > /dev/null 2>&1
-	perf data convert --to-json "$result" --force -i "$perfdata" >/dev/null 2>&1
+	echo "Testing Perf Data Conversion Command to JSON"
+	perf record -o "$perfdata" -F 99 -g -- perf test -w noploop
+	perf data convert --to-json "$result" --force -i "$perfdata"
 	if [ "$(cat ${result} | wc -l)" -gt "0" ] ; then
 		echo "Perf Data Converter Command to JSON [SUCCESS]"
 	else
 		echo "Perf Data Converter Command to JSON [FAILED]"
 		err=1
-		exit
+	fi
+}
+
+test_json_converter_pipe()
+{
+	echo "Testing Perf Data Conversion Command to JSON (Pipe mode)"
+	perf record -o - -F 99 -g -- perf test -w noploop > "$perfdata"
+	cat "$perfdata" | perf data convert --to-json "$result" --force -i -
+	if [ "$(cat ${result} | wc -l)" -gt "0" ] ; then
+		echo "Perf Data Converter Command to JSON (Pipe mode) [SUCCESS]"
+	else
+		echo "Perf Data Converter Command to JSON (Pipe mode) [FAILED]"
+		err=1
 	fi
 }
 
@@ -50,16 +63,18 @@ validate_json_format()
 		else
 			echo "The file does not contain valid JSON format [FAILED]"
 			err=1
-			exit
 		fi
 	else
 		echo "File not found [FAILED]"
-		err=2
-		exit
+		err=1
 	fi
 }
 
 test_json_converter_command
 validate_json_format
 
+test_json_converter_pipe
+validate_json_format
+
+cleanup
 exit ${err}
diff --git a/tools/perf/tests/subcmd-help.c b/tools/perf/tests/subcmd-help.c
index 2280b4c0e5e7..9da96a16fd20 100644
--- a/tools/perf/tests/subcmd-help.c
+++ b/tools/perf/tests/subcmd-help.c
@@ -95,10 +95,36 @@ static int test__exclude_cmdnames(struct test_suite *test __maybe_unused,
 	return TEST_OK;
 }
 
+static int test__exclude_cmdnames_no_overlap(struct test_suite *test __maybe_unused,
+					     int subtest __maybe_unused)
+{
+	struct cmdnames cmds1 = {};
+	struct cmdnames cmds2 = {};
+
+	add_cmdname(&cmds1, "read-vdso32", 11);
+	add_cmdname(&cmds2, "archive", 7);
+
+	TEST_ASSERT_VAL("invalid original size", cmds1.cnt == 1);
+	TEST_ASSERT_VAL("invalid original size", cmds2.cnt == 1);
+
+	exclude_cmds(&cmds1, &cmds2);
+
+	TEST_ASSERT_VAL("invalid excluded size", cmds1.cnt == 1);
+	TEST_ASSERT_VAL("invalid excluded size", cmds2.cnt == 1);
+
+	TEST_ASSERT_VAL("cannot find cmd", is_in_cmdlist(&cmds1, "read-vdso32") == 1);
+	TEST_ASSERT_VAL("wrong cmd", is_in_cmdlist(&cmds1, "archive") == 0);
+
+	clean_cmdnames(&cmds1);
+	clean_cmdnames(&cmds2);
+	return TEST_OK;
+}
+
 static struct test_case tests__subcmd_help[] = {
 	TEST_CASE("Load subcmd names", load_cmdnames),
 	TEST_CASE("Uniquify subcmd names", uniq_cmdnames),
 	TEST_CASE("Exclude duplicate subcmd names", exclude_cmdnames),
+	TEST_CASE("Exclude disjoint subcmd names", exclude_cmdnames_no_overlap),
 	{	.name = NULL, }
 };
 
diff --git a/tools/perf/tests/sw-clock.c b/tools/perf/tests/sw-clock.c
index 4a2ad7176fa0..b6e46975379c 100644
--- a/tools/perf/tests/sw-clock.c
+++ b/tools/perf/tests/sw-clock.c
@@ -5,6 +5,7 @@
 #include <stdlib.h>
 #include <signal.h>
 #include <sys/mman.h>
+#include <linux/compiler.h>
 #include <linux/string.h>
 
 #include "tests.h"
@@ -28,7 +29,7 @@
 static int __test__sw_clock_freq(enum perf_sw_ids clock_id)
 {
 	int i, err = -1;
-	volatile int tmp = 0;
+	volatile int tmp __maybe_unused = 0;
 	u64 total_periods = 0;
 	int nr_samples = 0;
 	char sbuf[STRERR_BUFSIZE];
diff --git a/tools/perf/tests/tests.h b/tools/perf/tests/tests.h
index cb67ddbd0375..f5f1238d1f7f 100644
--- a/tools/perf/tests/tests.h
+++ b/tools/perf/tests/tests.h
@@ -240,6 +240,11 @@ DECLARE_WORKLOAD(brstack);
 DECLARE_WORKLOAD(datasym);
 DECLARE_WORKLOAD(landlock);
 DECLARE_WORKLOAD(traploop);
+DECLARE_WORKLOAD(inlineloop);
+
+#ifdef HAVE_RUST_SUPPORT
+DECLARE_WORKLOAD(code_with_type);
+#endif
 
 extern const char *dso_to_test;
 extern const char *test_objdump_path;
diff --git a/tools/perf/tests/util.c b/tools/perf/tests/util.c
index b273d287e164..bf2c5b133884 100644
--- a/tools/perf/tests/util.c
+++ b/tools/perf/tests/util.c
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 #include "tests.h"
+#include "util/blake2s.h"
 #include "util/debug.h"
-#include "util/sha1.h"
 
 #include <linux/compiler.h>
 #include <stdlib.h>
@@ -17,45 +17,72 @@ static int test_strreplace(char needle, const char *haystack,
 	return ret == 0;
 }
 
-#define MAX_LEN 512
+/* Maximum data length tested by test_blake2s() */
+#define MAX_DATA_LEN 512
 
-/* Test sha1() for all lengths from 0 to MAX_LEN inclusively. */
-static int test_sha1(void)
+/*
+ * Hash length tested by test_blake2s().  BLAKE2s supports variable-length
+ * hashes.  However, the only user of BLAKE2s in 'perf' uses 20-byte hashes,
+ * matching the length of the ELF build ID field.  So that's the length we test.
+ */
+#define HASH_LEN 20
+
+/* Test the implementation of the BLAKE2s hash algorithm. */
+static int test_blake2s(void)
 {
-	u8 data[MAX_LEN];
-	size_t digests_size = (MAX_LEN + 1) * SHA1_DIGEST_SIZE;
-	u8 *digests;
-	u8 digest_of_digests[SHA1_DIGEST_SIZE];
+	u8 data[MAX_DATA_LEN];
+	u8 hash[HASH_LEN];
+	u8 hash2[HASH_LEN];
+	struct blake2s_ctx main_ctx;
 	/*
-	 * The correctness of this value was verified by running this test with
-	 * sha1() replaced by OpenSSL's SHA1().
+	 * This value was generated by the following Python code:
+	 *
+	 * import hashlib
+	 *
+	 * data = bytes(i % 256 for i in range(513))
+	 * h = hashlib.blake2s(digest_size=20)
+	 * for i in range(513):
+	 *     h.update(hashlib.blake2s(data=data[:i], digest_size=20).digest())
+	 * print(h.hexdigest())
 	 */
-	static const u8 expected_digest_of_digests[SHA1_DIGEST_SIZE] = {
-		0x74, 0xcd, 0x4c, 0xb9, 0xd8, 0xa6, 0xd5, 0x95, 0x22, 0x8b,
-		0x7e, 0xd6, 0x8b, 0x7e, 0x46, 0x95, 0x31, 0x9b, 0xa2, 0x43,
+	static const u8 expected_hash_of_hashes[20] = {
+		0xef, 0x9b, 0x13, 0x98, 0x78, 0x8e, 0x74, 0x59, 0x9c, 0xd5,
+		0x0c, 0xf0, 0x33, 0x97, 0x79, 0x3d, 0x3e, 0xd0, 0x95, 0xa6
 	};
 	size_t i;
 
-	digests = malloc(digests_size);
-	TEST_ASSERT_VAL("failed to allocate digests", digests != NULL);
-
-	/* Generate MAX_LEN bytes of data. */
-	for (i = 0; i < MAX_LEN; i++)
+	/* Generate MAX_DATA_LEN bytes of data. */
+	for (i = 0; i < MAX_DATA_LEN; i++)
 		data[i] = i;
 
-	/* Calculate a SHA-1 for each length 0 through MAX_LEN inclusively. */
-	for (i = 0; i <= MAX_LEN; i++)
-		sha1(data, i, &digests[i * SHA1_DIGEST_SIZE]);
+	blake2s_init(&main_ctx, sizeof(hash));
+	for (i = 0; i <= MAX_DATA_LEN; i++) {
+		struct blake2s_ctx ctx;
+
+		/* Compute the BLAKE2s hash of 'i' data bytes. */
+		blake2s_init(&ctx, HASH_LEN);
+		blake2s_update(&ctx, data, i);
+		blake2s_final(&ctx, hash);
 
-	/* Calculate digest of all digests calculated above. */
-	sha1(digests, digests_size, digest_of_digests);
+		/* Verify that multiple updates produce the same result. */
+		blake2s_init(&ctx, HASH_LEN);
+		blake2s_update(&ctx, data, i / 2);
+		blake2s_update(&ctx, &data[i / 2], i - (i / 2));
+		blake2s_final(&ctx, hash2);
+		TEST_ASSERT_VAL("inconsistent BLAKE2s hashes",
+				memcmp(hash, hash2, HASH_LEN) == 0);
 
-	free(digests);
+		/*
+		 * Pass the hash to another BLAKE2s context, so that we
+		 * incrementally compute the hash of all the hashes.
+		 */
+		blake2s_update(&main_ctx, hash, HASH_LEN);
+	}
 
-	/* Check for the expected result. */
-	TEST_ASSERT_VAL("wrong output from sha1()",
-			memcmp(digest_of_digests, expected_digest_of_digests,
-			       SHA1_DIGEST_SIZE) == 0);
+	/* Verify the hash of all the hashes. */
+	blake2s_final(&main_ctx, hash);
+	TEST_ASSERT_VAL("wrong BLAKE2s hashes",
+			memcmp(hash, expected_hash_of_hashes, HASH_LEN) == 0);
 	return 0;
 }
 
@@ -68,7 +95,7 @@ static int test__util(struct test_suite *t __maybe_unused, int subtest __maybe_u
 	TEST_ASSERT_VAL("replace long", test_strreplace('a', "abcabc", "longlong",
 							"longlongbclonglongbc"));
 
-	return test_sha1();
+	return test_blake2s();
 }
 
 DEFINE_SUITE("util", util);
diff --git a/tools/perf/tests/workloads/Build b/tools/perf/tests/workloads/Build
index fb1012cc4fc3..2ef97f7affce 100644
--- a/tools/perf/tests/workloads/Build
+++ b/tools/perf/tests/workloads/Build
@@ -8,9 +8,16 @@ perf-test-y += brstack.o
 perf-test-y += datasym.o
 perf-test-y += landlock.o
 perf-test-y += traploop.o
+perf-test-y += inlineloop.o
+
+ifeq ($(CONFIG_RUST_SUPPORT),y)
+    perf-test-y += code_with_type.o
+    perf-test-y += code_with_type.a
+endif
 
 CFLAGS_sqrtloop.o         = -g -O0 -fno-inline -U_FORTIFY_SOURCE
 CFLAGS_leafloop.o         = -g -O0 -fno-inline -fno-omit-frame-pointer -U_FORTIFY_SOURCE
 CFLAGS_brstack.o          = -g -O0 -fno-inline -U_FORTIFY_SOURCE
 CFLAGS_datasym.o          = -g -O0 -fno-inline -U_FORTIFY_SOURCE
 CFLAGS_traploop.o         = -g -O0 -fno-inline -U_FORTIFY_SOURCE
+CFLAGS_inlineloop.o       = -g -O2
diff --git a/tools/perf/tests/workloads/code_with_type.c b/tools/perf/tests/workloads/code_with_type.c
new file mode 100644
index 000000000000..65d7be7dac24
--- /dev/null
+++ b/tools/perf/tests/workloads/code_with_type.c
@@ -0,0 +1,46 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <pthread.h>
+#include <stdlib.h>
+#include <signal.h>
+#include <unistd.h>
+#include <linux/compiler.h>
+#include "../tests.h"
+
+extern void test_rs(uint count);
+
+static volatile sig_atomic_t done;
+
+static void sighandler(int sig __maybe_unused)
+{
+	done = 1;
+}
+
+static int code_with_type(int argc, const char **argv)
+{
+	int sec = 1, num_loops = 100;
+
+	pthread_setname_np(pthread_self(), "perf-code-with-type");
+	if (argc > 0)
+		sec = atoi(argv[0]);
+
+	if (argc > 1)
+		num_loops = atoi(argv[1]);
+
+	signal(SIGINT, sighandler);
+	signal(SIGALRM, sighandler);
+	alarm(sec);
+
+	/*
+	 * Rust doesn't have signal management in the standard library. To
+	 * not deal with any external crates, offload signal handling to the
+	 * outside code.
+	 */
+	while (!done) {
+		test_rs(num_loops);
+		continue;
+	}
+
+	return 0;
+}
+
+DEFINE_WORKLOAD(code_with_type);
diff --git a/tools/perf/tests/workloads/code_with_type.rs b/tools/perf/tests/workloads/code_with_type.rs
new file mode 100644
index 000000000000..3dab39b22dd7
--- /dev/null
+++ b/tools/perf/tests/workloads/code_with_type.rs
@@ -0,0 +1,27 @@
+// SPDX-License-Identifier: GPL-2.0
+
+// We're going to look for this structure in the data type profiling report
+#[allow(dead_code)]
+struct Buf {
+    data1: u64,
+    data2: String,
+    data3: u64,
+}
+
+#[no_mangle]
+pub extern "C" fn test_rs(count: u32) {
+    let mut b = Buf {
+        data1: 0,
+        data2: String::from("data"),
+        data3: 0,
+    };
+
+    for _ in 1..count {
+        b.data1 += 1;
+        if b.data1 == 123 {
+            b.data1 += 1;
+        }
+
+        b.data3 += b.data1;
+    }
+}
diff --git a/tools/perf/tests/workloads/inlineloop.c b/tools/perf/tests/workloads/inlineloop.c
new file mode 100644
index 000000000000..bc82dfc7c410
--- /dev/null
+++ b/tools/perf/tests/workloads/inlineloop.c
@@ -0,0 +1,52 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <pthread.h>
+#include <stdlib.h>
+#include <signal.h>
+#include <unistd.h>
+#include <linux/compiler.h>
+#include "../tests.h"
+
+static volatile int a;
+static volatile sig_atomic_t done;
+
+static void sighandler(int sig __maybe_unused)
+{
+	done = 1;
+}
+
+static inline void __attribute__((always_inline)) leaf(int b)
+{
+again:
+	a += b;
+	if (!done)
+		goto again;
+}
+
+static inline void __attribute__((always_inline)) middle(int b)
+{
+	leaf(b);
+}
+
+static noinline void parent(int b)
+{
+	middle(b);
+}
+
+static int inlineloop(int argc, const char **argv)
+{
+	int sec = 1;
+
+	pthread_setname_np(pthread_self(), "perf-inlineloop");
+	if (argc > 0)
+		sec = atoi(argv[0]);
+
+	signal(SIGINT, sighandler);
+	signal(SIGALRM, sighandler);
+	alarm(sec);
+
+	parent(sec);
+
+	return 0;
+}
+
+DEFINE_WORKLOAD(inlineloop);
diff --git a/tools/perf/ui/browsers/annotate.c b/tools/perf/ui/browsers/annotate.c
index 36aca8d6d003..ea17e6d29a7e 100644
--- a/tools/perf/ui/browsers/annotate.c
+++ b/tools/perf/ui/browsers/annotate.c
@@ -30,7 +30,7 @@ struct annotate_browser {
 	struct rb_root		    entries;
 	struct rb_node		   *curr_hot;
 	struct annotation_line	   *selection;
-	struct arch		   *arch;
+	const struct arch	   *arch;
 	/*
 	 * perf top can delete hist_entry anytime.  Callers should make sure
 	 * its lifetime.
@@ -601,7 +601,7 @@ static bool annotate_browser__callq(struct annotate_browser *browser,
 		return true;
 	}
 
-	target_ms.maps = ms->maps;
+	target_ms.thread = ms->thread;
 	target_ms.map = ms->map;
 	target_ms.sym = dl->ops.target.sym;
 	annotation__unlock(notes);
@@ -1198,7 +1198,7 @@ int __hist_entry__tui_annotate(struct hist_entry *he, struct map_symbol *ms,
 				ui__warning("Annotation has no source code.");
 		}
 	} else {
-		err = evsel__get_arch(evsel, &browser.arch);
+		err = thread__get_arch(ms->thread, &browser.arch);
 		if (err) {
 			annotate_browser__symbol_annotate_error(&browser, err);
 			return -1;
diff --git a/tools/perf/ui/browsers/hists.c b/tools/perf/ui/browsers/hists.c
index 08fecbe28a52..cfa6386e6e1d 100644
--- a/tools/perf/ui/browsers/hists.c
+++ b/tools/perf/ui/browsers/hists.c
@@ -3189,7 +3189,8 @@ do_hotkey:		 // key came straight from options ui__popup_menu()
 		case 'k':
 			if (browser->selection != NULL)
 				hists_browser__zoom_map(browser,
-					      maps__machine(browser->selection->maps)->vmlinux_map);
+					maps__machine(thread__maps(browser->selection->thread)
+						     )->vmlinux_map);
 			continue;
 		case 'V':
 			verbose = (verbose + 1) % 4;
diff --git a/tools/perf/util/Build b/tools/perf/util/Build
index 1c2a43e1dc68..bcccad7487a9 100644
--- a/tools/perf/util/Build
+++ b/tools/perf/util/Build
@@ -1,15 +1,17 @@
 include $(srctree)/tools/scripts/Makefile.include
 include $(srctree)/tools/scripts/utilities.mak
 
+perf-util-y += annotate-arch/
 perf-util-y += arm64-frame-pointer-unwind-support.o
 perf-util-y += addr2line.o
 perf-util-y += addr_location.o
 perf-util-y += annotate.o
+perf-util-y += blake2s.o
 perf-util-y += block-info.o
 perf-util-y += block-range.o
 perf-util-y += build-id.o
 perf-util-y += cacheline.o
-perf-util-y += capstone.o
+perf-util-$(CONFIG_LIBCAPSTONE) += capstone.o
 perf-util-y += config.o
 perf-util-y += copyfile.o
 perf-util-y += ctype.o
@@ -43,7 +45,6 @@ perf-util-y += rbtree.o
 perf-util-y += libstring.o
 perf-util-y += bitmap.o
 perf-util-y += hweight.o
-perf-util-y += sha1.o
 perf-util-y += smt.o
 perf-util-y += strbuf.o
 perf-util-y += string.o
@@ -127,7 +128,8 @@ perf-util-y += spark.o
 perf-util-y += topdown.o
 perf-util-y += iostat.o
 perf-util-y += stream.o
-perf-util-y += kvm-stat.o
+perf-util-$(CONFIG_LIBTRACEEVENT) += kvm-stat.o
+perf-util-y += kvm-stat-arch/
 perf-util-y += lock-contention.o
 perf-util-y += auxtrace.o
 perf-util-y += intel-pt-decoder/
@@ -219,13 +221,12 @@ endif
 perf-util-$(CONFIG_LIBDW) += probe-finder.o
 perf-util-$(CONFIG_LIBDW) += dwarf-aux.o
 perf-util-$(CONFIG_LIBDW) += dwarf-regs.o
-perf-util-$(CONFIG_LIBDW) += dwarf-regs-csky.o
-perf-util-$(CONFIG_LIBDW) += dwarf-regs-powerpc.o
-perf-util-$(CONFIG_LIBDW) += dwarf-regs-x86.o
+perf-util-$(CONFIG_LIBDW) += dwarf-regs-arch/
 perf-util-$(CONFIG_LIBDW) += debuginfo.o
 perf-util-$(CONFIG_LIBDW) += annotate-data.o
+perf-util-$(CONFIG_LIBDW) += libdw.o
+perf-util-$(CONFIG_LIBDW) += unwind-libdw.o
 
-perf-util-$(CONFIG_LIBDW_DWARF_UNWIND) += unwind-libdw.o
 perf-util-$(CONFIG_LOCAL_LIBUNWIND)    += unwind-libunwind-local.o
 perf-util-$(CONFIG_LIBUNWIND)          += unwind-libunwind.o
 perf-util-$(CONFIG_LIBUNWIND_X86)      += libunwind/x86_32.o
@@ -418,20 +419,6 @@ $(OUTPUT)util/list_sort.o: ../lib/list_sort.c FORCE
 	$(call rule_mkdir)
 	$(call if_changed_dep,cc_o_c)
 
-ifdef SHELLCHECK
-  SHELL_TESTS := generate-cmdlist.sh
-  SHELL_TEST_LOGS := $(SHELL_TESTS:%=%.shellcheck_log)
-else
-  SHELL_TESTS :=
-  SHELL_TEST_LOGS :=
-endif
-
-$(OUTPUT)%.shellcheck_log: %
-	$(call rule_mkdir)
-	$(Q)$(call echo-cmd,test)$(SHELLCHECK) "$<" > $@ || (cat $@ && rm $@ && false)
-
-perf-util-y += $(SHELL_TEST_LOGS)
-
 PY_TESTS := setup.py
 ifdef MYPY
   MYPY_TEST_LOGS := $(PY_TESTS:%=%.mypy_log)
diff --git a/tools/perf/util/addr2line.c b/tools/perf/util/addr2line.c
index f2d94a3272d7..31c0391fffa3 100644
--- a/tools/perf/util/addr2line.c
+++ b/tools/perf/util/addr2line.c
@@ -18,8 +18,8 @@
 
 #define MAX_INLINE_NEST 1024
 
-/* If addr2line doesn't return data for 1 second then timeout. */
-int addr2line_timeout_ms = 1 * 1000;
+/* If addr2line doesn't return data for 5 seconds then timeout. */
+int addr2line_timeout_ms = 5 * 1000;
 
 static int filename_split(char *filename, unsigned int *line_nr)
 {
@@ -90,16 +90,16 @@ static struct child_process *addr2line_subprocess_init(const char *addr2line_pat
 	return a2l;
 }
 
-enum a2l_style {
+enum cmd_a2l_style {
 	BROKEN,
 	GNU_BINUTILS,
 	LLVM,
 };
 
-static enum a2l_style addr2line_configure(struct child_process *a2l, const char *dso_name)
+static enum cmd_a2l_style cmd_addr2line_configure(struct child_process *a2l, const char *dso_name)
 {
 	static bool cached;
-	static enum a2l_style style;
+	static enum cmd_a2l_style style;
 
 	if (!cached) {
 		char buf[128];
@@ -149,7 +149,7 @@ static enum a2l_style addr2line_configure(struct child_process *a2l, const char
 }
 
 static int read_addr2line_record(struct io *io,
-				 enum a2l_style style,
+				 enum cmd_a2l_style style,
 				 const char *dso_name,
 				 u64 addr,
 				 bool first,
@@ -298,7 +298,7 @@ int cmd__addr2line(const char *dso_name, u64 addr,
 	char buf[128];
 	ssize_t written;
 	struct io io = { .eof = false };
-	enum a2l_style a2l_style;
+	enum cmd_a2l_style cmd_a2l_style;
 
 	if (!a2l) {
 		if (!filename__has_section(dso_name, ".debug_line"))
@@ -314,8 +314,8 @@ int cmd__addr2line(const char *dso_name, u64 addr,
 			pr_warning("%s %s: addr2line_subprocess_init failed\n", __func__, dso_name);
 		goto out;
 	}
-	a2l_style = addr2line_configure(a2l, dso_name);
-	if (a2l_style == BROKEN)
+	cmd_a2l_style = cmd_addr2line_configure(a2l, dso_name);
+	if (cmd_a2l_style == BROKEN)
 		goto out;
 
 	/*
@@ -336,7 +336,7 @@ int cmd__addr2line(const char *dso_name, u64 addr,
 	}
 	io__init(&io, a2l->out, buf, sizeof(buf));
 	io.timeout_ms = addr2line_timeout_ms;
-	switch (read_addr2line_record(&io, a2l_style, dso_name, addr, /*first=*/true,
+	switch (read_addr2line_record(&io, cmd_a2l_style, dso_name, addr, /*first=*/true,
 				      &record_function, &record_filename, &record_line_nr)) {
 	case -1:
 		if (!symbol_conf.disable_add2line_warn)
@@ -351,7 +351,7 @@ int cmd__addr2line(const char *dso_name, u64 addr,
 		 * binutils, also force a non-zero address as we're no longer
 		 * reading that record.
 		 */
-		switch (read_addr2line_record(&io, a2l_style, dso_name,
+		switch (read_addr2line_record(&io, cmd_a2l_style, dso_name,
 					      /*addr=*/1, /*first=*/true,
 					      NULL, NULL, NULL)) {
 		case -1:
@@ -397,7 +397,7 @@ int cmd__addr2line(const char *dso_name, u64 addr,
 	 * as we're reading records beyond the first.
 	 */
 	while ((record_status = read_addr2line_record(&io,
-						      a2l_style,
+						      cmd_a2l_style,
 						      dso_name,
 						      /*addr=*/1,
 						      /*first=*/false,
diff --git a/tools/perf/util/addr_location.c b/tools/perf/util/addr_location.c
index 007a2f5df9a6..57e8217a00f9 100644
--- a/tools/perf/util/addr_location.c
+++ b/tools/perf/util/addr_location.c
@@ -7,7 +7,6 @@
 void addr_location__init(struct addr_location *al)
 {
 	al->thread = NULL;
-	al->maps = NULL;
 	al->map = NULL;
 	al->sym = NULL;
 	al->srcline = NULL;
@@ -24,22 +23,19 @@ void addr_location__init(struct addr_location *al)
  * The preprocess_sample method will return with reference counts for the
  * in it, when done using (and perhaps getting ref counts if needing to
  * keep a pointer to one of those entries) it must be paired with
- * addr_location__put(), so that the refcounts can be decremented.
+ * addr_location__exit(), so that the refcounts can be decremented.
  */
 void addr_location__exit(struct addr_location *al)
 {
 	map__zput(al->map);
 	thread__zput(al->thread);
-	maps__zput(al->maps);
 }
 
 void addr_location__copy(struct addr_location *dst, struct addr_location *src)
 {
 	thread__put(dst->thread);
-	maps__put(dst->maps);
 	map__put(dst->map);
 	*dst = *src;
 	dst->thread = thread__get(src->thread);
-	dst->maps = maps__get(src->maps);
 	dst->map = map__get(src->map);
 }
diff --git a/tools/perf/util/addr_location.h b/tools/perf/util/addr_location.h
index 64b551025216..fdc4d3f3a68b 100644
--- a/tools/perf/util/addr_location.h
+++ b/tools/perf/util/addr_location.h
@@ -11,7 +11,6 @@ struct symbol;
 
 struct addr_location {
 	struct thread *thread;
-	struct maps   *maps;
 	struct map    *map;
 	struct symbol *sym;
 	const char    *srcline;
diff --git a/tools/perf/util/annotate-arch/Build b/tools/perf/util/annotate-arch/Build
new file mode 100644
index 000000000000..23316743fdc5
--- /dev/null
+++ b/tools/perf/util/annotate-arch/Build
@@ -0,0 +1,11 @@
+perf-util-y += annotate-arc.o
+perf-util-y += annotate-arm.o
+perf-util-y += annotate-arm64.o
+perf-util-y += annotate-csky.o
+perf-util-y += annotate-loongarch.o
+perf-util-y += annotate-mips.o
+perf-util-y += annotate-x86.o
+perf-util-y += annotate-powerpc.o
+perf-util-y += annotate-riscv64.o
+perf-util-y += annotate-s390.o
+perf-util-y += annotate-sparc.o
diff --git a/tools/perf/util/annotate-arch/annotate-arc.c b/tools/perf/util/annotate-arch/annotate-arc.c
new file mode 100644
index 000000000000..170103e383a4
--- /dev/null
+++ b/tools/perf/util/annotate-arch/annotate-arc.c
@@ -0,0 +1,18 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/compiler.h>
+#include <linux/zalloc.h>
+#include "../disasm.h"
+
+const struct arch *arch__new_arc(const struct e_machine_and_e_flags *id,
+				 const char *cpuid __maybe_unused)
+{
+	struct arch *arch = zalloc(sizeof(*arch));
+
+	if (!arch)
+		return NULL;
+
+	arch->name = "arc";
+	arch->id = *id;
+	arch->objdump.comment_char = ';';
+	return arch;
+}
diff --git a/tools/perf/arch/arm/annotate/instructions.c b/tools/perf/util/annotate-arch/annotate-arm.c
index 5e667b0f5512..afb413c80156 100644
--- a/tools/perf/arch/arm/annotate/instructions.c
+++ b/tools/perf/util/annotate-arch/annotate-arm.c
@@ -1,20 +1,22 @@
 // SPDX-License-Identifier: GPL-2.0
+#include <stdlib.h>
 #include <linux/compiler.h>
 #include <linux/zalloc.h>
 #include <errno.h>
-#include <sys/types.h>
 #include <regex.h>
-#include <stdlib.h>
+#include "../annotate.h"
+#include "../disasm.h"
 
-struct arm_annotate {
-	regex_t call_insn,
-		jump_insn;
+struct arch_arm {
+	struct arch arch;
+	regex_t call_insn;
+	regex_t jump_insn;
 };
 
-static struct ins_ops *arm__associate_instruction_ops(struct arch *arch, const char *name)
+static const struct ins_ops *arm__associate_instruction_ops(struct arch *arch, const char *name)
 {
-	struct arm_annotate *arm = arch->priv;
-	struct ins_ops *ops;
+	struct arch_arm *arm = container_of(arch, struct arch_arm, arch);
+	const struct ins_ops *ops;
 	regmatch_t match[2];
 
 	if (!regexec(&arm->call_insn, name, 2, match, 0))
@@ -28,39 +30,39 @@ static struct ins_ops *arm__associate_instruction_ops(struct arch *arch, const c
 	return ops;
 }
 
-static int arm__annotate_init(struct arch *arch, char *cpuid __maybe_unused)
+const struct arch *arch__new_arm(const struct e_machine_and_e_flags *id,
+				 const char *cpuid __maybe_unused)
 {
-	struct arm_annotate *arm;
 	int err;
+	struct arch_arm *arm = zalloc(sizeof(*arm));
+	struct arch *arch;
 
-	if (arch->initialized)
-		return 0;
-
-	arm = zalloc(sizeof(*arm));
 	if (!arm)
-		return ENOMEM;
+		return NULL;
+
+	arch = &arm->arch;
+	arch->name = "arm";
+	arch->id = *id;
+	arch->objdump.comment_char	  = ';';
+	arch->objdump.skip_functions_char = '+';
+	arch->associate_instruction_ops   = arm__associate_instruction_ops;
 
 #define ARM_CONDS "(cc|cs|eq|ge|gt|hi|le|ls|lt|mi|ne|pl|vc|vs)"
 	err = regcomp(&arm->call_insn, "^blx?" ARM_CONDS "?$", REG_EXTENDED);
 	if (err)
 		goto out_free_arm;
+
 	err = regcomp(&arm->jump_insn, "^bx?" ARM_CONDS "?$", REG_EXTENDED);
 	if (err)
 		goto out_free_call;
 #undef ARM_CONDS
 
-	arch->initialized = true;
-	arch->priv	  = arm;
-	arch->associate_instruction_ops   = arm__associate_instruction_ops;
-	arch->objdump.comment_char	  = ';';
-	arch->objdump.skip_functions_char = '+';
-	arch->e_machine = EM_ARM;
-	arch->e_flags = 0;
-	return 0;
+	return arch;
 
 out_free_call:
 	regfree(&arm->call_insn);
 out_free_arm:
 	free(arm);
-	return SYMBOL_ANNOTATE_ERRNO__ARCH_INIT_REGEXP;
+	errno = SYMBOL_ANNOTATE_ERRNO__ARCH_INIT_REGEXP;
+	return NULL;
 }
diff --git a/tools/perf/arch/arm64/annotate/instructions.c b/tools/perf/util/annotate-arch/annotate-arm64.c
index 16cb62d40bd9..33080fdca125 100644
--- a/tools/perf/arch/arm64/annotate/instructions.c
+++ b/tools/perf/util/annotate-arch/annotate-arm64.c
@@ -1,16 +1,20 @@
 // SPDX-License-Identifier: GPL-2.0
 #include <linux/compiler.h>
 #include <errno.h>
-#include <sys/types.h>
-#include <regex.h>
 #include <stdlib.h>
+#include <string.h>
+#include <linux/zalloc.h>
+#include <regex.h>
+#include "../annotate.h"
+#include "../disasm.h"
 
-struct arm64_annotate {
-	regex_t call_insn,
-		jump_insn;
+struct arch_arm64 {
+	struct arch arch;
+	regex_t call_insn;
+	regex_t jump_insn;
 };
 
-static int arm64_mov__parse(struct arch *arch __maybe_unused,
+static int arm64_mov__parse(const struct arch *arch __maybe_unused,
 			    struct ins_operands *ops,
 			    struct map_symbol *ms __maybe_unused,
 			    struct disasm_line *dl __maybe_unused)
@@ -60,18 +64,15 @@ out_free_source:
 	return -1;
 }
 
-static int mov__scnprintf(struct ins *ins, char *bf, size_t size,
-			  struct ins_operands *ops, int max_ins_name);
-
-static struct ins_ops arm64_mov_ops = {
+static const struct ins_ops arm64_mov_ops = {
 	.parse	   = arm64_mov__parse,
 	.scnprintf = mov__scnprintf,
 };
 
-static struct ins_ops *arm64__associate_instruction_ops(struct arch *arch, const char *name)
+static const struct ins_ops *arm64__associate_instruction_ops(struct arch *arch, const char *name)
 {
-	struct arm64_annotate *arm = arch->priv;
-	struct ins_ops *ops;
+	struct arch_arm64 *arm = container_of(arch, struct arch_arm64, arch);
+	const struct ins_ops *ops;
 	regmatch_t match[2];
 
 	if (!regexec(&arm->jump_insn, name, 2, match, 0))
@@ -87,40 +88,40 @@ static struct ins_ops *arm64__associate_instruction_ops(struct arch *arch, const
 	return ops;
 }
 
-static int arm64__annotate_init(struct arch *arch, char *cpuid __maybe_unused)
+const struct arch *arch__new_arm64(const struct e_machine_and_e_flags *id,
+				   const char *cpuid __maybe_unused)
 {
-	struct arm64_annotate *arm;
 	int err;
+	struct arch_arm64 *arm = zalloc(sizeof(*arm));
+	struct arch *arch;
 
-	if (arch->initialized)
-		return 0;
-
-	arm = zalloc(sizeof(*arm));
 	if (!arm)
-		return ENOMEM;
+		return NULL;
+
+	arch = &arm->arch;
+	arch->name = "arm64";
+	arch->id = *id;
+	arch->objdump.comment_char	  = '/';
+	arch->objdump.skip_functions_char = '+';
+	arch->associate_instruction_ops   = arm64__associate_instruction_ops;
 
 	/* bl, blr */
 	err = regcomp(&arm->call_insn, "^blr?$", REG_EXTENDED);
 	if (err)
 		goto out_free_arm;
+
 	/* b, b.cond, br, cbz/cbnz, tbz/tbnz */
 	err = regcomp(&arm->jump_insn, "^[ct]?br?\\.?(cc|cs|eq|ge|gt|hi|hs|le|lo|ls|lt|mi|ne|pl|vc|vs)?n?z?$",
 		      REG_EXTENDED);
 	if (err)
 		goto out_free_call;
 
-	arch->initialized = true;
-	arch->priv	  = arm;
-	arch->associate_instruction_ops   = arm64__associate_instruction_ops;
-	arch->objdump.comment_char	  = '/';
-	arch->objdump.skip_functions_char = '+';
-	arch->e_machine = EM_AARCH64;
-	arch->e_flags = 0;
-	return 0;
+	return arch;
 
 out_free_call:
 	regfree(&arm->call_insn);
 out_free_arm:
 	free(arm);
-	return SYMBOL_ANNOTATE_ERRNO__ARCH_INIT_REGEXP;
+	errno = SYMBOL_ANNOTATE_ERRNO__ARCH_INIT_REGEXP;
+	return NULL;
 }
diff --git a/tools/perf/arch/csky/annotate/instructions.c b/tools/perf/util/annotate-arch/annotate-csky.c
index 14270311d215..d2b18e4ea2c9 100644
--- a/tools/perf/arch/csky/annotate/instructions.c
+++ b/tools/perf/util/annotate-arch/annotate-csky.c
@@ -1,12 +1,14 @@
 // SPDX-License-Identifier: GPL-2.0
 // Copyright (C) 2019 Hangzhou C-SKY Microsystems co.,ltd.
-
+#include <string.h>
 #include <linux/compiler.h>
+#include <linux/zalloc.h>
+#include "../disasm.h"
 
-static struct ins_ops *csky__associate_ins_ops(struct arch *arch,
-					       const char *name)
+static const struct ins_ops *csky__associate_ins_ops(struct arch *arch,
+						     const char *name)
 {
-	struct ins_ops *ops = NULL;
+	const struct ins_ops *ops = NULL;
 
 	/* catch all kind of jumps */
 	if (!strcmp(name, "bt") ||
@@ -38,16 +40,17 @@ static struct ins_ops *csky__associate_ins_ops(struct arch *arch,
 	return ops;
 }
 
-static int csky__annotate_init(struct arch *arch, char *cpuid __maybe_unused)
+const struct arch *arch__new_csky(const struct e_machine_and_e_flags *id,
+				  const char *cpuid __maybe_unused)
 {
-	arch->initialized = true;
+	struct arch *arch = zalloc(sizeof(*arch));
+
+	if (!arch)
+		return NULL;
+
+	arch->name = "csky";
+	arch->id = *id;
 	arch->objdump.comment_char = '/';
 	arch->associate_instruction_ops = csky__associate_ins_ops;
-	arch->e_machine = EM_CSKY;
-#if defined(__CSKYABIV2__)
-	arch->e_flags = EF_CSKY_ABIV2;
-#else
-	arch->e_flags = EF_CSKY_ABIV1;
-#endif
-	return 0;
+	return arch;
 }
diff --git a/tools/perf/arch/loongarch/annotate/instructions.c b/tools/perf/util/annotate-arch/annotate-loongarch.c
index 70262d5f1444..3aeab453a059 100644
--- a/tools/perf/arch/loongarch/annotate/instructions.c
+++ b/tools/perf/util/annotate-arch/annotate-loongarch.c
@@ -4,15 +4,23 @@
  *
  * Copyright (C) 2020-2023 Loongson Technology Corporation Limited
  */
-
-static int loongarch_call__parse(struct arch *arch, struct ins_operands *ops, struct map_symbol *ms,
-		struct disasm_line *dl __maybe_unused)
+#include <stdlib.h>
+#include <string.h>
+#include <linux/compiler.h>
+#include <linux/zalloc.h>
+#include "../disasm.h"
+#include "../map.h"
+#include "../maps.h"
+#include "../symbol.h"
+#include "../thread.h"
+
+static int loongarch_call__parse(const struct arch *arch, struct ins_operands *ops,
+				 struct map_symbol *ms,
+				 struct disasm_line *dl __maybe_unused)
 {
 	char *c, *endptr, *tok, *name;
 	struct map *map = ms->map;
-	struct addr_map_symbol target = {
-		.ms = { .map = map, },
-	};
+	struct addr_map_symbol target;
 
 	c = strchr(ops->raw, '#');
 	if (c++ == NULL)
@@ -38,27 +46,34 @@ static int loongarch_call__parse(struct arch *arch, struct ins_operands *ops, st
 	if (ops->target.name == NULL)
 		return -1;
 
-	target.addr = map__objdump_2mem(map, ops->target.addr);
+	target = (struct addr_map_symbol) {
+		.ms = { .map = map__get(map), },
+		.addr = map__objdump_2mem(map, ops->target.addr),
+	};
 
-	if (maps__find_ams(ms->maps, &target) == 0 &&
+	if (maps__find_ams(thread__maps(ms->thread), &target) == 0 &&
 	    map__rip_2objdump(target.ms.map, map__map_ip(target.ms.map, target.addr)) == ops->target.addr)
 		ops->target.sym = target.ms.sym;
 
+	addr_map_symbol__exit(&target);
 	return 0;
 }
 
-static struct ins_ops loongarch_call_ops = {
+static const struct ins_ops loongarch_call_ops = {
 	.parse	   = loongarch_call__parse,
 	.scnprintf = call__scnprintf,
+	.is_call   = true,
 };
 
-static int loongarch_jump__parse(struct arch *arch, struct ins_operands *ops, struct map_symbol *ms,
-		struct disasm_line *dl __maybe_unused)
+static int loongarch_jump__parse(const struct arch *arch, struct ins_operands *ops,
+				 struct map_symbol *ms,
+				 struct disasm_line *dl __maybe_unused)
+
 {
 	struct map *map = ms->map;
 	struct symbol *sym = ms->sym;
 	struct addr_map_symbol target = {
-		.ms = { .map = map, },
+		.ms = { .map = map__get(map), },
 	};
 	const char *c = strchr(ops->raw, '#');
 	u64 start, end;
@@ -80,7 +95,7 @@ static int loongarch_jump__parse(struct arch *arch, struct ins_operands *ops, st
 
 	ops->target.outside = target.addr < start || target.addr > end;
 
-	if (maps__find_ams(ms->maps, &target) == 0 &&
+	if (maps__find_ams(thread__maps(ms->thread), &target) == 0 &&
 	    map__rip_2objdump(target.ms.map, map__map_ip(target.ms.map, target.addr)) == ops->target.addr)
 		ops->target.sym = target.ms.sym;
 
@@ -90,19 +105,20 @@ static int loongarch_jump__parse(struct arch *arch, struct ins_operands *ops, st
 	} else {
 		ops->target.offset_avail = false;
 	}
-
+	addr_map_symbol__exit(&target);
 	return 0;
 }
 
-static struct ins_ops loongarch_jump_ops = {
+static const struct ins_ops loongarch_jump_ops = {
 	.parse	   = loongarch_jump__parse,
 	.scnprintf = jump__scnprintf,
+	.is_jump   = true,
 };
 
 static
-struct ins_ops *loongarch__associate_ins_ops(struct arch *arch, const char *name)
+const struct ins_ops *loongarch__associate_ins_ops(struct arch *arch, const char *name)
 {
-	struct ins_ops *ops = NULL;
+	const struct ins_ops *ops = NULL;
 
 	if (!strcmp(name, "bl"))
 		ops = &loongarch_call_ops;
@@ -124,16 +140,17 @@ struct ins_ops *loongarch__associate_ins_ops(struct arch *arch, const char *name
 	return ops;
 }
 
-static
-int loongarch__annotate_init(struct arch *arch, char *cpuid __maybe_unused)
+const struct arch *arch__new_loongarch(const struct e_machine_and_e_flags *id,
+				       const char *cpuid __maybe_unused)
 {
-	if (!arch->initialized) {
-		arch->associate_instruction_ops = loongarch__associate_ins_ops;
-		arch->initialized = true;
-		arch->objdump.comment_char = '#';
-		arch->e_machine = EM_LOONGARCH;
-		arch->e_flags = 0;
-	}
+	struct arch *arch = zalloc(sizeof(*arch));
 
-	return 0;
+	if (!arch)
+		return NULL;
+
+	arch->name = "loongarch";
+	arch->id = *id;
+	arch->associate_instruction_ops = loongarch__associate_ins_ops;
+	arch->objdump.comment_char = '#';
+	return arch;
 }
diff --git a/tools/perf/arch/mips/annotate/instructions.c b/tools/perf/util/annotate-arch/annotate-mips.c
index b50b46c613d6..e8d1c6c7e9f3 100644
--- a/tools/perf/arch/mips/annotate/instructions.c
+++ b/tools/perf/util/annotate-arch/annotate-mips.c
@@ -1,9 +1,13 @@
 // SPDX-License-Identifier: GPL-2.0
+#include <string.h>
+#include <linux/compiler.h>
+#include <linux/zalloc.h>
+#include "../disasm.h"
 
 static
-struct ins_ops *mips__associate_ins_ops(struct arch *arch, const char *name)
+const struct ins_ops *mips__associate_ins_ops(struct arch *arch, const char *name)
 {
-	struct ins_ops *ops = NULL;
+	const struct ins_ops *ops = NULL;
 
 	if (!strncmp(name, "bal", 3) ||
 	    !strncmp(name, "bgezal", 6) ||
@@ -33,16 +37,17 @@ struct ins_ops *mips__associate_ins_ops(struct arch *arch, const char *name)
 	return ops;
 }
 
-static
-int mips__annotate_init(struct arch *arch, char *cpuid __maybe_unused)
+const struct arch *arch__new_mips(const struct e_machine_and_e_flags *id,
+				  const char *cpuid __maybe_unused)
 {
-	if (!arch->initialized) {
-		arch->associate_instruction_ops = mips__associate_ins_ops;
-		arch->initialized = true;
-		arch->objdump.comment_char = '#';
-		arch->e_machine = EM_MIPS;
-		arch->e_flags = 0;
-	}
+	struct arch *arch = zalloc(sizeof(*arch));
+
+	if (!arch)
+		return NULL;
 
-	return 0;
+	arch->name = "mips";
+	arch->id = *id;
+	arch->objdump.comment_char = '#';
+	arch->associate_instruction_ops = mips__associate_ins_ops;
+	return arch;
 }
diff --git a/tools/perf/arch/powerpc/annotate/instructions.c b/tools/perf/util/annotate-arch/annotate-powerpc.c
index ca567cfdcbdb..218207b52581 100644
--- a/tools/perf/arch/powerpc/annotate/instructions.c
+++ b/tools/perf/util/annotate-arch/annotate-powerpc.c
@@ -1,10 +1,102 @@
 // SPDX-License-Identifier: GPL-2.0
+#include <string.h>
 #include <linux/compiler.h>
+#include <linux/kernel.h>
+#include "../annotate-data.h"
+#include "../debug.h"
+#include "../disasm.h"
 
-static struct ins_ops *powerpc__associate_instruction_ops(struct arch *arch, const char *name)
+#define PPC_OP(op)	(((op) >> 26) & 0x3F)
+#define PPC_21_30(R)	(((R) >> 1) & 0x3ff)
+#define PPC_22_30(R)	(((R) >> 1) & 0x1ff)
+
+#define MINUS_EXT_XO_FORM	234
+#define SUB_EXT_XO_FORM		232
+#define	ADD_ZERO_EXT_XO_FORM	202
+#define	SUB_ZERO_EXT_XO_FORM	200
+
+static int arithmetic__scnprintf(const struct ins *ins, char *bf, size_t size,
+		struct ins_operands *ops, int max_ins_name)
+{
+	return scnprintf(bf, size, "%-*s %s", max_ins_name, ins->name,
+			ops->raw);
+}
+
+/*
+ * Sets the fields: multi_regs and "mem_ref".
+ * "mem_ref" is set for ops->source which is later used to
+ * fill the objdump->memory_ref-char field. This ops is currently
+ * used by powerpc and since binary instruction code is used to
+ * extract opcode, regs and offset, no other parsing is needed here.
+ *
+ * Dont set multi regs for 4 cases since it has only one operand
+ * for source:
+ * - Add to Minus One Extended XO-form ( Ex: addme, addmeo )
+ * - Subtract From Minus One Extended XO-form ( Ex: subfme )
+ * - Add to Zero Extended XO-form ( Ex: addze, addzeo )
+ * - Subtract From Zero Extended XO-form ( Ex: subfze )
+ */
+static int arithmetic__parse(const struct arch *arch __maybe_unused, struct ins_operands *ops,
+		struct map_symbol *ms __maybe_unused, struct disasm_line *dl)
+{
+	int opcode = PPC_OP(dl->raw.raw_insn);
+
+	ops->source.mem_ref = false;
+	if (opcode == 31) {
+		if ((opcode != MINUS_EXT_XO_FORM) && (opcode != SUB_EXT_XO_FORM) &&
+		    (opcode != ADD_ZERO_EXT_XO_FORM) && (opcode != SUB_ZERO_EXT_XO_FORM))
+			ops->source.multi_regs = true;
+	}
+
+	ops->target.mem_ref = false;
+	ops->target.multi_regs = false;
+
+	return 0;
+}
+
+static const struct ins_ops arithmetic_ops = {
+	.parse     = arithmetic__parse,
+	.scnprintf = arithmetic__scnprintf,
+};
+
+static int load_store__scnprintf(const struct ins *ins, char *bf, size_t size,
+		struct ins_operands *ops, int max_ins_name)
+{
+	return scnprintf(bf, size, "%-*s %s", max_ins_name, ins->name,
+			ops->raw);
+}
+
+/*
+ * Sets the fields: multi_regs and "mem_ref".
+ * "mem_ref" is set for ops->source which is later used to
+ * fill the objdump->memory_ref-char field. This ops is currently
+ * used by powerpc and since binary instruction code is used to
+ * extract opcode, regs and offset, no other parsing is needed here
+ */
+static int load_store__parse(const struct arch *arch __maybe_unused, struct ins_operands *ops,
+		struct map_symbol *ms __maybe_unused, struct disasm_line *dl __maybe_unused)
+{
+	ops->source.mem_ref = true;
+	ops->source.multi_regs = false;
+	/* opcode 31 is of X form */
+	if (PPC_OP(dl->raw.raw_insn) == 31)
+		ops->source.multi_regs = true;
+
+	ops->target.mem_ref = false;
+	ops->target.multi_regs = false;
+
+	return 0;
+}
+
+static const struct ins_ops load_store_ops = {
+	.parse     = load_store__parse,
+	.scnprintf = load_store__scnprintf,
+};
+
+static const struct ins_ops *powerpc__associate_instruction_ops(struct arch *arch, const char *name)
 {
 	int i;
-	struct ins_ops *ops;
+	const struct ins_ops *ops;
 
 	/*
 	 * - Interested only if instruction starts with 'b'.
@@ -49,10 +141,6 @@ static struct ins_ops *powerpc__associate_instruction_ops(struct arch *arch, con
 	return ops;
 }
 
-#define PPC_OP(op)	(((op) >> 26) & 0x3F)
-#define PPC_21_30(R)	(((R) >> 1) & 0x3ff)
-#define PPC_22_30(R)	(((R) >> 1) & 0x1ff)
-
 struct insn_offset {
 	const char	*name;
 	int		value;
@@ -189,7 +277,7 @@ static int cmp_offset(const void *a, const void *b)
 	return (val1->value - val2->value);
 }
 
-static struct ins_ops *check_ppc_insn(struct disasm_line *dl)
+const struct ins_ops *check_ppc_insn(struct disasm_line *dl)
 {
 	int raw_insn = dl->raw.raw_insn;
 	int opcode = PPC_OP(raw_insn);
@@ -302,16 +390,21 @@ static void update_insn_state_powerpc(struct type_state *state,
 }
 #endif /* HAVE_LIBDW_SUPPORT */
 
-static int powerpc__annotate_init(struct arch *arch, char *cpuid __maybe_unused)
+const struct arch *arch__new_powerpc(const struct e_machine_and_e_flags *id,
+				     const char *cpuid __maybe_unused)
 {
-	if (!arch->initialized) {
-		arch->initialized = true;
-		arch->associate_instruction_ops = powerpc__associate_instruction_ops;
-		arch->objdump.comment_char      = '#';
-		annotate_opts.show_asm_raw = true;
-		arch->e_machine = EM_PPC;
-		arch->e_flags = 0;
-	}
+	struct arch *arch = zalloc(sizeof(*arch));
 
-	return 0;
+	if (!arch)
+		return NULL;
+
+	arch->name = "powerpc";
+	arch->id = *id;
+	arch->objdump.comment_char = '#';
+	annotate_opts.show_asm_raw = true;
+	arch->associate_instruction_ops = powerpc__associate_instruction_ops;
+#ifdef HAVE_LIBDW_SUPPORT
+	arch->update_insn_state = update_insn_state_powerpc;
+#endif
+	return arch;
 }
diff --git a/tools/perf/util/annotate-arch/annotate-riscv64.c b/tools/perf/util/annotate-arch/annotate-riscv64.c
new file mode 100644
index 000000000000..29a988fca8c9
--- /dev/null
+++ b/tools/perf/util/annotate-arch/annotate-riscv64.c
@@ -0,0 +1,41 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <string.h>
+#include <linux/compiler.h>
+#include <linux/zalloc.h>
+#include "../disasm.h"
+
+static
+const struct ins_ops *riscv64__associate_ins_ops(struct arch *arch, const char *name)
+{
+	const struct ins_ops *ops = NULL;
+
+	if (!strncmp(name, "jal", 3) ||
+	    !strncmp(name, "jr", 2) ||
+	    !strncmp(name, "call", 4))
+		ops = &call_ops;
+	else if (!strncmp(name, "ret", 3))
+		ops = &ret_ops;
+	else if (name[0] == 'j' || name[0] == 'b')
+		ops = &jump_ops;
+	else
+		return NULL;
+
+	arch__associate_ins_ops(arch, name, ops);
+
+	return ops;
+}
+
+const struct arch *arch__new_riscv64(const struct e_machine_and_e_flags *id,
+				     const char *cpuid __maybe_unused)
+{
+	struct arch *arch = zalloc(sizeof(*arch));
+
+	if (!arch)
+		return NULL;
+
+	arch->name = "riscv";
+	arch->id = *id;
+	arch->objdump.comment_char = '#';
+	arch->associate_instruction_ops = riscv64__associate_ins_ops;
+	return arch;
+}
diff --git a/tools/perf/arch/s390/annotate/instructions.c b/tools/perf/util/annotate-arch/annotate-s390.c
index c61193f1e096..af9cabd0a586 100644
--- a/tools/perf/arch/s390/annotate/instructions.c
+++ b/tools/perf/util/annotate-arch/annotate-s390.c
@@ -1,14 +1,22 @@
 // SPDX-License-Identifier: GPL-2.0
+#include <string.h>
 #include <linux/compiler.h>
-
-static int s390_call__parse(struct arch *arch, struct ins_operands *ops,
-			    struct map_symbol *ms, struct disasm_line *dl __maybe_unused)
+#include "../debug.h"
+#include "../disasm.h"
+#include "../map.h"
+#include "../maps.h"
+#include "../symbol.h"
+#include "../thread.h"
+#include "../annotate.h"
+#include "../annotate-data.h"
+
+static int s390_call__parse(const struct arch *arch, struct ins_operands *ops,
+			    struct map_symbol *ms,
+			    struct disasm_line *dl __maybe_unused)
 {
 	char *endptr, *tok, *name;
 	struct map *map = ms->map;
-	struct addr_map_symbol target = {
-		.ms = { .map = map, },
-	};
+	struct addr_map_symbol target;
 
 	tok = strchr(ops->raw, ',');
 	if (!tok)
@@ -36,21 +44,27 @@ static int s390_call__parse(struct arch *arch, struct ins_operands *ops,
 
 	if (ops->target.name == NULL)
 		return -1;
-	target.addr = map__objdump_2mem(map, ops->target.addr);
 
-	if (maps__find_ams(ms->maps, &target) == 0 &&
+	target = (struct addr_map_symbol) {
+		.ms = { .map = map__get(map), },
+		.addr = map__objdump_2mem(map, ops->target.addr),
+	};
+
+	if (maps__find_ams(thread__maps(ms->thread), &target) == 0 &&
 	    map__rip_2objdump(target.ms.map, map__map_ip(target.ms.map, target.addr)) == ops->target.addr)
 		ops->target.sym = target.ms.sym;
 
+	addr_map_symbol__exit(&target);
 	return 0;
 }
 
-static struct ins_ops s390_call_ops = {
+static const struct ins_ops s390_call_ops = {
 	.parse	   = s390_call__parse,
 	.scnprintf = call__scnprintf,
+	.is_call   = true,
 };
 
-static int s390_mov__parse(struct arch *arch __maybe_unused,
+static int s390_mov__parse(const struct arch *arch __maybe_unused,
 			   struct ins_operands *ops,
 			   struct map_symbol *ms __maybe_unused,
 			   struct disasm_line *dl __maybe_unused)
@@ -99,14 +113,14 @@ out_free_source:
 }
 
 
-static struct ins_ops s390_mov_ops = {
+static const struct ins_ops s390_mov_ops = {
 	.parse	   = s390_mov__parse,
 	.scnprintf = mov__scnprintf,
 };
 
-static struct ins_ops *s390__associate_ins_ops(struct arch *arch, const char *name)
+static const struct ins_ops *s390__associate_ins_ops(struct arch *arch, const char *name)
 {
-	struct ins_ops *ops = NULL;
+	const struct ins_ops *ops = NULL;
 
 	/* catch all kind of jumps */
 	if (strchr(name, 'j') ||
@@ -134,7 +148,7 @@ static struct ins_ops *s390__associate_ins_ops(struct arch *arch, const char *na
 	return ops;
 }
 
-static int s390__cpuid_parse(struct arch *arch, char *cpuid)
+static int s390__cpuid_parse(struct arch *arch, const char *cpuid)
 {
 	unsigned int family;
 	char model[16], model_c[16], cpumf_v[16], cpumf_a[16];
@@ -155,20 +169,22 @@ static int s390__cpuid_parse(struct arch *arch, char *cpuid)
 	return -1;
 }
 
-static int s390__annotate_init(struct arch *arch, char *cpuid __maybe_unused)
+const struct arch *arch__new_s390(const struct e_machine_and_e_flags *id, const char *cpuid)
 {
-	int err = 0;
-
-	if (!arch->initialized) {
-		arch->initialized = true;
-		arch->associate_instruction_ops = s390__associate_ins_ops;
-		if (cpuid) {
-			if (s390__cpuid_parse(arch, cpuid))
-				err = SYMBOL_ANNOTATE_ERRNO__ARCH_INIT_CPUID_PARSING;
+	struct arch *arch = zalloc(sizeof(*arch));
+
+	if (!arch)
+		return NULL;
+
+	arch->name = "s390";
+	arch->id = *id;
+	arch->associate_instruction_ops = s390__associate_ins_ops;
+	if (cpuid) {
+		if (s390__cpuid_parse(arch, cpuid)) {
+			errno = SYMBOL_ANNOTATE_ERRNO__ARCH_INIT_CPUID_PARSING;
+			return NULL;
 		}
-		arch->e_machine = EM_S390;
-		arch->e_flags = 0;
 	}
-
-	return err;
+	arch->objdump.comment_char = '#';
+	return arch;
 }
diff --git a/tools/perf/arch/sparc/annotate/instructions.c b/tools/perf/util/annotate-arch/annotate-sparc.c
index 68c31580ccfc..2f07bc7a56dd 100644
--- a/tools/perf/arch/sparc/annotate/instructions.c
+++ b/tools/perf/util/annotate-arch/annotate-sparc.c
@@ -1,4 +1,8 @@
 // SPDX-License-Identifier: GPL-2.0
+#include <string.h>
+#include <linux/compiler.h>
+#include <linux/zalloc.h>
+#include "../../util/disasm.h"
 
 static int is_branch_cond(const char *cond)
 {
@@ -117,9 +121,9 @@ static int is_branch_float_cond(const char *cond)
 	return 0;
 }
 
-static struct ins_ops *sparc__associate_instruction_ops(struct arch *arch, const char *name)
+static const struct ins_ops *sparc__associate_instruction_ops(struct arch *arch, const char *name)
 {
-	struct ins_ops *ops = NULL;
+	const struct ins_ops *ops = NULL;
 
 	if (!strcmp(name, "call") ||
 	    !strcmp(name, "jmp") ||
@@ -157,15 +161,17 @@ static struct ins_ops *sparc__associate_instruction_ops(struct arch *arch, const
 	return ops;
 }
 
-static int sparc__annotate_init(struct arch *arch, char *cpuid __maybe_unused)
+const struct arch *arch__new_sparc(const struct e_machine_and_e_flags *id,
+				   const char *cpuid __maybe_unused)
 {
-	if (!arch->initialized) {
-		arch->initialized = true;
-		arch->associate_instruction_ops = sparc__associate_instruction_ops;
-		arch->objdump.comment_char = '#';
-		arch->e_machine = EM_SPARC;
-		arch->e_flags = 0;
-	}
+	struct arch *arch = zalloc(sizeof(*arch));
 
-	return 0;
+	if (!arch)
+		return NULL;
+
+	arch->name = "sparc";
+	arch->id = *id;
+	arch->associate_instruction_ops = sparc__associate_instruction_ops;
+	arch->objdump.comment_char = '#';
+	return arch;
 }
diff --git a/tools/perf/arch/x86/annotate/instructions.c b/tools/perf/util/annotate-arch/annotate-x86.c
index 803f9351a3fb..eb9a649ca656 100644
--- a/tools/perf/arch/x86/annotate/instructions.c
+++ b/tools/perf/util/annotate-arch/annotate-x86.c
@@ -1,4 +1,16 @@
 // SPDX-License-Identifier: GPL-2.0
+#include <string.h>
+#include <linux/compiler.h>
+#include <assert.h>
+#include <inttypes.h>
+#include "../annotate-data.h"
+#include "../debug.h"
+#include "../disasm.h"
+#include "../dso.h"
+#include "../map.h"
+#include "../string2.h" // strstarts
+#include "../symbol.h"
+
 /*
  * x86 instruction nmemonic table to parse disasm lines for annotate.
  * This table is searched twice - one for exact match and another for
@@ -7,7 +19,7 @@
  * So this table should not have entries with the suffix unless it's
  * a complete different instruction than ones without the suffix.
  */
-static struct ins x86__instructions[] = {
+static const struct ins x86__instructions[] = {
 	{ .name = "adc",	.ops = &mov_ops,  },
 	{ .name = "add",	.ops = &mov_ops,  },
 	{ .name = "addsd",	.ops = &mov_ops,  },
@@ -19,9 +31,9 @@ static struct ins x86__instructions[] = {
 	{ .name = "btr",	.ops = &mov_ops,  },
 	{ .name = "bts",	.ops = &mov_ops,  },
 	{ .name = "call",	.ops = &call_ops, },
+	{ .name = "cmovae",	.ops = &mov_ops,  },
 	{ .name = "cmovbe",	.ops = &mov_ops,  },
 	{ .name = "cmove",	.ops = &mov_ops,  },
-	{ .name = "cmovae",	.ops = &mov_ops,  },
 	{ .name = "cmp",	.ops = &mov_ops,  },
 	{ .name = "cmpxch",	.ops = &mov_ops,  },
 	{ .name = "cmpxchg",	.ops = &mov_ops,  },
@@ -73,23 +85,23 @@ static struct ins x86__instructions[] = {
 	{ .name = "movaps",	.ops = &mov_ops,  },
 	{ .name = "movdqa",	.ops = &mov_ops,  },
 	{ .name = "movdqu",	.ops = &mov_ops,  },
+	{ .name = "movsb",	.ops = &mov_ops,  },
 	{ .name = "movsd",	.ops = &mov_ops,  },
+	{ .name = "movsl",	.ops = &mov_ops,  },
 	{ .name = "movss",	.ops = &mov_ops,  },
-	{ .name = "movsb",	.ops = &mov_ops,  },
 	{ .name = "movsw",	.ops = &mov_ops,  },
-	{ .name = "movsl",	.ops = &mov_ops,  },
 	{ .name = "movupd",	.ops = &mov_ops,  },
 	{ .name = "movups",	.ops = &mov_ops,  },
 	{ .name = "movzb",	.ops = &mov_ops,  },
-	{ .name = "movzw",	.ops = &mov_ops,  },
 	{ .name = "movzl",	.ops = &mov_ops,  },
+	{ .name = "movzw",	.ops = &mov_ops,  },
 	{ .name = "mulsd",	.ops = &mov_ops,  },
 	{ .name = "mulss",	.ops = &mov_ops,  },
 	{ .name = "nop",	.ops = &nop_ops,  },
 	{ .name = "or",		.ops = &mov_ops,  },
 	{ .name = "orps",	.ops = &mov_ops,  },
-	{ .name = "pand",	.ops = &mov_ops,  },
 	{ .name = "paddq",	.ops = &mov_ops,  },
+	{ .name = "pand",	.ops = &mov_ops,  },
 	{ .name = "pcmpeqb",	.ops = &mov_ops,  },
 	{ .name = "por",	.ops = &mov_ops,  },
 	{ .name = "rcl",	.ops = &mov_ops,  },
@@ -119,7 +131,7 @@ static struct ins x86__instructions[] = {
 	{ .name = "xorps",	.ops = &mov_ops, },
 };
 
-static bool amd__ins_is_fused(struct arch *arch, const char *ins1,
+static bool amd__ins_is_fused(const struct arch *arch, const char *ins1,
 			      const char *ins2)
 {
 	if (strstr(ins2, "jmp"))
@@ -142,7 +154,7 @@ static bool amd__ins_is_fused(struct arch *arch, const char *ins1,
 	return false;
 }
 
-static bool intel__ins_is_fused(struct arch *arch, const char *ins1,
+static bool intel__ins_is_fused(const struct arch *arch, const char *ins1,
 				const char *ins2)
 {
 	if (arch->family != 6 || arch->model < 0x1e || strstr(ins2, "jmp"))
@@ -170,7 +182,7 @@ static bool intel__ins_is_fused(struct arch *arch, const char *ins1,
 	return false;
 }
 
-static int x86__cpuid_parse(struct arch *arch, char *cpuid)
+static int x86__cpuid_parse(struct arch *arch, const char *cpuid)
 {
 	unsigned int family, model, stepping;
 	int ret;
@@ -191,23 +203,6 @@ static int x86__cpuid_parse(struct arch *arch, char *cpuid)
 	return -1;
 }
 
-static int x86__annotate_init(struct arch *arch, char *cpuid)
-{
-	int err = 0;
-
-	if (arch->initialized)
-		return 0;
-
-	if (cpuid) {
-		if (x86__cpuid_parse(arch, cpuid))
-			err = SYMBOL_ANNOTATE_ERRNO__ARCH_INIT_CPUID_PARSING;
-	}
-	arch->e_machine = EM_X86_64;
-	arch->e_flags = 0;
-	arch->initialized = true;
-	return err;
-}
-
 #ifdef HAVE_LIBDW_SUPPORT
 static void update_insn_state_x86(struct type_state *state,
 				  struct data_loc_info *dloc, Dwarf_Die *cu_die,
@@ -781,3 +776,45 @@ retry:
 	/* Case 4. memory to memory transfers (not handled for now) */
 }
 #endif
+
+const struct arch *arch__new_x86(const struct e_machine_and_e_flags *id, const char *cpuid)
+{
+	struct arch *arch = zalloc(sizeof(*arch));
+
+	if (!arch)
+		return NULL;
+
+	arch->name = "x86";
+	arch->id = *id;
+	if (cpuid) {
+		if (x86__cpuid_parse(arch, cpuid)) {
+			errno = SYMBOL_ANNOTATE_ERRNO__ARCH_INIT_CPUID_PARSING;
+			return NULL;
+		}
+	}
+	arch->instructions = x86__instructions;
+	arch->nr_instructions = ARRAY_SIZE(x86__instructions);
+#ifndef NDEBUG
+	{
+		static bool sorted_check;
+
+		if (!sorted_check) {
+			for (size_t i = 0; i < arch->nr_instructions - 1; i++) {
+				assert(strcmp(arch->instructions[i].name,
+					      arch->instructions[i + 1].name) <= 0);
+			}
+			sorted_check = true;
+		}
+	}
+#endif
+	arch->sorted_instructions = true;
+	arch->objdump.comment_char = '#';
+	arch->objdump.register_char = '%';
+	arch->objdump.memory_ref_char = '(';
+	arch->objdump.imm_char = '$';
+	arch->insn_suffix = "bwlq";
+#ifdef HAVE_LIBDW_SUPPORT
+	arch->update_insn_state = update_insn_state_x86;
+#endif
+	return arch;
+}
diff --git a/tools/perf/util/annotate-data.c b/tools/perf/util/annotate-data.c
index 07cf9c334be0..44fbd41e3845 100644
--- a/tools/perf/util/annotate-data.c
+++ b/tools/perf/util/annotate-data.c
@@ -160,12 +160,12 @@ bool has_reg_type(struct type_state *state, int reg)
 	return (unsigned)reg < ARRAY_SIZE(state->regs);
 }
 
-static void init_type_state(struct type_state *state, struct arch *arch)
+static void init_type_state(struct type_state *state, const struct arch *arch)
 {
 	memset(state, 0, sizeof(*state));
 	INIT_LIST_HEAD(&state->stack_vars);
 
-	if (arch__is(arch, "x86")) {
+	if (arch__is_x86(arch)) {
 		state->regs[0].caller_saved = true;
 		state->regs[1].caller_saved = true;
 		state->regs[2].caller_saved = true;
@@ -526,7 +526,7 @@ static enum type_match_result check_variable(struct data_loc_info *dloc,
 		needs_pointer = false;
 	else if (reg == dloc->fbreg || is_fbreg)
 		needs_pointer = false;
-	else if (arch__is(dloc->arch, "x86") && reg == X86_REG_SP)
+	else if (arch__is_x86(dloc->arch) && reg == X86_REG_SP)
 		needs_pointer = false;
 
 	/* Get the type of the variable */
@@ -1071,7 +1071,7 @@ static void delete_var_types(struct die_var_type *var_types)
 /* should match to is_stack_canary() in util/annotate.c */
 static void setup_stack_canary(struct data_loc_info *dloc)
 {
-	if (arch__is(dloc->arch, "x86")) {
+	if (arch__is_x86(dloc->arch)) {
 		dloc->op->segment = INSN_SEG_X86_GS;
 		dloc->op->imm = true;
 		dloc->op->offset = 40;
@@ -1311,7 +1311,7 @@ check_kernel:
 
 		/* Direct this-cpu access like "%gs:0x34740" */
 		if (dloc->op->segment == INSN_SEG_X86_GS && dloc->op->imm &&
-		    arch__is(dloc->arch, "x86")) {
+		    arch__is_x86(dloc->arch)) {
 			pr_debug_dtp("this-cpu var");
 
 			addr = dloc->op->offset;
@@ -1397,7 +1397,7 @@ out:
 
 static int arch_supports_insn_tracking(struct data_loc_info *dloc)
 {
-	if ((arch__is(dloc->arch, "x86")) || (arch__is(dloc->arch, "powerpc")))
+	if ((arch__is_x86(dloc->arch)) || (arch__is_powerpc(dloc->arch)))
 		return 1;
 	return 0;
 }
diff --git a/tools/perf/util/annotate-data.h b/tools/perf/util/annotate-data.h
index 869307c7f130..9b222869e42d 100644
--- a/tools/perf/util/annotate-data.h
+++ b/tools/perf/util/annotate-data.h
@@ -117,7 +117,7 @@ extern struct annotated_data_type canary_type;
  */
 struct data_loc_info {
 	/* These are input field, should be filled by caller */
-	struct arch *arch;
+	const struct arch *arch;
 	struct thread *thread;
 	struct map_symbol *ms;
 	u64 ip;
diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c
index cc7764455faf..2e3522905046 100644
--- a/tools/perf/util/annotate.c
+++ b/tools/perf/util/annotate.c
@@ -761,7 +761,7 @@ static int disasm_line__print(struct disasm_line *dl, u64 start, int addr_fmt_wi
 }
 
 static struct annotated_data_type *
-__hist_entry__get_data_type(struct hist_entry *he, struct arch *arch,
+__hist_entry__get_data_type(struct hist_entry *he, const struct arch *arch,
 			    struct debuginfo *dbg, struct disasm_line *dl,
 			    int *type_offset);
 
@@ -980,47 +980,43 @@ void symbol__calc_percent(struct symbol *sym, struct evsel *evsel)
 	annotation__calc_percent(notes, evsel, symbol__size(sym));
 }
 
-int evsel__get_arch(struct evsel *evsel, struct arch **parch)
+int thread__get_arch(struct thread *thread, const struct arch **parch)
 {
-	struct perf_env *env = evsel__env(evsel);
-	const char *arch_name = perf_env__arch(env);
-	struct arch *arch;
-	int err;
+	const struct arch *arch;
+	struct machine *machine;
+	uint32_t e_flags;
+	uint16_t e_machine;
 
-	if (!arch_name) {
+	if (!thread) {
 		*parch = NULL;
-		return errno;
+		return -1;
 	}
 
-	*parch = arch = arch__find(arch_name);
+	machine = maps__machine(thread__maps(thread));
+	e_machine = thread__e_machine(thread, machine, &e_flags);
+	arch = arch__find(e_machine, e_flags, machine->env ? machine->env->cpuid : NULL);
 	if (arch == NULL) {
-		pr_err("%s: unsupported arch %s\n", __func__, arch_name);
-		return ENOTSUP;
+		pr_err("%s: unsupported arch %d\n", __func__, e_machine);
+		return errno;
 	}
+	if (parch)
+		*parch = arch;
 
-	if (arch->init) {
-		err = arch->init(arch, env ? env->cpuid : NULL);
-		if (err) {
-			pr_err("%s: failed to initialize %s arch priv area\n",
-			       __func__, arch->name);
-			return err;
-		}
-	}
 	return 0;
 }
 
 int symbol__annotate(struct map_symbol *ms, struct evsel *evsel,
-		     struct arch **parch)
+		     const struct arch **parch)
 {
 	struct symbol *sym = ms->sym;
 	struct annotation *notes = symbol__annotation(sym);
 	struct annotate_args args = {
 		.options	= &annotate_opts,
 	};
-	struct arch *arch = NULL;
+	const struct arch *arch = NULL;
 	int err, nr;
 
-	err = evsel__get_arch(evsel, &arch);
+	err = thread__get_arch(ms->thread, &arch);
 	if (err)
 		return err;
 
@@ -1031,7 +1027,7 @@ int symbol__annotate(struct map_symbol *ms, struct evsel *evsel,
 		return 0;
 
 	args.arch = arch;
-	args.ms = *ms;
+	args.ms = ms;
 
 	if (notes->src == NULL) {
 		notes->src = annotated_source__new();
@@ -1268,7 +1264,7 @@ int hist_entry__annotate_printf(struct hist_entry *he, struct evsel *evsel)
 
 	apd.addr_fmt_width = annotated_source__addr_fmt_width(&notes->src->source,
 							      notes->src->start);
-	evsel__get_arch(evsel, &apd.arch);
+	thread__get_arch(ms->thread, &apd.arch);
 	apd.dbg = dso__debuginfo(dso);
 
 	list_for_each_entry(pos, &notes->src->source, node) {
@@ -1373,7 +1369,7 @@ static int symbol__annotate_fprintf2(struct symbol *sym, FILE *fp,
 	struct annotation_line *al;
 
 	if (annotate_opts.code_with_type) {
-		evsel__get_arch(apd->evsel, &apd->arch);
+		thread__get_arch(apd->he->ms.thread, &apd->arch);
 		apd->dbg = dso__debuginfo(map__dso(apd->he->ms.map));
 	}
 
@@ -2204,7 +2200,7 @@ print_addr:
 }
 
 int symbol__annotate2(struct map_symbol *ms, struct evsel *evsel,
-		      struct arch **parch)
+		      const struct arch **parch)
 {
 	struct symbol *sym = ms->sym;
 	struct annotation *notes = symbol__annotation(sym);
@@ -2451,17 +2447,39 @@ int annotate_check_args(void)
 	return 0;
 }
 
+static int arch__dwarf_regnum(const struct arch *arch, const char *str)
+{
+	const char *p;
+	char *regname, *q;
+	int reg;
+
+	p = strchr(str, arch->objdump.register_char);
+	if (p == NULL)
+		return -1;
+
+	regname = strdup(p);
+	if (regname == NULL)
+		return -1;
+
+	q = strpbrk(regname, ",) ");
+	if (q)
+		*q = '\0';
+
+	reg = get_dwarf_regnum(regname, arch->id.e_machine, arch->id.e_flags);
+	free(regname);
+	return reg;
+}
+
 /*
  * Get register number and access offset from the given instruction.
  * It assumes AT&T x86 asm format like OFFSET(REG).  Maybe it needs
  * to revisit the format when it handles different architecture.
  * Fills @reg and @offset when return 0.
  */
-static int extract_reg_offset(struct arch *arch, const char *str,
+static int extract_reg_offset(const struct arch *arch, const char *str,
 			      struct annotated_op_loc *op_loc)
 {
 	char *p;
-	char *regname;
 
 	if (arch->objdump.register_char == 0)
 		return -1;
@@ -2474,7 +2492,7 @@ static int extract_reg_offset(struct arch *arch, const char *str,
 	 * %gs:0x18(%rbx).  In that case it should skip the part.
 	 */
 	if (*str == arch->objdump.register_char) {
-		if (arch__is(arch, "x86")) {
+		if (arch__is_x86(arch)) {
 			/* FIXME: Handle other segment registers */
 			if (!strncmp(str, "%gs:", 4))
 				op_loc->segment = INSN_SEG_X86_GS;
@@ -2486,31 +2504,14 @@ static int extract_reg_offset(struct arch *arch, const char *str,
 	}
 
 	op_loc->offset = strtol(str, &p, 0);
-
-	p = strchr(p, arch->objdump.register_char);
-	if (p == NULL)
-		return -1;
-
-	regname = strdup(p);
-	if (regname == NULL)
+	op_loc->reg1 = arch__dwarf_regnum(arch, p);
+	if (op_loc->reg1 == -1)
 		return -1;
 
-	op_loc->reg1 = get_dwarf_regnum(regname, arch->e_machine, arch->e_flags);
-	free(regname);
-
 	/* Get the second register */
-	if (op_loc->multi_regs) {
-		p = strchr(p + 1, arch->objdump.register_char);
-		if (p == NULL)
-			return -1;
+	if (op_loc->multi_regs)
+		op_loc->reg2 = arch__dwarf_regnum(arch, p + 1);
 
-		regname = strdup(p);
-		if (regname == NULL)
-			return -1;
-
-		op_loc->reg2 = get_dwarf_regnum(regname, arch->e_machine, arch->e_flags);
-		free(regname);
-	}
 	return 0;
 }
 
@@ -2538,7 +2539,7 @@ static int extract_reg_offset(struct arch *arch, const char *str,
  *                              # dst_reg1 = rbx, dst_reg2 = rcx, dst_mem = 1
  *                              # dst_multi_regs = 1, dst_offset = 8
  */
-int annotate_get_insn_location(struct arch *arch, struct disasm_line *dl,
+int annotate_get_insn_location(const struct arch *arch, struct disasm_line *dl,
 			       struct annotated_insn_loc *loc)
 {
 	struct ins_operands *ops;
@@ -2571,7 +2572,7 @@ int annotate_get_insn_location(struct arch *arch, struct disasm_line *dl,
 		op_loc->reg2 = -1;
 
 		if (insn_str == NULL) {
-			if (!arch__is(arch, "powerpc"))
+			if (!arch__is_powerpc(arch))
 				continue;
 		}
 
@@ -2580,7 +2581,7 @@ int annotate_get_insn_location(struct arch *arch, struct disasm_line *dl,
 		 * required fields for op_loc, ie reg1, reg2, offset from the
 		 * raw instruction.
 		 */
-		if (arch__is(arch, "powerpc")) {
+		if (arch__is_powerpc(arch)) {
 			op_loc->mem_ref = mem_ref;
 			op_loc->multi_regs = multi_regs;
 			get_powerpc_regs(dl->raw.raw_insn, !i, op_loc);
@@ -2589,9 +2590,10 @@ int annotate_get_insn_location(struct arch *arch, struct disasm_line *dl,
 			op_loc->multi_regs = multi_regs;
 			extract_reg_offset(arch, insn_str, op_loc);
 		} else {
-			char *s, *p = NULL;
+			const char *s = insn_str;
+			char *p = NULL;
 
-			if (arch__is(arch, "x86")) {
+			if (arch__is_x86(arch)) {
 				/* FIXME: Handle other segment registers */
 				if (!strncmp(insn_str, "%gs:", 4)) {
 					op_loc->segment = INSN_SEG_X86_GS;
@@ -2603,18 +2605,14 @@ int annotate_get_insn_location(struct arch *arch, struct disasm_line *dl,
 				}
 			}
 
-			s = strdup(insn_str);
-			if (s == NULL)
-				return -1;
-
-			if (*s == arch->objdump.register_char)
-				op_loc->reg1 = get_dwarf_regnum(s, arch->e_machine, arch->e_flags);
+			if (*s == arch->objdump.register_char) {
+				op_loc->reg1 = arch__dwarf_regnum(arch, s);
+			}
 			else if (*s == arch->objdump.imm_char) {
 				op_loc->offset = strtol(s + 1, &p, 0);
 				if (p && p != s + 1)
 					op_loc->imm = true;
 			}
-			free(s);
 		}
 	}
 
@@ -2673,9 +2671,9 @@ static struct annotated_item_stat *annotate_data_stat(struct list_head *head,
 	return istat;
 }
 
-static bool is_stack_operation(struct arch *arch, struct disasm_line *dl)
+static bool is_stack_operation(const struct arch *arch, struct disasm_line *dl)
 {
-	if (arch__is(arch, "x86")) {
+	if (arch__is_x86(arch)) {
 		if (!strncmp(dl->ins.name, "push", 4) ||
 		    !strncmp(dl->ins.name, "pop", 3) ||
 		    !strncmp(dl->ins.name, "call", 4) ||
@@ -2686,10 +2684,10 @@ static bool is_stack_operation(struct arch *arch, struct disasm_line *dl)
 	return false;
 }
 
-static bool is_stack_canary(struct arch *arch, struct annotated_op_loc *loc)
+static bool is_stack_canary(const struct arch *arch, struct annotated_op_loc *loc)
 {
 	/* On x86_64, %gs:40 is used for stack canary */
-	if (arch__is(arch, "x86")) {
+	if (arch__is_x86(arch)) {
 		if (loc->segment == INSN_SEG_X86_GS && loc->imm &&
 		    loc->offset == 40)
 			return true;
@@ -2702,9 +2700,9 @@ static bool is_stack_canary(struct arch *arch, struct annotated_op_loc *loc)
  * Returns true if the instruction has a memory operand without
  * performing a load/store
  */
-static bool is_address_gen_insn(struct arch *arch, struct disasm_line *dl)
+static bool is_address_gen_insn(const struct arch *arch, struct disasm_line *dl)
 {
-	if (arch__is(arch, "x86")) {
+	if (arch__is_x86(arch)) {
 		if (!strncmp(dl->ins.name, "lea", 3))
 			return true;
 	}
@@ -2791,7 +2789,7 @@ void debuginfo_cache__delete(void)
 }
 
 static struct annotated_data_type *
-__hist_entry__get_data_type(struct hist_entry *he, struct arch *arch,
+__hist_entry__get_data_type(struct hist_entry *he, const struct arch *arch,
 			    struct debuginfo *dbg, struct disasm_line *dl,
 			    int *type_offset)
 {
@@ -2847,7 +2845,7 @@ __hist_entry__get_data_type(struct hist_entry *he, struct arch *arch,
 		}
 
 		/* This CPU access in kernel - pretend PC-relative addressing */
-		if (dso__kernel(map__dso(ms->map)) && arch__is(arch, "x86") &&
+		if (dso__kernel(map__dso(ms->map)) && arch__is_x86(arch) &&
 		    op_loc->segment == INSN_SEG_X86_GS && op_loc->imm) {
 			dloc.var_addr = op_loc->offset;
 			op_loc->reg1 = DWARF_REG_PC;
@@ -2895,7 +2893,7 @@ struct annotated_data_type *hist_entry__get_data_type(struct hist_entry *he)
 {
 	struct map_symbol *ms = &he->ms;
 	struct evsel *evsel = hists_to_evsel(he->hists);
-	struct arch *arch;
+	const struct arch *arch;
 	struct disasm_line *dl;
 	struct annotated_data_type *mem_type;
 	struct annotated_item_stat *istat;
diff --git a/tools/perf/util/annotate.h b/tools/perf/util/annotate.h
index d4990bff29a7..696e36dbf013 100644
--- a/tools/perf/util/annotate.h
+++ b/tools/perf/util/annotate.h
@@ -202,7 +202,7 @@ struct annotation_write_ops {
 struct annotation_print_data {
 	struct hist_entry *he;
 	struct evsel *evsel;
-	struct arch *arch;
+	const struct arch *arch;
 	struct debuginfo *dbg;
 	/* save data type info keyed by al->offset */
 	struct hashmap *type_hash;
@@ -441,10 +441,10 @@ void symbol__annotate_zero_histograms(struct symbol *sym);
 
 int symbol__annotate(struct map_symbol *ms,
 		     struct evsel *evsel,
-		     struct arch **parch);
+		     const struct arch **parch);
 int symbol__annotate2(struct map_symbol *ms,
 		      struct evsel *evsel,
-		      struct arch **parch);
+		      const struct arch **parch);
 
 enum symbol_disassemble_errno {
 	SYMBOL_ANNOTATE_ERRNO__SUCCESS		= 0,
@@ -546,7 +546,7 @@ struct annotated_insn_loc {
 	     i++, op_loc++)
 
 /* Get detailed location info in the instruction */
-int annotate_get_insn_location(struct arch *arch, struct disasm_line *dl,
+int annotate_get_insn_location(const struct arch *arch, struct disasm_line *dl,
 			       struct annotated_insn_loc *loc);
 
 /* Returns a data type from the sample instruction (if any) */
@@ -586,5 +586,5 @@ int annotation_br_cntr_entry(char **str, int br_cntr_nr, u64 *br_cntr,
 			     int num_aggr, struct evsel *evsel);
 int annotation_br_cntr_abbr_list(char **str, struct evsel *evsel, bool header);
 
-int evsel__get_arch(struct evsel *evsel, struct arch **parch);
+int thread__get_arch(struct thread *thread, const struct arch **parch);
 #endif	/* __PERF_ANNOTATE_H */
diff --git a/tools/perf/util/archinsn.h b/tools/perf/util/archinsn.h
deleted file mode 100644
index 448cbb6b8d7e..000000000000
--- a/tools/perf/util/archinsn.h
+++ /dev/null
@@ -1,12 +0,0 @@
-#ifndef INSN_H
-#define INSN_H 1
-
-struct perf_sample;
-struct machine;
-struct thread;
-
-void arch_fetch_insn(struct perf_sample *sample,
-		     struct thread *thread,
-		     struct machine *machine);
-
-#endif
diff --git a/tools/perf/util/arm64-frame-pointer-unwind-support.c b/tools/perf/util/arm64-frame-pointer-unwind-support.c
index 958afe8b821e..858ce2b01812 100644
--- a/tools/perf/util/arm64-frame-pointer-unwind-support.c
+++ b/tools/perf/util/arm64-frame-pointer-unwind-support.c
@@ -2,7 +2,6 @@
 #include "arm64-frame-pointer-unwind-support.h"
 #include "callchain.h"
 #include "event.h"
-#include "perf_regs.h" // SMPL_REG_MASK
 #include "unwind.h"
 #include <string.h>
 
@@ -15,6 +14,8 @@ struct entries {
 	size_t length;
 };
 
+#define SMPL_REG_MASK(b) (1ULL << (b))
+
 static bool get_leaf_frame_caller_enabled(struct perf_sample *sample)
 {
 	struct regs_dump *regs;
diff --git a/tools/perf/util/blake2s.c b/tools/perf/util/blake2s.c
new file mode 100644
index 000000000000..ce5d89a19376
--- /dev/null
+++ b/tools/perf/util/blake2s.c
@@ -0,0 +1,151 @@
+// SPDX-License-Identifier: GPL-2.0 OR MIT
+/*
+ * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ *
+ * This is an implementation of the BLAKE2s hash and PRF functions.
+ *
+ * Information: https://blake2.net/
+ */
+
+#include "blake2s.h"
+#include <linux/kernel.h>
+
+static inline u32 ror32(u32 v, int n)
+{
+	return (v >> n) | (v << (32 - n));
+}
+
+static inline void le32_to_cpu_array(u32 a[], size_t n)
+{
+	for (size_t i = 0; i < n; i++)
+		a[i] = le32_to_cpu((__force __le32)a[i]);
+}
+
+static inline void cpu_to_le32_array(u32 a[], size_t n)
+{
+	for (size_t i = 0; i < n; i++)
+		a[i] = (__force u32)cpu_to_le32(a[i]);
+}
+
+static const u8 blake2s_sigma[10][16] = {
+	{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+	{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
+	{ 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
+	{ 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
+	{ 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
+	{ 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 },
+	{ 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 },
+	{ 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 },
+	{ 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 },
+	{ 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 },
+};
+
+static inline void blake2s_increment_counter(struct blake2s_ctx *ctx, u32 inc)
+{
+	ctx->t[0] += inc;
+	ctx->t[1] += (ctx->t[0] < inc);
+}
+
+static void blake2s_compress(struct blake2s_ctx *ctx,
+			     const u8 *data, size_t nblocks, u32 inc)
+{
+	u32 m[16];
+	u32 v[16];
+	int i;
+
+	while (nblocks > 0) {
+		blake2s_increment_counter(ctx, inc);
+		memcpy(m, data, BLAKE2S_BLOCK_SIZE);
+		le32_to_cpu_array(m, ARRAY_SIZE(m));
+		memcpy(v, ctx->h, 32);
+		v[ 8] = BLAKE2S_IV0;
+		v[ 9] = BLAKE2S_IV1;
+		v[10] = BLAKE2S_IV2;
+		v[11] = BLAKE2S_IV3;
+		v[12] = BLAKE2S_IV4 ^ ctx->t[0];
+		v[13] = BLAKE2S_IV5 ^ ctx->t[1];
+		v[14] = BLAKE2S_IV6 ^ ctx->f[0];
+		v[15] = BLAKE2S_IV7 ^ ctx->f[1];
+
+#define G(r, i, a, b, c, d) do { \
+	a += b + m[blake2s_sigma[r][2 * i + 0]]; \
+	d = ror32(d ^ a, 16); \
+	c += d; \
+	b = ror32(b ^ c, 12); \
+	a += b + m[blake2s_sigma[r][2 * i + 1]]; \
+	d = ror32(d ^ a, 8); \
+	c += d; \
+	b = ror32(b ^ c, 7); \
+} while (0)
+
+#define ROUND(r) do { \
+	G(r, 0, v[0], v[ 4], v[ 8], v[12]); \
+	G(r, 1, v[1], v[ 5], v[ 9], v[13]); \
+	G(r, 2, v[2], v[ 6], v[10], v[14]); \
+	G(r, 3, v[3], v[ 7], v[11], v[15]); \
+	G(r, 4, v[0], v[ 5], v[10], v[15]); \
+	G(r, 5, v[1], v[ 6], v[11], v[12]); \
+	G(r, 6, v[2], v[ 7], v[ 8], v[13]); \
+	G(r, 7, v[3], v[ 4], v[ 9], v[14]); \
+} while (0)
+		ROUND(0);
+		ROUND(1);
+		ROUND(2);
+		ROUND(3);
+		ROUND(4);
+		ROUND(5);
+		ROUND(6);
+		ROUND(7);
+		ROUND(8);
+		ROUND(9);
+
+#undef G
+#undef ROUND
+
+		for (i = 0; i < 8; ++i)
+			ctx->h[i] ^= v[i] ^ v[i + 8];
+
+		data += BLAKE2S_BLOCK_SIZE;
+		--nblocks;
+	}
+}
+
+static inline void blake2s_set_lastblock(struct blake2s_ctx *ctx)
+{
+	ctx->f[0] = -1;
+}
+
+void blake2s_update(struct blake2s_ctx *ctx, const u8 *in, size_t inlen)
+{
+	const size_t fill = BLAKE2S_BLOCK_SIZE - ctx->buflen;
+
+	if (unlikely(!inlen))
+		return;
+	if (inlen > fill) {
+		memcpy(ctx->buf + ctx->buflen, in, fill);
+		blake2s_compress(ctx, ctx->buf, 1, BLAKE2S_BLOCK_SIZE);
+		ctx->buflen = 0;
+		in += fill;
+		inlen -= fill;
+	}
+	if (inlen > BLAKE2S_BLOCK_SIZE) {
+		const size_t nblocks = DIV_ROUND_UP(inlen, BLAKE2S_BLOCK_SIZE);
+
+		blake2s_compress(ctx, in, nblocks - 1, BLAKE2S_BLOCK_SIZE);
+		in += BLAKE2S_BLOCK_SIZE * (nblocks - 1);
+		inlen -= BLAKE2S_BLOCK_SIZE * (nblocks - 1);
+	}
+	memcpy(ctx->buf + ctx->buflen, in, inlen);
+	ctx->buflen += inlen;
+}
+
+void blake2s_final(struct blake2s_ctx *ctx, u8 *out)
+{
+	blake2s_set_lastblock(ctx);
+	memset(ctx->buf + ctx->buflen, 0,
+	       BLAKE2S_BLOCK_SIZE - ctx->buflen); /* Padding */
+	blake2s_compress(ctx, ctx->buf, 1, ctx->buflen);
+	cpu_to_le32_array(ctx->h, ARRAY_SIZE(ctx->h));
+	memcpy(out, ctx->h, ctx->outlen);
+	memset(ctx, 0, sizeof(*ctx));
+}
diff --git a/tools/perf/util/blake2s.h b/tools/perf/util/blake2s.h
new file mode 100644
index 000000000000..a1fe81a4bea8
--- /dev/null
+++ b/tools/perf/util/blake2s.h
@@ -0,0 +1,73 @@
+/* SPDX-License-Identifier: GPL-2.0 OR MIT */
+/*
+ * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ */
+
+#ifndef _CRYPTO_BLAKE2S_H
+#define _CRYPTO_BLAKE2S_H
+
+#include <string.h>
+#include <linux/types.h>
+
+#define BLAKE2S_BLOCK_SIZE 64
+
+struct blake2s_ctx {
+	u32 h[8];
+	u32 t[2];
+	u32 f[2];
+	u8 buf[BLAKE2S_BLOCK_SIZE];
+	unsigned int buflen;
+	unsigned int outlen;
+};
+
+enum blake2s_iv {
+	BLAKE2S_IV0 = 0x6A09E667UL,
+	BLAKE2S_IV1 = 0xBB67AE85UL,
+	BLAKE2S_IV2 = 0x3C6EF372UL,
+	BLAKE2S_IV3 = 0xA54FF53AUL,
+	BLAKE2S_IV4 = 0x510E527FUL,
+	BLAKE2S_IV5 = 0x9B05688CUL,
+	BLAKE2S_IV6 = 0x1F83D9ABUL,
+	BLAKE2S_IV7 = 0x5BE0CD19UL,
+};
+
+static inline void __blake2s_init(struct blake2s_ctx *ctx, size_t outlen,
+				  const void *key, size_t keylen)
+{
+	ctx->h[0] = BLAKE2S_IV0 ^ (0x01010000 | keylen << 8 | outlen);
+	ctx->h[1] = BLAKE2S_IV1;
+	ctx->h[2] = BLAKE2S_IV2;
+	ctx->h[3] = BLAKE2S_IV3;
+	ctx->h[4] = BLAKE2S_IV4;
+	ctx->h[5] = BLAKE2S_IV5;
+	ctx->h[6] = BLAKE2S_IV6;
+	ctx->h[7] = BLAKE2S_IV7;
+	ctx->t[0] = 0;
+	ctx->t[1] = 0;
+	ctx->f[0] = 0;
+	ctx->f[1] = 0;
+	ctx->buflen = 0;
+	ctx->outlen = outlen;
+	if (keylen) {
+		memcpy(ctx->buf, key, keylen);
+		memset(&ctx->buf[keylen], 0, BLAKE2S_BLOCK_SIZE - keylen);
+		ctx->buflen = BLAKE2S_BLOCK_SIZE;
+	}
+}
+
+static inline void blake2s_init(struct blake2s_ctx *ctx, size_t outlen)
+{
+	__blake2s_init(ctx, outlen, NULL, 0);
+}
+
+static inline void blake2s_init_key(struct blake2s_ctx *ctx, size_t outlen,
+				    const void *key, size_t keylen)
+{
+	__blake2s_init(ctx, outlen, key, keylen);
+}
+
+void blake2s_update(struct blake2s_ctx *ctx, const u8 *in, size_t inlen);
+
+void blake2s_final(struct blake2s_ctx *ctx, u8 *out);
+
+#endif /* _CRYPTO_BLAKE2S_H */
diff --git a/tools/perf/util/bpf-event.c b/tools/perf/util/bpf-event.c
index 2298cd396c42..67e7786bb878 100644
--- a/tools/perf/util/bpf-event.c
+++ b/tools/perf/util/bpf-event.c
@@ -733,7 +733,8 @@ kallsyms_process_symbol(void *data, const char *_name,
 			char type __maybe_unused, u64 start)
 {
 	char disp[KSYM_NAME_LEN];
-	char *module, *name;
+	const char *module;
+	char *name;
 	unsigned long id;
 	int err = 0;
 
@@ -787,11 +788,10 @@ int perf_event__synthesize_bpf_events(struct perf_session *session,
 				err = 0;
 				break;
 			}
-			pr_debug("%s: can't get next program: %s%s\n",
-				 __func__, strerror(errno),
-				 errno == EINVAL ? " -- kernel too old?" : "");
 			/* don't report error on old kernel or EPERM  */
 			err = (errno == EINVAL || errno == EPERM) ? 0 : -1;
+			pr_debug("%s: can\'t get next program: %m%s\n",
+				__func__, errno == EINVAL ? " -- kernel too old?" : "");
 			break;
 		}
 		fd = bpf_prog_get_fd_by_id(id);
@@ -824,10 +824,8 @@ int perf_event__synthesize_bpf_events(struct perf_session *session,
 		.tool    = session->tool,
 	};
 
-	if (kallsyms__parse(kallsyms_filename, &arg, kallsyms_process_symbol)) {
-		pr_err("%s: failed to synthesize bpf images: %s\n",
-		       __func__, strerror(errno));
-	}
+	if (kallsyms__parse(kallsyms_filename, &arg, kallsyms_process_symbol))
+		pr_err("%s: failed to synthesize bpf images: %m\n", __func__);
 
 	free(event);
 	return err;
diff --git a/tools/perf/util/bpf-utils.c b/tools/perf/util/bpf-utils.c
index 5a66dc8594aa..d6d2c9c190f7 100644
--- a/tools/perf/util/bpf-utils.c
+++ b/tools/perf/util/bpf-utils.c
@@ -123,7 +123,7 @@ get_bpf_prog_info_linear(int fd, __u64 arrays)
 	/* step 1: get array dimensions */
 	err = bpf_obj_get_info_by_fd(fd, &info, &info_len);
 	if (err) {
-		pr_debug("can't get prog info: %s", strerror(errno));
+		pr_debug("can't get prog info: %m\n");
 		return ERR_PTR(-EFAULT);
 	}
 	if (info.type >= __MAX_BPF_PROG_TYPE)
@@ -186,7 +186,7 @@ get_bpf_prog_info_linear(int fd, __u64 arrays)
 	/* step 5: call syscall again to get required arrays */
 	err = bpf_obj_get_info_by_fd(fd, &info_linear->info, &info_len);
 	if (err) {
-		pr_debug("can't get prog info: %s", strerror(errno));
+		pr_debug("can't get prog info: %m\n");
 		free(info_linear);
 		return ERR_PTR(-EFAULT);
 	}
diff --git a/tools/perf/util/bpf_lock_contention.c b/tools/perf/util/bpf_lock_contention.c
index 7b5671f13c53..cbd7435579fe 100644
--- a/tools/perf/util/bpf_lock_contention.c
+++ b/tools/perf/util/bpf_lock_contention.c
@@ -42,7 +42,7 @@ static void check_slab_cache_iter(struct lock_contention *con)
 
 	con->btf = btf__load_vmlinux_btf();
 	if (con->btf == NULL) {
-		pr_debug("BTF loading failed: %s\n", strerror(errno));
+		pr_debug("BTF loading failed: %m\n");
 		return;
 	}
 
@@ -117,6 +117,9 @@ static void init_numa_data(struct lock_contention *con)
 	long last = -1;
 	int ret;
 
+	if (!con->btf)
+		return;
+
 	/*
 	 * 'struct zone' is embedded in 'struct pglist_data' as an array.
 	 * As we may not have full information of the struct zone in the
diff --git a/tools/perf/util/callchain.c b/tools/perf/util/callchain.c
index 428e5350d7a2..8ff0898799ee 100644
--- a/tools/perf/util/callchain.c
+++ b/tools/perf/util/callchain.c
@@ -31,6 +31,7 @@
 #include "callchain.h"
 #include "branch.h"
 #include "symbol.h"
+#include "thread.h"
 #include "util.h"
 #include "../perf.h"
 
@@ -1042,7 +1043,7 @@ merge_chain_branch(struct callchain_cursor *cursor,
 
 	list_for_each_entry_safe(list, next_list, &src->val, list) {
 		struct map_symbol ms = {
-			.maps = maps__get(list->ms.maps),
+			.thread = thread__get(list->ms.thread),
 			.map = map__get(list->ms.map),
 		};
 		callchain_cursor_append(cursor, list->ip, &ms, false, NULL, 0, 0, 0, list->srcline);
@@ -1147,10 +1148,11 @@ int hist_entry__append_callchain(struct hist_entry *he, struct perf_sample *samp
 int fill_callchain_info(struct addr_location *al, struct callchain_cursor_node *node,
 			bool hide_unresolved)
 {
-	struct machine *machine = node->ms.maps ? maps__machine(node->ms.maps) : NULL;
+	struct machine *machine = NULL;
+
+	if (node->ms.thread)
+		machine = maps__machine(thread__maps(node->ms.thread));
 
-	maps__put(al->maps);
-	al->maps = maps__get(node->ms.maps);
 	map__put(al->map);
 	al->map = map__get(node->ms.map);
 	al->sym = node->ms.sym;
@@ -1163,7 +1165,7 @@ int fill_callchain_info(struct addr_location *al, struct callchain_cursor_node *
 		if (al->map == NULL)
 			goto out;
 	}
-	if (maps__equal(al->maps, machine__kernel_maps(machine))) {
+	if (maps__equal(thread__maps(al->thread), machine__kernel_maps(machine))) {
 		if (machine__is_host(machine)) {
 			al->cpumode = PERF_RECORD_MISC_KERNEL;
 			al->level = 'k';
@@ -1679,7 +1681,7 @@ void callchain_cursor_reset(struct callchain_cursor *cursor)
 		map_symbol__exit(&node->ms);
 }
 
-void callchain_param_setup(u64 sample_type, const char *arch)
+void callchain_param_setup(u64 sample_type, uint16_t e_machine)
 {
 	if (symbol_conf.use_callchain || symbol_conf.cumulate_callchain) {
 		if ((sample_type & PERF_SAMPLE_REGS_USER) &&
@@ -1701,7 +1703,7 @@ void callchain_param_setup(u64 sample_type, const char *arch)
 	 * erroneous entries. Always skipping the LR and starting from the FP
 	 * can result in missing entries.
 	 */
-	if (callchain_param.record_mode == CALLCHAIN_FP && !strcmp(arch, "arm64"))
+	if (callchain_param.record_mode == CALLCHAIN_FP && e_machine == EM_AARCH64)
 		dwarf_callchain_users = true;
 }
 
diff --git a/tools/perf/util/callchain.h b/tools/perf/util/callchain.h
index 2a52af8c80ac..df54ddb8c0cb 100644
--- a/tools/perf/util/callchain.h
+++ b/tools/perf/util/callchain.h
@@ -303,7 +303,7 @@ int callchain_branch_counts(struct callchain_root *root,
 			    u64 *branch_count, u64 *predicted_count,
 			    u64 *abort_count, u64 *cycles_count);
 
-void callchain_param_setup(u64 sample_type, const char *arch);
+void callchain_param_setup(u64 sample_type, uint16_t e_machine);
 
 bool callchain_cnode_matched(struct callchain_node *base_cnode,
 			     struct callchain_node *pair_cnode);
diff --git a/tools/perf/util/cap.c b/tools/perf/util/cap.c
index 24a0ea7e6d97..ac6d1d9a523d 100644
--- a/tools/perf/util/cap.c
+++ b/tools/perf/util/cap.c
@@ -28,8 +28,7 @@ bool perf_cap__capable(int cap, bool *used_root)
 		    header.version == _LINUX_CAPABILITY_VERSION_1)
 			continue;
 
-		pr_debug2("capget syscall failed (%s - %d) fall back on root check\n",
-			  strerror(errno), errno);
+		pr_debug2("capget syscall failed (%m) fall back on root check\n");
 		*used_root = true;
 		return geteuid() == 0;
 	}
diff --git a/tools/perf/util/capstone.c b/tools/perf/util/capstone.c
index be5fd44b1f9d..25cf6e15ec27 100644
--- a/tools/perf/util/capstone.c
+++ b/tools/perf/util/capstone.c
@@ -11,20 +11,137 @@
 #include "print_insn.h"
 #include "symbol.h"
 #include "thread.h"
+#include <dlfcn.h>
 #include <errno.h>
 #include <fcntl.h>
+#include <inttypes.h>
 #include <string.h>
 
-#ifdef HAVE_LIBCAPSTONE_SUPPORT
 #include <capstone/capstone.h>
+
+#ifdef LIBCAPSTONE_DLOPEN
+static void *perf_cs_dll_handle(void)
+{
+	static bool dll_handle_init;
+	static void *dll_handle;
+
+	if (!dll_handle_init) {
+		dll_handle_init = true;
+		dll_handle = dlopen("libcapstone.so", RTLD_LAZY);
+		if (!dll_handle)
+			pr_debug("dlopen failed for libcapstone.so\n");
+	}
+	return dll_handle;
+}
+#endif
+
+static enum cs_err perf_cs_open(enum cs_arch arch, enum cs_mode mode, csh *handle)
+{
+#ifndef LIBCAPSTONE_DLOPEN
+	return cs_open(arch, mode, handle);
+#else
+	static bool fn_init;
+	static enum cs_err (*fn)(enum cs_arch arch, enum cs_mode mode, csh *handle);
+
+	if (!fn_init) {
+		fn = dlsym(perf_cs_dll_handle(), "cs_open");
+		if (!fn)
+			pr_debug("dlsym failed for cs_open\n");
+		fn_init = true;
+	}
+	if (!fn)
+		return CS_ERR_HANDLE;
+	return fn(arch, mode, handle);
 #endif
+}
+
+static enum cs_err perf_cs_option(csh handle, enum cs_opt_type type, size_t value)
+{
+#ifndef LIBCAPSTONE_DLOPEN
+	return cs_option(handle, type, value);
+#else
+	static bool fn_init;
+	static enum cs_err (*fn)(csh handle, enum cs_opt_type type, size_t value);
+
+	if (!fn_init) {
+		fn = dlsym(perf_cs_dll_handle(), "cs_option");
+		if (!fn)
+			pr_debug("dlsym failed for cs_option\n");
+		fn_init = true;
+	}
+	if (!fn)
+		return CS_ERR_HANDLE;
+	return fn(handle, type, value);
+#endif
+}
+
+static size_t perf_cs_disasm(csh handle, const uint8_t *code, size_t code_size,
+			uint64_t address, size_t count, struct cs_insn **insn)
+{
+#ifndef LIBCAPSTONE_DLOPEN
+	return cs_disasm(handle, code, code_size, address, count, insn);
+#else
+	static bool fn_init;
+	static enum cs_err (*fn)(csh handle, const uint8_t *code, size_t code_size,
+				 uint64_t address, size_t count, struct cs_insn **insn);
+
+	if (!fn_init) {
+		fn = dlsym(perf_cs_dll_handle(), "cs_disasm");
+		if (!fn)
+			pr_debug("dlsym failed for cs_disasm\n");
+		fn_init = true;
+	}
+	if (!fn)
+		return CS_ERR_HANDLE;
+	return fn(handle, code, code_size, address, count, insn);
+#endif
+}
+
+static void perf_cs_free(struct cs_insn *insn, size_t count)
+{
+#ifndef LIBCAPSTONE_DLOPEN
+	cs_free(insn, count);
+#else
+	static bool fn_init;
+	static void (*fn)(struct cs_insn *insn, size_t count);
+
+	if (!fn_init) {
+		fn = dlsym(perf_cs_dll_handle(), "cs_free");
+		if (!fn)
+			pr_debug("dlsym failed for cs_free\n");
+		fn_init = true;
+	}
+	if (!fn)
+		return;
+	fn(insn, count);
+#endif
+}
+
+static enum cs_err perf_cs_close(csh *handle)
+{
+#ifndef LIBCAPSTONE_DLOPEN
+	return cs_close(handle);
+#else
+	static bool fn_init;
+	static enum cs_err (*fn)(csh *handle);
+
+	if (!fn_init) {
+		fn = dlsym(perf_cs_dll_handle(), "cs_close");
+		if (!fn)
+			pr_debug("dlsym failed for cs_close\n");
+		fn_init = true;
+	}
+	if (!fn)
+		return CS_ERR_HANDLE;
+	return fn(handle);
+#endif
+}
 
-#ifdef HAVE_LIBCAPSTONE_SUPPORT
 static int capstone_init(struct machine *machine, csh *cs_handle, bool is64,
 			 bool disassembler_style)
 {
-	cs_arch arch;
-	cs_mode mode;
+	enum cs_arch arch;
+	enum cs_mode mode;
 
 	if (machine__is(machine, "x86_64") && is64) {
 		arch = CS_ARCH_X86;
@@ -45,7 +162,7 @@ static int capstone_init(struct machine *machine, csh *cs_handle, bool is64,
 		return -1;
 	}
 
-	if (cs_open(arch, mode, cs_handle) != CS_ERR_OK) {
+	if (perf_cs_open(arch, mode, cs_handle) != CS_ERR_OK) {
 		pr_warning_once("cs_open failed\n");
 		return -1;
 	}
@@ -57,27 +174,25 @@ static int capstone_init(struct machine *machine, csh *cs_handle, bool is64,
 		 * is set via annotation args
 		 */
 		if (disassembler_style)
-			cs_option(*cs_handle, CS_OPT_SYNTAX, CS_OPT_SYNTAX_ATT);
+			perf_cs_option(*cs_handle, CS_OPT_SYNTAX, CS_OPT_SYNTAX_ATT);
 		/*
 		 * Resolving address operands to symbols is implemented
 		 * on x86 by investigating instruction details.
 		 */
-		cs_option(*cs_handle, CS_OPT_DETAIL, CS_OPT_ON);
+		perf_cs_option(*cs_handle, CS_OPT_DETAIL, CS_OPT_ON);
 	}
 
 	return 0;
 }
-#endif
 
-#ifdef HAVE_LIBCAPSTONE_SUPPORT
-static size_t print_insn_x86(struct thread *thread, u8 cpumode, cs_insn *insn,
+static size_t print_insn_x86(struct thread *thread, u8 cpumode, struct cs_insn *insn,
 			     int print_opts, FILE *fp)
 {
 	struct addr_location al;
 	size_t printed = 0;
 
 	if (insn->detail && insn->detail->x86.op_count == 1) {
-		cs_x86_op *op = &insn->detail->x86.operands[0];
+		struct cs_x86_op *op = &insn->detail->x86.operands[0];
 
 		addr_location__init(&al);
 		if (op->type == X86_OP_IMM &&
@@ -95,7 +210,6 @@ static size_t print_insn_x86(struct thread *thread, u8 cpumode, cs_insn *insn,
 	printed += fprintf(fp, "%s %s", insn[0].mnemonic, insn[0].op_str);
 	return printed;
 }
-#endif
 
 
 ssize_t capstone__fprintf_insn_asm(struct machine *machine __maybe_unused,
@@ -106,9 +220,8 @@ ssize_t capstone__fprintf_insn_asm(struct machine *machine __maybe_unused,
 				   uint64_t ip __maybe_unused, int *lenp __maybe_unused,
 				   int print_opts __maybe_unused, FILE *fp __maybe_unused)
 {
-#ifdef HAVE_LIBCAPSTONE_SUPPORT
 	size_t printed;
-	cs_insn *insn;
+	struct cs_insn *insn;
 	csh cs_handle;
 	size_t count;
 	int ret;
@@ -118,7 +231,7 @@ ssize_t capstone__fprintf_insn_asm(struct machine *machine __maybe_unused,
 	if (ret < 0)
 		return ret;
 
-	count = cs_disasm(cs_handle, code, code_size, ip, 1, &insn);
+	count = perf_cs_disasm(cs_handle, code, code_size, ip, 1, &insn);
 	if (count > 0) {
 		if (machine__normalized_is(machine, "x86"))
 			printed = print_insn_x86(thread, cpumode, &insn[0], print_opts, fp);
@@ -126,35 +239,31 @@ ssize_t capstone__fprintf_insn_asm(struct machine *machine __maybe_unused,
 			printed = fprintf(fp, "%s %s", insn[0].mnemonic, insn[0].op_str);
 		if (lenp)
 			*lenp = insn->size;
-		cs_free(insn, count);
+		perf_cs_free(insn, count);
 	} else {
 		printed = -1;
 	}
 
-	cs_close(&cs_handle);
+	perf_cs_close(&cs_handle);
 	return printed;
-#else
-	return -1;
-#endif
 }
 
-#ifdef HAVE_LIBCAPSTONE_SUPPORT
-static void print_capstone_detail(cs_insn *insn, char *buf, size_t len,
+static void print_capstone_detail(struct cs_insn *insn, char *buf, size_t len,
 				  struct annotate_args *args, u64 addr)
 {
 	int i;
-	struct map *map = args->ms.map;
+	struct map *map = args->ms->map;
 	struct symbol *sym;
 
 	/* TODO: support more architectures */
-	if (!arch__is(args->arch, "x86"))
+	if (!arch__is_x86(args->arch))
 		return;
 
 	if (insn->detail == NULL)
 		return;
 
 	for (i = 0; i < insn->detail->x86.op_count; i++) {
-		cs_x86_op *op = &insn->detail->x86.operands[i];
+		struct cs_x86_op *op = &insn->detail->x86.operands[i];
 		u64 orig_addr;
 
 		if (op->type != X86_OP_MEM)
@@ -195,9 +304,7 @@ static void print_capstone_detail(cs_insn *insn, char *buf, size_t len,
 		break;
 	}
 }
-#endif
 
-#ifdef HAVE_LIBCAPSTONE_SUPPORT
 struct find_file_offset_data {
 	u64 ip;
 	u64 offset;
@@ -214,15 +321,13 @@ static int find_file_offset(u64 start, u64 len, u64 pgoff, void *arg)
 	}
 	return 0;
 }
-#endif
 
 int symbol__disassemble_capstone(const char *filename __maybe_unused,
 				 struct symbol *sym __maybe_unused,
 				 struct annotate_args *args __maybe_unused)
 {
-#ifdef HAVE_LIBCAPSTONE_SUPPORT
 	struct annotation *notes = symbol__annotation(sym);
-	struct map *map = args->ms.map;
+	struct map *map = args->ms->map;
 	struct dso *dso = map__dso(map);
 	u64 start = map__rip_2objdump(map, sym->start);
 	u64 offset;
@@ -235,7 +340,7 @@ int symbol__disassemble_capstone(const char *filename __maybe_unused,
 	const u8 *buf;
 	u64 buf_len;
 	csh handle;
-	cs_insn *insn = NULL;
+	struct cs_insn *insn = NULL;
 	char disasm_buf[512];
 	struct disasm_line *dl;
 	bool disassembler_style = false;
@@ -256,7 +361,7 @@ int symbol__disassemble_capstone(const char *filename __maybe_unused,
 	args->line = disasm_buf;
 	args->line_nr = 0;
 	args->fileloc = NULL;
-	args->ms.sym = sym;
+	args->ms->sym = sym;
 
 	dl = disasm_line__new(args);
 	if (dl == NULL)
@@ -268,12 +373,13 @@ int symbol__disassemble_capstone(const char *filename __maybe_unused,
 	    !strcmp(args->options->disassembler_style, "att"))
 		disassembler_style = true;
 
-	if (capstone_init(maps__machine(args->ms.maps), &handle, is_64bit, disassembler_style) < 0)
+	if (capstone_init(maps__machine(thread__maps(args->ms->thread)), &handle, is_64bit,
+			  disassembler_style) < 0)
 		goto err;
 
 	needs_cs_close = true;
 
-	free_count = count = cs_disasm(handle, buf, buf_len, start, buf_len, &insn);
+	free_count = count = perf_cs_disasm(handle, buf, buf_len, start, buf_len, &insn);
 	for (i = 0, offset = 0; i < count; i++) {
 		int printed;
 
@@ -312,9 +418,9 @@ int symbol__disassemble_capstone(const char *filename __maybe_unused,
 
 out:
 	if (needs_cs_close) {
-		cs_close(&handle);
+		perf_cs_close(&handle);
 		if (free_count > 0)
-			cs_free(insn, free_count);
+			perf_cs_free(insn, free_count);
 	}
 	free(code_buf);
 	return count < 0 ? count : 0;
@@ -334,18 +440,14 @@ err:
 	}
 	count = -1;
 	goto out;
-#else
-	return -1;
-#endif
 }
 
 int symbol__disassemble_capstone_powerpc(const char *filename __maybe_unused,
 					 struct symbol *sym __maybe_unused,
 					 struct annotate_args *args __maybe_unused)
 {
-#ifdef HAVE_LIBCAPSTONE_SUPPORT
 	struct annotation *notes = symbol__annotation(sym);
-	struct map *map = args->ms.map;
+	struct map *map = args->ms->map;
 	struct dso *dso = map__dso(map);
 	struct nscookie nsc;
 	u64 start = map__rip_2objdump(map, sym->start);
@@ -382,7 +484,8 @@ int symbol__disassemble_capstone_powerpc(const char *filename __maybe_unused,
 	    !strcmp(args->options->disassembler_style, "att"))
 		disassembler_style = true;
 
-	if (capstone_init(maps__machine(args->ms.maps), &handle, is_64bit, disassembler_style) < 0)
+	if (capstone_init(maps__machine(thread__maps(args->ms->thread)), &handle, is_64bit,
+			  disassembler_style) < 0)
 		goto err;
 
 	needs_cs_close = true;
@@ -408,7 +511,7 @@ int symbol__disassemble_capstone_powerpc(const char *filename __maybe_unused,
 	args->line = disasm_buf;
 	args->line_nr = 0;
 	args->fileloc = NULL;
-	args->ms.sym = sym;
+	args->ms->sym = sym;
 
 	dl = disasm_line__new(args);
 	if (dl == NULL)
@@ -456,7 +559,7 @@ int symbol__disassemble_capstone_powerpc(const char *filename __maybe_unused,
 
 out:
 	if (needs_cs_close)
-		cs_close(&handle);
+		perf_cs_close(&handle);
 	free(buf);
 	return count < 0 ? count : 0;
 
@@ -465,7 +568,4 @@ err:
 		close(fd);
 	count = -1;
 	goto out;
-#else
-	return -1;
-#endif
 }
diff --git a/tools/perf/util/capstone.h b/tools/perf/util/capstone.h
index 0f030ea034b6..7c0baaa01a73 100644
--- a/tools/perf/util/capstone.h
+++ b/tools/perf/util/capstone.h
@@ -6,6 +6,7 @@
 #include <stdint.h>
 #include <stdio.h>
 #include <stdlib.h>
+#include <linux/compiler.h>
 #include <linux/types.h>
 
 struct annotate_args;
@@ -13,6 +14,7 @@ struct machine;
 struct symbol;
 struct thread;
 
+#ifdef HAVE_LIBCAPSTONE_SUPPORT
 ssize_t capstone__fprintf_insn_asm(struct machine *machine, struct thread *thread, u8 cpumode,
 				   bool is64bit, const uint8_t *code, size_t code_size,
 				   uint64_t ip, int *lenp, int print_opts, FILE *fp);
@@ -21,4 +23,35 @@ int symbol__disassemble_capstone(const char *filename, struct symbol *sym,
 int symbol__disassemble_capstone_powerpc(const char *filename, struct symbol *sym,
 					 struct annotate_args *args);
 
+#else /* !HAVE_LIBCAPSTONE_SUPPORT */
+static inline ssize_t capstone__fprintf_insn_asm(struct machine *machine __maybe_unused,
+						 struct thread *thread __maybe_unused,
+						 u8 cpumode __maybe_unused,
+						 bool is64bit __maybe_unused,
+						 const uint8_t *code __maybe_unused,
+						 size_t code_size __maybe_unused,
+						 uint64_t ip __maybe_unused,
+						 int *lenp __maybe_unused,
+						 int print_opts __maybe_unused,
+						 FILE *fp __maybe_unused)
+{
+	return -1;
+}
+
+static inline int symbol__disassemble_capstone(const char *filename __maybe_unused,
+					       struct symbol *sym __maybe_unused,
+					       struct annotate_args *args __maybe_unused)
+{
+	return -1;
+}
+
+static inline int symbol__disassemble_capstone_powerpc(const char *filename __maybe_unused,
+						       struct symbol *sym __maybe_unused,
+						       struct annotate_args *args __maybe_unused)
+{
+	return -1;
+}
+
+#endif /* HAVE_LIBCAPSTONE_SUPPORT */
+
 #endif /* __PERF_CAPSTONE_H */
diff --git a/tools/perf/util/config.c b/tools/perf/util/config.c
index e0219bc6330a..0452fbc6c085 100644
--- a/tools/perf/util/config.c
+++ b/tools/perf/util/config.c
@@ -20,6 +20,7 @@
 #include "util/stat.h"  /* perf_stat__set_big_num */
 #include "util/evsel.h"  /* evsel__hw_names, evsel__use_bpf_counters */
 #include "util/addr2line.h"  /* addr2line_timeout_ms */
+#include "srcline.h"
 #include "build-id.h"
 #include "debug.h"
 #include "config.h"
@@ -519,6 +520,9 @@ int perf_default_config(const char *var, const char *value,
 	if (strstarts(var, "stat."))
 		return perf_stat_config(var, value);
 
+	if (strstarts(var, "addr2line."))
+		return addr2line_configure(var, value, dummy);
+
 	/* Add other config variables here. */
 	return 0;
 }
diff --git a/tools/perf/util/cs-etm.c b/tools/perf/util/cs-etm.c
index 25d56e0f1c07..95f439c96180 100644
--- a/tools/perf/util/cs-etm.c
+++ b/tools/perf/util/cs-etm.c
@@ -6,7 +6,6 @@
  * Author: Mathieu Poirier <mathieu.poirier@linaro.org>
  */
 
-#include <linux/kernel.h>
 #include <linux/bitfield.h>
 #include <linux/bitops.h>
 #include <linux/coresight-pmu.h>
@@ -3086,7 +3085,7 @@ static int cs_etm__queue_aux_fragment(struct perf_session *session, off_t file_o
 
 	if (aux_offset >= auxtrace_event->offset &&
 	    aux_offset + aux_size <= auxtrace_event->offset + auxtrace_event->size) {
-		struct cs_etm_queue *etmq = etm->queues.queue_array[auxtrace_event->idx].priv;
+		struct cs_etm_queue *etmq = cs_etm__get_queue(etm, auxtrace_event->cpu);
 
 		/*
 		 * If this AUX event was inside this buffer somewhere, create a new auxtrace event
@@ -3095,6 +3094,7 @@ static int cs_etm__queue_aux_fragment(struct perf_session *session, off_t file_o
 		auxtrace_fragment.auxtrace = *auxtrace_event;
 		auxtrace_fragment.auxtrace.size = aux_size;
 		auxtrace_fragment.auxtrace.offset = aux_offset;
+		auxtrace_fragment.auxtrace.idx = etmq->queue_nr;
 		file_offset += aux_offset - auxtrace_event->offset + auxtrace_event->header.size;
 
 		pr_debug3("CS ETM: Queue buffer size: %#"PRI_lx64" offset: %#"PRI_lx64
diff --git a/tools/perf/util/data-convert-bt.c b/tools/perf/util/data-convert-bt.c
index 3d2e437e1354..ba1c8e48d495 100644
--- a/tools/perf/util/data-convert-bt.c
+++ b/tools/perf/util/data-convert-bt.c
@@ -34,6 +34,8 @@
 #include "util.h"
 #include "clockid.h"
 #include "util/sample.h"
+#include "util/time-utils.h"
+#include "header.h"
 
 #ifdef HAVE_LIBTRACEEVENT
 #include <event-parse.h>
@@ -91,9 +93,14 @@ struct convert {
 	struct perf_tool	tool;
 	struct ctf_writer	writer;
 
+	struct perf_time_interval *ptime_range;
+	int range_size;
+	int range_num;
+
 	u64			events_size;
 	u64			events_count;
 	u64			non_sample_count;
+	u64			skipped;
 
 	/* Ordered events configured queue size. */
 	u64			queue_size;
@@ -811,6 +818,11 @@ static int process_sample_event(const struct perf_tool *tool,
 	if (WARN_ONCE(!priv, "Failed to setup all events.\n"))
 		return 0;
 
+	if (perf_time__ranges_skip_sample(c->ptime_range, c->range_num, sample->time)) {
+		++c->skipped;
+		return 0;
+	}
+
 	event_class = priv->event_class;
 
 	/* update stats */
@@ -1327,7 +1339,8 @@ static void cleanup_events(struct perf_session *session)
 		struct evsel_priv *priv;
 
 		priv = evsel->priv;
-		bt_ctf_event_class_put(priv->event_class);
+		if (priv)
+			bt_ctf_event_class_put(priv->event_class);
 		zfree(&evsel->priv);
 	}
 
@@ -1376,7 +1389,7 @@ static int ctf_writer__setup_env(struct ctf_writer *cw,
 
 #define ADD(__n, __v)							\
 do {									\
-	if (bt_ctf_writer_add_environment_field(writer, __n, __v))	\
+	if (__v && bt_ctf_writer_add_environment_field(writer, __n, __v))	\
 		return -1;						\
 } while (0)
 
@@ -1392,6 +1405,52 @@ do {									\
 	return 0;
 }
 
+static int process_feature_event(const struct perf_tool *tool,
+				 struct perf_session *session,
+				 union perf_event *event)
+{
+	struct convert *c = container_of(tool, struct convert, tool);
+	struct ctf_writer *cw = &c->writer;
+	struct perf_record_header_feature *fe = &event->feat;
+
+	if (event->feat.feat_id < HEADER_LAST_FEATURE) {
+		int ret = perf_event__process_feature(session, event);
+
+		if (ret)
+			return ret;
+	}
+
+	switch (fe->feat_id) {
+	case HEADER_HOSTNAME:
+		if (session->header.env.hostname) {
+			return bt_ctf_writer_add_environment_field(cw->writer, "host",
+								   session->header.env.hostname);
+		}
+		break;
+	case HEADER_OSRELEASE:
+		if (session->header.env.os_release) {
+			return bt_ctf_writer_add_environment_field(cw->writer, "release",
+								   session->header.env.os_release);
+		}
+		break;
+	case HEADER_VERSION:
+		if (session->header.env.version) {
+			return bt_ctf_writer_add_environment_field(cw->writer, "version",
+								   session->header.env.version);
+		}
+		break;
+	case HEADER_ARCH:
+		if (session->header.env.arch) {
+			return bt_ctf_writer_add_environment_field(cw->writer, "machine",
+								   session->header.env.arch);
+		}
+		break;
+	default:
+		break;
+	}
+	return 0;
+}
+
 static int ctf_writer__setup_clock(struct ctf_writer *cw,
 				   struct perf_session *session,
 				   bool tod)
@@ -1624,6 +1683,8 @@ int bt_convert__perf2ctf(const char *input, const char *path,
 	c.tool.tracing_data    = perf_event__process_tracing_data;
 	c.tool.build_id        = perf_event__process_build_id;
 	c.tool.namespaces      = perf_event__process_namespaces;
+	c.tool.attr            = perf_event__process_attr;
+	c.tool.feature         = process_feature_event;
 	c.tool.ordering_requires_timestamps = true;
 
 	if (opts->all) {
@@ -1644,6 +1705,15 @@ int bt_convert__perf2ctf(const char *input, const char *path,
 	if (IS_ERR(session))
 		return PTR_ERR(session);
 
+	if (opts->time_str) {
+		err = perf_time__parse_for_ranges(opts->time_str, session,
+						  &c.ptime_range,
+						  &c.range_size,
+						  &c.range_num);
+		if (err < 0)
+			goto free_session;
+	}
+
 	/* CTF writer */
 	if (ctf_writer__init(cw, path, session, opts->tod))
 		goto free_session;
@@ -1673,12 +1743,10 @@ int bt_convert__perf2ctf(const char *input, const char *path,
 	else
 		pr_err("Error during conversion.\n");
 
-	fprintf(stderr,
-		"[ perf data convert: Converted '%s' into CTF data '%s' ]\n",
+	fprintf(stderr,	"[ perf data convert: Converted '%s' into CTF data '%s' ]\n",
 		data.path, path);
 
-	fprintf(stderr,
-		"[ perf data convert: Converted and wrote %.3f MB (%" PRIu64 " samples",
+	fprintf(stderr,	"[ perf data convert: Converted and wrote %.3f MB (%" PRIu64 " samples",
 		(double) c.events_size / 1024.0 / 1024.0,
 		c.events_count);
 
@@ -1687,6 +1755,14 @@ int bt_convert__perf2ctf(const char *input, const char *path,
 	else
 		fprintf(stderr, ", %" PRIu64 " non-samples) ]\n", c.non_sample_count);
 
+	if (c.skipped) {
+		fprintf(stderr,	"[ perf data convert: Skipped %" PRIu64 " samples ]\n",
+			c.skipped);
+	}
+
+	if (c.ptime_range)
+		zfree(&c.ptime_range);
+
 	cleanup_events(session);
 	perf_session__delete(session);
 	ctf_writer__cleanup(cw);
@@ -1696,6 +1772,9 @@ int bt_convert__perf2ctf(const char *input, const char *path,
 free_writer:
 	ctf_writer__cleanup(cw);
 free_session:
+	if (c.ptime_range)
+		zfree(&c.ptime_range);
+
 	perf_session__delete(session);
 	pr_err("Error during conversion setup.\n");
 	return err;
diff --git a/tools/perf/util/data-convert-json.c b/tools/perf/util/data-convert-json.c
index 9dc1e184cf3c..6a626322476a 100644
--- a/tools/perf/util/data-convert-json.c
+++ b/tools/perf/util/data-convert-json.c
@@ -25,6 +25,7 @@
 #include "util/session.h"
 #include "util/symbol.h"
 #include "util/thread.h"
+#include "util/time-utils.h"
 #include "util/tool.h"
 
 #ifdef HAVE_LIBTRACEEVENT
@@ -35,13 +36,21 @@ struct convert_json {
 	struct perf_tool tool;
 	FILE *out;
 	bool first;
+	struct perf_time_interval *ptime_range;
+	int range_size;
+	int range_num;
+
 	u64 events_count;
+	u64 skipped;
 };
 
 // Outputs a JSON-encoded string surrounded by quotes with characters escaped.
 static void output_json_string(FILE *out, const char *s)
 {
 	fputc('"', out);
+	if (!s)
+		goto out;
+
 	while (*s) {
 		switch (*s) {
 
@@ -65,6 +74,7 @@ static void output_json_string(FILE *out, const char *s)
 
 		++s;
 	}
+out:
 	fputc('"', out);
 }
 
@@ -165,6 +175,11 @@ static int process_sample_event(const struct perf_tool *tool,
 		return -1;
 	}
 
+	if (perf_time__ranges_skip_sample(c->ptime_range, c->range_num, sample->time)) {
+		++c->skipped;
+		return 0;
+	}
+
 	++c->events_count;
 
 	if (c->first)
@@ -311,6 +326,16 @@ static void output_headers(struct perf_session *session, struct convert_json *c)
 	output_json_format(out, false, 2, "]");
 }
 
+static int process_feature_event(const struct perf_tool *tool __maybe_unused,
+				 struct perf_session *session,
+				 union perf_event *event)
+{
+	if (event->feat.feat_id < HEADER_LAST_FEATURE)
+		return perf_event__process_feature(session, event);
+
+	return 0;
+}
+
 int bt_convert__perf2json(const char *input_name, const char *output_name,
 		struct perf_data_convert_opts *opts __maybe_unused)
 {
@@ -320,6 +345,10 @@ int bt_convert__perf2json(const char *input_name, const char *output_name,
 	struct convert_json c = {
 		.first = true,
 		.events_count = 0,
+		.ptime_range = NULL,
+		.range_size = 0,
+		.range_num = 0,
+		.skipped = 0,
 	};
 	struct perf_data data = {
 		.mode = PERF_DATA_MODE_READ,
@@ -345,6 +374,8 @@ int bt_convert__perf2json(const char *input_name, const char *output_name,
 	c.tool.auxtrace_info  = perf_event__process_auxtrace_info;
 	c.tool.auxtrace       = perf_event__process_auxtrace;
 	c.tool.event_update   = perf_event__process_event_update;
+	c.tool.attr           = perf_event__process_attr;
+	c.tool.feature        = process_feature_event;
 	c.tool.ordering_requires_timestamps = true;
 
 	if (opts->all) {
@@ -382,6 +413,15 @@ int bt_convert__perf2json(const char *input_name, const char *output_name,
 		goto err_session_delete;
 	}
 
+	if (opts->time_str) {
+		ret = perf_time__parse_for_ranges(opts->time_str, session,
+						  &c.ptime_range,
+						  &c.range_size,
+						  &c.range_num);
+		if (ret < 0)
+			goto err_session_delete;
+	}
+
 	// The opening brace is printed manually because it isn't delimited from a
 	// previous value (i.e. we don't want a leading newline)
 	fputc('{', c.out);
@@ -403,15 +443,23 @@ int bt_convert__perf2json(const char *input_name, const char *output_name,
 	output_json_format(c.out, false, 0, "}");
 	fputc('\n', c.out);
 
-	fprintf(stderr,
-			"[ perf data convert: Converted '%s' into JSON data '%s' ]\n",
-			data.path, output_name);
+	fprintf(stderr,	"[ perf data convert: Converted '%s' into JSON data '%s' ]\n",
+		data.path, output_name);
 
 	fprintf(stderr,
-			"[ perf data convert: Converted and wrote %.3f MB (%" PRIu64 " samples) ]\n",
-			(ftell(c.out)) / 1024.0 / 1024.0, c.events_count);
+		"[ perf data convert: Converted and wrote %.3f MB (%" PRIu64 " samples) ]\n",
+		(ftell(c.out)) / 1024.0 / 1024.0, c.events_count);
+
+	if (c.skipped) {
+		fprintf(stderr,	"[ perf data convert: Skipped %" PRIu64 " samples ]\n",
+			c.skipped);
+	}
 
 	ret = 0;
+
+	if (c.ptime_range)
+		zfree(&c.ptime_range);
+
 err_session_delete:
 	perf_session__delete(session);
 err_fclose:
diff --git a/tools/perf/util/data-convert.h b/tools/perf/util/data-convert.h
index 1b4c5f598415..ee651fa680a1 100644
--- a/tools/perf/util/data-convert.h
+++ b/tools/perf/util/data-convert.h
@@ -8,6 +8,7 @@ struct perf_data_convert_opts {
 	bool force;
 	bool all;
 	bool tod;
+	const char *time_str;
 };
 
 #ifdef HAVE_LIBBABELTRACE_SUPPORT
diff --git a/tools/perf/util/data.c b/tools/perf/util/data.c
index 164eb45a0b36..90df41da1a32 100644
--- a/tools/perf/util/data.c
+++ b/tools/perf/util/data.c
@@ -213,17 +213,15 @@ static int check_backup(struct perf_data *data)
 
 		ret = rm_rf_perf_data(oldname);
 		if (ret) {
-			pr_err("Can't remove old data: %s (%s)\n",
-			       ret == -2 ?
-			       "Unknown file found" : strerror(errno),
-			       oldname);
+			if (ret == -2)
+				pr_err("Can't remove old data: Unknown file found (%s)\n", oldname);
+			else
+				pr_err("Can't remove old data: %m (%s)\n", oldname);
 			return -1;
 		}
 
 		if (rename(data->path, oldname)) {
-			pr_err("Can't move data: %s (%s to %s)\n",
-			       strerror(errno),
-			       data->path, oldname);
+			pr_err("Can't move data: %m (%s to %s)\n", data->path, oldname);
 			return -1;
 		}
 	}
@@ -246,14 +244,12 @@ static int open_file_read(struct perf_data *data)
 	int flags = data->in_place_update ? O_RDWR : O_RDONLY;
 	struct stat st;
 	int fd;
-	char sbuf[STRERR_BUFSIZE];
 
 	fd = open(data->file.path, flags);
 	if (fd < 0) {
 		int err = errno;
 
-		pr_err("failed to open %s: %s", data->file.path,
-			str_error_r(err, sbuf, sizeof(sbuf)));
+		pr_err("failed to open %s: %m", data->file.path);
 		if (err == ENOENT && !strcmp(data->file.path, "perf.data"))
 			pr_err("  (try 'perf record' first)");
 		pr_err("\n");
@@ -285,15 +281,10 @@ static int open_file_read(struct perf_data *data)
 
 static int open_file_write(struct perf_data *data)
 {
-	int fd;
-	char sbuf[STRERR_BUFSIZE];
-
-	fd = open(data->file.path, O_CREAT|O_RDWR|O_TRUNC|O_CLOEXEC,
-		  S_IRUSR|S_IWUSR);
+	int fd = open(data->file.path, O_CREAT|O_RDWR|O_TRUNC|O_CLOEXEC, S_IRUSR|S_IWUSR);
 
 	if (fd < 0)
-		pr_err("failed to open %s : %s\n", data->file.path,
-			str_error_r(errno, sbuf, sizeof(sbuf)));
+		pr_err("failed to open %s : %m\n", data->file.path);
 
 	return fd;
 }
@@ -436,8 +427,8 @@ int perf_data__switch(struct perf_data *data,
 
 		if (lseek(data->file.fd, pos, SEEK_SET) == (off_t)-1) {
 			ret = -errno;
-			pr_debug("Failed to lseek to %zu: %s",
-				 pos, strerror(errno));
+			pr_debug("Failed to lseek to %zu: %m\n",
+				 pos);
 			goto out;
 		}
 	}
diff --git a/tools/perf/util/db-export.c b/tools/perf/util/db-export.c
index 8f52e8cefcf3..ae9a9065aab7 100644
--- a/tools/perf/util/db-export.c
+++ b/tools/perf/util/db-export.c
@@ -254,7 +254,6 @@ static struct call_path *call_path_from_sample(struct db_export *dbe,
 		addr_location__init(&al);
 		al.sym = node->ms.sym;
 		al.map = map__get(node->ms.map);
-		al.maps = maps__get(thread__maps(thread));
 		al.addr = node->ip;
 		al.thread = thread__get(thread);
 
diff --git a/tools/perf/util/demangle-java.c b/tools/perf/util/demangle-java.c
index ddf33d58bcd3..c3cb327ed562 100644
--- a/tools/perf/util/demangle-java.c
+++ b/tools/perf/util/demangle-java.c
@@ -158,7 +158,7 @@ char *
 java_demangle_sym(const char *str, int flags)
 {
 	char *buf, *ptr;
-	char *p;
+	const char *p;
 	size_t len, l1 = 0;
 
 	if (!str)
diff --git a/tools/perf/util/disasm.c b/tools/perf/util/disasm.c
index 50b9433f3f8e..ddcc488f2e5f 100644
--- a/tools/perf/util/disasm.c
+++ b/tools/perf/util/disasm.c
@@ -1,5 +1,9 @@
 // SPDX-License-Identifier: GPL-2.0-only
 #include <ctype.h>
+#include <elf.h>
+#ifndef EF_CSKY_ABIMASK
+#define EF_CSKY_ABIMASK	0XF0000000
+#endif
 #include <errno.h>
 #include <fcntl.h>
 #include <inttypes.h>
@@ -28,25 +32,21 @@
 #include "namespaces.h"
 #include "srcline.h"
 #include "symbol.h"
+#include "thread.h"
 #include "util.h"
 
 static regex_t	 file_lineno;
 
 /* These can be referred from the arch-dependent code */
-static struct ins_ops call_ops;
-static struct ins_ops dec_ops;
-static struct ins_ops jump_ops;
-static struct ins_ops mov_ops;
-static struct ins_ops nop_ops;
-static struct ins_ops lock_ops;
-static struct ins_ops ret_ops;
-static struct ins_ops load_store_ops;
-static struct ins_ops arithmetic_ops;
-
-static int jump__scnprintf(struct ins *ins, char *bf, size_t size,
-			   struct ins_operands *ops, int max_ins_name);
-static int call__scnprintf(struct ins *ins, char *bf, size_t size,
-			   struct ins_operands *ops, int max_ins_name);
+const struct ins_ops call_ops;
+const struct ins_ops dec_ops;
+const struct ins_ops jump_ops;
+const struct ins_ops mov_ops;
+const struct ins_ops nop_ops;
+const struct ins_ops lock_ops;
+const struct ins_ops ret_ops;
+const struct ins_ops load_store_ops;
+const struct ins_ops arithmetic_ops;
 
 static void ins__sort(struct arch *arch);
 static int disasm_line__parse(char *line, const char **namep, char **rawp);
@@ -66,7 +66,8 @@ static int arch__grow_instructions(struct arch *arch)
 		goto grow_from_non_allocated_table;
 
 	new_nr_allocated = arch->nr_instructions_allocated + 128;
-	new_instructions = realloc(arch->instructions, new_nr_allocated * sizeof(struct ins));
+	new_instructions = realloc((void *)arch->instructions,
+				   new_nr_allocated * sizeof(struct ins));
 	if (new_instructions == NULL)
 		return -1;
 
@@ -81,11 +82,11 @@ grow_from_non_allocated_table:
 	if (new_instructions == NULL)
 		return -1;
 
-	memcpy(new_instructions, arch->instructions, arch->nr_instructions);
+	memcpy(new_instructions, arch->instructions, arch->nr_instructions * sizeof(struct ins));
 	goto out_update_instructions;
 }
 
-static int arch__associate_ins_ops(struct arch* arch, const char *name, struct ins_ops *ops)
+int arch__associate_ins_ops(struct arch *arch, const char *name, const struct ins_ops *ops)
 {
 	struct ins *ins;
 
@@ -93,7 +94,7 @@ static int arch__associate_ins_ops(struct arch* arch, const char *name, struct i
 	    arch__grow_instructions(arch))
 		return -1;
 
-	ins = &arch->instructions[arch->nr_instructions];
+	ins = (struct ins *)&arch->instructions[arch->nr_instructions];
 	ins->name = strdup(name);
 	if (!ins->name)
 		return -1;
@@ -105,130 +106,100 @@ static int arch__associate_ins_ops(struct arch* arch, const char *name, struct i
 	return 0;
 }
 
-#include "arch/arc/annotate/instructions.c"
-#include "arch/arm/annotate/instructions.c"
-#include "arch/arm64/annotate/instructions.c"
-#include "arch/csky/annotate/instructions.c"
-#include "arch/loongarch/annotate/instructions.c"
-#include "arch/mips/annotate/instructions.c"
-#include "arch/x86/annotate/instructions.c"
-#include "arch/powerpc/annotate/instructions.c"
-#include "arch/riscv64/annotate/instructions.c"
-#include "arch/s390/annotate/instructions.c"
-#include "arch/sparc/annotate/instructions.c"
-
-static struct arch architectures[] = {
-	{
-		.name = "arc",
-		.init = arc__annotate_init,
-	},
-	{
-		.name = "arm",
-		.init = arm__annotate_init,
-	},
-	{
-		.name = "arm64",
-		.init = arm64__annotate_init,
-	},
-	{
-		.name = "csky",
-		.init = csky__annotate_init,
-	},
-	{
-		.name = "mips",
-		.init = mips__annotate_init,
-		.objdump = {
-			.comment_char = '#',
-		},
-	},
-	{
-		.name = "x86",
-		.init = x86__annotate_init,
-		.instructions = x86__instructions,
-		.nr_instructions = ARRAY_SIZE(x86__instructions),
-		.insn_suffix = "bwlq",
-		.objdump =  {
-			.comment_char = '#',
-			.register_char = '%',
-			.memory_ref_char = '(',
-			.imm_char = '$',
-		},
-#ifdef HAVE_LIBDW_SUPPORT
-		.update_insn_state = update_insn_state_x86,
-#endif
-	},
-	{
-		.name = "powerpc",
-		.init = powerpc__annotate_init,
-#ifdef HAVE_LIBDW_SUPPORT
-		.update_insn_state = update_insn_state_powerpc,
-#endif
-	},
-	{
-		.name = "riscv64",
-		.init = riscv64__annotate_init,
-	},
-	{
-		.name = "s390",
-		.init = s390__annotate_init,
-		.objdump =  {
-			.comment_char = '#',
-		},
-	},
-	{
-		.name = "sparc",
-		.init = sparc__annotate_init,
-		.objdump = {
-			.comment_char = '#',
-		},
-	},
-	{
-		.name = "loongarch",
-		.init = loongarch__annotate_init,
-		.objdump = {
-			.comment_char = '#',
-		},
-	},
-};
+static int e_machine_and_eflags__cmp(const struct e_machine_and_e_flags *val1,
+				     const struct e_machine_and_e_flags *val2)
+{
+	if (val1->e_machine == val2->e_machine) {
+		if (val1->e_machine != EM_CSKY)
+			return 0;
+		if ((val1->e_flags & EF_CSKY_ABIMASK) < (val2->e_flags & EF_CSKY_ABIMASK))
+			return -1;
+		return (val1->e_flags & EF_CSKY_ABIMASK) > (val2->e_flags & EF_CSKY_ABIMASK);
+	}
+	return val1->e_machine < val2->e_machine ? -1 : 1;
+}
 
-static int arch__key_cmp(const void *name, const void *archp)
+static int arch__key_cmp(const void *key, const void *archp)
 {
-	const struct arch *arch = archp;
+	const struct arch *const *arch = archp;
 
-	return strcmp(name, arch->name);
+	return e_machine_and_eflags__cmp(key, &(*arch)->id);
 }
 
 static int arch__cmp(const void *a, const void *b)
 {
-	const struct arch *aa = a;
-	const struct arch *ab = b;
+	const struct arch *const *aa = a;
+	const struct arch *const *ab = b;
 
-	return strcmp(aa->name, ab->name);
+	return e_machine_and_eflags__cmp(&(*aa)->id, &(*ab)->id);
 }
 
-static void arch__sort(void)
+const struct arch *arch__find(uint16_t e_machine, uint32_t e_flags, const char *cpuid)
 {
-	const int nmemb = ARRAY_SIZE(architectures);
+	static const struct arch *(*const arch_new_fn[])(const struct e_machine_and_e_flags *id,
+							 const char *cpuid) = {
+		[EM_386]	= arch__new_x86,
+		[EM_ARC]	= arch__new_arc,
+		[EM_ARM]	= arch__new_arm,
+		[EM_AARCH64]	= arch__new_arm64,
+		[EM_CSKY]	= arch__new_csky,
+		[EM_LOONGARCH]	= arch__new_loongarch,
+		[EM_MIPS]	= arch__new_mips,
+		[EM_PPC64]	= arch__new_powerpc,
+		[EM_PPC]	= arch__new_powerpc,
+		[EM_RISCV]	= arch__new_riscv64,
+		[EM_S390]	= arch__new_s390,
+		[EM_SPARC]	= arch__new_sparc,
+		[EM_SPARCV9]	= arch__new_sparc,
+		[EM_X86_64]	= arch__new_x86,
+	};
+	static const struct arch **archs;
+	static size_t num_archs;
+	struct e_machine_and_e_flags key = {
+		.e_machine = e_machine,
+		.e_flags = e_flags,
+	};
+	const struct arch *result = NULL, **tmp;
 
-	qsort(architectures, nmemb, sizeof(struct arch), arch__cmp);
-}
+	if (num_archs > 0) {
+		tmp = bsearch(&key, archs, num_archs, sizeof(*archs), arch__key_cmp);
+		if (tmp)
+			result = *tmp;
+	}
 
-struct arch *arch__find(const char *name)
-{
-	const int nmemb = ARRAY_SIZE(architectures);
-	static bool sorted;
+	if (result)
+		return result;
 
-	if (!sorted) {
-		arch__sort();
-		sorted = true;
+	if (e_machine >= ARRAY_SIZE(arch_new_fn) || arch_new_fn[e_machine] == NULL) {
+		errno = ENOTSUP;
+		return NULL;
 	}
 
-	return bsearch(name, architectures, nmemb, sizeof(struct arch), arch__key_cmp);
+	tmp = reallocarray(archs, num_archs + 1, sizeof(*archs));
+	if (!tmp)
+		return NULL;
+
+	result = arch_new_fn[e_machine](&key, cpuid);
+	if (!result) {
+		pr_err("%s: failed to initialize %s (%u) arch priv area\n",
+			__func__, result->name, e_machine);
+		free(tmp);
+		return NULL;
+	}
+	archs = tmp;
+	archs[num_archs++] = result;
+	qsort(archs, num_archs, sizeof(*archs), arch__cmp);
+	return result;
+}
+
+bool arch__is_x86(const struct arch *arch)
+{
+	return arch->id.e_machine == EM_386 || arch->id.e_machine == EM_X86_64;
 }
 
-bool arch__is(struct arch *arch, const char *name)
+bool arch__is_powerpc(const struct arch *arch)
 {
-	return !strcmp(arch->name, name);
+	return arch->id.e_machine == EM_PPC || arch->id.e_machine == EM_PPC64;
 }
 
 static void ins_ops__delete(struct ins_operands *ops)
@@ -241,14 +212,14 @@ static void ins_ops__delete(struct ins_operands *ops)
 	zfree(&ops->target.name);
 }
 
-static int ins__raw_scnprintf(struct ins *ins, char *bf, size_t size,
-			      struct ins_operands *ops, int max_ins_name)
+int ins__raw_scnprintf(const struct ins *ins, char *bf, size_t size,
+			   struct ins_operands *ops, int max_ins_name)
 {
 	return scnprintf(bf, size, "%-*s %s", max_ins_name, ins->name, ops->raw);
 }
 
-static int ins__scnprintf(struct ins *ins, char *bf, size_t size,
-			  struct ins_operands *ops, int max_ins_name)
+int ins__scnprintf(const struct ins *ins, char *bf, size_t size,
+		   struct ins_operands *ops, int max_ins_name)
 {
 	if (ins->ops->scnprintf)
 		return ins->ops->scnprintf(ins, bf, size, ops, max_ins_name);
@@ -256,7 +227,7 @@ static int ins__scnprintf(struct ins *ins, char *bf, size_t size,
 	return ins__raw_scnprintf(ins, bf, size, ops, max_ins_name);
 }
 
-bool ins__is_fused(struct arch *arch, const char *ins1, const char *ins2)
+bool ins__is_fused(const struct arch *arch, const char *ins1, const char *ins2)
 {
 	if (!arch || !arch->ins_is_fused)
 		return false;
@@ -264,14 +235,12 @@ bool ins__is_fused(struct arch *arch, const char *ins1, const char *ins2)
 	return arch->ins_is_fused(arch, ins1, ins2);
 }
 
-static int call__parse(struct arch *arch, struct ins_operands *ops, struct map_symbol *ms,
+static int call__parse(const struct arch *arch, struct ins_operands *ops, struct map_symbol *ms,
 		struct disasm_line *dl __maybe_unused)
 {
 	char *endptr, *tok, *name;
 	struct map *map = ms->map;
-	struct addr_map_symbol target = {
-		.ms = { .map = map, },
-	};
+	struct addr_map_symbol target;
 
 	ops->target.addr = strtoull(ops->raw, &endptr, 16);
 
@@ -296,12 +265,16 @@ static int call__parse(struct arch *arch, struct ins_operands *ops, struct map_s
 	if (ops->target.name == NULL)
 		return -1;
 find_target:
-	target.addr = map__objdump_2mem(map, ops->target.addr);
+	target = (struct addr_map_symbol) {
+		.ms = { .map = map__get(map), },
+		.addr = map__objdump_2mem(map, ops->target.addr),
+	};
 
-	if (maps__find_ams(ms->maps, &target) == 0 &&
+	if (maps__find_ams(thread__maps(ms->thread), &target) == 0 &&
 	    map__rip_2objdump(target.ms.map, map__map_ip(target.ms.map, target.addr)) == ops->target.addr)
 		ops->target.sym = target.ms.sym;
 
+	addr_map_symbol__exit(&target);
 	return 0;
 
 indirect_call:
@@ -317,8 +290,8 @@ indirect_call:
 	goto find_target;
 }
 
-static int call__scnprintf(struct ins *ins, char *bf, size_t size,
-			   struct ins_operands *ops, int max_ins_name)
+int call__scnprintf(const struct ins *ins, char *bf, size_t size,
+		      struct ins_operands *ops, int max_ins_name)
 {
 	if (ops->target.sym)
 		return scnprintf(bf, size, "%-*s %s", max_ins_name, ins->name, ops->target.sym->name);
@@ -332,14 +305,15 @@ static int call__scnprintf(struct ins *ins, char *bf, size_t size,
 	return scnprintf(bf, size, "%-*s *%" PRIx64, max_ins_name, ins->name, ops->target.addr);
 }
 
-static struct ins_ops call_ops = {
+const struct ins_ops call_ops = {
 	.parse	   = call__parse,
 	.scnprintf = call__scnprintf,
+	.is_call   = true,
 };
 
 bool ins__is_call(const struct ins *ins)
 {
-	return ins->ops == &call_ops || ins->ops == &s390_call_ops || ins->ops == &loongarch_call_ops;
+	return ins->ops && ins->ops->is_call;
 }
 
 /*
@@ -360,13 +334,13 @@ static inline const char *validate_comma(const char *c, struct ins_operands *ops
 	return c;
 }
 
-static int jump__parse(struct arch *arch, struct ins_operands *ops, struct map_symbol *ms,
+static int jump__parse(const struct arch *arch, struct ins_operands *ops, struct map_symbol *ms,
 		struct disasm_line *dl __maybe_unused)
 {
 	struct map *map = ms->map;
 	struct symbol *sym = ms->sym;
 	struct addr_map_symbol target = {
-		.ms = { .map = map, },
+		.ms = { .map = map__get(map), },
 	};
 	const char *c = strchr(ops->raw, ',');
 	u64 start, end;
@@ -430,7 +404,7 @@ static int jump__parse(struct arch *arch, struct ins_operands *ops, struct map_s
 	 * Actual navigation will come next, with further understanding of how
 	 * the symbol searching and disassembly should be done.
 	 */
-	if (maps__find_ams(ms->maps, &target) == 0 &&
+	if (maps__find_ams(thread__maps(ms->thread), &target) == 0 &&
 	    map__rip_2objdump(target.ms.map, map__map_ip(target.ms.map, target.addr)) == ops->target.addr)
 		ops->target.sym = target.ms.sym;
 
@@ -440,12 +414,12 @@ static int jump__parse(struct arch *arch, struct ins_operands *ops, struct map_s
 	} else {
 		ops->target.offset_avail = false;
 	}
-
+	addr_map_symbol__exit(&target);
 	return 0;
 }
 
-static int jump__scnprintf(struct ins *ins, char *bf, size_t size,
-			   struct ins_operands *ops, int max_ins_name)
+int jump__scnprintf(const struct ins *ins, char *bf, size_t size,
+		      struct ins_operands *ops, int max_ins_name)
 {
 	const char *c;
 
@@ -485,15 +459,16 @@ static void jump__delete(struct ins_operands *ops __maybe_unused)
 	 */
 }
 
-static struct ins_ops jump_ops = {
+const struct ins_ops jump_ops = {
 	.free	   = jump__delete,
 	.parse	   = jump__parse,
 	.scnprintf = jump__scnprintf,
+	.is_jump   = true,
 };
 
 bool ins__is_jump(const struct ins *ins)
 {
-	return ins->ops == &jump_ops || ins->ops == &loongarch_jump_ops;
+	return ins->ops && ins->ops->is_jump;
 }
 
 static int comment__symbol(char *raw, char *comment, u64 *addrp, char **namep)
@@ -523,7 +498,7 @@ static int comment__symbol(char *raw, char *comment, u64 *addrp, char **namep)
 	return 0;
 }
 
-static int lock__parse(struct arch *arch, struct ins_operands *ops, struct map_symbol *ms,
+static int lock__parse(const struct arch *arch, struct ins_operands *ops, struct map_symbol *ms,
 		struct disasm_line *dl __maybe_unused)
 {
 	ops->locked.ops = zalloc(sizeof(*ops->locked.ops));
@@ -549,7 +524,7 @@ out_free_ops:
 	return 0;
 }
 
-static int lock__scnprintf(struct ins *ins, char *bf, size_t size,
+static int lock__scnprintf(const struct ins *ins, char *bf, size_t size,
 			   struct ins_operands *ops, int max_ins_name)
 {
 	int printed;
@@ -577,7 +552,7 @@ static void lock__delete(struct ins_operands *ops)
 	zfree(&ops->target.name);
 }
 
-static struct ins_ops lock_ops = {
+const struct ins_ops lock_ops = {
 	.free	   = lock__delete,
 	.parse	   = lock__parse,
 	.scnprintf = lock__scnprintf,
@@ -590,7 +565,7 @@ static struct ins_ops lock_ops = {
  * But it doesn't care segment selectors like %gs:0x5678(%rcx), so just check
  * the input string after 'memory_ref_char' if exists.
  */
-static bool check_multi_regs(struct arch *arch, const char *op)
+static bool check_multi_regs(const struct arch *arch, const char *op)
 {
 	int count = 0;
 
@@ -611,8 +586,9 @@ static bool check_multi_regs(struct arch *arch, const char *op)
 	return count > 1;
 }
 
-static int mov__parse(struct arch *arch, struct ins_operands *ops, struct map_symbol *ms __maybe_unused,
-		struct disasm_line *dl __maybe_unused)
+static int mov__parse(const struct arch *arch, struct ins_operands *ops,
+		      struct map_symbol *ms __maybe_unused,
+		      struct disasm_line *dl __maybe_unused)
 {
 	char *s = strchr(ops->raw, ','), *target, *comment, prev;
 
@@ -677,105 +653,22 @@ out_free_source:
 	return -1;
 }
 
-static int mov__scnprintf(struct ins *ins, char *bf, size_t size,
-			   struct ins_operands *ops, int max_ins_name)
+int mov__scnprintf(const struct ins *ins, char *bf, size_t size,
+		     struct ins_operands *ops, int max_ins_name)
 {
 	return scnprintf(bf, size, "%-*s %s,%s", max_ins_name, ins->name,
 			 ops->source.name ?: ops->source.raw,
 			 ops->target.name ?: ops->target.raw);
 }
 
-static struct ins_ops mov_ops = {
+const struct ins_ops mov_ops = {
 	.parse	   = mov__parse,
 	.scnprintf = mov__scnprintf,
 };
 
-#define PPC_22_30(R)    (((R) >> 1) & 0x1ff)
-#define MINUS_EXT_XO_FORM	234
-#define SUB_EXT_XO_FORM		232
-#define	ADD_ZERO_EXT_XO_FORM	202
-#define	SUB_ZERO_EXT_XO_FORM	200
-
-static int arithmetic__scnprintf(struct ins *ins, char *bf, size_t size,
-		struct ins_operands *ops, int max_ins_name)
-{
-	return scnprintf(bf, size, "%-*s %s", max_ins_name, ins->name,
-			ops->raw);
-}
-
-/*
- * Sets the fields: multi_regs and "mem_ref".
- * "mem_ref" is set for ops->source which is later used to
- * fill the objdump->memory_ref-char field. This ops is currently
- * used by powerpc and since binary instruction code is used to
- * extract opcode, regs and offset, no other parsing is needed here.
- *
- * Dont set multi regs for 4 cases since it has only one operand
- * for source:
- * - Add to Minus One Extended XO-form ( Ex: addme, addmeo )
- * - Subtract From Minus One Extended XO-form ( Ex: subfme )
- * - Add to Zero Extended XO-form ( Ex: addze, addzeo )
- * - Subtract From Zero Extended XO-form ( Ex: subfze )
- */
-static int arithmetic__parse(struct arch *arch __maybe_unused, struct ins_operands *ops,
-		struct map_symbol *ms __maybe_unused, struct disasm_line *dl)
-{
-	int opcode = PPC_OP(dl->raw.raw_insn);
-
-	ops->source.mem_ref = false;
-	if (opcode == 31) {
-		if ((opcode != MINUS_EXT_XO_FORM) && (opcode != SUB_EXT_XO_FORM) \
-				&& (opcode != ADD_ZERO_EXT_XO_FORM) && (opcode != SUB_ZERO_EXT_XO_FORM))
-			ops->source.multi_regs = true;
-	}
-
-	ops->target.mem_ref = false;
-	ops->target.multi_regs = false;
-
-	return 0;
-}
-
-static struct ins_ops arithmetic_ops = {
-	.parse     = arithmetic__parse,
-	.scnprintf = arithmetic__scnprintf,
-};
-
-static int load_store__scnprintf(struct ins *ins, char *bf, size_t size,
-		struct ins_operands *ops, int max_ins_name)
-{
-	return scnprintf(bf, size, "%-*s %s", max_ins_name, ins->name,
-			ops->raw);
-}
-
-/*
- * Sets the fields: multi_regs and "mem_ref".
- * "mem_ref" is set for ops->source which is later used to
- * fill the objdump->memory_ref-char field. This ops is currently
- * used by powerpc and since binary instruction code is used to
- * extract opcode, regs and offset, no other parsing is needed here
- */
-static int load_store__parse(struct arch *arch __maybe_unused, struct ins_operands *ops,
-		struct map_symbol *ms __maybe_unused, struct disasm_line *dl __maybe_unused)
-{
-	ops->source.mem_ref = true;
-	ops->source.multi_regs = false;
-	/* opcode 31 is of X form */
-	if (PPC_OP(dl->raw.raw_insn) == 31)
-		ops->source.multi_regs = true;
-
-	ops->target.mem_ref = false;
-	ops->target.multi_regs = false;
-
-	return 0;
-}
-
-static struct ins_ops load_store_ops = {
-	.parse     = load_store__parse,
-	.scnprintf = load_store__scnprintf,
-};
-
-static int dec__parse(struct arch *arch __maybe_unused, struct ins_operands *ops, struct map_symbol *ms __maybe_unused,
-		struct disasm_line *dl __maybe_unused)
+static int dec__parse(const struct arch *arch __maybe_unused, struct ins_operands *ops,
+		      struct map_symbol *ms __maybe_unused,
+		      struct disasm_line *dl __maybe_unused)
 {
 	char *target, *comment, *s, prev;
 
@@ -802,29 +695,29 @@ static int dec__parse(struct arch *arch __maybe_unused, struct ins_operands *ops
 	return 0;
 }
 
-static int dec__scnprintf(struct ins *ins, char *bf, size_t size,
+static int dec__scnprintf(const struct ins *ins, char *bf, size_t size,
 			   struct ins_operands *ops, int max_ins_name)
 {
 	return scnprintf(bf, size, "%-*s %s", max_ins_name, ins->name,
 			 ops->target.name ?: ops->target.raw);
 }
 
-static struct ins_ops dec_ops = {
+const struct ins_ops dec_ops = {
 	.parse	   = dec__parse,
 	.scnprintf = dec__scnprintf,
 };
 
-static int nop__scnprintf(struct ins *ins __maybe_unused, char *bf, size_t size,
+static int nop__scnprintf(const struct ins *ins __maybe_unused, char *bf, size_t size,
 			  struct ins_operands *ops __maybe_unused, int max_ins_name)
 {
 	return scnprintf(bf, size, "%-*s", max_ins_name, "nop");
 }
 
-static struct ins_ops nop_ops = {
+const struct ins_ops nop_ops = {
 	.scnprintf = nop__scnprintf,
 };
 
-static struct ins_ops ret_ops = {
+const struct ins_ops ret_ops = {
 	.scnprintf = ins__raw_scnprintf,
 };
 
@@ -862,20 +755,21 @@ static void ins__sort(struct arch *arch)
 {
 	const int nmemb = arch->nr_instructions;
 
-	qsort(arch->instructions, nmemb, sizeof(struct ins), ins__cmp);
+	qsort((void *)arch->instructions, nmemb, sizeof(struct ins), ins__cmp);
 }
 
-static struct ins_ops *__ins__find(struct arch *arch, const char *name, struct disasm_line *dl)
+static const struct ins_ops *__ins__find(const struct arch *arch, const char *name,
+				     struct disasm_line *dl)
 {
-	struct ins *ins;
+	const struct ins *ins;
 	const int nmemb = arch->nr_instructions;
 
-	if (arch__is(arch, "powerpc")) {
+	if (arch__is_powerpc(arch)) {
 		/*
 		 * For powerpc, identify the instruction ops
 		 * from the opcode using raw_insn.
 		 */
-		struct ins_ops *ops;
+		const struct ins_ops *ops;
 
 		ops = check_ppc_insn(dl);
 		if (ops)
@@ -883,8 +777,8 @@ static struct ins_ops *__ins__find(struct arch *arch, const char *name, struct d
 	}
 
 	if (!arch->sorted_instructions) {
-		ins__sort(arch);
-		arch->sorted_instructions = true;
+		ins__sort((struct arch *)arch);
+		((struct arch *)arch)->sorted_instructions = true;
 	}
 
 	ins = bsearch(name, arch->instructions, nmemb, sizeof(struct ins), ins__key_cmp);
@@ -911,17 +805,18 @@ static struct ins_ops *__ins__find(struct arch *arch, const char *name, struct d
 	return ins ? ins->ops : NULL;
 }
 
-struct ins_ops *ins__find(struct arch *arch, const char *name, struct disasm_line *dl)
+const struct ins_ops *ins__find(const struct arch *arch, const char *name, struct disasm_line *dl)
 {
-	struct ins_ops *ops = __ins__find(arch, name, dl);
+	const struct ins_ops *ops = __ins__find(arch, name, dl);
 
 	if (!ops && arch->associate_instruction_ops)
-		ops = arch->associate_instruction_ops(arch, name);
+		ops = arch->associate_instruction_ops((struct arch *)arch, name);
 
 	return ops;
 }
 
-static void disasm_line__init_ins(struct disasm_line *dl, struct arch *arch, struct map_symbol *ms)
+static void disasm_line__init_ins(struct disasm_line *dl, const struct arch *arch,
+				    struct map_symbol *ms)
 {
 	dl->ins.ops = ins__find(arch, dl->ins.name, dl);
 
@@ -1046,7 +941,7 @@ static size_t disasm_line_size(int nr)
 struct disasm_line *disasm_line__new(struct annotate_args *args)
 {
 	struct disasm_line *dl = NULL;
-	struct annotation *notes = symbol__annotation(args->ms.sym);
+	struct annotation *notes = symbol__annotation(args->ms->sym);
 	int nr = notes->src->nr_events;
 
 	dl = zalloc(disasm_line_size(nr));
@@ -1058,13 +953,13 @@ struct disasm_line *disasm_line__new(struct annotate_args *args)
 		goto out_delete;
 
 	if (args->offset != -1) {
-		if (arch__is(args->arch, "powerpc")) {
+		if (arch__is_powerpc(args->arch)) {
 			if (disasm_line__parse_powerpc(dl, args) < 0)
 				goto out_free_line;
 		} else if (disasm_line__parse(dl->al.line, &dl->ins.name, &dl->ops.raw) < 0)
 			goto out_free_line;
 
-		disasm_line__init_ins(dl, args->arch, &args->ms);
+		disasm_line__init_ins(dl, args->arch, args->ms);
 	}
 
 	return dl;
@@ -1119,7 +1014,7 @@ static int symbol__parse_objdump_line(struct symbol *sym,
 				      struct annotate_args *args,
 				      char *parsed_line, int *line_nr, char **fileloc)
 {
-	struct map *map = args->ms.map;
+	struct map *map = args->ms->map;
 	struct annotation *notes = symbol__annotation(sym);
 	struct disasm_line *dl;
 	char *tmp;
@@ -1151,7 +1046,7 @@ static int symbol__parse_objdump_line(struct symbol *sym,
 	args->line    = parsed_line;
 	args->line_nr = *line_nr;
 	args->fileloc = *fileloc;
-	args->ms.sym  = sym;
+	args->ms->sym  = sym;
 
 	dl = disasm_line__new(args);
 	(*line_nr)++;
@@ -1169,12 +1064,14 @@ static int symbol__parse_objdump_line(struct symbol *sym,
 	if (dl->ins.ops && ins__is_call(&dl->ins) && !dl->ops.target.sym) {
 		struct addr_map_symbol target = {
 			.addr = dl->ops.target.addr,
-			.ms = { .map = map, },
+			.ms = { .map = map__get(map), },
 		};
 
-		if (!maps__find_ams(args->ms.maps, &target) &&
+		if (!maps__find_ams(thread__maps(args->ms->thread), &target) &&
 		    target.ms.sym->start == target.al_addr)
 			dl->ops.target.sym = target.ms.sym;
+
+		addr_map_symbol__exit(&target);
 	}
 
 	annotation_line__add(&dl->al, &notes->src->source);
@@ -1338,7 +1235,7 @@ static int symbol__disassemble_raw(char *filename, struct symbol *sym,
 					struct annotate_args *args)
 {
 	struct annotation *notes = symbol__annotation(sym);
-	struct map *map = args->ms.map;
+	struct map *map = args->ms->map;
 	struct dso *dso = map__dso(map);
 	u64 start = map__rip_2objdump(map, sym->start);
 	u64 end = map__rip_2objdump(map, sym->end);
@@ -1375,7 +1272,7 @@ static int symbol__disassemble_raw(char *filename, struct symbol *sym,
 	args->line = disasm_buf;
 	args->line_nr = 0;
 	args->fileloc = NULL;
-	args->ms.sym = sym;
+	args->ms->sym = sym;
 
 	dl = disasm_line__new(args);
 	if (dl == NULL)
@@ -1501,7 +1398,7 @@ static int symbol__disassemble_objdump(const char *filename, struct symbol *sym,
 				       struct annotate_args *args)
 {
 	struct annotation_options *opts = &annotate_opts;
-	struct map *map = args->ms.map;
+	struct map *map = args->ms->map;
 	struct dso *dso = map__dso(map);
 	char *command;
 	FILE *file;
@@ -1644,7 +1541,7 @@ out_free_command:
 int symbol__disassemble(struct symbol *sym, struct annotate_args *args)
 {
 	struct annotation_options *options = args->options;
-	struct map *map = args->ms.map;
+	struct map *map = args->ms->map;
 	struct dso *dso = map__dso(map);
 	char symfs_filename[PATH_MAX];
 	bool delete_extract = false;
@@ -1690,7 +1587,7 @@ int symbol__disassemble(struct symbol *sym, struct annotate_args *args)
 	 * and typeoff, disassemble to mnemonic notation is not required in
 	 * case of powerpc.
 	 */
-	if (arch__is(args->arch, "powerpc")) {
+	if (arch__is_powerpc(args->arch)) {
 		extern const char *sort_order;
 
 		if (sort_order && !strstr(sort_order, "sym")) {
diff --git a/tools/perf/util/disasm.h b/tools/perf/util/disasm.h
index d2cb555e4a3b..a6e478caf61a 100644
--- a/tools/perf/util/disasm.h
+++ b/tools/perf/util/disasm.h
@@ -17,21 +17,23 @@ struct data_loc_info;
 struct type_state;
 struct disasm_line;
 
+struct e_machine_and_e_flags {
+	uint32_t e_flags;
+	uint16_t e_machine;
+};
+
 struct arch {
-	const char	*name;
-	struct ins	*instructions;
-	size_t		nr_instructions;
-	size_t		nr_instructions_allocated;
-	struct ins_ops  *(*associate_instruction_ops)(struct arch *arch, const char *name);
-	bool		sorted_instructions;
-	bool		initialized;
-	const char	*insn_suffix;
-	void		*priv;
-	unsigned int	model;
-	unsigned int	family;
-	int		(*init)(struct arch *arch, char *cpuid);
-	bool		(*ins_is_fused)(struct arch *arch, const char *ins1,
-					const char *ins2);
+	/** @name: name such as "x86" or "powerpc". */
+	const char		*name;
+	const struct ins	*instructions;
+	size_t			nr_instructions;
+	size_t			nr_instructions_allocated;
+	const char		*insn_suffix;
+	unsigned int		model;
+	unsigned int		family;
+	/** @id: ELF machine and flags associated with arch. */
+	struct e_machine_and_e_flags id;
+	bool			sorted_instructions;
 	struct		{
 		char comment_char;
 		char skip_functions_char;
@@ -39,20 +41,19 @@ struct arch {
 		char memory_ref_char;
 		char imm_char;
 	} objdump;
+	bool		(*ins_is_fused)(const struct arch *arch, const char *ins1,
+					const char *ins2);
+	const struct ins_ops  *(*associate_instruction_ops)(struct arch *arch, const char *name);
 #ifdef HAVE_LIBDW_SUPPORT
 	void		(*update_insn_state)(struct type_state *state,
 				struct data_loc_info *dloc, Dwarf_Die *cu_die,
 				struct disasm_line *dl);
 #endif
-	/** @e_machine: ELF machine associated with arch. */
-	unsigned int e_machine;
-	/** @e_flags: Optional ELF flags associated with arch. */
-	unsigned int e_flags;
 };
 
 struct ins {
 	const char     *name;
-	struct ins_ops *ops;
+	const struct ins_ops *ops;
 };
 
 struct ins_operands {
@@ -89,15 +90,17 @@ struct ins_operands {
 
 struct ins_ops {
 	void (*free)(struct ins_operands *ops);
-	int (*parse)(struct arch *arch, struct ins_operands *ops, struct map_symbol *ms,
+	int (*parse)(const struct arch *arch, struct ins_operands *ops, struct map_symbol *ms,
 			struct disasm_line *dl);
-	int (*scnprintf)(struct ins *ins, char *bf, size_t size,
+	int (*scnprintf)(const struct ins *ins, char *bf, size_t size,
 			 struct ins_operands *ops, int max_ins_name);
+	bool is_jump;
+	bool is_call;
 };
 
 struct annotate_args {
-	struct arch		  *arch;
-	struct map_symbol	  ms;
+	const struct arch	  *arch;
+	struct map_symbol	  *ms;
 	struct annotation_options *options;
 	s64			  offset;
 	char			  *line;
@@ -105,23 +108,59 @@ struct annotate_args {
 	char			  *fileloc;
 };
 
-struct arch *arch__find(const char *name);
-bool arch__is(struct arch *arch, const char *name);
-
-struct ins_ops *ins__find(struct arch *arch, const char *name, struct disasm_line *dl);
+const struct arch *arch__find(uint16_t e_machine, uint32_t e_flags, const char *cpuid);
+bool arch__is_x86(const struct arch *arch);
+bool arch__is_powerpc(const struct arch *arch);
+
+extern const struct ins_ops call_ops;
+extern const struct ins_ops dec_ops;
+extern const struct ins_ops jump_ops;
+extern const struct ins_ops mov_ops;
+extern const struct ins_ops nop_ops;
+extern const struct ins_ops lock_ops;
+extern const struct ins_ops ret_ops;
+
+int arch__associate_ins_ops(struct arch *arch, const char *name, const struct ins_ops *ops);
+
+const struct arch *arch__new_arc(const struct e_machine_and_e_flags *id, const char *cpuid);
+const struct arch *arch__new_arm(const struct e_machine_and_e_flags *id, const char *cpuid);
+const struct arch *arch__new_arm64(const struct e_machine_and_e_flags *id, const char *cpuid);
+const struct arch *arch__new_csky(const struct e_machine_and_e_flags *id, const char *cpuid);
+const struct arch *arch__new_loongarch(const struct e_machine_and_e_flags *id, const char *cpuid);
+const struct arch *arch__new_mips(const struct e_machine_and_e_flags *id, const char *cpuid);
+const struct arch *arch__new_powerpc(const struct e_machine_and_e_flags *id, const char *cpuid);
+const struct arch *arch__new_riscv64(const struct e_machine_and_e_flags *id, const char *cpuid);
+const struct arch *arch__new_s390(const struct e_machine_and_e_flags *id, const char *cpuid);
+const struct arch *arch__new_sparc(const struct e_machine_and_e_flags *id, const char *cpuid);
+const struct arch *arch__new_x86(const struct e_machine_and_e_flags *id, const char *cpuid);
+
+const struct ins_ops *ins__find(const struct arch *arch, const char *name, struct disasm_line *dl);
 
 bool ins__is_call(const struct ins *ins);
 bool ins__is_jump(const struct ins *ins);
-bool ins__is_fused(struct arch *arch, const char *ins1, const char *ins2);
+bool ins__is_fused(const struct arch *arch, const char *ins1, const char *ins2);
 bool ins__is_ret(const struct ins *ins);
 bool ins__is_lock(const struct ins *ins);
 
+const struct ins_ops *check_ppc_insn(struct disasm_line *dl);
+
 struct disasm_line *disasm_line__new(struct annotate_args *args);
 void disasm_line__free(struct disasm_line *dl);
 
 int disasm_line__scnprintf(struct disasm_line *dl, char *bf, size_t size,
 			   bool raw, int max_ins_name);
 
+int ins__raw_scnprintf(const struct ins *ins, char *bf, size_t size,
+			   struct ins_operands *ops, int max_ins_name);
+int ins__scnprintf(const struct ins *ins, char *bf, size_t size,
+		   struct ins_operands *ops, int max_ins_name);
+int call__scnprintf(const struct ins *ins, char *bf, size_t size,
+		    struct ins_operands *ops, int max_ins_name);
+int jump__scnprintf(const struct ins *ins, char *bf, size_t size,
+		    struct ins_operands *ops, int max_ins_name);
+int mov__scnprintf(const struct ins *ins, char *bf, size_t size,
+		   struct ins_operands *ops, int max_ins_name);
+
 int symbol__disassemble(struct symbol *sym, struct annotate_args *args);
 
 char *expand_tabs(char *line, char **storage, size_t *storage_len);
diff --git a/tools/perf/util/dlfilter.c b/tools/perf/util/dlfilter.c
index c0afcbd954f8..dc31b5e7149e 100644
--- a/tools/perf/util/dlfilter.c
+++ b/tools/perf/util/dlfilter.c
@@ -234,8 +234,7 @@ static const __u8 *dlfilter__insn(void *ctx, __u32 *len)
 			struct machine *machine = maps__machine(thread__maps(al->thread));
 
 			if (machine)
-				script_fetch_insn(d->sample, al->thread, machine,
-						  /*native_arch=*/true);
+				perf_sample__fetch_insn(d->sample, al->thread, machine);
 		}
 	}
 
diff --git a/tools/perf/util/dso.c b/tools/perf/util/dso.c
index 344e689567ee..b791e1b6b2cf 100644
--- a/tools/perf/util/dso.c
+++ b/tools/perf/util/dso.c
@@ -32,6 +32,7 @@
 #include "string2.h"
 #include "vdso.h"
 #include "annotate-data.h"
+#include "libdw.h"
 
 static const char * const debuglink_paths[] = {
 	"%.0s%s",
@@ -111,7 +112,7 @@ bool dso__is_object_file(const struct dso *dso)
 
 int dso__read_binary_type_filename(const struct dso *dso,
 				   enum dso_binary_type type,
-				   char *root_dir, char *filename, size_t size)
+				   const char *root_dir, char *filename, size_t size)
 {
 	char build_id_hex[SBUILD_ID_SIZE];
 	int ret = 0;
@@ -539,16 +540,13 @@ static void close_first_dso(void);
 
 static int do_open(char *name) EXCLUSIVE_LOCKS_REQUIRED(_dso__data_open_lock)
 {
-	int fd;
-	char sbuf[STRERR_BUFSIZE];
-
 	do {
-		fd = open(name, O_RDONLY|O_CLOEXEC);
+		int fd = open(name, O_RDONLY|O_CLOEXEC);
+
 		if (fd >= 0)
 			return fd;
 
-		pr_debug("dso open failed: %s\n",
-			 str_error_r(errno, sbuf, sizeof(sbuf)));
+		pr_debug("dso open failed: %m\n");
 		if (!dso__data_open_cnt || errno != EMFILE)
 			break;
 
@@ -563,20 +561,15 @@ char *dso__filename_with_chroot(const struct dso *dso, const char *filename)
 	return filename_with_chroot(nsinfo__pid(dso__nsinfo_const(dso)), filename);
 }
 
-static int __open_dso(struct dso *dso, struct machine *machine)
-	EXCLUSIVE_LOCKS_REQUIRED(_dso__data_open_lock)
+static char *dso__get_filename(struct dso *dso, const char *root_dir,
+			       bool *decomp)
 {
-	int fd = -EINVAL;
-	char *root_dir = (char *)"";
 	char *name = malloc(PATH_MAX);
-	bool decomp = false;
 
-	if (!name)
-		return -ENOMEM;
+	*decomp = false;
 
-	mutex_lock(dso__lock(dso));
-	if (machine)
-		root_dir = machine->root_dir;
+	if (name == NULL)
+		return NULL;
 
 	if (dso__read_binary_type_filename(dso, dso__binary_type(dso),
 					    root_dir, name, PATH_MAX))
@@ -601,20 +594,38 @@ static int __open_dso(struct dso *dso, struct machine *machine)
 		size_t len = sizeof(newpath);
 
 		if (dso__decompress_kmodule_path(dso, name, newpath, len) < 0) {
-			fd = -(*dso__load_errno(dso));
+			errno = *dso__load_errno(dso);
 			goto out;
 		}
 
-		decomp = true;
+		*decomp = true;
 		strcpy(name, newpath);
 	}
+	return name;
+
+out:
+	free(name);
+	return NULL;
+}
+
+static int __open_dso(struct dso *dso, struct machine *machine)
+	EXCLUSIVE_LOCKS_REQUIRED(_dso__data_open_lock)
+{
+	int fd = -EINVAL;
+	char *name;
+	bool decomp = false;
 
-	fd = do_open(name);
+	mutex_lock(dso__lock(dso));
+
+	name = dso__get_filename(dso, machine ? machine->root_dir : "", &decomp);
+	if (name)
+		fd = do_open(name);
+	else
+		fd = -errno;
 
 	if (decomp)
 		unlink(name);
 
-out:
 	mutex_unlock(dso__lock(dso));
 	free(name);
 	return fd;
@@ -1097,7 +1108,6 @@ static int file_size(struct dso *dso, struct machine *machine)
 {
 	int ret = 0;
 	struct stat st;
-	char sbuf[STRERR_BUFSIZE];
 
 	mutex_lock(dso__data_open_lock());
 
@@ -1115,8 +1125,7 @@ static int file_size(struct dso *dso, struct machine *machine)
 
 	if (fstat(dso__data(dso)->fd, &st) < 0) {
 		ret = -errno;
-		pr_err("dso cache fstat failed: %s\n",
-		       str_error_r(errno, sbuf, sizeof(sbuf)));
+		pr_err("dso cache fstat failed: %m\n");
 		dso__data(dso)->status = DSO_DATA_STATUS_ERROR;
 		goto out;
 	}
@@ -1194,7 +1203,92 @@ ssize_t dso__data_read_offset(struct dso *dso, struct machine *machine,
 	return data_read_write_offset(dso, machine, offset, data, size, true);
 }
 
-uint16_t dso__e_machine(struct dso *dso, struct machine *machine)
+static enum dso_swap_type dso_swap_type__from_elf_data(unsigned char eidata)
+{
+	static const unsigned int endian = 1;
+
+	switch (eidata) {
+	case ELFDATA2LSB:
+		/* We are big endian, DSO is little endian. */
+		return (*(unsigned char const *)&endian != 1) ? DSO_SWAP__YES : DSO_SWAP__NO;
+	case ELFDATA2MSB:
+		/* We are little endian, DSO is big endian. */
+		return (*(unsigned char const *)&endian != 0) ? DSO_SWAP__YES : DSO_SWAP__NO;
+	default:
+		return DSO_SWAP__UNSET;
+	}
+}
+
+/* Reads e_machine from fd, optionally caching data in dso. */
+uint16_t dso__read_e_machine(struct dso *optional_dso, int fd, uint32_t *e_flags)
+{
+	uint16_t e_machine = EM_NONE;
+	unsigned char e_ident[EI_NIDENT];
+	enum dso_swap_type swap_type;
+	bool need_e_flags;
+
+	if (e_flags)
+		*e_flags = 0;
+
+	{
+		_Static_assert(offsetof(Elf32_Ehdr, e_ident) == 0, "Unexpected offset");
+		_Static_assert(offsetof(Elf64_Ehdr, e_ident) == 0, "Unexpected offset");
+	}
+	if (pread(fd, &e_ident, sizeof(e_ident), 0) != sizeof(e_ident))
+		return EM_NONE; // Read failed.
+
+	if (memcmp(e_ident, ELFMAG, SELFMAG) != 0)
+		return EM_NONE; // Not an ELF file.
+
+	if (e_ident[EI_CLASS] == ELFCLASSNONE || e_ident[EI_CLASS] >= ELFCLASSNUM)
+		return EM_NONE; // Bad ELF class (32 or 64-bit objects).
+
+	if (e_ident[EI_VERSION] != EV_CURRENT)
+		return EM_NONE; // Bad ELF version.
+
+	swap_type = dso_swap_type__from_elf_data(e_ident[EI_DATA]);
+	if (swap_type == DSO_SWAP__UNSET)
+		return EM_NONE; // Bad ELF data encoding.
+
+	/* Cache the need for swapping. */
+	if (optional_dso) {
+		assert(dso__needs_swap(optional_dso) == DSO_SWAP__UNSET ||
+		       dso__needs_swap(optional_dso) == swap_type);
+		dso__set_needs_swap(optional_dso, swap_type);
+	}
+
+	{
+		_Static_assert(offsetof(Elf32_Ehdr, e_machine) == 18, "Unexpected offset");
+		_Static_assert(offsetof(Elf64_Ehdr, e_machine) == 18, "Unexpected offset");
+	}
+	if (pread(fd, &e_machine, sizeof(e_machine), 18) != sizeof(e_machine))
+		return EM_NONE; // e_machine read failed.
+
+	e_machine = DSO_SWAP_TYPE__SWAP(swap_type, uint16_t, e_machine);
+	if (e_machine >= EM_NUM)
+		return EM_NONE; // Bad ELF machine number.
+
+#ifdef NDEBUG
+	/* In production code the e_flags are only needed on CSKY. */
+	need_e_flags = e_flags && e_machine == EM_CSKY;
+#else
+	/* Debug code will always read the e_flags. */
+	need_e_flags = e_flags != NULL;
+#endif
+	if (need_e_flags) {
+		off_t offset = e_ident[EI_CLASS] == ELFCLASS32
+			? offsetof(Elf32_Ehdr, e_flags)
+			: offsetof(Elf64_Ehdr, e_flags);
+
+		if (pread(fd, e_flags, sizeof(*e_flags), offset) != sizeof(*e_flags)) {
+			*e_flags = 0;
+			return EM_NONE; // e_flags read failed.
+		}
+	}
+	return e_machine;
+}
+
+uint16_t dso__e_machine(struct dso *dso, struct machine *machine, uint32_t *e_flags)
 {
 	uint16_t e_machine = EM_NONE;
 	int fd;
@@ -1214,6 +1308,8 @@ uint16_t dso__e_machine(struct dso *dso, struct machine *machine)
 	case DSO_BINARY_TYPE__BPF_IMAGE:
 	case DSO_BINARY_TYPE__OOL:
 	case DSO_BINARY_TYPE__JAVA_JIT:
+		if (e_flags)
+			*e_flags = EF_HOST;
 		return EM_HOST;
 	case DSO_BINARY_TYPE__DEBUGLINK:
 	case DSO_BINARY_TYPE__BUILD_ID_CACHE:
@@ -1228,6 +1324,8 @@ uint16_t dso__e_machine(struct dso *dso, struct machine *machine)
 		break;
 	case DSO_BINARY_TYPE__NOT_FOUND:
 	default:
+		if (e_flags)
+			*e_flags = 0;
 		return EM_NONE;
 	}
 
@@ -1239,19 +1337,11 @@ uint16_t dso__e_machine(struct dso *dso, struct machine *machine)
 	 */
 	try_to_open_dso(dso, machine);
 	fd = dso__data(dso)->fd;
-	if (fd >= 0) {
-		_Static_assert(offsetof(Elf32_Ehdr, e_machine) == 18, "Unexpected offset");
-		_Static_assert(offsetof(Elf64_Ehdr, e_machine) == 18, "Unexpected offset");
-		if (dso__needs_swap(dso) == DSO_SWAP__UNSET) {
-			unsigned char eidata;
+	if (fd >= 0)
+		e_machine = dso__read_e_machine(dso, fd, e_flags);
+	else if (e_flags)
+		*e_flags = 0;
 
-			if (pread(fd, &eidata, sizeof(eidata), EI_DATA) == sizeof(eidata))
-				dso__swap_init(dso, eidata);
-		}
-		if (dso__needs_swap(dso) != DSO_SWAP__UNSET &&
-		    pread(fd, &e_machine, sizeof(e_machine), 18) == sizeof(e_machine))
-			e_machine = DSO__SWAP(dso, uint16_t, e_machine);
-	}
 	mutex_unlock(dso__data_open_lock());
 	return e_machine;
 }
@@ -1605,6 +1695,7 @@ void dso__delete(struct dso *dso)
 	auxtrace_cache__free(RC_CHK_ACCESS(dso)->auxtrace_cache);
 	dso_cache__free(dso);
 	dso__free_a2l(dso);
+	dso__free_libdw(dso);
 	dso__free_symsrc_filename(dso);
 	nsinfo__zput(RC_CHK_ACCESS(dso)->nsinfo);
 	mutex_destroy(dso__lock(dso));
@@ -1635,28 +1726,13 @@ void dso__put(struct dso *dso)
 
 int dso__swap_init(struct dso *dso, unsigned char eidata)
 {
-	static unsigned int const endian = 1;
-
-	dso__set_needs_swap(dso, DSO_SWAP__NO);
-
-	switch (eidata) {
-	case ELFDATA2LSB:
-		/* We are big endian, DSO is little endian. */
-		if (*(unsigned char const *)&endian != 1)
-			dso__set_needs_swap(dso, DSO_SWAP__YES);
-		break;
-
-	case ELFDATA2MSB:
-		/* We are little endian, DSO is big endian. */
-		if (*(unsigned char const *)&endian != 0)
-			dso__set_needs_swap(dso, DSO_SWAP__YES);
-		break;
+	enum dso_swap_type type = dso_swap_type__from_elf_data(eidata);
 
-	default:
+	dso__set_needs_swap(dso, type);
+	if (type == DSO_SWAP__UNSET) {
 		pr_err("unrecognized DSO data encoding %d\n", eidata);
 		return -EINVAL;
 	}
-
 	return 0;
 }
 
@@ -1771,10 +1847,8 @@ int dso__strerror_load(struct dso *dso, char *buf, size_t buflen)
 	BUG_ON(buflen == 0);
 
 	if (errnum >= 0) {
-		const char *err = str_error_r(errnum, buf, buflen);
-
-		if (err != buf)
-			scnprintf(buf, buflen, "%s", err);
+		errno = errnum;
+		scnprintf(buf, buflen, "%m");
 
 		return 0;
 	}
@@ -1910,3 +1984,23 @@ const u8 *dso__read_symbol(struct dso *dso, const char *symfs_filename,
 	return __dso__read_symbol(dso, symfs_filename, start, len,
 				  out_buf, out_buf_len, is_64bit);
 }
+
+struct debuginfo *dso__debuginfo(struct dso *dso)
+{
+	char *name;
+	bool decomp = false;
+	struct debuginfo *dinfo = NULL;
+
+	mutex_lock(dso__lock(dso));
+
+	name = dso__get_filename(dso, "", &decomp);
+	if (name)
+		dinfo = debuginfo__new(name);
+
+	if (decomp)
+		unlink(name);
+
+	mutex_unlock(dso__lock(dso));
+	free(name);
+	return dinfo;
+}
diff --git a/tools/perf/util/dso.h b/tools/perf/util/dso.h
index f8ccb9816b89..ede691e9a249 100644
--- a/tools/perf/util/dso.h
+++ b/tools/perf/util/dso.h
@@ -160,12 +160,11 @@ enum dso_load_errno {
 	__DSO_LOAD_ERRNO__END,
 };
 
-#define DSO__SWAP(dso, type, val)				\
+#define DSO_SWAP_TYPE__SWAP(swap_type, type, val)		\
 ({								\
 	type ____r = val;					\
-	enum dso_swap_type ___dst = dso__needs_swap(dso);	\
-	BUG_ON(___dst == DSO_SWAP__UNSET);			\
-	if (___dst == DSO_SWAP__YES) {				\
+	BUG_ON(swap_type == DSO_SWAP__UNSET);			\
+	if (swap_type == DSO_SWAP__YES) {			\
 		switch (sizeof(____r)) {			\
 		case 2:						\
 			____r = bswap_16(val);			\
@@ -183,6 +182,8 @@ enum dso_load_errno {
 	____r;							\
 })
 
+#define DSO__SWAP(dso, type, val) DSO_SWAP_TYPE__SWAP(dso__needs_swap(dso), type, val)
+
 #define DSO__DATA_CACHE_SIZE 4096
 #define DSO__DATA_CACHE_MASK ~(DSO__DATA_CACHE_SIZE - 1)
 
@@ -268,10 +269,8 @@ DECLARE_RC_STRUCT(dso) {
 	const char	 *short_name;
 	const char	 *long_name;
 	void		 *a2l;
+	void		 *libdw;
 	char		 *symsrc_filename;
-#if defined(__powerpc__)
-	void		*dwfl;			/* DWARF debug info */
-#endif
 	struct nsinfo	*nsinfo;
 	struct auxtrace_cache *auxtrace_cache;
 	union { /* Tool specific area */
@@ -334,6 +333,26 @@ static inline void dso__set_a2l(struct dso *dso, void *val)
 	RC_CHK_ACCESS(dso)->a2l = val;
 }
 
+static inline void *dso__libdw(const struct dso *dso)
+{
+	return RC_CHK_ACCESS(dso)->libdw;
+}
+
+static inline void dso__set_libdw(struct dso *dso, void *val)
+{
+	RC_CHK_ACCESS(dso)->libdw = val;
+}
+
+struct Dwfl;
+#ifdef HAVE_LIBDW_SUPPORT
+struct Dwfl *dso__libdw_dwfl(struct dso *dso);
+#else
+static inline struct Dwfl *dso__libdw_dwfl(struct dso *dso __maybe_unused)
+{
+	return NULL;
+}
+#endif
+
 static inline unsigned int dso__a2l_fails(const struct dso *dso)
 {
 	return RC_CHK_ACCESS(dso)->a2l_fails;
@@ -766,7 +785,7 @@ int dso__kernel_module_get_build_id(struct dso *dso, const char *root_dir);
 
 char dso__symtab_origin(const struct dso *dso);
 int dso__read_binary_type_filename(const struct dso *dso, enum dso_binary_type type,
-				   char *root_dir, char *filename, size_t size);
+				   const char *root_dir, char *filename, size_t size);
 bool is_kernel_module(const char *pathname, int cpumode);
 bool dso__needs_decompress(struct dso *dso);
 int dso__decompress_kmodule_fd(struct dso *dso, const char *name);
@@ -847,7 +866,8 @@ int dso__data_file_size(struct dso *dso, struct machine *machine);
 off_t dso__data_size(struct dso *dso, struct machine *machine);
 ssize_t dso__data_read_offset(struct dso *dso, struct machine *machine,
 			      u64 offset, u8 *data, ssize_t size);
-uint16_t dso__e_machine(struct dso *dso, struct machine *machine);
+uint16_t dso__read_e_machine(struct dso *optional_dso, int fd, uint32_t *e_flags);
+uint16_t dso__e_machine(struct dso *dso, struct machine *machine, uint32_t *e_flags);
 ssize_t dso__data_read_addr(struct dso *dso, struct map *map,
 			    struct machine *machine, u64 addr,
 			    u8 *data, ssize_t size);
@@ -915,14 +935,7 @@ u64 dso__findnew_global_type(struct dso *dso, u64 addr, u64 offset);
 bool perf_pid_map_tid(const char *dso_name, int *tid);
 bool is_perf_pid_map_name(const char *dso_name);
 
-/*
- * In the future, we may get debuginfo using build-ID (w/o path).
- * Add this helper is for the smooth conversion.
- */
-static inline struct debuginfo *dso__debuginfo(struct dso *dso)
-{
-	return debuginfo__new(dso__long_name(dso));
-}
+struct debuginfo *dso__debuginfo(struct dso *dso);
 
 const u8 *dso__read_symbol(struct dso *dso, const char *symfs_filename,
 			   const struct map *map, const struct symbol *sym,
diff --git a/tools/perf/util/dwarf-regs-arch/Build b/tools/perf/util/dwarf-regs-arch/Build
new file mode 100644
index 000000000000..ceb68ae86fd8
--- /dev/null
+++ b/tools/perf/util/dwarf-regs-arch/Build
@@ -0,0 +1,9 @@
+perf-util-$(CONFIG_LIBDW) += dwarf-regs-arm64.o
+perf-util-$(CONFIG_LIBDW) += dwarf-regs-arm.o
+perf-util-$(CONFIG_LIBDW) += dwarf-regs-csky.o
+perf-util-$(CONFIG_LIBDW) += dwarf-regs-loongarch.o
+perf-util-$(CONFIG_LIBDW) += dwarf-regs-mips.o
+perf-util-$(CONFIG_LIBDW) += dwarf-regs-powerpc.o
+perf-util-$(CONFIG_LIBDW) += dwarf-regs-riscv.o
+perf-util-$(CONFIG_LIBDW) += dwarf-regs-s390.o
+perf-util-$(CONFIG_LIBDW) += dwarf-regs-x86.o
diff --git a/tools/perf/util/dwarf-regs-arch/dwarf-regs-arm.c b/tools/perf/util/dwarf-regs-arch/dwarf-regs-arm.c
new file mode 100644
index 000000000000..42c6c0635612
--- /dev/null
+++ b/tools/perf/util/dwarf-regs-arch/dwarf-regs-arm.c
@@ -0,0 +1,12 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <errno.h>
+#include <dwarf-regs.h>
+#include "../../../arch/arm/include/uapi/asm/perf_regs.h"
+
+int __get_dwarf_regnum_for_perf_regnum_arm(int perf_regnum)
+{
+	if (perf_regnum < 0 || perf_regnum >= PERF_REG_ARM_MAX)
+		return -ENOENT;
+
+	return perf_regnum;
+}
diff --git a/tools/perf/util/dwarf-regs-arch/dwarf-regs-arm64.c b/tools/perf/util/dwarf-regs-arch/dwarf-regs-arm64.c
new file mode 100644
index 000000000000..593ca7d4fccc
--- /dev/null
+++ b/tools/perf/util/dwarf-regs-arch/dwarf-regs-arm64.c
@@ -0,0 +1,12 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <errno.h>
+#include <dwarf-regs.h>
+#include "../../../arch/arm64/include/uapi/asm/perf_regs.h"
+
+int __get_dwarf_regnum_for_perf_regnum_arm64(int perf_regnum)
+{
+	if (perf_regnum < 0 || perf_regnum >= PERF_REG_ARM64_MAX)
+		return -ENOENT;
+
+	return perf_regnum;
+}
diff --git a/tools/perf/util/dwarf-regs-arch/dwarf-regs-csky.c b/tools/perf/util/dwarf-regs-arch/dwarf-regs-csky.c
new file mode 100644
index 000000000000..cb44b774f8d9
--- /dev/null
+++ b/tools/perf/util/dwarf-regs-arch/dwarf-regs-csky.c
@@ -0,0 +1,126 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (C) 2019 Hangzhou C-SKY Microsystems co.,ltd.
+// Mapping of DWARF debug register numbers into register names.
+
+#include <errno.h>
+#include <stddef.h>
+#include <dwarf-regs.h>
+// Ensure the V2 perf reg definitions are included.
+#undef __CSKYABIV2__
+#define __CSKYABIV2__ 1
+#include "../../../arch/csky/include/uapi/asm/perf_regs.h"
+
+#define CSKY_ABIV2_MAX_REGS 73
+static const char * const csky_dwarf_regs_table_abiv2[CSKY_ABIV2_MAX_REGS] = {
+	/* r0 ~ r8 */
+	"%a0", "%a1", "%a2", "%a3", "%regs0", "%regs1", "%regs2", "%regs3",
+	/* r9 ~ r15 */
+	"%regs4", "%regs5", "%regs6", "%regs7", "%regs8", "%regs9", "%sp",
+	"%lr",
+	/* r16 ~ r23 */
+	"%exregs0", "%exregs1", "%exregs2", "%exregs3", "%exregs4",
+	"%exregs5", "%exregs6", "%exregs7",
+	/* r24 ~ r31 */
+	"%exregs8", "%exregs9", "%exregs10", "%exregs11", "%exregs12",
+	"%exregs13", "%exregs14", "%tls",
+	"%pc", NULL, NULL, NULL, "%hi", "%lo", NULL, NULL,
+	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+	"%epc",
+};
+
+#define CSKY_ABIV1_MAX_REGS 57
+static const char * const csky_dwarf_regs_table_abiv1[CSKY_ABIV1_MAX_REGS] = {
+	/* r0 ~ r8 */
+	"%sp", "%regs9", "%a0", "%a1", "%a2", "%a3", "%regs0", "%regs1",
+	/* r9 ~ r15 */
+	"%regs2", "%regs3", "%regs4", "%regs5", "%regs6", "%regs7", "%regs8",
+	"%lr",
+	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+	"%epc",
+};
+
+const char *__get_csky_regstr(unsigned int n, unsigned int flags)
+{
+	if (flags & EF_CSKY_ABIV2)
+		return (n < CSKY_ABIV2_MAX_REGS) ? csky_dwarf_regs_table_abiv2[n] : NULL;
+
+	return (n < CSKY_ABIV1_MAX_REGS) ? csky_dwarf_regs_table_abiv1[n] : NULL;
+}
+
+static int __get_dwarf_regnum(const char *const *regstr, size_t num_regstr, const char *name)
+{
+	for (size_t i = 0; i < num_regstr; i++) {
+		if (regstr[i] && !strcmp(regstr[i], name))
+			return i;
+	}
+	return -ENOENT;
+}
+
+int __get_csky_regnum(const char *name, unsigned int flags)
+{
+	if (flags & EF_CSKY_ABIV2)
+		return __get_dwarf_regnum(csky_dwarf_regs_table_abiv2, CSKY_ABIV2_MAX_REGS, name);
+
+	return __get_dwarf_regnum(csky_dwarf_regs_table_abiv1, CSKY_ABIV1_MAX_REGS, name);
+}
+
+int __get_dwarf_regnum_for_perf_regnum_csky(int perf_regnum, unsigned int flags)
+{
+	static const int dwarf_csky_regnums[][2] = {
+		[PERF_REG_CSKY_TLS] = {-ENOENT, 31},
+		[PERF_REG_CSKY_LR] = {15, 15},
+		[PERF_REG_CSKY_PC] = {-ENOENT, 32},
+		/* TODO: PERF_REG_CSKY_SR */
+		[PERF_REG_CSKY_SP] = {0, 14},
+		/* TODO: PERF_REG_CSKY_ORIG_A0 */
+		[PERF_REG_CSKY_A0] = {2, 0},
+		[PERF_REG_CSKY_A1] = {3, 1},
+		[PERF_REG_CSKY_A2] = {4, 2},
+		[PERF_REG_CSKY_A3] = {5, 3},
+		[PERF_REG_CSKY_REGS0] = {6, 4},
+		[PERF_REG_CSKY_REGS1] = {7, 5},
+		[PERF_REG_CSKY_REGS2] = {8, 6},
+		[PERF_REG_CSKY_REGS3] = {9, 7},
+		[PERF_REG_CSKY_REGS4] = {10, 8},
+		[PERF_REG_CSKY_REGS5] = {11, 9},
+		[PERF_REG_CSKY_REGS6] = {12, 10},
+		[PERF_REG_CSKY_REGS7] = {13, 11},
+		[PERF_REG_CSKY_REGS8] = {14, 12},
+		[PERF_REG_CSKY_REGS9] = {1, 13},
+		[PERF_REG_CSKY_EXREGS0] = {-ENOENT, 16},
+		[PERF_REG_CSKY_EXREGS1] = {-ENOENT, 17},
+		[PERF_REG_CSKY_EXREGS2] = {-ENOENT, 18},
+		[PERF_REG_CSKY_EXREGS3] = {-ENOENT, 19},
+		[PERF_REG_CSKY_EXREGS4] = {-ENOENT, 20},
+		[PERF_REG_CSKY_EXREGS5] = {-ENOENT, 21},
+		[PERF_REG_CSKY_EXREGS6] = {-ENOENT, 22},
+		[PERF_REG_CSKY_EXREGS7] = {-ENOENT, 23},
+		[PERF_REG_CSKY_EXREGS8] = {-ENOENT, 24},
+		[PERF_REG_CSKY_EXREGS9] = {-ENOENT, 25},
+		[PERF_REG_CSKY_EXREGS10] = {-ENOENT, 26},
+		[PERF_REG_CSKY_EXREGS11] = {-ENOENT, 27},
+		[PERF_REG_CSKY_EXREGS12] = {-ENOENT, 28},
+		[PERF_REG_CSKY_EXREGS13] = {-ENOENT, 29},
+		[PERF_REG_CSKY_EXREGS14] = {-ENOENT, 30},
+		/* TODO: PERF_REG_CSKY_HI */
+		/* TODO: PERF_REG_CSKY_LO */
+		/* TODO: PERF_REG_CSKY_DCSR */
+	};
+	int idx = 0;
+
+	if (flags & EF_CSKY_ABIV2)
+		idx++;
+
+	if (perf_regnum <  0 || perf_regnum > (int)ARRAY_SIZE(dwarf_csky_regnums) ||
+	    dwarf_csky_regnums[perf_regnum][idx] == 0)
+		return -ENOENT;
+
+	return dwarf_csky_regnums[perf_regnum][idx];
+}
diff --git a/tools/perf/util/dwarf-regs-arch/dwarf-regs-loongarch.c b/tools/perf/util/dwarf-regs-arch/dwarf-regs-loongarch.c
new file mode 100644
index 000000000000..203077b740a0
--- /dev/null
+++ b/tools/perf/util/dwarf-regs-arch/dwarf-regs-loongarch.c
@@ -0,0 +1,12 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <errno.h>
+#include <dwarf-regs.h>
+#include "../../../arch/loongarch/include/uapi/asm/perf_regs.h"
+
+int __get_dwarf_regnum_for_perf_regnum_loongarch(int perf_regnum)
+{
+	if (perf_regnum < 0 || perf_regnum >= PERF_REG_LOONGARCH_MAX)
+		return -ENOENT;
+
+	return perf_regnum;
+}
diff --git a/tools/perf/util/dwarf-regs-arch/dwarf-regs-mips.c b/tools/perf/util/dwarf-regs-arch/dwarf-regs-mips.c
new file mode 100644
index 000000000000..3bb916b45c66
--- /dev/null
+++ b/tools/perf/util/dwarf-regs-arch/dwarf-regs-mips.c
@@ -0,0 +1,14 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <errno.h>
+#include <dwarf-regs.h>
+#include "../../../arch/mips/include/uapi/asm/perf_regs.h"
+
+int __get_dwarf_regnum_for_perf_regnum_mips(int perf_regnum)
+{
+	if (perf_regnum == PERF_REG_MIPS_PC)
+		return 37;
+	if (perf_regnum < 0 || perf_regnum >= PERF_REG_MIPS_MAX)
+		return -ENOENT;
+
+	return perf_regnum;
+}
diff --git a/tools/perf/util/dwarf-regs-arch/dwarf-regs-powerpc.c b/tools/perf/util/dwarf-regs-arch/dwarf-regs-powerpc.c
new file mode 100644
index 000000000000..51892a09725b
--- /dev/null
+++ b/tools/perf/util/dwarf-regs-arch/dwarf-regs-powerpc.c
@@ -0,0 +1,136 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Mapping of DWARF debug register numbers into register names.
+ *
+ * Copyright (C) 2010 Ian Munsie, IBM Corporation.
+ */
+#include <errno.h>
+#include <dwarf-regs.h>
+#include "../../../arch/powerpc/include/uapi/asm/perf_regs.h"
+
+#define PPC_OP(op)	(((op) >> 26) & 0x3F)
+#define PPC_RA(a)	(((a) >> 16) & 0x1f)
+#define PPC_RT(t)	(((t) >> 21) & 0x1f)
+#define PPC_RB(b)	(((b) >> 11) & 0x1f)
+#define PPC_D(D)	((D) & 0xfffe)
+#define PPC_DS(DS)	((DS) & 0xfffc)
+#define OP_LD	58
+#define OP_STD	62
+
+static int get_source_reg(u32 raw_insn)
+{
+	return PPC_RA(raw_insn);
+}
+
+static int get_target_reg(u32 raw_insn)
+{
+	return PPC_RT(raw_insn);
+}
+
+static int get_offset_opcode(u32 raw_insn)
+{
+	int opcode = PPC_OP(raw_insn);
+
+	/* DS- form */
+	if ((opcode == OP_LD) || (opcode == OP_STD))
+		return PPC_DS(raw_insn);
+	else
+		return PPC_D(raw_insn);
+}
+
+/*
+ * Fills the required fields for op_loc depending on if it
+ * is a source or target.
+ * D form: ins RT,D(RA) -> src_reg1 = RA, offset = D, dst_reg1 = RT
+ * DS form: ins RT,DS(RA) -> src_reg1 = RA, offset = DS, dst_reg1 = RT
+ * X form: ins RT,RA,RB -> src_reg1 = RA, src_reg2 = RB, dst_reg1 = RT
+ */
+void get_powerpc_regs(u32 raw_insn, int is_source,
+		struct annotated_op_loc *op_loc)
+{
+	if (is_source)
+		op_loc->reg1 = get_source_reg(raw_insn);
+	else
+		op_loc->reg1 = get_target_reg(raw_insn);
+
+	if (op_loc->multi_regs)
+		op_loc->reg2 = PPC_RB(raw_insn);
+
+	/* TODO: Implement offset handling for X Form */
+	if ((op_loc->mem_ref) && (PPC_OP(raw_insn) != 31))
+		op_loc->offset = get_offset_opcode(raw_insn);
+}
+
+int __get_dwarf_regnum_for_perf_regnum_powerpc(int perf_regnum)
+{
+	static const int dwarf_powerpc_regnums[] = {
+		[PERF_REG_POWERPC_R0] = 0,
+		[PERF_REG_POWERPC_R1] = 1,
+		[PERF_REG_POWERPC_R2] = 2,
+		[PERF_REG_POWERPC_R3] = 3,
+		[PERF_REG_POWERPC_R4] = 4,
+		[PERF_REG_POWERPC_R5] = 5,
+		[PERF_REG_POWERPC_R6] = 6,
+		[PERF_REG_POWERPC_R7] = 7,
+		[PERF_REG_POWERPC_R8] = 8,
+		[PERF_REG_POWERPC_R9] = 9,
+		[PERF_REG_POWERPC_R10] = 10,
+		[PERF_REG_POWERPC_R11] = 11,
+		[PERF_REG_POWERPC_R12] = 12,
+		[PERF_REG_POWERPC_R13] = 13,
+		[PERF_REG_POWERPC_R14] = 14,
+		[PERF_REG_POWERPC_R15] = 15,
+		[PERF_REG_POWERPC_R16] = 16,
+		[PERF_REG_POWERPC_R17] = 17,
+		[PERF_REG_POWERPC_R18] = 18,
+		[PERF_REG_POWERPC_R19] = 19,
+		[PERF_REG_POWERPC_R20] = 20,
+		[PERF_REG_POWERPC_R21] = 21,
+		[PERF_REG_POWERPC_R22] = 22,
+		[PERF_REG_POWERPC_R23] = 23,
+		[PERF_REG_POWERPC_R24] = 24,
+		[PERF_REG_POWERPC_R25] = 25,
+		[PERF_REG_POWERPC_R26] = 26,
+		[PERF_REG_POWERPC_R27] = 27,
+		[PERF_REG_POWERPC_R28] = 28,
+		[PERF_REG_POWERPC_R29] = 29,
+		[PERF_REG_POWERPC_R30] = 30,
+		[PERF_REG_POWERPC_R31] = 31,
+		/* TODO: PERF_REG_POWERPC_NIP */
+		[PERF_REG_POWERPC_MSR] = 66,
+		/* TODO: PERF_REG_POWERPC_ORIG_R3 */
+		[PERF_REG_POWERPC_CTR] = 109,
+		[PERF_REG_POWERPC_LINK] = 108, /* Note, previously in perf encoded as 65? */
+		[PERF_REG_POWERPC_XER] = 101,
+		/* TODO: PERF_REG_POWERPC_CCR */
+		/* TODO: PERF_REG_POWERPC_SOFTE */
+		/* TODO: PERF_REG_POWERPC_TRAP */
+		/* TODO: PERF_REG_POWERPC_DAR */
+		/* TODO: PERF_REG_POWERPC_DSISR */
+		/* TODO: PERF_REG_POWERPC_SIER */
+		/* TODO: PERF_REG_POWERPC_MMCRA */
+		/* TODO: PERF_REG_POWERPC_MMCR0 */
+		/* TODO: PERF_REG_POWERPC_MMCR1 */
+		/* TODO: PERF_REG_POWERPC_MMCR2 */
+		/* TODO: PERF_REG_POWERPC_MMCR3 */
+		/* TODO: PERF_REG_POWERPC_SIER2 */
+		/* TODO: PERF_REG_POWERPC_SIER3 */
+		/* TODO: PERF_REG_POWERPC_PMC1 */
+		/* TODO: PERF_REG_POWERPC_PMC2 */
+		/* TODO: PERF_REG_POWERPC_PMC3 */
+		/* TODO: PERF_REG_POWERPC_PMC4 */
+		/* TODO: PERF_REG_POWERPC_PMC5 */
+		/* TODO: PERF_REG_POWERPC_PMC6 */
+		/* TODO: PERF_REG_POWERPC_SDAR */
+		/* TODO: PERF_REG_POWERPC_SIAR */
+	};
+
+	if (perf_regnum == 0)
+		return 0;
+
+	if (perf_regnum <  0 || perf_regnum > (int)ARRAY_SIZE(dwarf_powerpc_regnums) ||
+	    dwarf_powerpc_regnums[perf_regnum] == 0)
+		return -ENOENT;
+
+	return dwarf_powerpc_regnums[perf_regnum];
+}
diff --git a/tools/perf/util/dwarf-regs-arch/dwarf-regs-riscv.c b/tools/perf/util/dwarf-regs-arch/dwarf-regs-riscv.c
new file mode 100644
index 000000000000..090db51aba41
--- /dev/null
+++ b/tools/perf/util/dwarf-regs-arch/dwarf-regs-riscv.c
@@ -0,0 +1,12 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <errno.h>
+#include <dwarf-regs.h>
+#include "../../../arch/riscv/include/uapi/asm/perf_regs.h"
+
+int __get_dwarf_regnum_for_perf_regnum_riscv(int perf_regnum)
+{
+	if (perf_regnum < 0 || perf_regnum >= PERF_REG_RISCV_MAX)
+		return -ENOENT;
+
+	return perf_regnum;
+}
diff --git a/tools/perf/util/dwarf-regs-arch/dwarf-regs-s390.c b/tools/perf/util/dwarf-regs-arch/dwarf-regs-s390.c
new file mode 100644
index 000000000000..310a37451bdc
--- /dev/null
+++ b/tools/perf/util/dwarf-regs-arch/dwarf-regs-s390.c
@@ -0,0 +1,53 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <errno.h>
+#include <dwarf-regs.h>
+#include "../../../arch/s390/include/uapi/asm/perf_regs.h"
+
+int __get_dwarf_regnum_for_perf_regnum_s390(int perf_regnum)
+{
+	static const int dwarf_s390_regnums[] = {
+		[PERF_REG_S390_R0] = 0,
+		[PERF_REG_S390_R1] = 1,
+		[PERF_REG_S390_R2] = 2,
+		[PERF_REG_S390_R3] = 3,
+		[PERF_REG_S390_R4] = 4,
+		[PERF_REG_S390_R5] = 5,
+		[PERF_REG_S390_R6] = 6,
+		[PERF_REG_S390_R7] = 7,
+		[PERF_REG_S390_R8] = 8,
+		[PERF_REG_S390_R9] = 9,
+		[PERF_REG_S390_R10] = 10,
+		[PERF_REG_S390_R11] = 11,
+		[PERF_REG_S390_R12] = 12,
+		[PERF_REG_S390_R13] = 13,
+		[PERF_REG_S390_R14] = 14,
+		[PERF_REG_S390_R15] = 15,
+		[PERF_REG_S390_FP0] = 16,
+		[PERF_REG_S390_FP1] = 20,
+		[PERF_REG_S390_FP2] = 17,
+		[PERF_REG_S390_FP3] = 21,
+		[PERF_REG_S390_FP4] = 18,
+		[PERF_REG_S390_FP5] = 22,
+		[PERF_REG_S390_FP6] = 19,
+		[PERF_REG_S390_FP7] = 23,
+		[PERF_REG_S390_FP8] = 24,
+		[PERF_REG_S390_FP9] = 28,
+		[PERF_REG_S390_FP10] = 25,
+		[PERF_REG_S390_FP11] = 29,
+		[PERF_REG_S390_FP12] = 26,
+		[PERF_REG_S390_FP13] = 30,
+		[PERF_REG_S390_FP14] = 27,
+		[PERF_REG_S390_FP15] = 31,
+		[PERF_REG_S390_MASK] = 64,
+		[PERF_REG_S390_PC] = 65,
+	};
+
+	if (perf_regnum == 0)
+		return 0;
+
+	if (perf_regnum <  0 || perf_regnum > (int)ARRAY_SIZE(dwarf_s390_regnums) ||
+	    dwarf_s390_regnums[perf_regnum] == 0)
+		return -ENOENT;
+
+	return dwarf_s390_regnums[perf_regnum];
+}
diff --git a/tools/perf/util/dwarf-regs-arch/dwarf-regs-x86.c b/tools/perf/util/dwarf-regs-arch/dwarf-regs-x86.c
new file mode 100644
index 000000000000..cadef120aeb4
--- /dev/null
+++ b/tools/perf/util/dwarf-regs-arch/dwarf-regs-x86.c
@@ -0,0 +1,260 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * dwarf-regs.c : Mapping of DWARF debug register numbers into register names.
+ * Extracted from probe-finder.c
+ *
+ * Written by Masami Hiramatsu <mhiramat@redhat.com>
+ */
+
+#include <errno.h> /* for EINVAL */
+#include <string.h> /* for strcmp */
+#include <linux/kernel.h> /* for ARRAY_SIZE */
+#include <dwarf-regs.h>
+#include "../../../arch/x86/include/uapi/asm/perf_regs.h"
+
+struct dwarf_regs_idx {
+	const char *name;
+	int dwarf_regnum;
+};
+
+static const struct dwarf_regs_idx i386_regidx_table[] = {
+	{ "eax", 0 }, { "ax", 0 }, { "al", 0 },
+	{ "ecx", 1 }, { "cx", 1 }, { "cl", 1 },
+	{ "edx", 2 }, { "dx", 2 }, { "dl", 2 },
+	{ "ebx", 3 }, { "bx", 3 }, { "bl", 3 },
+	{ "esp", 4 }, { "sp", 4 }, { "$stack", 4},
+	{ "ebp", 5 }, { "bp", 5 },
+	{ "esi", 6 }, { "si", 6 },
+	{ "edi", 7 }, { "di", 7 },
+	// 8 - Return Address RA
+	{ "eflags", 9}, { "flags", 9},
+	// 10 - reserved
+	{ "st0", 11},
+	{ "st1", 12},
+	{ "st2", 13},
+	{ "st3", 14},
+	{ "st4", 15},
+	{ "st5", 16},
+	{ "st6", 17},
+	{ "st7", 18},
+	// 19-20 - reserved
+	{ "xmm0", 21},
+	{ "xmm1", 22},
+	{ "xmm2", 23},
+	{ "xmm3", 24},
+	{ "xmm4", 25},
+	{ "xmm5", 26},
+	{ "xmm6", 27},
+	{ "xmm7", 28},
+	{ "mm0", 29},
+	{ "mm1", 30},
+	{ "mm2", 31},
+	{ "mm3", 32},
+	{ "mm4", 33},
+	{ "mm5", 34},
+	{ "mm6", 35},
+	{ "mm7", 36},
+	// 37-38 - unknown
+	{ "mxcsr", 39}, // 128-bit Media Control and Status
+	{ "es", 40},
+	{ "cs", 41},
+	{ "ss", 42},
+	{ "ds", 43},
+	{ "fs", 44},
+	{ "gs", 45},
+	// 46-47 - reserved
+	{ "tr", 48}, // Task Register
+	{ "ldtr", 49}, // LDT Register
+	// 50-92 - reserved
+	{ "fs.base", 92},
+	{ "gs.base", 93},
+	// End of regular dwarf registers.
+	{ "eip", DWARF_REG_PC }, { "ip", DWARF_REG_PC },
+};
+
+static const struct dwarf_regs_idx x86_64_regidx_table[] = {
+	{ "rax", 0 }, { "eax", 0 }, { "ax", 0 }, { "al", 0 },
+	{ "rdx", 1 }, { "edx", 1 }, { "dx", 1 }, { "dl", 1 },
+	{ "rcx", 2 }, { "ecx", 2 }, { "cx", 2 }, { "cl", 2 },
+	{ "rbx", 3 }, { "edx", 3 }, { "bx", 3 }, { "bl", 3 },
+	{ "rsi", 4 }, { "esi", 4 }, { "si", 4 }, { "sil", 4 },
+	{ "rdi", 5 }, { "edi", 5 }, { "di", 5 }, { "dil", 5 },
+	{ "rbp", 6 }, { "ebp", 6 }, { "bp", 6 }, { "bpl", 6 },
+	{ "rsp", 7 }, { "esp", 7 }, { "sp", 7 }, { "spl", 7 },
+	{ "r8", 8 }, { "r8d", 8 }, { "r8w", 8 }, { "r8b", 8 },
+	{ "r9", 9 }, { "r9d", 9 }, { "r9w", 9 }, { "r9b", 9 },
+	{ "r10", 10 }, { "r10d", 10 }, { "r10w", 10 }, { "r10b", 10 },
+	{ "r11", 11 }, { "r11d", 11 }, { "r11w", 11 }, { "r11b", 11 },
+	{ "r12", 12 }, { "r12d", 12 }, { "r12w", 12 }, { "r12b", 12 },
+	{ "r13", 13 }, { "r13d", 13 }, { "r13w", 13 }, { "r13b", 13 },
+	{ "r14", 14 }, { "r14d", 14 }, { "r14w", 14 }, { "r14b", 14 },
+	{ "r15", 15 }, { "r15d", 15 }, { "r15w", 15 }, { "r15b", 15 },
+	// 16 - Return Address RA
+	{ "xmm0", 17},
+	{ "xmm1", 18},
+	{ "xmm2", 19},
+	{ "xmm3", 20},
+	{ "xmm4", 21},
+	{ "xmm5", 22},
+	{ "xmm6", 23},
+	{ "xmm7", 24},
+	{ "xmm8", 25},
+	{ "xmm9", 26},
+	{ "xmm10", 27},
+	{ "xmm11", 28},
+	{ "xmm12", 29},
+	{ "xmm13", 30},
+	{ "xmm14", 31},
+	{ "xmm15", 32},
+	{ "st0", 33},
+	{ "st1", 34},
+	{ "st2", 35},
+	{ "st3", 36},
+	{ "st4", 37},
+	{ "st5", 38},
+	{ "st6", 39},
+	{ "st7", 40},
+	{ "mm0", 41},
+	{ "mm1", 42},
+	{ "mm2", 43},
+	{ "mm3", 44},
+	{ "mm4", 45},
+	{ "mm5", 46},
+	{ "mm6", 47},
+	{ "mm7", 48},
+	{ "rflags", 49}, { "eflags", 49}, { "flags", 49},
+	{ "es", 50},
+	{ "cs", 51},
+	{ "ss", 52},
+	{ "ds", 53},
+	{ "fs", 54},
+	{ "gs", 55},
+	// 56-47 - reserved
+	{ "fs.base", 58},
+	{ "gs.base", 59},
+	// 60-61 - reserved
+	{ "tr", 62}, // Task Register
+	{ "ldtr", 63}, // LDT Register
+	{ "mxcsr", 64}, // 128-bit Media Control and Status
+	{ "fcw", 65}, // x87 Control Word
+	{ "fsw", 66}, // x87 Status Word
+	// End of regular dwarf registers.
+	{ "rip", DWARF_REG_PC }, { "eip", DWARF_REG_PC }, { "ip", DWARF_REG_PC },
+};
+
+static int get_regnum(const struct dwarf_regs_idx *entries, size_t num_entries, const char *name)
+{
+	if (*name != '%')
+		return -EINVAL;
+
+	name++;
+	for (size_t i = 0; i < num_entries; i++) {
+		if (!strcmp(entries[i].name, name))
+			return entries[i].dwarf_regnum;
+	}
+	return -ENOENT;
+}
+
+int __get_dwarf_regnum_i386(const char *name)
+{
+	return get_regnum(i386_regidx_table, ARRAY_SIZE(i386_regidx_table), name);
+}
+
+int __get_dwarf_regnum_x86_64(const char *name)
+{
+	return get_regnum(x86_64_regidx_table, ARRAY_SIZE(x86_64_regidx_table), name);
+}
+
+int __get_dwarf_regnum_for_perf_regnum_i386(int perf_regnum)
+{
+	static const int dwarf_i386_regnums[] = {
+		[PERF_REG_X86_AX] = 0,
+		[PERF_REG_X86_BX] = 3,
+		[PERF_REG_X86_CX] = 1,
+		[PERF_REG_X86_DX] = 2,
+		[PERF_REG_X86_SI] = 6,
+		[PERF_REG_X86_DI] = 7,
+		[PERF_REG_X86_BP] = 5,
+		[PERF_REG_X86_SP] = 4,
+		[PERF_REG_X86_IP] = 8,
+		[PERF_REG_X86_FLAGS] = 9,
+		[PERF_REG_X86_CS] = 41,
+		[PERF_REG_X86_SS] = 42,
+		[PERF_REG_X86_DS] = 43,
+		[PERF_REG_X86_ES] = 40,
+		[PERF_REG_X86_FS] = 44,
+		[PERF_REG_X86_GS] = 45,
+		[PERF_REG_X86_XMM0] = 21,
+		[PERF_REG_X86_XMM1] = 22,
+		[PERF_REG_X86_XMM2] = 23,
+		[PERF_REG_X86_XMM3] = 24,
+		[PERF_REG_X86_XMM4] = 25,
+		[PERF_REG_X86_XMM5] = 26,
+		[PERF_REG_X86_XMM6] = 27,
+		[PERF_REG_X86_XMM7] = 28,
+	};
+
+	if (perf_regnum == 0)
+		return 0;
+
+	if (perf_regnum <  0 || perf_regnum > (int)ARRAY_SIZE(dwarf_i386_regnums) ||
+	    dwarf_i386_regnums[perf_regnum] == 0)
+		return -ENOENT;
+
+	return dwarf_i386_regnums[perf_regnum];
+}
+
+int __get_dwarf_regnum_for_perf_regnum_x86_64(int perf_regnum)
+{
+	static const int dwarf_x86_64_regnums[] = {
+		[PERF_REG_X86_AX] = 0,
+		[PERF_REG_X86_BX] = 3,
+		[PERF_REG_X86_CX] = 2,
+		[PERF_REG_X86_DX] = 1,
+		[PERF_REG_X86_SI] = 4,
+		[PERF_REG_X86_DI] = 5,
+		[PERF_REG_X86_BP] = 6,
+		[PERF_REG_X86_SP] = 7,
+		[PERF_REG_X86_IP] = 16,
+		[PERF_REG_X86_FLAGS] = 49,
+		[PERF_REG_X86_CS] = 51,
+		[PERF_REG_X86_SS] = 52,
+		[PERF_REG_X86_DS] = 53,
+		[PERF_REG_X86_ES] = 50,
+		[PERF_REG_X86_FS] = 54,
+		[PERF_REG_X86_GS] = 55,
+		[PERF_REG_X86_R8] = 8,
+		[PERF_REG_X86_R9] = 9,
+		[PERF_REG_X86_R10] = 10,
+		[PERF_REG_X86_R11] = 11,
+		[PERF_REG_X86_R12] = 12,
+		[PERF_REG_X86_R13] = 13,
+		[PERF_REG_X86_R14] = 14,
+		[PERF_REG_X86_R15] = 15,
+		[PERF_REG_X86_XMM0] = 17,
+		[PERF_REG_X86_XMM1] = 18,
+		[PERF_REG_X86_XMM2] = 19,
+		[PERF_REG_X86_XMM3] = 20,
+		[PERF_REG_X86_XMM4] = 21,
+		[PERF_REG_X86_XMM5] = 22,
+		[PERF_REG_X86_XMM6] = 23,
+		[PERF_REG_X86_XMM7] = 24,
+		[PERF_REG_X86_XMM8] = 25,
+		[PERF_REG_X86_XMM9] = 26,
+		[PERF_REG_X86_XMM10] = 27,
+		[PERF_REG_X86_XMM11] = 28,
+		[PERF_REG_X86_XMM12] = 29,
+		[PERF_REG_X86_XMM13] = 30,
+		[PERF_REG_X86_XMM14] = 31,
+		[PERF_REG_X86_XMM15] = 32,
+	};
+
+	if (perf_regnum == 0)
+		return 0;
+
+	if (perf_regnum <  0 || perf_regnum > (int)ARRAY_SIZE(dwarf_x86_64_regnums) ||
+	    dwarf_x86_64_regnums[perf_regnum] == 0)
+		return -ENOENT;
+
+	return dwarf_x86_64_regnums[perf_regnum];
+}
diff --git a/tools/perf/util/dwarf-regs-csky.c b/tools/perf/util/dwarf-regs-csky.c
deleted file mode 100644
index d38ef1f07f3e..000000000000
--- a/tools/perf/util/dwarf-regs-csky.c
+++ /dev/null
@@ -1,50 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-// Copyright (C) 2019 Hangzhou C-SKY Microsystems co.,ltd.
-// Mapping of DWARF debug register numbers into register names.
-
-#include <stddef.h>
-#include <dwarf-regs.h>
-
-#define CSKY_ABIV2_MAX_REGS 73
-const char *csky_dwarf_regs_table_abiv2[CSKY_ABIV2_MAX_REGS] = {
-	/* r0 ~ r8 */
-	"%a0", "%a1", "%a2", "%a3", "%regs0", "%regs1", "%regs2", "%regs3",
-	/* r9 ~ r15 */
-	"%regs4", "%regs5", "%regs6", "%regs7", "%regs8", "%regs9", "%sp",
-	"%lr",
-	/* r16 ~ r23 */
-	"%exregs0", "%exregs1", "%exregs2", "%exregs3", "%exregs4",
-	"%exregs5", "%exregs6", "%exregs7",
-	/* r24 ~ r31 */
-	"%exregs8", "%exregs9", "%exregs10", "%exregs11", "%exregs12",
-	"%exregs13", "%exregs14", "%tls",
-	"%pc", NULL, NULL, NULL, "%hi", "%lo", NULL, NULL,
-	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-	"%epc",
-};
-
-#define CSKY_ABIV1_MAX_REGS 57
-const char *csky_dwarf_regs_table_abiv1[CSKY_ABIV1_MAX_REGS] = {
-	/* r0 ~ r8 */
-	"%sp", "%regs9", "%a0", "%a1", "%a2", "%a3", "%regs0", "%regs1",
-	/* r9 ~ r15 */
-	"%regs2", "%regs3", "%regs4", "%regs5", "%regs6", "%regs7", "%regs8",
-	"%lr",
-	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-	"%epc",
-};
-
-const char *get_csky_regstr(unsigned int n, unsigned int flags)
-{
-	if (flags & EF_CSKY_ABIV2)
-		return (n < CSKY_ABIV2_MAX_REGS) ? csky_dwarf_regs_table_abiv2[n] : NULL;
-
-	return (n < CSKY_ABIV1_MAX_REGS) ? csky_dwarf_regs_table_abiv1[n] : NULL;
-}
diff --git a/tools/perf/util/dwarf-regs-powerpc.c b/tools/perf/util/dwarf-regs-powerpc.c
deleted file mode 100644
index caf77a234c78..000000000000
--- a/tools/perf/util/dwarf-regs-powerpc.c
+++ /dev/null
@@ -1,61 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * Mapping of DWARF debug register numbers into register names.
- *
- * Copyright (C) 2010 Ian Munsie, IBM Corporation.
- */
-
-#include <dwarf-regs.h>
-
-#define PPC_OP(op)	(((op) >> 26) & 0x3F)
-#define PPC_RA(a)	(((a) >> 16) & 0x1f)
-#define PPC_RT(t)	(((t) >> 21) & 0x1f)
-#define PPC_RB(b)	(((b) >> 11) & 0x1f)
-#define PPC_D(D)	((D) & 0xfffe)
-#define PPC_DS(DS)	((DS) & 0xfffc)
-#define OP_LD	58
-#define OP_STD	62
-
-static int get_source_reg(u32 raw_insn)
-{
-	return PPC_RA(raw_insn);
-}
-
-static int get_target_reg(u32 raw_insn)
-{
-	return PPC_RT(raw_insn);
-}
-
-static int get_offset_opcode(u32 raw_insn)
-{
-	int opcode = PPC_OP(raw_insn);
-
-	/* DS- form */
-	if ((opcode == OP_LD) || (opcode == OP_STD))
-		return PPC_DS(raw_insn);
-	else
-		return PPC_D(raw_insn);
-}
-
-/*
- * Fills the required fields for op_loc depending on if it
- * is a source or target.
- * D form: ins RT,D(RA) -> src_reg1 = RA, offset = D, dst_reg1 = RT
- * DS form: ins RT,DS(RA) -> src_reg1 = RA, offset = DS, dst_reg1 = RT
- * X form: ins RT,RA,RB -> src_reg1 = RA, src_reg2 = RB, dst_reg1 = RT
- */
-void get_powerpc_regs(u32 raw_insn, int is_source,
-		struct annotated_op_loc *op_loc)
-{
-	if (is_source)
-		op_loc->reg1 = get_source_reg(raw_insn);
-	else
-		op_loc->reg1 = get_target_reg(raw_insn);
-
-	if (op_loc->multi_regs)
-		op_loc->reg2 = PPC_RB(raw_insn);
-
-	/* TODO: Implement offset handling for X Form */
-	if ((op_loc->mem_ref) && (PPC_OP(raw_insn) != 31))
-		op_loc->offset = get_offset_opcode(raw_insn);
-}
diff --git a/tools/perf/util/dwarf-regs-x86.c b/tools/perf/util/dwarf-regs-x86.c
deleted file mode 100644
index 7a55c65e8da6..000000000000
--- a/tools/perf/util/dwarf-regs-x86.c
+++ /dev/null
@@ -1,50 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * dwarf-regs.c : Mapping of DWARF debug register numbers into register names.
- * Extracted from probe-finder.c
- *
- * Written by Masami Hiramatsu <mhiramat@redhat.com>
- */
-
-#include <errno.h> /* for EINVAL */
-#include <string.h> /* for strcmp */
-#include <linux/kernel.h> /* for ARRAY_SIZE */
-#include <dwarf-regs.h>
-
-struct dwarf_regs_idx {
-	const char *name;
-	int idx;
-};
-
-static const struct dwarf_regs_idx x86_regidx_table[] = {
-	{ "rax", 0 }, { "eax", 0 }, { "ax", 0 }, { "al", 0 },
-	{ "rdx", 1 }, { "edx", 1 }, { "dx", 1 }, { "dl", 1 },
-	{ "rcx", 2 }, { "ecx", 2 }, { "cx", 2 }, { "cl", 2 },
-	{ "rbx", 3 }, { "edx", 3 }, { "bx", 3 }, { "bl", 3 },
-	{ "rsi", 4 }, { "esi", 4 }, { "si", 4 }, { "sil", 4 },
-	{ "rdi", 5 }, { "edi", 5 }, { "di", 5 }, { "dil", 5 },
-	{ "rbp", 6 }, { "ebp", 6 }, { "bp", 6 }, { "bpl", 6 },
-	{ "rsp", 7 }, { "esp", 7 }, { "sp", 7 }, { "spl", 7 },
-	{ "r8", 8 }, { "r8d", 8 }, { "r8w", 8 }, { "r8b", 8 },
-	{ "r9", 9 }, { "r9d", 9 }, { "r9w", 9 }, { "r9b", 9 },
-	{ "r10", 10 }, { "r10d", 10 }, { "r10w", 10 }, { "r10b", 10 },
-	{ "r11", 11 }, { "r11d", 11 }, { "r11w", 11 }, { "r11b", 11 },
-	{ "r12", 12 }, { "r12d", 12 }, { "r12w", 12 }, { "r12b", 12 },
-	{ "r13", 13 }, { "r13d", 13 }, { "r13w", 13 }, { "r13b", 13 },
-	{ "r14", 14 }, { "r14d", 14 }, { "r14w", 14 }, { "r14b", 14 },
-	{ "r15", 15 }, { "r15d", 15 }, { "r15w", 15 }, { "r15b", 15 },
-	{ "rip", DWARF_REG_PC },
-};
-
-int get_x86_regnum(const char *name)
-{
-	unsigned int i;
-
-	if (*name != '%')
-		return -EINVAL;
-
-	for (i = 0; i < ARRAY_SIZE(x86_regidx_table); i++)
-		if (!strcmp(x86_regidx_table[i].name, name + 1))
-			return x86_regidx_table[i].idx;
-	return -ENOENT;
-}
diff --git a/tools/perf/util/dwarf-regs.c b/tools/perf/util/dwarf-regs.c
index 28a1cfdf26d4..797f455eba0d 100644
--- a/tools/perf/util/dwarf-regs.c
+++ b/tools/perf/util/dwarf-regs.c
@@ -27,11 +27,11 @@
 #include "../arch/mips/include/dwarf-regs-table.h"
 #include "../arch/loongarch/include/dwarf-regs-table.h"
 
-#define __get_dwarf_regstr(tbl, n) (((n) < ARRAY_SIZE(tbl)) ? (tbl)[(n)] : NULL)
-
 /* Return architecture dependent register string (for kprobe-tracer) */
 const char *get_dwarf_regstr(unsigned int n, unsigned int machine, unsigned int flags)
 {
+	#define __get_dwarf_regstr(tbl, n) (((n) < ARRAY_SIZE(tbl)) ? (tbl)[(n)] : NULL)
+
 	if (machine == EM_NONE) {
 		/* Generic arch - use host arch */
 		machine = EM_HOST;
@@ -46,7 +46,7 @@ const char *get_dwarf_regstr(unsigned int n, unsigned int machine, unsigned int
 	case EM_AARCH64:
 		return __get_dwarf_regstr(aarch64_regstr_tbl, n);
 	case EM_CSKY:
-		return get_csky_regstr(n, flags);
+		return __get_csky_regstr(n, flags);
 	case EM_SH:
 		return __get_dwarf_regstr(sh_regstr_tbl, n);
 	case EM_S390:
@@ -69,22 +69,28 @@ const char *get_dwarf_regstr(unsigned int n, unsigned int machine, unsigned int
 		pr_err("ELF MACHINE %x is not supported.\n", machine);
 	}
 	return NULL;
+
+	#undef __get_dwarf_regstr
 }
 
-#if EM_HOST != EM_X86_64 && EM_HOST != EM_386
-__weak int get_arch_regnum(const char *name __maybe_unused)
+static int __get_dwarf_regnum(const char *const *regstr, size_t num_regstr, const char *name)
 {
-	return -ENOTSUP;
+	for (size_t i = 0; i < num_regstr; i++) {
+		if (regstr[i] && !strcmp(regstr[i], name))
+			return i;
+	}
+	return -ENOENT;
 }
-#endif
 
 /* Return DWARF register number from architecture register name */
-int get_dwarf_regnum(const char *name, unsigned int machine, unsigned int flags __maybe_unused)
+int get_dwarf_regnum(const char *name, unsigned int machine, unsigned int flags)
 {
 	char *regname = strdup(name);
 	int reg = -1;
 	char *p;
 
+	#define _get_dwarf_regnum(tbl, name) __get_dwarf_regnum(tbl, ARRAY_SIZE(tbl), name)
+
 	if (regname == NULL)
 		return -EINVAL;
 
@@ -98,19 +104,134 @@ int get_dwarf_regnum(const char *name, unsigned int machine, unsigned int flags
 		machine = EM_HOST;
 	}
 	switch (machine) {
-#if EM_HOST != EM_X86_64 && EM_HOST != EM_386
-	case EM_HOST:
-		reg = get_arch_regnum(regname);
-		break;
-#endif
 	case EM_X86_64:
-		fallthrough;
+		reg = __get_dwarf_regnum_x86_64(name);
+		break;
 	case EM_386:
-		reg = get_x86_regnum(regname);
+		reg = __get_dwarf_regnum_i386(name);
+		break;
+	case EM_ARM:
+		reg = _get_dwarf_regnum(arm_regstr_tbl, name);
+		break;
+	case EM_AARCH64:
+		reg = _get_dwarf_regnum(aarch64_regstr_tbl, name);
+		break;
+	case EM_CSKY:
+		reg = __get_csky_regnum(name, flags);
+		break;
+	case EM_SH:
+		reg = _get_dwarf_regnum(sh_regstr_tbl, name);
+		break;
+	case EM_S390:
+		reg = _get_dwarf_regnum(s390_regstr_tbl, name);
+		break;
+	case EM_PPC:
+	case EM_PPC64:
+		reg = _get_dwarf_regnum(powerpc_regstr_tbl, name);
+		break;
+	case EM_RISCV:
+		reg = _get_dwarf_regnum(riscv_regstr_tbl, name);
+		break;
+	case EM_SPARC:
+	case EM_SPARCV9:
+		reg = _get_dwarf_regnum(sparc_regstr_tbl, name);
+		break;
+	case EM_XTENSA:
+		reg = _get_dwarf_regnum(xtensa_regstr_tbl, name);
+		break;
+	case EM_MIPS:
+		reg = _get_dwarf_regnum(mips_regstr_tbl, name);
+		break;
+	case EM_LOONGARCH:
+		reg = _get_dwarf_regnum(loongarch_regstr_tbl, name);
 		break;
 	default:
 		pr_err("ELF MACHINE %x is not supported.\n", machine);
 	}
 	free(regname);
 	return reg;
+
+	#undef _get_dwarf_regnum
+}
+
+static int get_libdw_frame_nregs(unsigned int machine, unsigned int flags __maybe_unused)
+{
+	switch (machine) {
+	case EM_X86_64:
+		return 17;
+	case EM_386:
+		return 9;
+	case EM_ARM:
+		return 16;
+	case EM_AARCH64:
+		return 97;
+	case EM_CSKY:
+		return 38;
+	case EM_S390:
+		return 32;
+	case EM_PPC:
+	case EM_PPC64:
+		return 145;
+	case EM_RISCV:
+		return 66;
+	case EM_SPARC:
+	case EM_SPARCV9:
+		return 103;
+	case EM_LOONGARCH:
+		return 74;
+	case EM_MIPS:
+		return 71;
+	default:
+		return 0;
+	}
+}
+
+int get_dwarf_regnum_for_perf_regnum(int perf_regnum, unsigned int machine,
+				     unsigned int flags, bool only_libdw_supported)
+{
+	int reg;
+
+	switch (machine) {
+	case EM_X86_64:
+		reg = __get_dwarf_regnum_for_perf_regnum_x86_64(perf_regnum);
+		break;
+	case EM_386:
+		reg = __get_dwarf_regnum_for_perf_regnum_i386(perf_regnum);
+		break;
+	case EM_ARM:
+		reg = __get_dwarf_regnum_for_perf_regnum_arm(perf_regnum);
+		break;
+	case EM_AARCH64:
+		reg = __get_dwarf_regnum_for_perf_regnum_arm64(perf_regnum);
+		break;
+	case EM_CSKY:
+		reg = __get_dwarf_regnum_for_perf_regnum_csky(perf_regnum, flags);
+		break;
+	case EM_PPC:
+	case EM_PPC64:
+		reg = __get_dwarf_regnum_for_perf_regnum_powerpc(perf_regnum);
+		break;
+	case EM_RISCV:
+		reg = __get_dwarf_regnum_for_perf_regnum_riscv(perf_regnum);
+		break;
+	case EM_S390:
+		reg = __get_dwarf_regnum_for_perf_regnum_s390(perf_regnum);
+		break;
+	case EM_LOONGARCH:
+		reg = __get_dwarf_regnum_for_perf_regnum_loongarch(perf_regnum);
+		break;
+	case EM_MIPS:
+		reg = __get_dwarf_regnum_for_perf_regnum_mips(perf_regnum);
+		break;
+	default:
+		pr_err("ELF MACHINE %x is not supported.\n", machine);
+		return -ENOENT;
+	}
+	if (reg >= 0 && only_libdw_supported) {
+		int nregs = get_libdw_frame_nregs(machine, flags);
+
+		if (reg >= nregs)
+			reg = -ENOENT;
+	}
+	return reg;
 }
diff --git a/tools/perf/util/env.c b/tools/perf/util/env.c
index f1626d2032cd..93d475a80f14 100644
--- a/tools/perf/util/env.c
+++ b/tools/perf/util/env.c
@@ -216,6 +216,34 @@ static void perf_env__purge_bpf(struct perf_env *env __maybe_unused)
 }
 #endif // HAVE_LIBBPF_SUPPORT
 
+void free_cpu_domain_info(struct cpu_domain_map **cd_map, u32 schedstat_version, u32 nr)
+{
+	if (!cd_map)
+		return;
+
+	for (u32 i = 0; i < nr; i++) {
+		if (!cd_map[i])
+			continue;
+
+		for (u32 j = 0; j < cd_map[i]->nr_domains; j++) {
+			struct domain_info *d_info = cd_map[i]->domains[j];
+
+			if (!d_info)
+				continue;
+
+			if (schedstat_version >= 17)
+				zfree(&d_info->dname);
+
+			zfree(&d_info->cpumask);
+			zfree(&d_info->cpulist);
+			zfree(&d_info);
+		}
+		zfree(&cd_map[i]->domains);
+		zfree(&cd_map[i]);
+	}
+	zfree(&cd_map);
+}
+
 void perf_env__exit(struct perf_env *env)
 {
 	int i, j;
@@ -265,6 +293,7 @@ void perf_env__exit(struct perf_env *env)
 		zfree(&env->pmu_caps[i].pmu_name);
 	}
 	zfree(&env->pmu_caps);
+	free_cpu_domain_info(env->cpu_domain, env->schedstat_version, env->nr_cpus_avail);
 }
 
 void perf_env__init(struct perf_env *env)
diff --git a/tools/perf/util/env.h b/tools/perf/util/env.h
index 9977b85523a8..a4501cbca375 100644
--- a/tools/perf/util/env.h
+++ b/tools/perf/util/env.h
@@ -54,6 +54,19 @@ struct pmu_caps {
 	char            *pmu_name;
 };
 
+struct domain_info {
+	u32	domain;
+	char	*dname;
+	char	*cpumask;
+	char	*cpulist;
+};
+
+struct cpu_domain_map {
+	u32			cpu;
+	u32			nr_domains;
+	struct domain_info	**domains;
+};
+
 typedef const char *(arch_syscalls__strerrno_t)(int err);
 
 struct perf_env {
@@ -61,6 +74,9 @@ struct perf_env {
 	char			*os_release;
 	char			*version;
 	char			*arch;
+	/* e_machine expanded from 16 to 32-bits for alignment. */
+	u32			e_machine;
+	u32			e_flags;
 	int			nr_cpus_online;
 	int			nr_cpus_avail;
 	char			*cpu_desc;
@@ -70,6 +86,8 @@ struct perf_env {
 	unsigned int		max_branches;
 	unsigned int		br_cntr_nr;
 	unsigned int		br_cntr_width;
+	unsigned int		schedstat_version;
+	unsigned int		max_sched_domains;
 	int			kernel_is_64_bit;
 
 	int			nr_cmdline;
@@ -92,6 +110,7 @@ struct perf_env {
 	char			**cpu_pmu_caps;
 	struct cpu_topology_map	*cpu;
 	struct cpu_cache_level	*caches;
+	struct cpu_domain_map	**cpu_domain;
 	int			 caches_cnt;
 	u32			comp_ratio;
 	u32			comp_ver;
@@ -151,6 +170,7 @@ struct bpf_prog_info_node;
 struct btf_node;
 
 int perf_env__read_core_pmu_caps(struct perf_env *env);
+void free_cpu_domain_info(struct cpu_domain_map **cd_map, u32 schedstat_version, u32 nr);
 void perf_env__exit(struct perf_env *env);
 
 int perf_env__kernel_is_64_bit(struct perf_env *env);
diff --git a/tools/perf/util/event.c b/tools/perf/util/event.c
index 4c92cc1a952c..bc045fddf7d5 100644
--- a/tools/perf/util/event.c
+++ b/tools/perf/util/event.c
@@ -83,6 +83,8 @@ static const char *perf_event__names[] = {
 	[PERF_RECORD_FINISHED_INIT]		= "FINISHED_INIT",
 	[PERF_RECORD_COMPRESSED2]		= "COMPRESSED2",
 	[PERF_RECORD_BPF_METADATA]		= "BPF_METADATA",
+	[PERF_RECORD_SCHEDSTAT_CPU]		= "SCHEDSTAT_CPU",
+	[PERF_RECORD_SCHEDSTAT_DOMAIN]		= "SCHEDSTAT_DOMAIN",
 };
 
 const char *perf_event__name(unsigned int id)
@@ -571,6 +573,56 @@ size_t perf_event__fprintf_text_poke(union perf_event *event, struct machine *ma
 	return ret;
 }
 
+size_t perf_event__fprintf_schedstat_cpu(union perf_event *event, FILE *fp)
+{
+	struct perf_record_schedstat_cpu *cs = &event->schedstat_cpu;
+	size_t size = fprintf(fp, "\ncpu%u ", cs->cpu);
+	__u16 version = cs->version;
+
+#define CPU_FIELD(_type, _name, _desc, _format, _is_pct, _pct_of, _ver)		\
+	size += fprintf(fp, "%" PRIu64 " ", (uint64_t)cs->_ver._name)
+
+	if (version == 15) {
+#include <perf/schedstat-v15.h>
+		return size;
+	} else if (version == 16) {
+#include <perf/schedstat-v16.h>
+		return size;
+	} else if (version == 17) {
+#include <perf/schedstat-v17.h>
+		return size;
+	}
+#undef CPU_FIELD
+
+	return fprintf(fp, "Unsupported /proc/schedstat version %d.\n",
+		       event->schedstat_cpu.version);
+}
+
+size_t perf_event__fprintf_schedstat_domain(union perf_event *event, FILE *fp)
+{
+	struct perf_record_schedstat_domain *ds = &event->schedstat_domain;
+	__u16 version = ds->version;
+	size_t size = fprintf(fp, "\ndomain%u ", ds->domain);
+
+#define DOMAIN_FIELD(_type, _name, _desc, _format, _is_jiffies, _ver)		\
+	size += fprintf(fp, "%" PRIu64 " ", (uint64_t)ds->_ver._name)
+
+	if (version == 15) {
+#include <perf/schedstat-v15.h>
+		return size;
+	} else if (version == 16) {
+#include <perf/schedstat-v16.h>
+		return size;
+	} else if (version == 17) {
+#include <perf/schedstat-v17.h>
+		return size;
+	}
+#undef DOMAIN_FIELD
+
+	return fprintf(fp, "Unsupported /proc/schedstat version %d.\n",
+		       event->schedstat_domain.version);
+}
+
 size_t perf_event__fprintf(union perf_event *event, struct machine *machine, FILE *fp)
 {
 	size_t ret = fprintf(fp, "PERF_RECORD_%s",
@@ -646,7 +698,6 @@ struct map *thread__find_map(struct thread *thread, u8 cpumode, u64 addr,
 	struct machine *machine = maps__machine(maps);
 	bool load_map = false;
 
-	maps__zput(al->maps);
 	map__zput(al->map);
 	thread__zput(al->thread);
 	al->thread = thread__get(thread);
@@ -684,7 +735,6 @@ struct map *thread__find_map(struct thread *thread, u8 cpumode, u64 addr,
 
 		return NULL;
 	}
-	al->maps = maps__get(maps);
 	al->map = maps__find(maps, al->addr);
 	if (al->map != NULL) {
 		/*
diff --git a/tools/perf/util/event.h b/tools/perf/util/event.h
index 64c63b59d617..2ea83fdf8a03 100644
--- a/tools/perf/util/event.h
+++ b/tools/perf/util/event.h
@@ -392,6 +392,8 @@ size_t perf_event__fprintf_ksymbol(union perf_event *event, FILE *fp);
 size_t perf_event__fprintf_bpf(union perf_event *event, FILE *fp);
 size_t perf_event__fprintf_bpf_metadata(union perf_event *event, FILE *fp);
 size_t perf_event__fprintf_text_poke(union perf_event *event, struct machine *machine,FILE *fp);
+size_t perf_event__fprintf_schedstat_cpu(union perf_event *event, FILE *fp);
+size_t perf_event__fprintf_schedstat_domain(union perf_event *event, FILE *fp);
 size_t perf_event__fprintf(union perf_event *event, struct machine *machine, FILE *fp);
 
 int kallsyms__get_function_start(const char *kallsyms_filename,
diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c
index 03674d2cbd01..591bdf0b3e2a 100644
--- a/tools/perf/util/evlist.c
+++ b/tools/perf/util/evlist.c
@@ -359,36 +359,107 @@ int evlist__add_newtp(struct evlist *evlist, const char *sys, const char *name,
 }
 #endif
 
-struct evlist_cpu_iterator evlist__cpu_begin(struct evlist *evlist, struct affinity *affinity)
+/*
+ * Should sched_setaffinity be used with evlist__for_each_cpu? Determine if
+ * migrating the thread will avoid possibly numerous IPIs.
+ */
+static bool evlist__use_affinity(struct evlist *evlist)
+{
+	struct evsel *pos;
+	struct perf_cpu_map *used_cpus = NULL;
+	bool ret = false;
+
+	if (evlist->no_affinity || !evlist->core.user_requested_cpus ||
+	    cpu_map__is_dummy(evlist->core.user_requested_cpus))
+		return false;
+
+	evlist__for_each_entry(evlist, pos) {
+		struct perf_cpu_map *intersect;
+
+		if (!perf_pmu__benefits_from_affinity(pos->pmu))
+			continue;
+
+		if (evsel__is_dummy_event(pos)) {
+			/*
+			 * The dummy event is opened on all CPUs so assume >1
+			 * event with shared CPUs.
+			 */
+			ret = true;
+			break;
+		}
+		if (evsel__is_retire_lat(pos)) {
+			/*
+			 * Retirement latency events are similar to tool ones in
+			 * their implementation, and so don't require affinity.
+			 */
+			continue;
+		}
+		if (perf_cpu_map__is_empty(used_cpus)) {
+			/* First benefitting event, we want >1 on a common CPU. */
+			used_cpus = perf_cpu_map__get(pos->core.cpus);
+			continue;
+		}
+		if ((pos->core.attr.read_format & PERF_FORMAT_GROUP) &&
+		    evsel__leader(pos) != pos) {
+			/* Skip members of the same sample group. */
+			continue;
+		}
+		intersect = perf_cpu_map__intersect(used_cpus, pos->core.cpus);
+		if (!perf_cpu_map__is_empty(intersect)) {
+			/* >1 event with shared CPUs. */
+			perf_cpu_map__put(intersect);
+			ret = true;
+			break;
+		}
+		perf_cpu_map__put(intersect);
+		perf_cpu_map__merge(&used_cpus, pos->core.cpus);
+	}
+	perf_cpu_map__put(used_cpus);
+	return ret;
+}
+
+void evlist_cpu_iterator__init(struct evlist_cpu_iterator *itr, struct evlist *evlist)
 {
-	struct evlist_cpu_iterator itr = {
+	*itr = (struct evlist_cpu_iterator){
 		.container = evlist,
 		.evsel = NULL,
 		.cpu_map_idx = 0,
 		.evlist_cpu_map_idx = 0,
 		.evlist_cpu_map_nr = perf_cpu_map__nr(evlist->core.all_cpus),
 		.cpu = (struct perf_cpu){ .cpu = -1},
-		.affinity = affinity,
+		.affinity = NULL,
 	};
 
 	if (evlist__empty(evlist)) {
 		/* Ensure the empty list doesn't iterate. */
-		itr.evlist_cpu_map_idx = itr.evlist_cpu_map_nr;
-	} else {
-		itr.evsel = evlist__first(evlist);
-		if (itr.affinity) {
-			itr.cpu = perf_cpu_map__cpu(evlist->core.all_cpus, 0);
-			affinity__set(itr.affinity, itr.cpu.cpu);
-			itr.cpu_map_idx = perf_cpu_map__idx(itr.evsel->core.cpus, itr.cpu);
-			/*
-			 * If this CPU isn't in the evsel's cpu map then advance
-			 * through the list.
-			 */
-			if (itr.cpu_map_idx == -1)
-				evlist_cpu_iterator__next(&itr);
-		}
+		itr->evlist_cpu_map_idx = itr->evlist_cpu_map_nr;
+		return;
 	}
-	return itr;
+
+	if (evlist__use_affinity(evlist)) {
+		if (affinity__setup(&itr->saved_affinity) == 0)
+			itr->affinity = &itr->saved_affinity;
+	}
+	itr->evsel = evlist__first(evlist);
+	itr->cpu = perf_cpu_map__cpu(evlist->core.all_cpus, 0);
+	if (itr->affinity)
+		affinity__set(itr->affinity, itr->cpu.cpu);
+	itr->cpu_map_idx = perf_cpu_map__idx(itr->evsel->core.cpus, itr->cpu);
+	/*
+	 * If this CPU isn't in the evsel's cpu map then advance
+	 * through the list.
+	 */
+	if (itr->cpu_map_idx == -1)
+		evlist_cpu_iterator__next(itr);
+}
+
+void evlist_cpu_iterator__exit(struct evlist_cpu_iterator *itr)
+{
+	if (!itr->affinity)
+		return;
+
+	affinity__cleanup(itr->affinity);
+	itr->affinity = NULL;
 }
 
 void evlist_cpu_iterator__next(struct evlist_cpu_iterator *evlist_cpu_itr)
@@ -418,14 +489,11 @@ void evlist_cpu_iterator__next(struct evlist_cpu_iterator *evlist_cpu_itr)
 		 */
 		if (evlist_cpu_itr->cpu_map_idx == -1)
 			evlist_cpu_iterator__next(evlist_cpu_itr);
+	} else {
+		evlist_cpu_iterator__exit(evlist_cpu_itr);
 	}
 }
 
-bool evlist_cpu_iterator__end(const struct evlist_cpu_iterator *evlist_cpu_itr)
-{
-	return evlist_cpu_itr->evlist_cpu_map_idx >= evlist_cpu_itr->evlist_cpu_map_nr;
-}
-
 static int evsel__strcmp(struct evsel *pos, char *evsel_name)
 {
 	if (!evsel_name)
@@ -453,19 +521,11 @@ static void __evlist__disable(struct evlist *evlist, char *evsel_name, bool excl
 {
 	struct evsel *pos;
 	struct evlist_cpu_iterator evlist_cpu_itr;
-	struct affinity saved_affinity, *affinity = NULL;
 	bool has_imm = false;
 
-	// See explanation in evlist__close()
-	if (!cpu_map__is_dummy(evlist->core.user_requested_cpus)) {
-		if (affinity__setup(&saved_affinity) < 0)
-			return;
-		affinity = &saved_affinity;
-	}
-
 	/* Disable 'immediate' events last */
 	for (int imm = 0; imm <= 1; imm++) {
-		evlist__for_each_cpu(evlist_cpu_itr, evlist, affinity) {
+		evlist__for_each_cpu(evlist_cpu_itr, evlist) {
 			pos = evlist_cpu_itr.evsel;
 			if (evsel__strcmp(pos, evsel_name))
 				continue;
@@ -483,7 +543,6 @@ static void __evlist__disable(struct evlist *evlist, char *evsel_name, bool excl
 			break;
 	}
 
-	affinity__cleanup(affinity);
 	evlist__for_each_entry(evlist, pos) {
 		if (evsel__strcmp(pos, evsel_name))
 			continue;
@@ -523,16 +582,8 @@ static void __evlist__enable(struct evlist *evlist, char *evsel_name, bool excl_
 {
 	struct evsel *pos;
 	struct evlist_cpu_iterator evlist_cpu_itr;
-	struct affinity saved_affinity, *affinity = NULL;
-
-	// See explanation in evlist__close()
-	if (!cpu_map__is_dummy(evlist->core.user_requested_cpus)) {
-		if (affinity__setup(&saved_affinity) < 0)
-			return;
-		affinity = &saved_affinity;
-	}
 
-	evlist__for_each_cpu(evlist_cpu_itr, evlist, affinity) {
+	evlist__for_each_cpu(evlist_cpu_itr, evlist) {
 		pos = evlist_cpu_itr.evsel;
 		if (evsel__strcmp(pos, evsel_name))
 			continue;
@@ -542,7 +593,6 @@ static void __evlist__enable(struct evlist *evlist, char *evsel_name, bool excl_
 			continue;
 		evsel__enable_cpu(pos, evlist_cpu_itr.cpu_map_idx);
 	}
-	affinity__cleanup(affinity);
 	evlist__for_each_entry(evlist, pos) {
 		if (evsel__strcmp(pos, evsel_name))
 			continue;
@@ -1339,28 +1389,14 @@ void evlist__close(struct evlist *evlist)
 {
 	struct evsel *evsel;
 	struct evlist_cpu_iterator evlist_cpu_itr;
-	struct affinity affinity;
-
-	/*
-	 * With perf record core.user_requested_cpus is usually NULL.
-	 * Use the old method to handle this for now.
-	 */
-	if (!evlist->core.user_requested_cpus ||
-	    cpu_map__is_dummy(evlist->core.user_requested_cpus)) {
-		evlist__for_each_entry_reverse(evlist, evsel)
-			evsel__close(evsel);
-		return;
-	}
-
-	if (affinity__setup(&affinity) < 0)
-		return;
 
-	evlist__for_each_cpu(evlist_cpu_itr, evlist, &affinity) {
+	evlist__for_each_cpu(evlist_cpu_itr, evlist) {
+		if (evlist_cpu_itr.cpu_map_idx == 0 && evsel__is_retire_lat(evlist_cpu_itr.evsel))
+			evsel__tpebs_close(evlist_cpu_itr.evsel);
 		perf_evsel__close_cpu(&evlist_cpu_itr.evsel->core,
 				      evlist_cpu_itr.cpu_map_idx);
 	}
 
-	affinity__cleanup(&affinity);
 	evlist__for_each_entry_reverse(evlist, evsel) {
 		perf_evsel__free_fd(&evsel->core);
 		perf_evsel__free_id(&evsel->core);
@@ -1614,14 +1650,14 @@ int evlist__parse_sample_timestamp(struct evlist *evlist, union perf_event *even
 int evlist__strerror_open(struct evlist *evlist, int err, char *buf, size_t size)
 {
 	int printed, value;
-	char sbuf[STRERR_BUFSIZE], *emsg = str_error_r(err, sbuf, sizeof(sbuf));
 
 	switch (err) {
 	case EACCES:
 	case EPERM:
+		errno = err;
 		printed = scnprintf(buf, size,
-				    "Error:\t%s.\n"
-				    "Hint:\tCheck /proc/sys/kernel/perf_event_paranoid setting.", emsg);
+				    "Error:\t%m.\n"
+				    "Hint:\tCheck /proc/sys/kernel/perf_event_paranoid setting.");
 
 		value = perf_event_paranoid();
 
@@ -1648,16 +1684,18 @@ int evlist__strerror_open(struct evlist *evlist, int err, char *buf, size_t size
 		if (first->core.attr.sample_freq < (u64)max_freq)
 			goto out_default;
 
+		errno = err;
 		printed = scnprintf(buf, size,
-				    "Error:\t%s.\n"
+				    "Error:\t%m.\n"
 				    "Hint:\tCheck /proc/sys/kernel/perf_event_max_sample_rate.\n"
 				    "Hint:\tThe current value is %d and %" PRIu64 " is being requested.",
-				    emsg, max_freq, first->core.attr.sample_freq);
+				    max_freq, first->core.attr.sample_freq);
 		break;
 	}
 	default:
 out_default:
-		scnprintf(buf, size, "%s", emsg);
+		errno = err;
+		scnprintf(buf, size, "%m");
 		break;
 	}
 
@@ -1666,17 +1704,17 @@ out_default:
 
 int evlist__strerror_mmap(struct evlist *evlist, int err, char *buf, size_t size)
 {
-	char sbuf[STRERR_BUFSIZE], *emsg = str_error_r(err, sbuf, sizeof(sbuf));
 	int pages_attempted = evlist->core.mmap_len / 1024, pages_max_per_user, printed = 0;
 
 	switch (err) {
 	case EPERM:
 		sysctl__read_int("kernel/perf_event_mlock_kb", &pages_max_per_user);
+		errno = err;
 		printed += scnprintf(buf + printed, size - printed,
-				     "Error:\t%s.\n"
+				     "Error:\t%m.\n"
 				     "Hint:\tCheck /proc/sys/kernel/perf_event_mlock_kb (%d kB) setting.\n"
 				     "Hint:\tTried using %zd kB.\n",
-				     emsg, pages_max_per_user, pages_attempted);
+				     pages_max_per_user, pages_attempted);
 
 		if (pages_attempted >= pages_max_per_user) {
 			printed += scnprintf(buf + printed, size - printed,
@@ -1688,7 +1726,8 @@ int evlist__strerror_mmap(struct evlist *evlist, int err, char *buf, size_t size
 				     "Hint:\tTry using a smaller -m/--mmap-pages value.");
 		break;
 	default:
-		scnprintf(buf, size, "%s", emsg);
+		errno = err;
+		scnprintf(buf, size, "%m");
 		break;
 	}
 
@@ -1920,8 +1959,8 @@ static int evlist__parse_control_fifo(const char *str, int *ctl_fd, int *ctl_fd_
 	 */
 	fd = open(s, O_RDWR | O_NONBLOCK | O_CLOEXEC);
 	if (fd < 0) {
-		pr_err("Failed to open '%s'\n", s);
 		ret = -errno;
+		pr_err("Failed to open '%s': %m\n", s);
 		goto out_free;
 	}
 	*ctl_fd = fd;
@@ -1931,7 +1970,7 @@ static int evlist__parse_control_fifo(const char *str, int *ctl_fd, int *ctl_fd_
 		/* O_RDWR | O_NONBLOCK means the other end need not be open */
 		fd = open(p, O_RDWR | O_NONBLOCK | O_CLOEXEC);
 		if (fd < 0) {
-			pr_err("Failed to open '%s'\n", p);
+			pr_err("Failed to open '%s': %m\n", p);
 			ret = -errno;
 			goto out_free;
 		}
@@ -1945,7 +1984,8 @@ out_free:
 
 int evlist__parse_control(const char *str, int *ctl_fd, int *ctl_fd_ack, bool *ctl_fd_close)
 {
-	char *comma = NULL, *endptr = NULL;
+	const char *comma = NULL;
+	char *endptr = NULL;
 
 	*ctl_fd_close = false;
 
@@ -2363,7 +2403,7 @@ int evlist__parse_event_enable_time(struct evlist *evlist, struct record_opts *o
 	eet->timerfd = timerfd_create(CLOCK_MONOTONIC, TFD_CLOEXEC);
 	if (eet->timerfd == -1) {
 		err = -errno;
-		pr_err("timerfd_create failed: %s\n", strerror(errno));
+		pr_err("timerfd_create failed: %m\n");
 		goto free_eet_times;
 	}
 
@@ -2398,7 +2438,7 @@ static int event_enable_timer__set_timer(struct event_enable_timer *eet, int ms)
 
 	if (timerfd_settime(eet->timerfd, 0, &its, NULL) < 0) {
 		err = -errno;
-		pr_err("timerfd_settime failed: %s\n", strerror(errno));
+		pr_err("timerfd_settime failed: %m\n");
 	}
 	return err;
 }
diff --git a/tools/perf/util/evlist.h b/tools/perf/util/evlist.h
index 911834ae7c2a..d17c3b57a409 100644
--- a/tools/perf/util/evlist.h
+++ b/tools/perf/util/evlist.h
@@ -10,6 +10,7 @@
 #include <internal/evlist.h>
 #include <internal/evsel.h>
 #include <perf/evlist.h>
+#include "affinity.h"
 #include "events_stats.h"
 #include "evsel.h"
 #include "rblist.h"
@@ -58,6 +59,7 @@ struct event_enable_timer;
 struct evlist {
 	struct perf_evlist core;
 	bool		 enabled;
+	bool		 no_affinity;
 	int		 id_pos;
 	int		 is_pos;
 	int		 nr_br_cntr;
@@ -363,6 +365,8 @@ struct evlist_cpu_iterator {
 	struct perf_cpu cpu;
 	/** If present, used to set the affinity when switching between CPUs. */
 	struct affinity *affinity;
+	/** Maybe be used to hold affinity state prior to iterating. */
+	struct affinity saved_affinity;
 };
 
 /**
@@ -370,22 +374,31 @@ struct evlist_cpu_iterator {
  *                        affinity, iterate over all CPUs and then the evlist
  *                        for each evsel on that CPU. When switching between
  *                        CPUs the affinity is set to the CPU to avoid IPIs
- *                        during syscalls.
+ *                        during syscalls. The affinity is set up and removed
+ *                        automatically, if the loop is broken a call to
+ *                        evlist_cpu_iterator__exit is necessary.
  * @evlist_cpu_itr: the iterator instance.
  * @evlist: evlist instance to iterate.
- * @affinity: NULL or used to set the affinity to the current CPU.
  */
-#define evlist__for_each_cpu(evlist_cpu_itr, evlist, affinity)		\
-	for ((evlist_cpu_itr) = evlist__cpu_begin(evlist, affinity);	\
+#define evlist__for_each_cpu(evlist_cpu_itr, evlist)			\
+	for (evlist_cpu_iterator__init(&(evlist_cpu_itr), evlist);	\
 	     !evlist_cpu_iterator__end(&evlist_cpu_itr);		\
 	     evlist_cpu_iterator__next(&evlist_cpu_itr))
 
-/** Returns an iterator set to the first CPU/evsel of evlist. */
-struct evlist_cpu_iterator evlist__cpu_begin(struct evlist *evlist, struct affinity *affinity);
+/** Setup an iterator set to the first CPU/evsel of evlist. */
+void evlist_cpu_iterator__init(struct evlist_cpu_iterator *itr, struct evlist *evlist);
+/**
+ * Cleans up the iterator, automatically done by evlist_cpu_iterator__next when
+ * the end of the list is reached. Multiple calls are safe.
+ */
+void evlist_cpu_iterator__exit(struct evlist_cpu_iterator *itr);
 /** Move to next element in iterator, updating CPU, evsel and the affinity. */
 void evlist_cpu_iterator__next(struct evlist_cpu_iterator *evlist_cpu_itr);
 /** Returns true when iterator is at the end of the CPUs and evlist. */
-bool evlist_cpu_iterator__end(const struct evlist_cpu_iterator *evlist_cpu_itr);
+static inline bool evlist_cpu_iterator__end(const struct evlist_cpu_iterator *evlist_cpu_itr)
+{
+	return evlist_cpu_itr->evlist_cpu_map_idx >= evlist_cpu_itr->evlist_cpu_map_nr;
+}
 
 struct evsel *evlist__get_tracking_event(struct evlist *evlist);
 void evlist__set_tracking_event(struct evlist *evlist, struct evsel *tracking_evsel);
diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
index 9cd706f62793..f59228c1a39e 100644
--- a/tools/perf/util/evsel.c
+++ b/tools/perf/util/evsel.c
@@ -34,6 +34,7 @@
 #include "callchain.h"
 #include "cgroup.h"
 #include "counts.h"
+#include "dwarf-regs.h"
 #include "event.h"
 #include "evsel.h"
 #include "time-utils.h"
@@ -648,8 +649,9 @@ struct tep_event *evsel__tp_format(struct evsel *evsel)
 	if (IS_ERR(tp_format)) {
 		int err = -PTR_ERR(evsel->tp_format);
 
-		pr_err("Error getting tracepoint format '%s' '%s'(%d)\n",
-			evsel__name(evsel), strerror(err), err);
+		errno = err;
+		pr_err("Error getting tracepoint format '%s': %m\n",
+			evsel__name(evsel));
 		return NULL;
 	}
 	evsel->tp_format = tp_format;
@@ -1006,6 +1008,13 @@ int evsel__group_desc(struct evsel *evsel, char *buf, size_t size)
 	return ret;
 }
 
+uint16_t evsel__e_machine(struct evsel *evsel, uint32_t *e_flags)
+{
+	struct perf_session *session = evsel__session(evsel);
+
+	return perf_session__e_machine(session, e_flags);
+}
+
 static void __evsel__config_callchain(struct evsel *evsel, struct record_opts *opts,
 				      struct callchain_param *param)
 {
@@ -1041,18 +1050,18 @@ static void __evsel__config_callchain(struct evsel *evsel, struct record_opts *o
 
 	if (param->record_mode == CALLCHAIN_DWARF) {
 		if (!function) {
-			const char *arch = perf_env__arch(evsel__env(evsel));
+			uint16_t e_machine = evsel__e_machine(evsel, /*e_flags=*/NULL);
 
 			evsel__set_sample_bit(evsel, REGS_USER);
 			evsel__set_sample_bit(evsel, STACK_USER);
 			if (opts->sample_user_regs &&
-			    DWARF_MINIMAL_REGS(arch) != arch__user_reg_mask()) {
-				attr->sample_regs_user |= DWARF_MINIMAL_REGS(arch);
+			    DWARF_MINIMAL_REGS(e_machine) != perf_user_reg_mask(EM_HOST)) {
+				attr->sample_regs_user |= DWARF_MINIMAL_REGS(e_machine);
 				pr_warning("WARNING: The use of --call-graph=dwarf may require all the user registers, "
 					   "specifying a subset with --user-regs may render DWARF unwinding unreliable, "
 					   "so the minimal registers set (IP, SP) is explicitly forced.\n");
 			} else {
-				attr->sample_regs_user |= arch__user_reg_mask();
+				attr->sample_regs_user |= perf_user_reg_mask(EM_HOST);
 			}
 			attr->sample_stack_user = param->dump_size;
 			attr->exclude_callchain_user = 1;
@@ -1242,7 +1251,11 @@ static void evsel__apply_config_terms(struct evsel *evsel,
 		case EVSEL__CONFIG_TERM_AUX_SAMPLE_SIZE:
 			/* Already applied by auxtrace */
 			break;
-		case EVSEL__CONFIG_TERM_CFG_CHG:
+		case EVSEL__CONFIG_TERM_USR_CHG_CONFIG:
+		case EVSEL__CONFIG_TERM_USR_CHG_CONFIG1:
+		case EVSEL__CONFIG_TERM_USR_CHG_CONFIG2:
+		case EVSEL__CONFIG_TERM_USR_CHG_CONFIG3:
+		case EVSEL__CONFIG_TERM_USR_CHG_CONFIG4:
 			break;
 		case EVSEL__CONFIG_TERM_RATIO_TO_PREV:
 			rtp_buf = term->val.str;
@@ -1314,6 +1327,109 @@ struct evsel_config_term *__evsel__get_config_term(struct evsel *evsel, enum evs
 	return found_term;
 }
 
+/*
+ * Set @config_name to @val as long as the user hasn't already set or cleared it
+ * by passing a config term on the command line.
+ *
+ * @val is the value to put into the bits specified by @config_name rather than
+ * the bit pattern. It is shifted into position by this function, so to set
+ * something to true, pass 1 for val rather than a pre shifted value.
+ */
+void evsel__set_config_if_unset(struct evsel *evsel, const char *config_name,
+				u64 val)
+{
+	u64 user_bits = 0;
+	struct evsel_config_term *term = evsel__get_config_term(evsel,
+								USR_CHG_CONFIG);
+	struct perf_pmu_format *format = pmu_find_format(&evsel->pmu->format,
+							 config_name);
+	int fbit;
+	__u64 *vp;
+
+	if (!format)
+		return;
+
+	switch (format->value) {
+	case PERF_PMU_FORMAT_VALUE_CONFIG:
+		term = evsel__get_config_term(evsel, USR_CHG_CONFIG);
+		vp = &evsel->core.attr.config;
+		break;
+	case PERF_PMU_FORMAT_VALUE_CONFIG1:
+		term = evsel__get_config_term(evsel, USR_CHG_CONFIG1);
+		vp = &evsel->core.attr.config1;
+		break;
+	case PERF_PMU_FORMAT_VALUE_CONFIG2:
+		term = evsel__get_config_term(evsel, USR_CHG_CONFIG2);
+		vp = &evsel->core.attr.config2;
+		break;
+	case PERF_PMU_FORMAT_VALUE_CONFIG3:
+		term = evsel__get_config_term(evsel, USR_CHG_CONFIG3);
+		vp = &evsel->core.attr.config3;
+		break;
+	case PERF_PMU_FORMAT_VALUE_CONFIG4:
+		term = evsel__get_config_term(evsel, USR_CHG_CONFIG4);
+		vp = &evsel->core.attr.config4;
+		break;
+	default:
+		pr_err("Unknown format value: %d\n", format->value);
+		return;
+	}
+
+	if (!format)
+		return;
+
+	if (term)
+		user_bits = term->val.cfg_chg;
+
+	/* Do nothing if the user changed the value */
+	for_each_set_bit(fbit, format->bits, PERF_PMU_FORMAT_BITS)
+		if ((1ULL << fbit) & user_bits)
+			return;
+
+	/* Otherwise replace it */
+	perf_pmu__format_pack(format->bits, val, vp, /*zero=*/true);
+}
+
+
+int evsel__get_config_val(const struct evsel *evsel, const char *config_name,
+			  u64 *val)
+{
+	struct perf_pmu_format *format = pmu_find_format(&evsel->pmu->format, config_name);
+
+	if (!format || bitmap_empty(format->bits, PERF_PMU_FORMAT_BITS)) {
+		pr_err("Unknown/empty format name: %s\n", config_name);
+		*val = 0;
+		return -EINVAL;
+	}
+
+	switch (format->value) {
+	case PERF_PMU_FORMAT_VALUE_CONFIG:
+		*val = perf_pmu__format_unpack(format->bits,
+					       evsel->core.attr.config);
+		return 0;
+	case PERF_PMU_FORMAT_VALUE_CONFIG1:
+		*val = perf_pmu__format_unpack(format->bits,
+					       evsel->core.attr.config1);
+		return 0;
+	case PERF_PMU_FORMAT_VALUE_CONFIG2:
+		*val = perf_pmu__format_unpack(format->bits,
+					       evsel->core.attr.config2);
+		return 0;
+	case PERF_PMU_FORMAT_VALUE_CONFIG3:
+		*val = perf_pmu__format_unpack(format->bits,
+					       evsel->core.attr.config3);
+		return 0;
+	case PERF_PMU_FORMAT_VALUE_CONFIG4:
+		*val = perf_pmu__format_unpack(format->bits,
+					       evsel->core.attr.config4);
+		return 0;
+	default:
+		pr_err("Unknown format value: %d\n", format->value);
+		*val = 0;
+		return -EINVAL;
+	}
+}
+
 void __weak arch_evsel__set_sample_weight(struct evsel *evsel)
 {
 	evsel__set_sample_bit(evsel, WEIGHT);
@@ -1445,10 +1561,11 @@ void evsel__config(struct evsel *evsel, struct record_opts *opts,
 		attr->inherit_stat = 1;
 	}
 
-	if (opts->sample_address) {
+	if (opts->sample_address)
 		evsel__set_sample_bit(evsel, ADDR);
+
+	if (opts->record_data_mmap)
 		attr->mmap_data = track;
-	}
 
 	/*
 	 * We don't allow user space callchains for  function trace
@@ -2771,8 +2888,8 @@ retry_open:
 					    PERF_EVENT_IOC_SET_BPF,
 					    bpf_fd);
 				if (err && errno != EEXIST) {
-					pr_err("failed to attach bpf fd %d: %s\n",
-					       bpf_fd, strerror(errno));
+					pr_err("failed to attach bpf fd %d: %m\n",
+					       bpf_fd);
 					err = -EINVAL;
 					goto out_close;
 				}
@@ -3863,7 +3980,6 @@ int evsel__open_strerror(struct evsel *evsel, struct target *target,
 			 int err, char *msg, size_t size)
 {
 	struct perf_pmu *pmu;
-	char sbuf[STRERR_BUFSIZE];
 	int printed = 0, enforced = 0;
 	int ret;
 
@@ -3996,10 +4112,11 @@ int evsel__open_strerror(struct evsel *evsel, struct target *target,
 	if (ret)
 		return ret;
 
+	errno = err;
 	return scnprintf(msg, size,
-	"The sys_perf_event_open() syscall returned with %d (%s) for event (%s).\n"
-	"\"dmesg | grep -i perf\" may provide additional information.\n",
-			 err, str_error_r(err, sbuf, sizeof(sbuf)), evsel__name(evsel));
+			 "The sys_perf_event_open() syscall failed for event (%s): %m\n"
+			 "\"dmesg | grep -i perf\" may provide additional information.\n",
+			 evsel__name(evsel));
 }
 
 struct perf_session *evsel__session(struct evsel *evsel)
@@ -4097,6 +4214,17 @@ void evsel__set_leader(struct evsel *evsel, struct evsel *leader)
 	evsel->core.leader = &leader->core;
 }
 
+bool evsel__is_aux_event(const struct evsel *evsel)
+{
+	struct perf_pmu *pmu;
+
+	if (evsel->needs_auxtrace_mmap)
+		return true;
+
+	pmu = evsel__find_pmu(evsel);
+	return pmu && pmu->auxtrace;
+}
+
 int evsel__source_count(const struct evsel *evsel)
 {
 	struct evsel *pos;
diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h
index a08130ff2e47..a3d754c029a0 100644
--- a/tools/perf/util/evsel.h
+++ b/tools/perf/util/evsel.h
@@ -546,6 +546,7 @@ static inline bool evsel__is_dummy_event(struct evsel *evsel)
 
 struct perf_session *evsel__session(struct evsel *evsel);
 struct perf_env *evsel__env(struct evsel *evsel);
+uint16_t evsel__e_machine(struct evsel *evsel, uint32_t *e_flags);
 
 int evsel__store_ids(struct evsel *evsel, struct evlist *evlist);
 
@@ -575,8 +576,10 @@ void evsel__uniquify_counter(struct evsel *counter);
 	((((src) >> (pos)) & ((1ull << (size)) - 1)) << (63 - ((pos) + (size) - 1)))
 
 u64 evsel__bitfield_swap_branch_flags(u64 value);
-void evsel__set_config_if_unset(struct perf_pmu *pmu, struct evsel *evsel,
-				const char *config_name, u64 val);
+int evsel__get_config_val(const struct evsel *evsel, const char *config_name,
+			  u64 *val);
+void evsel__set_config_if_unset(struct evsel *evsel, const char *config_name,
+				u64 val);
 
 bool evsel__is_offcpu_event(struct evsel *evsel);
 
diff --git a/tools/perf/util/evsel_config.h b/tools/perf/util/evsel_config.h
index bcd3a978f0c4..7b565d76c0bc 100644
--- a/tools/perf/util/evsel_config.h
+++ b/tools/perf/util/evsel_config.h
@@ -27,7 +27,11 @@ enum evsel_term_type {
 	EVSEL__CONFIG_TERM_AUX_OUTPUT,
 	EVSEL__CONFIG_TERM_AUX_ACTION,
 	EVSEL__CONFIG_TERM_AUX_SAMPLE_SIZE,
-	EVSEL__CONFIG_TERM_CFG_CHG,
+	EVSEL__CONFIG_TERM_USR_CHG_CONFIG,
+	EVSEL__CONFIG_TERM_USR_CHG_CONFIG1,
+	EVSEL__CONFIG_TERM_USR_CHG_CONFIG2,
+	EVSEL__CONFIG_TERM_USR_CHG_CONFIG3,
+	EVSEL__CONFIG_TERM_USR_CHG_CONFIG4,
 	EVSEL__CONFIG_TERM_RATIO_TO_PREV,
 };
 
@@ -50,6 +54,7 @@ struct evsel_config_term {
 		u64	      cfg_chg;
 		char	      *str;
 		int	      cpu;
+		u64	      val;
 	} val;
 	bool weak;
 };
diff --git a/tools/perf/util/evsel_fprintf.c b/tools/perf/util/evsel_fprintf.c
index 10f1a03c2860..5521d00bff2c 100644
--- a/tools/perf/util/evsel_fprintf.c
+++ b/tools/perf/util/evsel_fprintf.c
@@ -185,8 +185,12 @@ int sample__fprintf_callchain(struct perf_sample *sample, int left_alignment,
 			if (print_dso && (!sym || !sym->inlined))
 				printed += map__fprintf_dsoname_dsoff(map, print_dsoff, addr, fp);
 
-			if (print_srcline)
-				printed += map__fprintf_srcline(map, addr, "\n  ", fp);
+			if (print_srcline) {
+				if (node->srcline)
+					printed += fprintf(fp, "\n  %s", node->srcline);
+				else
+					printed += map__fprintf_srcline(map, addr, "\n  ", fp);
+			}
 
 			if (sym && sym->inlined)
 				printed += fprintf(fp, " (inlined)");
diff --git a/tools/perf/util/genelf.c b/tools/perf/util/genelf.c
index a1cd5196f4ec..14882def9704 100644
--- a/tools/perf/util/genelf.c
+++ b/tools/perf/util/genelf.c
@@ -18,8 +18,8 @@
 #include <dwarf.h>
 #endif
 
+#include "blake2s.h"
 #include "genelf.h"
-#include "sha1.h"
 #include "../util/jitdump.h"
 #include <linux/compiler.h>
 
@@ -51,7 +51,7 @@ static char shd_string_table[] = {
 static struct buildid_note {
 	Elf_Note desc;		/* descsz: size of build-id, must be multiple of 4 */
 	char	 name[4];	/* GNU\0 */
-	u8	 build_id[SHA1_DIGEST_SIZE];
+	u8	 build_id[20];
 } bnote;
 
 static Elf_Sym symtab[]={
@@ -152,9 +152,28 @@ jit_add_eh_frame_info(Elf *e, void* unwinding, uint64_t unwinding_header_size,
 	return 0;
 }
 
+enum {
+	TAG_CODE = 0,
+	TAG_SYMTAB = 1,
+	TAG_STRSYM = 2,
+};
+
+/*
+ * Update the hash using the given data, also prepending a (tag, len) prefix to
+ * ensure that distinct input tuples reliably result in distinct hashes.
+ */
+static void blake2s_update_tagged(struct blake2s_ctx *ctx, int tag,
+				  const void *data, size_t len)
+{
+	u64 prefix = ((u64)tag << 56) | len;
+
+	blake2s_update(ctx, (const u8 *)&prefix, sizeof(prefix));
+	blake2s_update(ctx, data, len);
+}
+
 /*
  * fd: file descriptor open for writing for the output file
- * load_addr: code load address (could be zero, just used for buildid)
+ * load_addr: code load address (could be zero)
  * sym: function name (for native code - used as the symbol)
  * code: the native code
  * csize: the code size in bytes
@@ -173,8 +192,7 @@ jit_write_elf(int fd, uint64_t load_addr __maybe_unused, const char *sym,
 	Elf_Shdr *shdr;
 	uint64_t eh_frame_base_offset;
 	char *strsym = NULL;
-	void *build_id_data = NULL, *tmp;
-	int build_id_data_len;
+	struct blake2s_ctx ctx;
 	int symlen;
 	int retval = -1;
 
@@ -253,13 +271,8 @@ jit_write_elf(int fd, uint64_t load_addr __maybe_unused, const char *sym,
 	shdr->sh_flags = SHF_EXECINSTR | SHF_ALLOC;
 	shdr->sh_entsize = 0;
 
-	build_id_data = malloc(csize);
-	if (build_id_data == NULL) {
-		warnx("cannot allocate build-id data");
-		goto error;
-	}
-	memcpy(build_id_data, code, csize);
-	build_id_data_len = csize;
+	blake2s_init(&ctx, sizeof(bnote.build_id));
+	blake2s_update_tagged(&ctx, TAG_CODE, code, csize);
 
 	/*
 	 * Setup .eh_frame_hdr and .eh_frame
@@ -344,14 +357,7 @@ jit_write_elf(int fd, uint64_t load_addr __maybe_unused, const char *sym,
 	shdr->sh_entsize = sizeof(Elf_Sym);
 	shdr->sh_link = unwinding ? 6 : 4; /* index of .strtab section */
 
-	tmp = realloc(build_id_data, build_id_data_len + sizeof(symtab));
-	if (tmp == NULL) {
-		warnx("cannot allocate build-id data");
-		goto error;
-	}
-	memcpy(tmp + build_id_data_len, symtab, sizeof(symtab));
-	build_id_data = tmp;
-	build_id_data_len += sizeof(symtab);
+	blake2s_update_tagged(&ctx, TAG_SYMTAB, symtab, sizeof(symtab));
 
 	/*
 	 * setup symbols string table
@@ -395,14 +401,7 @@ jit_write_elf(int fd, uint64_t load_addr __maybe_unused, const char *sym,
 	shdr->sh_flags = 0;
 	shdr->sh_entsize = 0;
 
-	tmp = realloc(build_id_data, build_id_data_len + symlen);
-	if (tmp == NULL) {
-		warnx("cannot allocate build-id data");
-		goto error;
-	}
-	memcpy(tmp + build_id_data_len, strsym, symlen);
-	build_id_data = tmp;
-	build_id_data_len += symlen;
+	blake2s_update_tagged(&ctx, TAG_STRSYM, strsym, symlen);
 
 	/*
 	 * setup build-id section
@@ -422,7 +421,7 @@ jit_write_elf(int fd, uint64_t load_addr __maybe_unused, const char *sym,
 	/*
 	 * build-id generation
 	 */
-	sha1(build_id_data, build_id_data_len, bnote.build_id);
+	blake2s_final(&ctx, bnote.build_id);
 	bnote.desc.namesz = sizeof(bnote.name); /* must include 0 termination */
 	bnote.desc.descsz = sizeof(bnote.build_id);
 	bnote.desc.type   = NT_GNU_BUILD_ID;
@@ -467,7 +466,6 @@ error:
 	(void)elf_end(e);
 
 	free(strsym);
-	free(build_id_data);
 
 	return retval;
 }
diff --git a/tools/perf/util/generate-cmdlist.sh b/tools/perf/util/generate-cmdlist.sh
deleted file mode 100755
index 6a73c903d690..000000000000
--- a/tools/perf/util/generate-cmdlist.sh
+++ /dev/null
@@ -1,70 +0,0 @@
-#!/bin/sh
-# SPDX-License-Identifier: GPL-2.0
-
-echo "/* Automatically generated by $0 */
-struct cmdname_help
-{
-    char name[16];
-    char help[80];
-};
-
-static struct cmdname_help common_cmds[] = {"
-
-sed -n -e 's/^perf-\([^ 	]*\)[ 	].* common.*/\1/p' command-list.txt |
-sort |
-while read cmd
-do
-     sed -n '
-     /^NAME/,/perf-'"$cmd"'/H
-     ${
-            x
-            s/.*perf-'"$cmd"' - \(.*\)/  {"'"$cmd"'", "\1"},/
-	    p
-     }' "Documentation/perf-$cmd.txt"
-done
-
-echo "#ifdef HAVE_LIBELF_SUPPORT"
-sed -n -e 's/^perf-\([^ 	]*\)[ 	].* full.*/\1/p' command-list.txt |
-sort |
-while read cmd
-do
-     sed -n '
-     /^NAME/,/perf-'"$cmd"'/H
-     ${
-            x
-            s/.*perf-'"$cmd"' - \(.*\)/  {"'"$cmd"'", "\1"},/
-	    p
-     }' "Documentation/perf-$cmd.txt"
-done
-echo "#endif /* HAVE_LIBELF_SUPPORT */"
-
-echo "#if defined(HAVE_LIBTRACEEVENT)"
-sed -n -e 's/^perf-\([^ 	]*\)[ 	].* audit*/\1/p' command-list.txt |
-sort |
-while read cmd
-do
-     sed -n '
-     /^NAME/,/perf-'"$cmd"'/H
-     ${
-            x
-            s/.*perf-'"$cmd"' - \(.*\)/  {"'"$cmd"'", "\1"},/
-	    p
-     }' "Documentation/perf-$cmd.txt"
-done
-echo "#endif /* HAVE_LIBTRACEEVENT */"
-
-echo "#ifdef HAVE_LIBTRACEEVENT"
-sed -n -e 's/^perf-\([^ 	]*\)[ 	].* traceevent.*/\1/p' command-list.txt |
-sort |
-while read cmd
-do
-     sed -n '
-     /^NAME/,/perf-'"$cmd"'/H
-     ${
-            x
-            s/.*perf-'"$cmd"' - \(.*\)/  {"'"$cmd"'", "\1"},/
-            p
-     }' "Documentation/perf-$cmd.txt"
-done
-echo "#endif /* HAVE_LIBTRACEEVENT */"
-echo "};"
diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c
index f5cad377c99e..9142a8ba4019 100644
--- a/tools/perf/util/header.c
+++ b/tools/perf/util/header.c
@@ -76,6 +76,7 @@ static const u64 __perf_magic2    = 0x32454c4946524550ULL;
 static const u64 __perf_magic2_sw = 0x50455246494c4532ULL;
 
 #define PERF_MAGIC	__perf_magic2
+#define DNAME_LEN	16
 
 const char perf_version_string[] = PERF_VERSION;
 
@@ -378,6 +379,21 @@ static int write_arch(struct feat_fd *ff,
 	return do_write_string(ff, uts.machine);
 }
 
+static int write_e_machine(struct feat_fd *ff,
+			   struct evlist *evlist __maybe_unused)
+{
+	/* e_machine expanded from 16 to 32-bits for alignment. */
+	uint32_t e_flags;
+	uint32_t e_machine = perf_session__e_machine(evlist->session, &e_flags);
+	int ret;
+
+	ret = do_write(ff, &e_machine, sizeof(e_machine));
+	if (ret)
+		return ret;
+
+	return do_write(ff, &e_flags, sizeof(e_flags));
+}
+
 static int write_version(struct feat_fd *ff,
 			 struct evlist *evlist __maybe_unused)
 {
@@ -1614,6 +1630,161 @@ static int write_pmu_caps(struct feat_fd *ff,
 	return 0;
 }
 
+struct cpu_domain_map **build_cpu_domain_map(u32 *schedstat_version, u32 *max_sched_domains, u32 nr)
+{
+	char dname[DNAME_LEN], cpumask[MAX_NR_CPUS];
+	struct domain_info *domain_info;
+	struct cpu_domain_map **cd_map;
+	char cpulist[MAX_NR_CPUS];
+	char *line = NULL;
+	u32 cpu, domain;
+	u32 dcount = 0;
+	size_t len;
+	FILE *fp;
+
+	fp = fopen("/proc/schedstat", "r");
+	if (!fp) {
+		pr_err("Failed to open /proc/schedstat\n");
+		return NULL;
+	}
+
+	cd_map = zalloc(sizeof(*cd_map) * nr);
+	if (!cd_map)
+		goto out;
+
+	while (getline(&line, &len, fp) > 0) {
+		int retval;
+
+		if (strncmp(line, "version", 7) == 0) {
+			retval = sscanf(line, "version %d\n", schedstat_version);
+			if (retval != 1)
+				continue;
+
+		} else if (strncmp(line, "cpu", 3) == 0) {
+			retval = sscanf(line, "cpu%u %*s", &cpu);
+			if (retval == 1) {
+				cd_map[cpu] = zalloc(sizeof(*cd_map[cpu]));
+				if (!cd_map[cpu])
+					goto out_free_line;
+				cd_map[cpu]->cpu = cpu;
+			} else
+				continue;
+
+			dcount = 0;
+		} else if (strncmp(line, "domain", 6) == 0) {
+			struct domain_info **temp_domains;
+
+			dcount++;
+			temp_domains = realloc(cd_map[cpu]->domains, dcount * sizeof(domain_info));
+			if (!temp_domains)
+				goto out_free_line;
+			else
+				cd_map[cpu]->domains = temp_domains;
+
+			domain_info = zalloc(sizeof(*domain_info));
+			if (!domain_info)
+				goto out_free_line;
+
+			cd_map[cpu]->domains[dcount - 1] = domain_info;
+
+			if (*schedstat_version >= 17) {
+				retval = sscanf(line, "domain%u %s %s %*s", &domain, dname,
+						cpumask);
+				if (retval != 3)
+					continue;
+
+				domain_info->dname = strdup(dname);
+				if (!domain_info->dname)
+					goto out_free_line;
+			} else {
+				retval = sscanf(line, "domain%u %s %*s", &domain, cpumask);
+				if (retval != 2)
+					continue;
+			}
+
+			domain_info->domain = domain;
+			if (domain > *max_sched_domains)
+				*max_sched_domains = domain;
+
+			domain_info->cpumask = strdup(cpumask);
+			if (!domain_info->cpumask)
+				goto out_free_line;
+
+			cpumask_to_cpulist(cpumask, cpulist);
+			domain_info->cpulist = strdup(cpulist);
+			if (!domain_info->cpulist)
+				goto out_free_line;
+
+			cd_map[cpu]->nr_domains = dcount;
+		}
+	}
+
+out_free_line:
+	free(line);
+out:
+	fclose(fp);
+	return cd_map;
+}
+
+static int write_cpu_domain_info(struct feat_fd *ff,
+				 struct evlist *evlist __maybe_unused)
+{
+	u32 max_sched_domains = 0, schedstat_version = 0;
+	struct cpu_domain_map **cd_map;
+	u32 i, j, nr, ret;
+
+	nr = cpu__max_present_cpu().cpu;
+
+	cd_map = build_cpu_domain_map(&schedstat_version, &max_sched_domains, nr);
+	if (!cd_map)
+		return -1;
+
+	ret = do_write(ff, &schedstat_version, sizeof(u32));
+	if (ret < 0)
+		goto out;
+
+	max_sched_domains += 1;
+	ret = do_write(ff, &max_sched_domains, sizeof(u32));
+	if (ret < 0)
+		goto out;
+
+	for (i = 0; i < nr; i++) {
+		if (!cd_map[i])
+			continue;
+
+		ret = do_write(ff, &cd_map[i]->cpu, sizeof(u32));
+		if (ret < 0)
+			goto out;
+
+		ret = do_write(ff, &cd_map[i]->nr_domains, sizeof(u32));
+		if (ret < 0)
+			goto out;
+
+		for (j = 0; j < cd_map[i]->nr_domains; j++) {
+			ret = do_write(ff, &cd_map[i]->domains[j]->domain, sizeof(u32));
+			if (ret < 0)
+				goto out;
+			if (schedstat_version >= 17) {
+				ret = do_write_string(ff, cd_map[i]->domains[j]->dname);
+				if (ret < 0)
+					goto out;
+			}
+
+			ret = do_write_string(ff, cd_map[i]->domains[j]->cpumask);
+			if (ret < 0)
+				goto out;
+
+			ret = do_write_string(ff, cd_map[i]->domains[j]->cpulist);
+			if (ret < 0)
+				goto out;
+		}
+	}
+
+out:
+	free_cpu_domain_info(cd_map, schedstat_version, nr);
+	return ret;
+}
+
 static void print_hostname(struct feat_fd *ff, FILE *fp)
 {
 	fprintf(fp, "# hostname : %s\n", ff->ph->env.hostname);
@@ -1629,6 +1800,12 @@ static void print_arch(struct feat_fd *ff, FILE *fp)
 	fprintf(fp, "# arch : %s\n", ff->ph->env.arch);
 }
 
+static void print_e_machine(struct feat_fd *ff, FILE *fp)
+{
+	fprintf(fp, "# e_machine : %u\n", ff->ph->env.e_machine);
+	fprintf(fp, "#   e_flags : %u\n", ff->ph->env.e_flags);
+}
+
 static void print_cpudesc(struct feat_fd *ff, FILE *fp)
 {
 	fprintf(fp, "# cpudesc : %s\n", ff->ph->env.cpu_desc);
@@ -2247,6 +2424,39 @@ static void print_mem_topology(struct feat_fd *ff, FILE *fp)
 	}
 }
 
+static void print_cpu_domain_info(struct feat_fd *ff, FILE *fp)
+{
+	struct cpu_domain_map **cd_map = ff->ph->env.cpu_domain;
+	u32 nr = ff->ph->env.nr_cpus_avail;
+	struct domain_info *d_info;
+	u32 i, j;
+
+	fprintf(fp, "# schedstat version	: %u\n", ff->ph->env.schedstat_version);
+	fprintf(fp, "# Maximum sched domains	: %u\n", ff->ph->env.max_sched_domains);
+
+	for (i = 0; i < nr; i++) {
+		if (!cd_map[i])
+			continue;
+
+		fprintf(fp, "# cpu		: %u\n", cd_map[i]->cpu);
+		fprintf(fp, "# nr_domains	: %u\n", cd_map[i]->nr_domains);
+
+		for (j = 0; j < cd_map[i]->nr_domains; j++) {
+			d_info = cd_map[i]->domains[j];
+			if (!d_info)
+				continue;
+
+			fprintf(fp, "# Domain		: %u\n", d_info->domain);
+
+			if (ff->ph->env.schedstat_version >= 17)
+				fprintf(fp, "# Domain name      : %s\n", d_info->dname);
+
+			fprintf(fp, "# Domain cpu map   : %s\n", d_info->cpumask);
+			fprintf(fp, "# Domain cpu list  : %s\n", d_info->cpulist);
+		}
+	}
+}
+
 static int __event_process_build_id(struct perf_record_header_build_id *bev,
 				    char *filename,
 				    struct perf_session *session)
@@ -2423,6 +2633,17 @@ FEAT_PROCESS_STR_FUN(arch, arch);
 FEAT_PROCESS_STR_FUN(cpudesc, cpu_desc);
 FEAT_PROCESS_STR_FUN(cpuid, cpuid);
 
+static int process_e_machine(struct feat_fd *ff, void *data __maybe_unused)
+{
+	int ret;
+
+	ret = do_read_u32(ff, &ff->ph->env.e_machine);
+	if (ret)
+		return ret;
+
+	return do_read_u32(ff, &ff->ph->env.e_flags);
+}
+
 #ifdef HAVE_LIBTRACEEVENT
 static int process_tracing_data(struct feat_fd *ff, void *data)
 {
@@ -3388,6 +3609,93 @@ err:
 	return ret;
 }
 
+static int process_cpu_domain_info(struct feat_fd *ff, void *data __maybe_unused)
+{
+	u32 schedstat_version, max_sched_domains, cpu, domain, nr_domains;
+	struct perf_env *env = &ff->ph->env;
+	char *dname, *cpumask, *cpulist;
+	struct cpu_domain_map **cd_map;
+	struct domain_info *d_info;
+	u32 nra, nr, i, j;
+	int ret;
+
+	nra = env->nr_cpus_avail;
+	nr = env->nr_cpus_online;
+
+	cd_map = zalloc(sizeof(*cd_map) * nra);
+	if (!cd_map)
+		return -1;
+
+	env->cpu_domain = cd_map;
+
+	ret = do_read_u32(ff, &schedstat_version);
+	if (ret)
+		return ret;
+
+	env->schedstat_version = schedstat_version;
+
+	ret = do_read_u32(ff, &max_sched_domains);
+	if (ret)
+		return ret;
+
+	env->max_sched_domains = max_sched_domains;
+
+	for (i = 0; i < nr; i++) {
+		if (do_read_u32(ff, &cpu))
+			return -1;
+
+		cd_map[cpu] = zalloc(sizeof(*cd_map[cpu]));
+		if (!cd_map[cpu])
+			return -1;
+
+		cd_map[cpu]->cpu = cpu;
+
+		if (do_read_u32(ff, &nr_domains))
+			return -1;
+
+		cd_map[cpu]->nr_domains = nr_domains;
+
+		cd_map[cpu]->domains = zalloc(sizeof(*d_info) * max_sched_domains);
+		if (!cd_map[cpu]->domains)
+			return -1;
+
+		for (j = 0; j < nr_domains; j++) {
+			if (do_read_u32(ff, &domain))
+				return -1;
+
+			d_info = zalloc(sizeof(*d_info));
+			if (!d_info)
+				return -1;
+
+			assert(cd_map[cpu]->domains[domain] == NULL);
+			cd_map[cpu]->domains[domain] = d_info;
+			d_info->domain = domain;
+
+			if (schedstat_version >= 17) {
+				dname = do_read_string(ff);
+				if (!dname)
+					return -1;
+
+				d_info->dname = dname;
+			}
+
+			cpumask = do_read_string(ff);
+			if (!cpumask)
+				return -1;
+
+			d_info->cpumask = cpumask;
+
+			cpulist = do_read_string(ff);
+			if (!cpulist)
+				return -1;
+
+			d_info->cpulist = cpulist;
+		}
+	}
+
+	return ret;
+}
+
 #define FEAT_OPR(n, func, __full_only) \
 	[HEADER_##n] = {					\
 		.name	    = __stringify(n),			\
@@ -3453,6 +3761,8 @@ const struct perf_header_feature_ops feat_ops[HEADER_LAST_FEATURE] = {
 	FEAT_OPR(CLOCK_DATA,	clock_data,	false),
 	FEAT_OPN(HYBRID_TOPOLOGY,	hybrid_topology,	true),
 	FEAT_OPR(PMU_CAPS,	pmu_caps,	false),
+	FEAT_OPR(CPU_DOMAIN_INFO,	cpu_domain_info,	true),
+	FEAT_OPR(E_MACHINE,	e_machine,	false),
 };
 
 struct header_print_data {
diff --git a/tools/perf/util/header.h b/tools/perf/util/header.h
index c058021c3150..cc40ac796f52 100644
--- a/tools/perf/util/header.h
+++ b/tools/perf/util/header.h
@@ -53,6 +53,8 @@ enum {
 	HEADER_CLOCK_DATA,
 	HEADER_HYBRID_TOPOLOGY,
 	HEADER_PMU_CAPS,
+	HEADER_CPU_DOMAIN_INFO,
+	HEADER_E_MACHINE,
 	HEADER_LAST_FEATURE,
 	HEADER_FEAT_BITS	= 256,
 };
@@ -210,4 +212,7 @@ char *get_cpuid_str(struct perf_cpu cpu);
 char *get_cpuid_allow_env_override(struct perf_cpu cpu);
 
 int strcmp_cpuid_str(const char *s1, const char *s2);
+
+struct cpu_domain_map **build_cpu_domain_map(u32 *schedstat_version, u32 *max_sched_domains,
+					     u32 nr);
 #endif /* __PERF_HEADER_H */
diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c
index ef4b569f7df4..7ffaa3d9851b 100644
--- a/tools/perf/util/hist.c
+++ b/tools/perf/util/hist.c
@@ -251,7 +251,7 @@ void hists__calc_col_len(struct hists *hists, struct hist_entry *h)
 
 	if (h->cgroup) {
 		const char *cgrp_name = "unknown";
-		struct cgroup *cgrp = cgroup__find(maps__machine(h->ms.maps)->env,
+		struct cgroup *cgrp = cgroup__find(maps__machine(thread__maps(h->ms.thread))->env,
 						   h->cgroup);
 		if (cgrp != NULL)
 			cgrp_name = cgrp->name;
@@ -536,7 +536,7 @@ static int hist_entry__init(struct hist_entry *he,
 			memset(&he->stat, 0, sizeof(he->stat));
 	}
 
-	he->ms.maps = maps__get(he->ms.maps);
+	he->ms.thread = thread__get(he->ms.thread);
 	he->ms.map = map__get(he->ms.map);
 
 	if (he->branch_info) {
@@ -552,9 +552,9 @@ static int hist_entry__init(struct hist_entry *he,
 		memcpy(he->branch_info, template->branch_info,
 		       sizeof(*he->branch_info));
 
-		he->branch_info->from.ms.maps = maps__get(he->branch_info->from.ms.maps);
+		he->branch_info->from.ms.thread = thread__get(he->branch_info->from.ms.thread);
 		he->branch_info->from.ms.map = map__get(he->branch_info->from.ms.map);
-		he->branch_info->to.ms.maps = maps__get(he->branch_info->to.ms.maps);
+		he->branch_info->to.ms.thread = thread__get(he->branch_info->to.ms.thread);
 		he->branch_info->to.ms.map = map__get(he->branch_info->to.ms.map);
 	}
 
@@ -810,7 +810,7 @@ __hists__add_entry(struct hists *hists,
 		},
 		.cgroup = sample->cgroup,
 		.ms = {
-			.maps	= al->maps,
+			.thread	= al->thread,
 			.map	= al->map,
 			.sym	= al->sym,
 		},
@@ -890,7 +890,7 @@ struct hist_entry *hists__add_entry_block(struct hists *hists,
 		.block_info = block_info,
 		.hists = hists,
 		.ms = {
-			.maps = al->maps,
+			.thread = al->thread,
 			.map = al->map,
 			.sym = al->sym,
 		},
@@ -1020,8 +1020,8 @@ iter_next_branch_entry(struct hist_entry_iter *iter, struct addr_location *al)
 	if (iter->curr >= iter->total)
 		return 0;
 
-	maps__put(al->maps);
-	al->maps = maps__get(bi[i].to.ms.maps);
+	thread__put(al->thread);
+	al->thread = thread__get(bi[i].to.ms.thread);
 	map__put(al->map);
 	al->map = map__get(bi[i].to.ms.map);
 	al->sym = bi[i].to.ms.sym;
@@ -1232,7 +1232,7 @@ iter_add_next_cumulative_entry(struct hist_entry_iter *iter,
 		.comm = thread__comm(al->thread),
 		.ip = al->addr,
 		.ms = {
-			.maps = al->maps,
+			.thread = al->thread,
 			.map = al->map,
 			.sym = al->sym,
 		},
diff --git a/tools/perf/util/hwmon_pmu.c b/tools/perf/util/hwmon_pmu.c
index 279d6b1a47f0..fb3ffa8d32ad 100644
--- a/tools/perf/util/hwmon_pmu.c
+++ b/tools/perf/util/hwmon_pmu.c
@@ -161,7 +161,7 @@ bool parse_hwmon_filename(const char *filename,
 			  bool *alarm)
 {
 	char fn_type[24];
-	const char **elem;
+	const char * const *elem;
 	const char *fn_item = NULL;
 	size_t fn_item_len;
 
diff --git a/tools/perf/util/include/dwarf-regs.h b/tools/perf/util/include/dwarf-regs.h
index 6f1b9f6b2466..46a764cf322f 100644
--- a/tools/perf/util/include/dwarf-regs.h
+++ b/tools/perf/util/include/dwarf-regs.h
@@ -89,8 +89,6 @@
 #define DWARF_REG_FB  0xd3affb /* random number */
 
 #ifdef HAVE_LIBDW_SUPPORT
-const char *get_csky_regstr(unsigned int n, unsigned int flags);
-
 /**
  * get_dwarf_regstr() - Returns ftrace register string from DWARF regnum.
  * @n: DWARF register number.
@@ -99,11 +97,23 @@ const char *get_csky_regstr(unsigned int n, unsigned int flags);
  */
 const char *get_dwarf_regstr(unsigned int n, unsigned int machine, unsigned int flags);
 
-int get_x86_regnum(const char *name);
+const char *__get_csky_regstr(unsigned int n, unsigned int flags);
+int __get_csky_regnum(const char *name, unsigned int flags);
 
-#if !defined(__x86_64__) && !defined(__i386__)
-int get_arch_regnum(const char *name);
-#endif
+int __get_dwarf_regnum_i386(const char *name);
+int __get_dwarf_regnum_x86_64(const char *name);
+int __get_dwarf_regnum_for_perf_regnum_i386(int perf_regnum);
+int __get_dwarf_regnum_for_perf_regnum_x86_64(int perf_regnum);
+
+int __get_dwarf_regnum_for_perf_regnum_arm(int perf_regnum);
+int __get_dwarf_regnum_for_perf_regnum_arm64(int perf_regnum);
+
+int __get_dwarf_regnum_for_perf_regnum_csky(int perf_regnum, unsigned int flags);
+int __get_dwarf_regnum_for_perf_regnum_loongarch(int perf_regnum);
+int __get_dwarf_regnum_for_perf_regnum_powerpc(int perf_regnum);
+int __get_dwarf_regnum_for_perf_regnum_riscv(int perf_regnum);
+int __get_dwarf_regnum_for_perf_regnum_s390(int perf_regnum);
+int __get_dwarf_regnum_for_perf_regnum_mips(int perf_regnum);
 
 /*
  * get_dwarf_regnum - Returns DWARF regnum from register name
@@ -112,6 +122,12 @@ int get_arch_regnum(const char *name);
  */
 int get_dwarf_regnum(const char *name, unsigned int machine, unsigned int flags);
 
+/*
+ * get_dwarf_regnum - Returns DWARF regnum from perf register number.
+ */
+int get_dwarf_regnum_for_perf_regnum(int perf_regnum, unsigned int machine, unsigned int flags,
+				     bool only_libdw_supported);
+
 void get_powerpc_regs(u32 raw_insn, int is_source, struct annotated_op_loc *op_loc);
 
 #else /* HAVE_LIBDW_SUPPORT */
diff --git a/tools/perf/util/jitdump.c b/tools/perf/util/jitdump.c
index f00814e37de9..e0ce8b904729 100644
--- a/tools/perf/util/jitdump.c
+++ b/tools/perf/util/jitdump.c
@@ -90,7 +90,8 @@ jit_emit_elf(struct jit_buf_desc *jd,
 	saved_errno = errno;
 	nsinfo__mountns_exit(&nsc);
 	if (fd == -1) {
-		pr_warning("cannot create jit ELF %s: %s\n", filename, strerror(saved_errno));
+		errno = saved_errno;
+		pr_warning("cannot create jit ELF %s: %m\n", filename);
 		return -1;
 	}
 
@@ -757,7 +758,7 @@ jit_inject(struct jit_buf_desc *jd, const char *path)
 static int
 jit_detect(const char *mmap_name, pid_t pid, struct nsinfo *nsi, bool *in_pidns)
  {
-	char *p;
+	const char *p;
 	char *end = NULL;
 	pid_t pid2;
 
diff --git a/tools/perf/util/kvm-stat-arch/Build b/tools/perf/util/kvm-stat-arch/Build
new file mode 100644
index 000000000000..d84e55656e7a
--- /dev/null
+++ b/tools/perf/util/kvm-stat-arch/Build
@@ -0,0 +1,6 @@
+perf-util-$(CONFIG_LIBTRACEEVENT) += kvm-stat-arm64.o
+perf-util-$(CONFIG_LIBTRACEEVENT) += kvm-stat-loongarch.o
+perf-util-$(CONFIG_LIBTRACEEVENT) += kvm-stat-powerpc.o
+perf-util-$(CONFIG_LIBTRACEEVENT) += kvm-stat-riscv.o
+perf-util-$(CONFIG_LIBTRACEEVENT) += kvm-stat-s390.o
+perf-util-$(CONFIG_LIBTRACEEVENT) += kvm-stat-x86.o
diff --git a/tools/perf/arch/arm64/util/arm64_exception_types.h b/tools/perf/util/kvm-stat-arch/arm64_exception_types.h
index bf827f19ace0..bf827f19ace0 100644
--- a/tools/perf/arch/arm64/util/arm64_exception_types.h
+++ b/tools/perf/util/kvm-stat-arch/arm64_exception_types.h
diff --git a/tools/perf/arch/powerpc/util/book3s_hcalls.h b/tools/perf/util/kvm-stat-arch/book3s_hcalls.h
index 488f4339b83c..488f4339b83c 100644
--- a/tools/perf/arch/powerpc/util/book3s_hcalls.h
+++ b/tools/perf/util/kvm-stat-arch/book3s_hcalls.h
diff --git a/tools/perf/arch/powerpc/util/book3s_hv_exits.h b/tools/perf/util/kvm-stat-arch/book3s_hv_exits.h
index 2011376c7ab5..2011376c7ab5 100644
--- a/tools/perf/arch/powerpc/util/book3s_hv_exits.h
+++ b/tools/perf/util/kvm-stat-arch/book3s_hv_exits.h
diff --git a/tools/perf/arch/arm64/util/kvm-stat.c b/tools/perf/util/kvm-stat-arch/kvm-stat-arm64.c
index 6611aa21cba9..c640dcd8af7c 100644
--- a/tools/perf/arch/arm64/util/kvm-stat.c
+++ b/tools/perf/util/kvm-stat-arch/kvm-stat-arm64.c
@@ -1,21 +1,17 @@
 // SPDX-License-Identifier: GPL-2.0
 #include <errno.h>
 #include <memory.h>
-#include "../../../util/evsel.h"
-#include "../../../util/kvm-stat.h"
+#include "../debug.h"
+#include "../evsel.h"
+#include "../kvm-stat.h"
 #include "arm64_exception_types.h"
-#include "debug.h"
 
 define_exit_reasons_table(arm64_exit_reasons, kvm_arm_exception_type);
 define_exit_reasons_table(arm64_trap_exit_reasons, kvm_arm_exception_class);
 
-const char *kvm_trap_exit_reason = "esr_ec";
-const char *vcpu_id_str = "id";
-const char *kvm_exit_reason = "ret";
-const char *kvm_entry_trace = "kvm:kvm_entry";
-const char *kvm_exit_trace = "kvm:kvm_exit";
+static const char *kvm_trap_exit_reason = "esr_ec";
 
-const char *kvm_events_tp[] = {
+static const char * const __kvm_events_tp[] = {
 	"kvm:kvm_entry",
 	"kvm:kvm_exit",
 	NULL,
@@ -26,7 +22,7 @@ static void event_get_key(struct evsel *evsel,
 			  struct event_key *key)
 {
 	key->info = 0;
-	key->key = evsel__intval(evsel, sample, kvm_exit_reason);
+	key->key = evsel__intval(evsel, sample, kvm_exit_reason(EM_AARCH64));
 	key->exit_reasons = arm64_exit_reasons;
 
 	/*
@@ -44,28 +40,28 @@ static bool event_begin(struct evsel *evsel,
 			struct perf_sample *sample __maybe_unused,
 			struct event_key *key __maybe_unused)
 {
-	return evsel__name_is(evsel, kvm_entry_trace);
+	return evsel__name_is(evsel, kvm_entry_trace(EM_AARCH64));
 }
 
 static bool event_end(struct evsel *evsel,
 		      struct perf_sample *sample,
 		      struct event_key *key)
 {
-	if (evsel__name_is(evsel, kvm_exit_trace)) {
+	if (evsel__name_is(evsel, kvm_exit_trace(EM_AARCH64))) {
 		event_get_key(evsel, sample, key);
 		return true;
 	}
 	return false;
 }
 
-static struct kvm_events_ops exit_events = {
+static const struct kvm_events_ops exit_events = {
 	.is_begin_event = event_begin,
 	.is_end_event	= event_end,
 	.decode_key	= exit_event_decode_key,
 	.name		= "VM-EXIT"
 };
 
-struct kvm_reg_events_ops kvm_reg_events_ops[] = {
+static const struct kvm_reg_events_ops __kvm_reg_events_ops[] = {
 	{
 		.name	= "vmexit",
 		.ops	= &exit_events,
@@ -73,12 +69,27 @@ struct kvm_reg_events_ops kvm_reg_events_ops[] = {
 	{ NULL, NULL },
 };
 
-const char * const kvm_skip_events[] = {
+static const char * const __kvm_skip_events[] = {
 	NULL,
 };
 
-int cpu_isa_init(struct perf_kvm_stat *kvm, const char *cpuid __maybe_unused)
+int __cpu_isa_init_arm64(struct perf_kvm_stat *kvm)
 {
 	kvm->exit_reasons_isa = "arm64";
 	return 0;
 }
+
+const char * const *__kvm_events_tp_arm64(void)
+{
+	return __kvm_events_tp;
+}
+
+const struct kvm_reg_events_ops *__kvm_reg_events_ops_arm64(void)
+{
+	return __kvm_reg_events_ops;
+}
+
+const char * const *__kvm_skip_events_arm64(void)
+{
+	return __kvm_skip_events;
+}
diff --git a/tools/perf/arch/loongarch/util/kvm-stat.c b/tools/perf/util/kvm-stat-arch/kvm-stat-loongarch.c
index a7859a3a9a51..b802e516b138 100644
--- a/tools/perf/arch/loongarch/util/kvm-stat.c
+++ b/tools/perf/util/kvm-stat-arch/kvm-stat-loongarch.c
@@ -1,12 +1,13 @@
 // SPDX-License-Identifier: GPL-2.0
 #include <errno.h>
 #include <memory.h>
-#include "util/kvm-stat.h"
-#include "util/parse-events.h"
-#include "util/debug.h"
-#include "util/evsel.h"
-#include "util/evlist.h"
-#include "util/pmus.h"
+#include <dwarf-regs.h>
+#include "../kvm-stat.h"
+#include "../parse-events.h"
+#include "../debug.h"
+#include "../evsel.h"
+#include "../evlist.h"
+#include "../pmus.h"
 
 #define LOONGARCH_EXCEPTION_INT		0
 #define LOONGARCH_EXCEPTION_PIL		1
@@ -43,12 +44,8 @@
 
 define_exit_reasons_table(loongarch_exit_reasons, loongarch_exception_type);
 
-const char *vcpu_id_str = "vcpu_id";
-const char *kvm_exit_reason = "reason";
-const char *kvm_entry_trace = "kvm:kvm_enter";
-const char *kvm_reenter_trace = "kvm:kvm_reenter";
-const char *kvm_exit_trace = "kvm:kvm_exit";
-const char *kvm_events_tp[] = {
+static const char *kvm_reenter_trace = "kvm:kvm_reenter";
+static const char * const __kvm_events_tp[] = {
 	"kvm:kvm_enter",
 	"kvm:kvm_reenter",
 	"kvm:kvm_exit",
@@ -74,7 +71,8 @@ static bool event_end(struct evsel *evsel,
 	 *   kvm:kvm_enter   means returning to vmm and then to guest
 	 *   kvm:kvm_reenter means returning to guest immediately
 	 */
-	return evsel__name_is(evsel, kvm_entry_trace) || evsel__name_is(evsel, kvm_reenter_trace);
+	return evsel__name_is(evsel, kvm_entry_trace(EM_LOONGARCH)) ||
+	       evsel__name_is(evsel, kvm_reenter_trace);
 }
 
 static void event_gspr_get_key(struct evsel *evsel,
@@ -109,12 +107,12 @@ static void event_gspr_get_key(struct evsel *evsel,
 	}
 }
 
-static struct child_event_ops child_events[] = {
+static const struct child_event_ops child_events[] = {
 	{ .name = "kvm:kvm_exit_gspr", .get_key = event_gspr_get_key },
 	{ NULL, NULL },
 };
 
-static struct kvm_events_ops exit_events = {
+static const struct kvm_events_ops exit_events = {
 	.is_begin_event = event_begin,
 	.is_end_event = event_end,
 	.child_ops = child_events,
@@ -122,18 +120,33 @@ static struct kvm_events_ops exit_events = {
 	.name = "VM-EXIT"
 };
 
-struct kvm_reg_events_ops kvm_reg_events_ops[] = {
+static const struct kvm_reg_events_ops __kvm_reg_events_ops[] = {
 	{ .name	= "vmexit", .ops = &exit_events, },
 	{ NULL, NULL },
 };
 
-const char * const kvm_skip_events[] = {
+static const char * const __kvm_skip_events[] = {
 	NULL,
 };
 
-int cpu_isa_init(struct perf_kvm_stat *kvm, const char *cpuid __maybe_unused)
+int __cpu_isa_init_loongarch(struct perf_kvm_stat *kvm)
 {
 	kvm->exit_reasons_isa = "loongarch64";
 	kvm->exit_reasons = loongarch_exit_reasons;
 	return 0;
 }
+
+const char * const *__kvm_events_tp_loongarch(void)
+{
+	return __kvm_events_tp;
+}
+
+const struct kvm_reg_events_ops *__kvm_reg_events_ops_loongarch(void)
+{
+	return __kvm_reg_events_ops;
+}
+
+const char * const *__kvm_skip_events_loongarch(void)
+{
+	return __kvm_skip_events;
+}
diff --git a/tools/perf/arch/powerpc/util/kvm-stat.c b/tools/perf/util/kvm-stat-arch/kvm-stat-powerpc.c
index c8357b571ccf..42182d70beb6 100644
--- a/tools/perf/arch/powerpc/util/kvm-stat.c
+++ b/tools/perf/util/kvm-stat-arch/kvm-stat-powerpc.c
@@ -1,11 +1,11 @@
 // SPDX-License-Identifier: GPL-2.0
 #include <errno.h>
-#include "util/kvm-stat.h"
-#include "util/parse-events.h"
-#include "util/debug.h"
-#include "util/evsel.h"
-#include "util/evlist.h"
-#include "util/pmus.h"
+#include "../kvm-stat.h"
+#include "../parse-events.h"
+#include "../debug.h"
+#include "../evsel.h"
+#include "../evlist.h"
+#include "../pmus.h"
 
 #include "book3s_hv_exits.h"
 #include "book3s_hcalls.h"
@@ -13,15 +13,11 @@
 
 #define NR_TPS 4
 
-const char *vcpu_id_str = "vcpu_id";
-const char *kvm_entry_trace = "kvm_hv:kvm_guest_enter";
-const char *kvm_exit_trace = "kvm_hv:kvm_guest_exit";
-
 define_exit_reasons_table(hv_exit_reasons, kvm_trace_symbol_exit);
 define_exit_reasons_table(hcall_reasons, kvm_trace_symbol_hcall);
 
 /* Tracepoints specific to ppc_book3s_hv */
-const char *ppc_book3s_hv_kvm_tp[] = {
+static const char * const ppc_book3s_hv_kvm_tp[] = {
 	"kvm_hv:kvm_guest_enter",
 	"kvm_hv:kvm_guest_exit",
 	"kvm_hv:kvm_hcall_enter",
@@ -30,8 +26,7 @@ const char *ppc_book3s_hv_kvm_tp[] = {
 };
 
 /* 1 extra placeholder for NULL */
-const char *kvm_events_tp[NR_TPS + 1];
-const char *kvm_exit_reason;
+static const char *__kvm_events_tp[NR_TPS + 1];
 
 static void hcall_event_get_key(struct evsel *evsel,
 				struct perf_sample *sample,
@@ -60,13 +55,13 @@ static bool hcall_event_end(struct evsel *evsel,
 			    struct perf_sample *sample __maybe_unused,
 			    struct event_key *key __maybe_unused)
 {
-	return (evsel__name_is(evsel, kvm_events_tp[3]));
+	return evsel__name_is(evsel, __kvm_events_tp[3]);
 }
 
 static bool hcall_event_begin(struct evsel *evsel,
 			      struct perf_sample *sample, struct event_key *key)
 {
-	if (evsel__name_is(evsel, kvm_events_tp[2])) {
+	if (evsel__name_is(evsel, __kvm_events_tp[2])) {
 		hcall_event_get_key(evsel, sample, key);
 		return true;
 	}
@@ -82,27 +77,27 @@ static void hcall_event_decode_key(struct perf_kvm_stat *kvm __maybe_unused,
 	scnprintf(decode, KVM_EVENT_NAME_LEN, "%s", hcall_reason);
 }
 
-static struct kvm_events_ops hcall_events = {
+static const struct kvm_events_ops hcall_events = {
 	.is_begin_event = hcall_event_begin,
 	.is_end_event = hcall_event_end,
 	.decode_key = hcall_event_decode_key,
 	.name = "HCALL-EVENT",
 };
 
-static struct kvm_events_ops exit_events = {
+static const struct kvm_events_ops exit_events = {
 	.is_begin_event = exit_event_begin,
 	.is_end_event = exit_event_end,
 	.decode_key = exit_event_decode_key,
 	.name = "VM-EXIT"
 };
 
-struct kvm_reg_events_ops kvm_reg_events_ops[] = {
+static const struct kvm_reg_events_ops __kvm_reg_events_ops[] = {
 	{ .name = "vmexit", .ops = &exit_events },
 	{ .name = "hcall", .ops = &hcall_events },
 	{ NULL, NULL },
 };
 
-const char * const kvm_skip_events[] = {
+static const char * const __kvm_skip_events[] = {
 	NULL,
 };
 
@@ -123,7 +118,7 @@ static int is_tracepoint_available(const char *str, struct evlist *evlist)
 static int ppc__setup_book3s_hv(struct perf_kvm_stat *kvm,
 				struct evlist *evlist)
 {
-	const char **events_ptr;
+	const char * const *events_ptr;
 	int i, nr_tp = 0, err = -1;
 
 	/* Check for book3s_hv tracepoints */
@@ -135,10 +130,9 @@ static int ppc__setup_book3s_hv(struct perf_kvm_stat *kvm,
 	}
 
 	for (i = 0; i < nr_tp; i++)
-		kvm_events_tp[i] = ppc_book3s_hv_kvm_tp[i];
+		__kvm_events_tp[i] = ppc_book3s_hv_kvm_tp[i];
 
-	kvm_events_tp[i] = NULL;
-	kvm_exit_reason = "trap";
+	__kvm_events_tp[i] = NULL;
 	kvm->exit_reasons = hv_exit_reasons;
 	kvm->exit_reasons_isa = "HV";
 
@@ -157,12 +151,12 @@ static int ppc__setup_kvm_tp(struct perf_kvm_stat *kvm)
 	return ppc__setup_book3s_hv(kvm, evlist);
 }
 
-int setup_kvm_events_tp(struct perf_kvm_stat *kvm)
+int __setup_kvm_events_tp_powerpc(struct perf_kvm_stat *kvm)
 {
 	return ppc__setup_kvm_tp(kvm);
 }
 
-int cpu_isa_init(struct perf_kvm_stat *kvm, const char *cpuid __maybe_unused)
+int __cpu_isa_init_powerpc(struct perf_kvm_stat *kvm)
 {
 	int ret;
 
@@ -184,7 +178,7 @@ int cpu_isa_init(struct perf_kvm_stat *kvm, const char *cpuid __maybe_unused)
  *
  * Function to parse the arguments and return appropriate values.
  */
-int kvm_add_default_arch_event(int *argc, const char **argv)
+int __kvm_add_default_arch_event_powerpc(int *argc, const char **argv)
 {
 	const char **tmp;
 	bool event = false;
@@ -217,3 +211,18 @@ int kvm_add_default_arch_event(int *argc, const char **argv)
 	free(tmp);
 	return 0;
 }
+
+const char * const *__kvm_events_tp_powerpc(void)
+{
+	return __kvm_events_tp;
+}
+
+const struct kvm_reg_events_ops *__kvm_reg_events_ops_powerpc(void)
+{
+	return __kvm_reg_events_ops;
+}
+
+const char * const *__kvm_skip_events_powerpc(void)
+{
+	return __kvm_skip_events;
+}
diff --git a/tools/perf/arch/riscv/util/kvm-stat.c b/tools/perf/util/kvm-stat-arch/kvm-stat-riscv.c
index 3ea7acb5e159..8d4d5d6ce720 100644
--- a/tools/perf/arch/riscv/util/kvm-stat.c
+++ b/tools/perf/util/kvm-stat-arch/kvm-stat-riscv.c
@@ -7,19 +7,14 @@
  */
 #include <errno.h>
 #include <memory.h>
-#include "../../../util/evsel.h"
-#include "../../../util/kvm-stat.h"
+#include "../evsel.h"
+#include "../kvm-stat.h"
 #include "riscv_trap_types.h"
 #include "debug.h"
 
 define_exit_reasons_table(riscv_exit_reasons, kvm_riscv_trap_class);
 
-const char *vcpu_id_str = "id";
-const char *kvm_exit_reason = "scause";
-const char *kvm_entry_trace = "kvm:kvm_entry";
-const char *kvm_exit_trace = "kvm:kvm_exit";
-
-const char *kvm_events_tp[] = {
+static const char * const __kvm_events_tp[] = {
 	"kvm:kvm_entry",
 	"kvm:kvm_exit",
 	NULL,
@@ -29,8 +24,10 @@ static void event_get_key(struct evsel *evsel,
 			  struct perf_sample *sample,
 			  struct event_key *key)
 {
+	int xlen = 64; // TODO: 32-bit support.
+
 	key->info = 0;
-	key->key = evsel__intval(evsel, sample, kvm_exit_reason) & ~CAUSE_IRQ_FLAG;
+	key->key = evsel__intval(evsel, sample, kvm_exit_reason(EM_RISCV)) & ~CAUSE_IRQ_FLAG(xlen);
 	key->exit_reasons = riscv_exit_reasons;
 }
 
@@ -38,28 +35,28 @@ static bool event_begin(struct evsel *evsel,
 			struct perf_sample *sample __maybe_unused,
 			struct event_key *key __maybe_unused)
 {
-	return evsel__name_is(evsel, kvm_entry_trace);
+	return evsel__name_is(evsel, kvm_entry_trace(EM_RISCV));
 }
 
 static bool event_end(struct evsel *evsel,
 		      struct perf_sample *sample,
 		      struct event_key *key)
 {
-	if (evsel__name_is(evsel, kvm_exit_trace)) {
+	if (evsel__name_is(evsel, kvm_exit_trace(EM_RISCV))) {
 		event_get_key(evsel, sample, key);
 		return true;
 	}
 	return false;
 }
 
-static struct kvm_events_ops exit_events = {
+static const struct kvm_events_ops exit_events = {
 	.is_begin_event = event_begin,
 	.is_end_event	= event_end,
 	.decode_key	= exit_event_decode_key,
 	.name		= "VM-EXIT"
 };
 
-struct kvm_reg_events_ops kvm_reg_events_ops[] = {
+static const struct kvm_reg_events_ops __kvm_reg_events_ops[] = {
 	{
 		.name	= "vmexit",
 		.ops	= &exit_events,
@@ -67,12 +64,27 @@ struct kvm_reg_events_ops kvm_reg_events_ops[] = {
 	{ NULL, NULL },
 };
 
-const char * const kvm_skip_events[] = {
+static const char * const __kvm_skip_events[] = {
 	NULL,
 };
 
-int cpu_isa_init(struct perf_kvm_stat *kvm, const char *cpuid __maybe_unused)
+int __cpu_isa_init_riscv(struct perf_kvm_stat *kvm)
 {
 	kvm->exit_reasons_isa = "riscv64";
 	return 0;
 }
+
+const char * const *__kvm_events_tp_riscv(void)
+{
+	return __kvm_events_tp;
+}
+
+const struct kvm_reg_events_ops *__kvm_reg_events_ops_riscv(void)
+{
+	return __kvm_reg_events_ops;
+}
+
+const char * const *__kvm_skip_events_riscv(void)
+{
+	return __kvm_skip_events;
+}
diff --git a/tools/perf/arch/s390/util/kvm-stat.c b/tools/perf/util/kvm-stat-arch/kvm-stat-s390.c
index 0aed92df51ba..7e29169f5bb0 100644
--- a/tools/perf/arch/s390/util/kvm-stat.c
+++ b/tools/perf/util/kvm-stat-arch/kvm-stat-s390.c
@@ -8,9 +8,9 @@
 
 #include <errno.h>
 #include <string.h>
-#include "../../util/kvm-stat.h"
-#include "../../util/evsel.h"
-#include <asm/sie.h>
+#include "../kvm-stat.h"
+#include "../evsel.h"
+#include "../../../arch/s390/include/uapi/asm/sie.h"
 
 define_exit_reasons_table(sie_exit_reasons, sie_intercept_code);
 define_exit_reasons_table(sie_icpt_insn_codes, icpt_insn_codes);
@@ -18,16 +18,11 @@ define_exit_reasons_table(sie_sigp_order_codes, sigp_order_codes);
 define_exit_reasons_table(sie_diagnose_codes, diagnose_codes);
 define_exit_reasons_table(sie_icpt_prog_codes, icpt_prog_codes);
 
-const char *vcpu_id_str = "id";
-const char *kvm_exit_reason = "icptcode";
-const char *kvm_entry_trace = "kvm:kvm_s390_sie_enter";
-const char *kvm_exit_trace = "kvm:kvm_s390_sie_exit";
-
 static void event_icpt_insn_get_key(struct evsel *evsel,
 				    struct perf_sample *sample,
 				    struct event_key *key)
 {
-	unsigned long insn;
+	u64 insn;
 
 	insn = evsel__intval(evsel, sample, "instruction");
 	key->key = icpt_insn_decoder(insn);
@@ -58,7 +53,7 @@ static void event_icpt_prog_get_key(struct evsel *evsel,
 	key->exit_reasons = sie_icpt_prog_codes;
 }
 
-static struct child_event_ops child_events[] = {
+static const struct child_event_ops child_events[] = {
 	{ .name = "kvm:kvm_s390_intercept_instruction",
 	  .get_key = event_icpt_insn_get_key },
 	{ .name = "kvm:kvm_s390_handle_sigp",
@@ -70,7 +65,7 @@ static struct child_event_ops child_events[] = {
 	{ NULL, NULL },
 };
 
-static struct kvm_events_ops exit_events = {
+static const struct kvm_events_ops exit_events = {
 	.is_begin_event = exit_event_begin,
 	.is_end_event = exit_event_end,
 	.child_ops = child_events,
@@ -78,7 +73,7 @@ static struct kvm_events_ops exit_events = {
 	.name = "VM-EXIT"
 };
 
-const char *kvm_events_tp[] = {
+static const char * const __kvm_events_tp[] = {
 	"kvm:kvm_s390_sie_enter",
 	"kvm:kvm_s390_sie_exit",
 	"kvm:kvm_s390_intercept_instruction",
@@ -88,17 +83,17 @@ const char *kvm_events_tp[] = {
 	NULL,
 };
 
-struct kvm_reg_events_ops kvm_reg_events_ops[] = {
+static const struct kvm_reg_events_ops __kvm_reg_events_ops[] = {
 	{ .name = "vmexit", .ops = &exit_events },
 	{ NULL, NULL },
 };
 
-const char * const kvm_skip_events[] = {
+static const char * const __kvm_skip_events[] = {
 	"Wait state",
 	NULL,
 };
 
-int cpu_isa_init(struct perf_kvm_stat *kvm, const char *cpuid)
+int __cpu_isa_init_s390(struct perf_kvm_stat *kvm, const char *cpuid)
 {
 	if (strstr(cpuid, "IBM")) {
 		kvm->exit_reasons = sie_exit_reasons;
@@ -108,3 +103,18 @@ int cpu_isa_init(struct perf_kvm_stat *kvm, const char *cpuid)
 
 	return 0;
 }
+
+const char * const *__kvm_events_tp_s390(void)
+{
+	return __kvm_events_tp;
+}
+
+const struct kvm_reg_events_ops *__kvm_reg_events_ops_s390(void)
+{
+	return __kvm_reg_events_ops;
+}
+
+const char * const *__kvm_skip_events_s390(void)
+{
+	return __kvm_skip_events;
+}
diff --git a/tools/perf/arch/x86/util/kvm-stat.c b/tools/perf/util/kvm-stat-arch/kvm-stat-x86.c
index bff36f9345ea..43275d25b6cb 100644
--- a/tools/perf/arch/x86/util/kvm-stat.c
+++ b/tools/perf/util/kvm-stat-arch/kvm-stat-x86.c
@@ -1,29 +1,24 @@
 // SPDX-License-Identifier: GPL-2.0
 #include <errno.h>
 #include <string.h>
-#include "../../../util/kvm-stat.h"
-#include "../../../util/evsel.h"
-#include "../../../util/env.h"
-#include <asm/svm.h>
-#include <asm/vmx.h>
-#include <asm/kvm.h>
+#include "../kvm-stat.h"
+#include "../evsel.h"
+#include "../env.h"
+#include "../../arch/x86/include/uapi/asm/svm.h"
+#include "../../arch/x86/include/uapi/asm/vmx.h"
+#include "../../arch/x86/include/uapi/asm/kvm.h"
 #include <subcmd/parse-options.h>
 
 define_exit_reasons_table(vmx_exit_reasons, VMX_EXIT_REASONS);
 define_exit_reasons_table(svm_exit_reasons, SVM_EXIT_REASONS);
 
-static struct kvm_events_ops exit_events = {
+static const struct kvm_events_ops exit_events = {
 	.is_begin_event = exit_event_begin,
 	.is_end_event = exit_event_end,
 	.decode_key = exit_event_decode_key,
 	.name = "VM-EXIT"
 };
 
-const char *vcpu_id_str = "vcpu_id";
-const char *kvm_exit_reason = "exit_reason";
-const char *kvm_entry_trace = "kvm:kvm_entry";
-const char *kvm_exit_trace = "kvm:kvm_exit";
-
 /*
  * For the mmio events, we treat:
  * the time of MMIO write: kvm_mmio(KVM_TRACE_MMIO_WRITE...) -> kvm_entry
@@ -83,7 +78,7 @@ static void mmio_event_decode_key(struct perf_kvm_stat *kvm __maybe_unused,
 		  key->info == KVM_TRACE_MMIO_WRITE ? "W" : "R");
 }
 
-static struct kvm_events_ops mmio_events = {
+static const struct kvm_events_ops mmio_events = {
 	.is_begin_event = mmio_event_begin,
 	.is_end_event = mmio_event_end,
 	.decode_key = mmio_event_decode_key,
@@ -127,7 +122,7 @@ static void ioport_event_decode_key(struct perf_kvm_stat *kvm __maybe_unused,
 		  key->info ? "POUT" : "PIN");
 }
 
-static struct kvm_events_ops ioport_events = {
+static const struct kvm_events_ops ioport_events = {
 	.is_begin_event = ioport_event_begin,
 	.is_end_event = ioport_event_end,
 	.decode_key = ioport_event_decode_key,
@@ -171,14 +166,14 @@ static void msr_event_decode_key(struct perf_kvm_stat *kvm __maybe_unused,
 		  key->info ? "W" : "R");
 }
 
-static struct kvm_events_ops msr_events = {
+static const struct kvm_events_ops msr_events = {
 	.is_begin_event = msr_event_begin,
 	.is_end_event = msr_event_end,
 	.decode_key = msr_event_decode_key,
 	.name = "MSR Access"
 };
 
-const char *kvm_events_tp[] = {
+static const char * const __kvm_events_tp[] = {
 	"kvm:kvm_entry",
 	"kvm:kvm_exit",
 	"kvm:kvm_mmio",
@@ -187,7 +182,7 @@ const char *kvm_events_tp[] = {
 	NULL,
 };
 
-struct kvm_reg_events_ops kvm_reg_events_ops[] = {
+static const struct kvm_reg_events_ops __kvm_reg_events_ops[] = {
 	{ .name = "vmexit", .ops = &exit_events },
 	{ .name = "mmio", .ops = &mmio_events },
 	{ .name = "ioport", .ops = &ioport_events },
@@ -195,12 +190,12 @@ struct kvm_reg_events_ops kvm_reg_events_ops[] = {
 	{ NULL, NULL },
 };
 
-const char * const kvm_skip_events[] = {
+static const char * const __kvm_skip_events[] = {
 	"HLT",
 	NULL,
 };
 
-int cpu_isa_init(struct perf_kvm_stat *kvm, const char *cpuid)
+int __cpu_isa_init_x86(struct perf_kvm_stat *kvm, const char *cpuid)
 {
 	if (strstr(cpuid, "Intel")) {
 		kvm->exit_reasons = vmx_exit_reasons;
@@ -226,7 +221,7 @@ int cpu_isa_init(struct perf_kvm_stat *kvm, const char *cpuid)
  * So, to avoid this issue explicitly use "cycles" instead of "cycles:P" event
  * by default to sample guest on Intel platforms.
  */
-int kvm_add_default_arch_event(int *argc, const char **argv)
+int __kvm_add_default_arch_event_x86(int *argc, const char **argv)
 {
 	const char **tmp;
 	bool event = false;
@@ -262,3 +257,18 @@ EXIT:
 	free(tmp);
 	return ret;
 }
+
+const char * const *__kvm_events_tp_x86(void)
+{
+	return __kvm_events_tp;
+}
+
+const struct kvm_reg_events_ops *__kvm_reg_events_ops_x86(void)
+{
+	return __kvm_reg_events_ops;
+}
+
+const char * const *__kvm_skip_events_x86(void)
+{
+	return __kvm_skip_events;
+}
diff --git a/tools/perf/arch/riscv/util/riscv_trap_types.h b/tools/perf/util/kvm-stat-arch/riscv_trap_types.h
index 6cc71eb01fca..aa5d24fab4ee 100644
--- a/tools/perf/arch/riscv/util/riscv_trap_types.h
+++ b/tools/perf/util/kvm-stat-arch/riscv_trap_types.h
@@ -3,7 +3,7 @@
 #define ARCH_PERF_RISCV_TRAP_TYPES_H
 
 /* Exception cause high bit - is an interrupt if set */
-#define CAUSE_IRQ_FLAG		(_AC(1, UL) << (__riscv_xlen - 1))
+#define CAUSE_IRQ_FLAG(xlen)		(_AC(1, UL) << (xlen - 1))
 
 /* Interrupt causes (minus the high bit) */
 #define IRQ_S_SOFT 1
diff --git a/tools/perf/util/kvm-stat.c b/tools/perf/util/kvm-stat.c
index 38ace736db5c..27f16810498c 100644
--- a/tools/perf/util/kvm-stat.c
+++ b/tools/perf/util/kvm-stat.c
@@ -2,20 +2,23 @@
 #include "debug.h"
 #include "evsel.h"
 #include "kvm-stat.h"
-
-#if defined(HAVE_KVM_STAT_SUPPORT) && defined(HAVE_LIBTRACEEVENT)
+#include <dwarf-regs.h>
 
 bool kvm_exit_event(struct evsel *evsel)
 {
-	return evsel__name_is(evsel, kvm_exit_trace);
+	uint16_t e_machine = evsel__e_machine(evsel, /*e_flags=*/NULL);
+
+	return evsel__name_is(evsel, kvm_exit_trace(e_machine));
 }
 
 void exit_event_get_key(struct evsel *evsel,
 			struct perf_sample *sample,
 			struct event_key *key)
 {
+	uint16_t e_machine = evsel__e_machine(evsel, /*e_flags=*/NULL);
+
 	key->info = 0;
-	key->key  = evsel__intval(evsel, sample, kvm_exit_reason);
+	key->key  = evsel__intval(evsel, sample, kvm_exit_reason(e_machine));
 }
 
 
@@ -32,7 +35,9 @@ bool exit_event_begin(struct evsel *evsel,
 
 bool kvm_entry_event(struct evsel *evsel)
 {
-	return evsel__name_is(evsel, kvm_entry_trace);
+	uint16_t e_machine = evsel__e_machine(evsel, /*e_flags=*/NULL);
+
+	return evsel__name_is(evsel, kvm_entry_trace(e_machine));
 }
 
 bool exit_event_end(struct evsel *evsel,
@@ -67,4 +72,202 @@ void exit_event_decode_key(struct perf_kvm_stat *kvm,
 	scnprintf(decode, KVM_EVENT_NAME_LEN, "%s", exit_reason);
 }
 
-#endif
+int setup_kvm_events_tp(struct perf_kvm_stat *kvm, uint16_t e_machine)
+{
+	switch (e_machine) {
+	case EM_PPC:
+	case EM_PPC64:
+		return __setup_kvm_events_tp_powerpc(kvm);
+	default:
+		return 0;
+	}
+}
+
+int cpu_isa_init(struct perf_kvm_stat *kvm, uint16_t e_machine, const char *cpuid)
+{
+	switch (e_machine) {
+	case EM_AARCH64:
+		return __cpu_isa_init_arm64(kvm);
+	case EM_LOONGARCH:
+		return __cpu_isa_init_loongarch(kvm);
+	case EM_PPC:
+	case EM_PPC64:
+		return __cpu_isa_init_powerpc(kvm);
+	case EM_RISCV:
+		return __cpu_isa_init_riscv(kvm);
+	case EM_S390:
+		return __cpu_isa_init_s390(kvm, cpuid);
+	case EM_X86_64:
+	case EM_386:
+		return __cpu_isa_init_x86(kvm, cpuid);
+	default:
+		pr_err("Unsupported kvm-stat host %d\n", e_machine);
+		return -1;
+	}
+}
+
+const char *vcpu_id_str(uint16_t e_machine)
+{
+	switch (e_machine) {
+	case EM_AARCH64:
+	case EM_RISCV:
+	case EM_S390:
+		return "id";
+	case EM_LOONGARCH:
+	case EM_PPC:
+	case EM_PPC64:
+	case EM_X86_64:
+	case EM_386:
+		return "vcpu_id";
+	default:
+		pr_err("Unsupported kvm-stat host %d\n", e_machine);
+		return NULL;
+	}
+}
+
+const char *kvm_exit_reason(uint16_t e_machine)
+{
+	switch (e_machine) {
+	case EM_AARCH64:
+		return "ret";
+	case EM_LOONGARCH:
+		return "reason";
+	case EM_PPC:
+	case EM_PPC64:
+		return "trap";
+	case EM_RISCV:
+		return "scause";
+	case EM_S390:
+		return "icptcode";
+	case EM_X86_64:
+	case EM_386:
+		return "exit_reason";
+	default:
+		pr_err("Unsupported kvm-stat host %d\n", e_machine);
+		return NULL;
+	}
+}
+
+const char *kvm_entry_trace(uint16_t e_machine)
+{
+	switch (e_machine) {
+	case EM_AARCH64:
+	case EM_RISCV:
+	case EM_X86_64:
+	case EM_386:
+		return "kvm:kvm_entry";
+	case EM_LOONGARCH:
+		return "kvm:kvm_enter";
+	case EM_PPC:
+	case EM_PPC64:
+		return "kvm_hv:kvm_guest_enter";
+	case EM_S390:
+		return "kvm:kvm_s390_sie_enter";
+	default:
+		pr_err("Unsupported kvm-stat host %d\n", e_machine);
+		return NULL;
+	}
+}
+
+const char *kvm_exit_trace(uint16_t e_machine)
+{
+	switch (e_machine) {
+	case EM_AARCH64:
+	case EM_LOONGARCH:
+	case EM_RISCV:
+	case EM_X86_64:
+	case EM_386:
+		return "kvm:kvm_exit";
+	case EM_PPC:
+	case EM_PPC64:
+		return "kvm_hv:kvm_guest_exit";
+	case EM_S390:
+		return "kvm:kvm_s390_sie_exit";
+	default:
+		pr_err("Unsupported kvm-stat host %d\n", e_machine);
+		return NULL;
+	}
+}
+
+const char * const *kvm_events_tp(uint16_t e_machine)
+{
+	switch (e_machine) {
+	case EM_AARCH64:
+		return __kvm_events_tp_arm64();
+	case EM_LOONGARCH:
+		return __kvm_events_tp_loongarch();
+	case EM_PPC:
+	case EM_PPC64:
+		return __kvm_events_tp_powerpc();
+	case EM_RISCV:
+		return __kvm_events_tp_riscv();
+	case EM_S390:
+		return __kvm_events_tp_s390();
+	case EM_X86_64:
+	case EM_386:
+		return __kvm_events_tp_x86();
+	default:
+		pr_err("Unsupported kvm-stat host %d\n", e_machine);
+		return NULL;
+	}
+}
+
+const struct kvm_reg_events_ops *kvm_reg_events_ops(uint16_t e_machine)
+{
+	switch (e_machine) {
+	case EM_AARCH64:
+		return __kvm_reg_events_ops_arm64();
+	case EM_LOONGARCH:
+		return __kvm_reg_events_ops_loongarch();
+	case EM_PPC:
+	case EM_PPC64:
+		return __kvm_reg_events_ops_powerpc();
+	case EM_RISCV:
+		return __kvm_reg_events_ops_riscv();
+	case EM_S390:
+		return __kvm_reg_events_ops_s390();
+	case EM_X86_64:
+	case EM_386:
+		return __kvm_reg_events_ops_x86();
+	default:
+		pr_err("Unsupported kvm-stat host %d\n", e_machine);
+		return NULL;
+	}
+}
+
+const char * const *kvm_skip_events(uint16_t e_machine)
+{
+	switch (e_machine) {
+	case EM_AARCH64:
+		return __kvm_skip_events_arm64();
+	case EM_LOONGARCH:
+		return __kvm_skip_events_loongarch();
+	case EM_PPC:
+	case EM_PPC64:
+		return __kvm_skip_events_powerpc();
+	case EM_RISCV:
+		return __kvm_skip_events_riscv();
+	case EM_S390:
+		return __kvm_skip_events_s390();
+	case EM_X86_64:
+	case EM_386:
+		return __kvm_skip_events_x86();
+	default:
+		pr_err("Unsupported kvm-stat host %d\n", e_machine);
+		return NULL;
+	}
+}
+
+int kvm_add_default_arch_event(uint16_t e_machine, int *argc, const char **argv)
+{
+	switch (e_machine) {
+	case EM_PPC:
+	case EM_PPC64:
+		return __kvm_add_default_arch_event_powerpc(argc, argv);
+	case EM_X86_64:
+	case EM_386:
+		return __kvm_add_default_arch_event_x86(argc, argv);
+	default:
+		return 0;
+	}
+}
diff --git a/tools/perf/util/kvm-stat.h b/tools/perf/util/kvm-stat.h
index a356b839c2ee..4a998aaece5d 100644
--- a/tools/perf/util/kvm-stat.h
+++ b/tools/perf/util/kvm-stat.h
@@ -2,8 +2,6 @@
 #ifndef __PERF_KVM_STAT_H
 #define __PERF_KVM_STAT_H
 
-#ifdef HAVE_KVM_STAT_SUPPORT
-
 #include "tool.h"
 #include "sort.h"
 #include "stat.h"
@@ -67,7 +65,7 @@ struct kvm_events_ops {
 			       struct event_key *key);
 	bool (*is_end_event)(struct evsel *evsel,
 			     struct perf_sample *sample, struct event_key *key);
-	struct child_event_ops *child_ops;
+	const struct child_event_ops *child_ops;
 	void (*decode_key)(struct perf_kvm_stat *kvm, struct event_key *key,
 			   char *decode);
 	const char *name;
@@ -95,7 +93,7 @@ struct perf_kvm_stat {
 	struct exit_reasons_table *exit_reasons;
 	const char *exit_reasons_isa;
 
-	struct kvm_events_ops *events_ops;
+	const struct kvm_events_ops *events_ops;
 
 	u64 total_time;
 	u64 total_count;
@@ -113,10 +111,10 @@ struct perf_kvm_stat {
 
 struct kvm_reg_events_ops {
 	const char *name;
-	struct kvm_events_ops *ops;
+	const struct kvm_events_ops *ops;
 };
 
-#if defined(HAVE_KVM_STAT_SUPPORT) && defined(HAVE_LIBTRACEEVENT)
+#ifdef HAVE_LIBTRACEEVENT
 
 void exit_event_get_key(struct evsel *evsel,
 			struct perf_sample *sample,
@@ -130,11 +128,9 @@ bool exit_event_end(struct evsel *evsel,
 void exit_event_decode_key(struct perf_kvm_stat *kvm,
 			   struct event_key *key,
 			   char *decode);
-#endif
 
 bool kvm_exit_event(struct evsel *evsel);
 bool kvm_entry_event(struct evsel *evsel);
-int setup_kvm_events_tp(struct perf_kvm_stat *kvm);
 
 #define define_exit_reasons_table(name, symbols)	\
 	static struct exit_reasons_table name[] = {	\
@@ -144,15 +140,60 @@ int setup_kvm_events_tp(struct perf_kvm_stat *kvm);
 /*
  * arch specific callbacks and data structures
  */
-int cpu_isa_init(struct perf_kvm_stat *kvm, const char *cpuid);
+int setup_kvm_events_tp(struct perf_kvm_stat *kvm, uint16_t e_machine);
+int __setup_kvm_events_tp_powerpc(struct perf_kvm_stat *kvm);
+
+int cpu_isa_init(struct perf_kvm_stat *kvm, uint16_t e_machine, const char *cpuid);
+int __cpu_isa_init_arm64(struct perf_kvm_stat *kvm);
+int __cpu_isa_init_loongarch(struct perf_kvm_stat *kvm);
+int __cpu_isa_init_powerpc(struct perf_kvm_stat *kvm);
+int __cpu_isa_init_riscv(struct perf_kvm_stat *kvm);
+int __cpu_isa_init_s390(struct perf_kvm_stat *kvm, const char *cpuid);
+int __cpu_isa_init_x86(struct perf_kvm_stat *kvm, const char *cpuid);
+
+const char *vcpu_id_str(uint16_t e_machine);
+const char *kvm_exit_reason(uint16_t e_machine);
+const char *kvm_entry_trace(uint16_t e_machine);
+const char *kvm_exit_trace(uint16_t e_machine);
+
+const char * const *kvm_events_tp(uint16_t e_machine);
+const char * const *__kvm_events_tp_arm64(void);
+const char * const *__kvm_events_tp_loongarch(void);
+const char * const *__kvm_events_tp_powerpc(void);
+const char * const *__kvm_events_tp_riscv(void);
+const char * const *__kvm_events_tp_s390(void);
+const char * const *__kvm_events_tp_x86(void);
+
+const struct kvm_reg_events_ops *kvm_reg_events_ops(uint16_t e_machine);
+const struct kvm_reg_events_ops *__kvm_reg_events_ops_arm64(void);
+const struct kvm_reg_events_ops *__kvm_reg_events_ops_loongarch(void);
+const struct kvm_reg_events_ops *__kvm_reg_events_ops_powerpc(void);
+const struct kvm_reg_events_ops *__kvm_reg_events_ops_riscv(void);
+const struct kvm_reg_events_ops *__kvm_reg_events_ops_s390(void);
+const struct kvm_reg_events_ops *__kvm_reg_events_ops_x86(void);
+
+const char * const *kvm_skip_events(uint16_t e_machine);
+const char * const *__kvm_skip_events_arm64(void);
+const char * const *__kvm_skip_events_loongarch(void);
+const char * const *__kvm_skip_events_powerpc(void);
+const char * const *__kvm_skip_events_riscv(void);
+const char * const *__kvm_skip_events_s390(void);
+const char * const *__kvm_skip_events_x86(void);
+
+int kvm_add_default_arch_event(uint16_t e_machine, int *argc, const char **argv);
+int __kvm_add_default_arch_event_powerpc(int *argc, const char **argv);
+int __kvm_add_default_arch_event_x86(int *argc, const char **argv);
+
+#else /* !HAVE_LIBTRACEEVENT */
+
+static inline int kvm_add_default_arch_event(uint16_t e_machine __maybe_unused,
+					     int *argc __maybe_unused,
+					     const char **argv __maybe_unused)
+{
+	return 0;
+}
 
-extern const char *kvm_events_tp[];
-extern struct kvm_reg_events_ops kvm_reg_events_ops[];
-extern const char * const kvm_skip_events[];
-extern const char *vcpu_id_str;
-extern const char *kvm_exit_reason;
-extern const char *kvm_entry_trace;
-extern const char *kvm_exit_trace;
+#endif /* HAVE_LIBTRACEEVENT */
 
 static inline struct kvm_info *kvm_info__get(struct kvm_info *ki)
 {
@@ -186,11 +227,6 @@ static inline struct kvm_info *kvm_info__new(void)
 	return ki;
 }
 
-#else /* HAVE_KVM_STAT_SUPPORT */
-// We use this unconditionally in hists__findnew_entry() and hist_entry__delete()
-#define kvm_info__zput(ki) do { } while (0)
-#endif /* HAVE_KVM_STAT_SUPPORT */
-
 #define STRDUP_FAIL_EXIT(s)		\
 	({	char *_p;		\
 		_p = strdup(s);		\
@@ -201,5 +237,4 @@ static inline struct kvm_info *kvm_info__new(void)
 		_p;			\
 	})
 
-extern int kvm_add_default_arch_event(int *argc, const char **argv);
 #endif /* __PERF_KVM_STAT_H */
diff --git a/tools/perf/util/libbfd.c b/tools/perf/util/libbfd.c
index 79f4528234a9..63ea3fb53e77 100644
--- a/tools/perf/util/libbfd.c
+++ b/tools/perf/util/libbfd.c
@@ -501,7 +501,7 @@ int symbol__disassemble_bpf_libbfd(struct symbol *sym __maybe_unused,
 	struct bpf_prog_info_node *info_node;
 	int len = sym->end - sym->start;
 	disassembler_ftype disassemble;
-	struct map *map = args->ms.map;
+	struct map *map = args->ms->map;
 	struct perf_bpil *info_linear;
 	struct disassemble_info info;
 	struct dso *dso = map__dso(map);
@@ -612,7 +612,7 @@ int symbol__disassemble_bpf_libbfd(struct symbol *sym __maybe_unused,
 			args->line = strdup(srcline);
 			args->line_nr = 0;
 			args->fileloc = NULL;
-			args->ms.sym  = sym;
+			args->ms->sym = sym;
 			dl = disasm_line__new(args);
 			if (dl) {
 				annotation_line__add(&dl->al,
@@ -624,7 +624,7 @@ int symbol__disassemble_bpf_libbfd(struct symbol *sym __maybe_unused,
 		args->line = buf + prev_buf_size;
 		args->line_nr = 0;
 		args->fileloc = NULL;
-		args->ms.sym  = sym;
+		args->ms->sym = sym;
 		dl = disasm_line__new(args);
 		if (dl)
 			annotation_line__add(&dl->al, &notes->src->source);
diff --git a/tools/perf/util/libdw.c b/tools/perf/util/libdw.c
new file mode 100644
index 000000000000..216977884103
--- /dev/null
+++ b/tools/perf/util/libdw.c
@@ -0,0 +1,172 @@
+// SPDX-License-Identifier: GPL-2.0
+#include "dso.h"
+#include "libdw.h"
+#include "srcline.h"
+#include "symbol.h"
+#include "dwarf-aux.h"
+#include <fcntl.h>
+#include <unistd.h>
+#include <elfutils/libdwfl.h>
+
+static const Dwfl_Callbacks offline_callbacks = {
+	.find_debuginfo = dwfl_standard_find_debuginfo,
+	.section_address = dwfl_offline_section_address,
+	.find_elf = dwfl_build_id_find_elf,
+};
+
+void dso__free_libdw(struct dso *dso)
+{
+	Dwfl *dwfl = dso__libdw(dso);
+
+	if (dwfl) {
+		dwfl_end(dwfl);
+		dso__set_libdw(dso, NULL);
+	}
+}
+
+struct Dwfl *dso__libdw_dwfl(struct dso *dso)
+{
+	Dwfl *dwfl = dso__libdw(dso);
+	const char *dso_name;
+	Dwfl_Module *mod;
+	int fd;
+
+	if (dwfl)
+		return dwfl;
+
+	dso_name = dso__long_name(dso);
+	/*
+	 * Initialize Dwfl session.
+	 * We need to open the DSO file to report it to libdw.
+	 */
+	fd = open(dso_name, O_RDONLY);
+	if (fd < 0)
+		return NULL;
+
+	dwfl = dwfl_begin(&offline_callbacks);
+	if (!dwfl) {
+		close(fd);
+		return NULL;
+	}
+
+	/*
+	 * If the report is successful, the file descriptor fd is consumed
+	 * and closed by the Dwfl. If not, it is not closed.
+	 */
+	mod = dwfl_report_offline(dwfl, dso_name, dso_name, fd);
+	if (!mod) {
+		dwfl_end(dwfl);
+		close(fd);
+		return NULL;
+	}
+
+	dwfl_report_end(dwfl, /*removed=*/NULL, /*arg=*/NULL);
+	dso__set_libdw(dso, dwfl);
+
+	return dwfl;
+}
+
+struct libdw_a2l_cb_args {
+	struct dso *dso;
+	struct symbol *sym;
+	struct inline_node *node;
+	char *leaf_srcline;
+	bool leaf_srcline_used;
+};
+
+static int libdw_a2l_cb(Dwarf_Die *die, void *_args)
+{
+	struct libdw_a2l_cb_args *args  = _args;
+	struct symbol *inline_sym = new_inline_sym(args->dso, args->sym, dwarf_diename(die));
+	const char *call_fname = die_get_call_file(die);
+	char *call_srcline = srcline__unknown;
+	struct inline_list *ilist;
+
+	if (!inline_sym)
+		return -ENOMEM;
+
+	/* Assign caller information to the parent. */
+	if (call_fname)
+		call_srcline = srcline_from_fileline(call_fname, die_get_call_lineno(die));
+
+	list_for_each_entry(ilist, &args->node->val, list) {
+		if (args->leaf_srcline == ilist->srcline)
+			args->leaf_srcline_used = false;
+		else if (ilist->srcline != srcline__unknown)
+			free(ilist->srcline);
+		ilist->srcline =  call_srcline;
+		call_srcline = NULL;
+		break;
+	}
+	if (call_srcline && call_srcline != srcline__unknown)
+		free(call_srcline);
+
+	/* Add this symbol to the chain as the leaf. */
+	if (!args->leaf_srcline_used) {
+		inline_list__append_tail(inline_sym, args->leaf_srcline, args->node);
+		args->leaf_srcline_used = true;
+	} else {
+		inline_list__append_tail(inline_sym, strdup(args->leaf_srcline), args->node);
+	}
+	return 0;
+}
+
+int libdw__addr2line(u64 addr, char **file, unsigned int *line_nr,
+		     struct dso *dso, bool unwind_inlines,
+		     struct inline_node *node, struct symbol *sym)
+{
+	Dwfl *dwfl = dso__libdw_dwfl(dso);
+	Dwfl_Module *mod;
+	Dwfl_Line *dwline;
+	Dwarf_Addr bias;
+	const char *src;
+	int lineno = 0;
+
+	if (!dwfl)
+		return 0;
+
+	mod = dwfl_addrmodule(dwfl, addr);
+	if (!mod)
+		return 0;
+
+	/*
+	 * Get/ignore the dwarf information. Determine the bias, difference
+	 * between the regular ELF addr2line addresses and those to use with
+	 * libdw.
+	 */
+	if (!dwfl_module_getdwarf(mod, &bias))
+		return 0;
+
+	/* Find source line information for the address. */
+	dwline = dwfl_module_getsrc(mod, addr + bias);
+	if (!dwline)
+		return 0;
+
+	/* Get line information. */
+	src = dwfl_lineinfo(dwline, /*addr=*/NULL, &lineno, /*col=*/NULL, /*mtime=*/NULL,
+			    /*length=*/NULL);
+
+	if (file)
+		*file = src ? strdup(src) : NULL;
+	if (line_nr)
+		*line_nr = lineno;
+
+	/* Optionally unwind inline function call chain. */
+	if (unwind_inlines && node) {
+		Dwarf_Addr unused_bias;
+		Dwarf_Die *cudie = dwfl_module_addrdie(mod, addr + bias, &unused_bias);
+		struct libdw_a2l_cb_args args = {
+			.dso = dso,
+			.sym = sym,
+			.node = node,
+			.leaf_srcline = srcline_from_fileline(src ?: "<unknown>", lineno),
+		};
+
+		/* Walk from the parent down to the leaf. */
+		cu_walk_functions_at(cudie, addr, libdw_a2l_cb, &args);
+
+		if (!args.leaf_srcline_used)
+			free(args.leaf_srcline);
+	}
+	return 1;
+}
diff --git a/tools/perf/util/libdw.h b/tools/perf/util/libdw.h
new file mode 100644
index 000000000000..b12094737415
--- /dev/null
+++ b/tools/perf/util/libdw.h
@@ -0,0 +1,58 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef PERF_LIBDW_H
+#define PERF_LIBDW_H
+
+#include <linux/types.h>
+
+struct dso;
+struct inline_node;
+struct symbol;
+
+#ifdef HAVE_LIBDW_SUPPORT
+/*
+ * libdw__addr2line - Convert address to source location using libdw
+ * @addr: Address to resolve
+ * @file: Pointer to return filename (caller must free)
+ * @line_nr: Pointer to return line number
+ * @dso: The dso struct
+ * @unwind_inlines: Whether to unwind inline function calls
+ * @node: Inline node list to append to
+ * @sym: The symbol associated with the address
+ *
+ * This function initializes a Dwfl context for the DSO if not already present,
+ * finds the source line information for the given address, and optionally
+ * resolves inline function call chains.
+ *
+ * Returns 1 on success (found), 0 on failure (not found).
+ */
+int libdw__addr2line(u64 addr, char **file,
+		     unsigned int *line_nr, struct dso *dso,
+		     bool unwind_inlines, struct inline_node *node,
+		     struct symbol *sym);
+
+/*
+ * dso__free_libdw - Free libdw resources associated with the DSO
+ * @dso: The dso to free resources for
+ *
+ * This function cleans up the Dwfl context used for addr2line lookups.
+ */
+void dso__free_libdw(struct dso *dso);
+
+#else /* HAVE_LIBDW_SUPPORT */
+
+static inline int libdw__addr2line(u64 addr __maybe_unused, char **file __maybe_unused,
+				   unsigned int *line_nr __maybe_unused,
+				   struct dso *dso __maybe_unused,
+				   bool unwind_inlines __maybe_unused,
+				   struct inline_node *node __maybe_unused,
+				   struct symbol *sym __maybe_unused)
+{
+	return 0;
+}
+
+static inline void dso__free_libdw(struct dso *dso __maybe_unused)
+{
+}
+#endif /* HAVE_LIBDW_SUPPORT */
+
+#endif /* PERF_LIBDW_H */
diff --git a/tools/perf/util/llvm.c b/tools/perf/util/llvm.c
index 2ebf1f5f65bf..0d126d233c01 100644
--- a/tools/perf/util/llvm.c
+++ b/tools/perf/util/llvm.c
@@ -118,7 +118,7 @@ int symbol__disassemble_llvm(const char *filename, struct symbol *sym,
 {
 #ifdef HAVE_LIBLLVM_SUPPORT
 	struct annotation *notes = symbol__annotation(sym);
-	struct map *map = args->ms.map;
+	struct map *map = args->ms->map;
 	struct dso *dso = map__dso(map);
 	u64 start = map__rip_2objdump(map, sym->start);
 	/* Malloc-ed buffer containing instructions read from disk. */
@@ -146,7 +146,7 @@ int symbol__disassemble_llvm(const char *filename, struct symbol *sym,
 		return errno;
 
 	init_llvm();
-	if (arch__is(args->arch, "x86")) {
+	if (arch__is_x86(args->arch)) {
 		const char *triplet = is_64bit ? "x86_64-pc-linux" : "i686-pc-linux";
 
 		disasm = LLVMCreateDisasm(triplet, &storage, /*tag_type=*/0,
@@ -184,7 +184,7 @@ int symbol__disassemble_llvm(const char *filename, struct symbol *sym,
 	args->line = disasm_buf;
 	args->line_nr = 0;
 	args->fileloc = NULL;
-	args->ms.sym = sym;
+	args->ms->sym = sym;
 
 	dl = disasm_line__new(args);
 	if (dl == NULL)
@@ -242,7 +242,7 @@ int symbol__disassemble_llvm(const char *filename, struct symbol *sym,
 					 &line_storage_len);
 		args->line_nr = 0;
 		args->fileloc = NULL;
-		args->ms.sym = sym;
+		args->ms->sym = sym;
 
 		llvm_addr2line(filename, pc, &args->fileloc,
 			       (unsigned int *)&args->line_nr, false, NULL);
diff --git a/tools/perf/util/lzma.c b/tools/perf/util/lzma.c
index c355757ed391..91b9b5171d1f 100644
--- a/tools/perf/util/lzma.c
+++ b/tools/perf/util/lzma.c
@@ -59,7 +59,7 @@ int lzma_decompress_stream_to_file(FILE *infile, int output_fd)
 			strm.avail_in = fread(buf_in, 1, sizeof(buf_in), infile);
 
 			if (ferror(infile)) {
-				pr_debug("lzma: read error: %s\n", strerror(errno));
+				pr_debug("lzma: read error: %m\n");
 				goto err_lzma_end;
 			}
 
@@ -73,7 +73,7 @@ int lzma_decompress_stream_to_file(FILE *infile, int output_fd)
 			ssize_t write_size = sizeof(buf_out) - strm.avail_out;
 
 			if (writen(output_fd, buf_out, write_size) != write_size) {
-				pr_debug("lzma: write error: %s\n", strerror(errno));
+				pr_debug("lzma: write error: %m\n");
 				goto err_lzma_end;
 			}
 
@@ -103,7 +103,7 @@ int lzma_decompress_to_file(const char *input, int output_fd)
 
 	infile = fopen(input, "rb");
 	if (!infile) {
-		pr_debug("lzma: fopen failed on %s: '%s'\n", input, strerror(errno));
+		pr_debug("lzma: fopen failed on %s: '%m'\n", input);
 		return -1;
 	}
 
diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c
index 841b711d970e..e76f8c86e62a 100644
--- a/tools/perf/util/machine.c
+++ b/tools/perf/util/machine.c
@@ -2016,7 +2016,7 @@ static void ip__resolve_ams(struct thread *thread,
 	ams->addr = ip;
 	ams->al_addr = al.addr;
 	ams->al_level = al.level;
-	ams->ms.maps = maps__get(al.maps);
+	ams->ms.thread = thread__get(al.thread);
 	ams->ms.sym = al.sym;
 	ams->ms.map = map__get(al.map);
 	ams->phys_addr = 0;
@@ -2037,7 +2037,7 @@ static void ip__resolve_data(struct thread *thread,
 	ams->addr = addr;
 	ams->al_addr = al.addr;
 	ams->al_level = al.level;
-	ams->ms.maps = maps__get(al.maps);
+	ams->ms.thread = thread__get(al.thread);
 	ams->ms.sym = al.sym;
 	ams->ms.map = map__get(al.map);
 	ams->phys_addr = phys_addr;
@@ -2090,6 +2090,59 @@ struct iterations {
 	u64 cycles;
 };
 
+static int append_inlines(struct callchain_cursor *cursor, struct map_symbol *ms, u64 ip,
+			bool branch, struct branch_flags *flags, int nr_loop_iter,
+			u64 iter_cycles, u64 branch_from)
+{
+	struct symbol *sym = ms->sym;
+	struct map *map = ms->map;
+	struct inline_node *inline_node;
+	struct inline_list *ilist;
+	struct dso *dso;
+	u64 addr;
+	int ret = 1;
+	struct map_symbol ilist_ms;
+	bool first = true;
+
+	if (!symbol_conf.inline_name || !map || !sym)
+		return ret;
+
+	addr = map__dso_map_ip(map, ip);
+	addr = map__rip_2objdump(map, addr);
+	dso = map__dso(map);
+
+	inline_node = inlines__tree_find(dso__inlined_nodes(dso), addr);
+	if (!inline_node) {
+		inline_node = dso__parse_addr_inlines(dso, addr, sym);
+		if (!inline_node)
+			return ret;
+		inlines__tree_insert(dso__inlined_nodes(dso), inline_node);
+	}
+
+	ilist_ms = (struct map_symbol) {
+		.thread = thread__get(ms->thread),
+		.map = map__get(map),
+	};
+	list_for_each_entry(ilist, &inline_node->val, list) {
+		ilist_ms.sym = ilist->symbol;
+		if (first) {
+			ret = callchain_cursor_append(cursor, ip, &ilist_ms,
+						      branch, flags, nr_loop_iter,
+						      iter_cycles, branch_from, ilist->srcline);
+		} else {
+			ret = callchain_cursor_append(cursor, ip, &ilist_ms, false,
+						      NULL, 0, 0, 0, ilist->srcline);
+		}
+		first = false;
+
+		if (ret != 0)
+			return ret;
+	}
+	map_symbol__exit(&ilist_ms);
+
+	return ret;
+}
+
 static int add_callchain_ip(struct thread *thread,
 			    struct callchain_cursor *cursor,
 			    struct symbol **parent,
@@ -2167,9 +2220,14 @@ static int add_callchain_ip(struct thread *thread,
 		iter_cycles = iter->cycles;
 	}
 
-	ms.maps = maps__get(al.maps);
+	ms.thread = thread__get(al.thread);
 	ms.map = map__get(al.map);
 	ms.sym = al.sym;
+
+	if (append_inlines(cursor, &ms, ip, branch, flags, nr_loop_iter,
+			   iter_cycles, branch_from) == 0)
+		goto out;
+
 	srcline = callchain_srcline(&ms, al.addr);
 	err = callchain_cursor_append(cursor, ip, &ms,
 				      branch, flags, nr_loop_iter,
@@ -2325,7 +2383,7 @@ static void save_lbr_cursor_node(struct thread *thread,
 	map_symbol__exit(&lbr_stitch->prev_lbr_cursor[idx].ms);
 	memcpy(&lbr_stitch->prev_lbr_cursor[idx], cursor->curr,
 	       sizeof(struct callchain_cursor_node));
-	lbr_stitch->prev_lbr_cursor[idx].ms.maps = maps__get(cursor->curr->ms.maps);
+	lbr_stitch->prev_lbr_cursor[idx].ms.thread = thread__get(cursor->curr->ms.thread);
 	lbr_stitch->prev_lbr_cursor[idx].ms.map = map__get(cursor->curr->ms.map);
 
 	lbr_stitch->prev_lbr_cursor[idx].valid = true;
@@ -2365,8 +2423,14 @@ static int lbr_callchain_add_lbr_ip(struct thread *thread,
 	}
 
 	if (callee) {
-		/* Add LBR ip from first entries.to */
-		ip = entries[0].to;
+		/*
+		 * Set the (first) leaf function's IP to sample->ip (the
+		 * location of the sample) but if not recorded use entries.to
+		 */
+		if (sample->ip)
+			ip = sample->ip;
+		else
+			ip = entries[0].to;
 		flags = &entries[0].flags;
 		*branch_from = entries[0].from;
 		err = add_callchain_ip(thread, cursor, parent,
@@ -2419,8 +2483,14 @@ static int lbr_callchain_add_lbr_ip(struct thread *thread,
 	}
 
 	if (lbr_nr > 0) {
-		/* Add LBR ip from first entries.to */
-		ip = entries[0].to;
+		/*
+		 * Set the (first) leaf function's IP to sample->ip (the
+		 * location of the sample) but if not recorded use entries.to
+		 */
+		if (sample->ip)
+			ip = sample->ip;
+		else
+			ip = entries[0].to;
 		flags = &entries[0].flags;
 		*branch_from = entries[0].from;
 		err = add_callchain_ip(thread, cursor, parent,
@@ -2538,7 +2608,8 @@ static bool has_stitched_lbr(struct thread *thread,
 		memcpy(&stitch_node->cursor, &lbr_stitch->prev_lbr_cursor[i],
 		       sizeof(struct callchain_cursor_node));
 
-		stitch_node->cursor.ms.maps = maps__get(lbr_stitch->prev_lbr_cursor[i].ms.maps);
+		stitch_node->cursor.ms.thread =
+			thread__get(lbr_stitch->prev_lbr_cursor[i].ms.thread);
 		stitch_node->cursor.ms.map = map__get(lbr_stitch->prev_lbr_cursor[i].ms.map);
 
 		if (callee)
@@ -2888,49 +2959,6 @@ check_calls:
 	return 0;
 }
 
-static int append_inlines(struct callchain_cursor *cursor, struct map_symbol *ms, u64 ip)
-{
-	struct symbol *sym = ms->sym;
-	struct map *map = ms->map;
-	struct inline_node *inline_node;
-	struct inline_list *ilist;
-	struct dso *dso;
-	u64 addr;
-	int ret = 1;
-	struct map_symbol ilist_ms;
-
-	if (!symbol_conf.inline_name || !map || !sym)
-		return ret;
-
-	addr = map__dso_map_ip(map, ip);
-	addr = map__rip_2objdump(map, addr);
-	dso = map__dso(map);
-
-	inline_node = inlines__tree_find(dso__inlined_nodes(dso), addr);
-	if (!inline_node) {
-		inline_node = dso__parse_addr_inlines(dso, addr, sym);
-		if (!inline_node)
-			return ret;
-		inlines__tree_insert(dso__inlined_nodes(dso), inline_node);
-	}
-
-	ilist_ms = (struct map_symbol) {
-		.maps = maps__get(ms->maps),
-		.map = map__get(map),
-	};
-	list_for_each_entry(ilist, &inline_node->val, list) {
-		ilist_ms.sym = ilist->symbol;
-		ret = callchain_cursor_append(cursor, ip, &ilist_ms, false,
-					      NULL, 0, 0, 0, ilist->srcline);
-
-		if (ret != 0)
-			return ret;
-	}
-	map_symbol__exit(&ilist_ms);
-
-	return ret;
-}
-
 static int unwind_entry(struct unwind_entry *entry, void *arg)
 {
 	struct callchain_cursor *cursor = arg;
@@ -2940,7 +2968,8 @@ static int unwind_entry(struct unwind_entry *entry, void *arg)
 	if (symbol_conf.hide_unresolved && entry->ms.sym == NULL)
 		return 0;
 
-	if (append_inlines(cursor, &entry->ms, entry->ip) == 0)
+	if (append_inlines(cursor, &entry->ms, entry->ip, /*branch=*/false, /*branch_flags=*/NULL,
+			   /*nr_loop_iter=*/0, /*iter_cycles=*/0, /*branch_from=*/0) == 0)
 		return 0;
 
 	/*
diff --git a/tools/perf/util/map_symbol.c b/tools/perf/util/map_symbol.c
index 6ad2960bc289..11bc0a7f704c 100644
--- a/tools/perf/util/map_symbol.c
+++ b/tools/perf/util/map_symbol.c
@@ -2,10 +2,11 @@
 #include "map_symbol.h"
 #include "maps.h"
 #include "map.h"
+#include "thread.h"
 
 void map_symbol__exit(struct map_symbol *ms)
 {
-	maps__zput(ms->maps);
+	thread__zput(ms->thread);
 	map__zput(ms->map);
 }
 
@@ -16,7 +17,7 @@ void addr_map_symbol__exit(struct addr_map_symbol *ams)
 
 void map_symbol__copy(struct map_symbol *dst, struct map_symbol *src)
 {
-	dst->maps = maps__get(src->maps);
+	dst->thread = thread__get(src->thread);
 	dst->map = map__get(src->map);
 	dst->sym = src->sym;
 }
diff --git a/tools/perf/util/map_symbol.h b/tools/perf/util/map_symbol.h
index e370bb32ed47..7437e319f4a3 100644
--- a/tools/perf/util/map_symbol.h
+++ b/tools/perf/util/map_symbol.h
@@ -4,12 +4,13 @@
 
 #include <linux/types.h>
 
+struct thread;
 struct maps;
 struct map;
 struct symbol;
 
 struct map_symbol {
-	struct maps   *maps;
+	struct thread *thread;
 	struct map    *map;
 	struct symbol *sym;
 };
diff --git a/tools/perf/util/maps.c b/tools/perf/util/maps.c
index c321d4f4d846..4092211cff62 100644
--- a/tools/perf/util/maps.c
+++ b/tools/perf/util/maps.c
@@ -10,6 +10,7 @@
 #include "thread.h"
 #include "ui/ui.h"
 #include "unwind.h"
+#include "unwind-libdw.h"
 #include <internal/rc_check.h>
 
 /*
@@ -40,6 +41,9 @@ DECLARE_RC_STRUCT(maps) {
 	void		*addr_space;
 	const struct unwind_libunwind_ops *unwind_libunwind_ops;
 #endif
+#ifdef HAVE_LIBDW_SUPPORT
+	void		*libdw_addr_space_dwfl;
+#endif
 	refcount_t	 refcnt;
 	/**
 	 * @nr_maps: number of maps_by_address, and possibly maps_by_name,
@@ -203,6 +207,17 @@ void maps__set_unwind_libunwind_ops(struct maps *maps, const struct unwind_libun
 	RC_CHK_ACCESS(maps)->unwind_libunwind_ops = ops;
 }
 #endif
+#ifdef HAVE_LIBDW_SUPPORT
+void *maps__libdw_addr_space_dwfl(const struct maps *maps)
+{
+	return RC_CHK_ACCESS(maps)->libdw_addr_space_dwfl;
+}
+
+void maps__set_libdw_addr_space_dwfl(struct maps *maps, void *dwfl)
+{
+	RC_CHK_ACCESS(maps)->libdw_addr_space_dwfl = dwfl;
+}
+#endif
 
 static struct rw_semaphore *maps__lock(struct maps *maps)
 {
@@ -219,6 +234,9 @@ static void maps__init(struct maps *maps, struct machine *machine)
 	RC_CHK_ACCESS(maps)->addr_space = NULL;
 	RC_CHK_ACCESS(maps)->unwind_libunwind_ops = NULL;
 #endif
+#ifdef HAVE_LIBDW_SUPPORT
+	RC_CHK_ACCESS(maps)->libdw_addr_space_dwfl = NULL;
+#endif
 	refcount_set(maps__refcnt(maps), 1);
 	RC_CHK_ACCESS(maps)->nr_maps = 0;
 	RC_CHK_ACCESS(maps)->nr_maps_allocated = 0;
@@ -240,6 +258,9 @@ static void maps__exit(struct maps *maps)
 	zfree(&maps_by_address);
 	zfree(&maps_by_name);
 	unwind__finish_access(maps);
+#ifdef HAVE_LIBDW_SUPPORT
+	libdw__invalidate_dwfl(maps, maps__libdw_addr_space_dwfl(maps));
+#endif
 }
 
 struct maps *maps__new(struct machine *machine)
@@ -549,6 +570,9 @@ void maps__remove(struct maps *maps, struct map *map)
 	__maps__remove(maps, map);
 	check_invariants(maps);
 	up_write(maps__lock(maps));
+#ifdef HAVE_LIBDW_SUPPORT
+	libdw__invalidate_dwfl(maps, maps__libdw_addr_space_dwfl(maps));
+#endif
 }
 
 bool maps__empty(struct maps *maps)
@@ -604,18 +628,26 @@ int maps__for_each_map(struct maps *maps, int (*cb)(struct map *map, void *data)
 void maps__remove_maps(struct maps *maps, bool (*cb)(struct map *map, void *data), void *data)
 {
 	struct map **maps_by_address;
+	bool removed = false;
 
 	down_write(maps__lock(maps));
 
 	maps_by_address = maps__maps_by_address(maps);
 	for (unsigned int i = 0; i < maps__nr_maps(maps);) {
-		if (cb(maps_by_address[i], data))
+		if (cb(maps_by_address[i], data)) {
 			__maps__remove(maps, maps_by_address[i]);
-		else
+			removed = true;
+		} else {
 			i++;
+		}
 	}
 	check_invariants(maps);
 	up_write(maps__lock(maps));
+	if (removed) {
+#ifdef HAVE_LIBDW_SUPPORT
+		libdw__invalidate_dwfl(maps, maps__libdw_addr_space_dwfl(maps));
+#endif
+	}
 }
 
 struct symbol *maps__find_symbol(struct maps *maps, u64 addr, struct map **mapp)
@@ -676,6 +708,7 @@ int maps__find_ams(struct maps *maps, struct addr_map_symbol *ams)
 	if (ams->addr < map__start(ams->ms.map) || ams->addr >= map__end(ams->ms.map)) {
 		if (maps == NULL)
 			return -1;
+		map__put(ams->ms.map);
 		ams->ms.map = maps__find(maps, ams->addr);
 		if (ams->ms.map == NULL)
 			return -1;
diff --git a/tools/perf/util/maps.h b/tools/perf/util/maps.h
index d9aa62ed968a..20c52084ba9e 100644
--- a/tools/perf/util/maps.h
+++ b/tools/perf/util/maps.h
@@ -52,6 +52,10 @@ void maps__set_addr_space(struct maps *maps, void *addr_space);
 const struct unwind_libunwind_ops *maps__unwind_libunwind_ops(const struct maps *maps);
 void maps__set_unwind_libunwind_ops(struct maps *maps, const struct unwind_libunwind_ops *ops);
 #endif
+#ifdef HAVE_LIBDW_SUPPORT
+void *maps__libdw_addr_space_dwfl(const struct maps *maps);
+void maps__set_libdw_addr_space_dwfl(struct maps *maps, void *dwfl);
+#endif
 
 size_t maps__fprintf(struct maps *maps, FILE *fp);
 
diff --git a/tools/perf/util/metricgroup.c b/tools/perf/util/metricgroup.c
index 25c75fdbfc52..46bf4dfeebc8 100644
--- a/tools/perf/util/metricgroup.c
+++ b/tools/perf/util/metricgroup.c
@@ -367,7 +367,7 @@ static int setup_metric_events(const char *pmu, struct hashmap *ids,
 static bool match_metric_or_groups(const char *metric_or_groups, const char *sought)
 {
 	int len;
-	char *m;
+	const char *m;
 
 	if (!sought)
 		return false;
@@ -450,11 +450,10 @@ static const char *code_characters = ",-=@";
 
 static int encode_metric_id(struct strbuf *sb, const char *x)
 {
-	char *c;
 	int ret = 0;
 
 	for (; *x; x++) {
-		c = strchr(code_characters, *x);
+		const char *c = strchr(code_characters, *x);
 		if (c) {
 			ret = strbuf_addch(sb, '!');
 			if (ret)
@@ -1563,8 +1562,6 @@ int metricgroup__parse_groups(struct evlist *perf_evlist,
 {
 	const struct pmu_metrics_table *table = pmu_metrics_table__find();
 
-	if (!table)
-		return -EINVAL;
 	if (hardware_aware_grouping)
 		pr_debug("Use hardware aware grouping instead of traditional metric grouping method\n");
 
@@ -1602,22 +1599,16 @@ static int metricgroup__has_metric_or_groups_callback(const struct pmu_metric *p
 
 bool metricgroup__has_metric_or_groups(const char *pmu, const char *metric_or_groups)
 {
-	const struct pmu_metrics_table *tables[2] = {
-		pmu_metrics_table__find(),
-		pmu_metrics_table__default(),
-	};
+	const struct pmu_metrics_table *table = pmu_metrics_table__find();
 	struct metricgroup__has_metric_data data = {
 		.pmu = pmu,
 		.metric_or_groups = metric_or_groups,
 	};
 
-	for (size_t i = 0; i < ARRAY_SIZE(tables); i++) {
-		if (pmu_metrics_table__for_each_metric(tables[i],
-							metricgroup__has_metric_or_groups_callback,
-							&data))
-			return true;
-	}
-	return false;
+	return pmu_metrics_table__for_each_metric(table,
+						  metricgroup__has_metric_or_groups_callback,
+						  &data)
+		? true : false;
 }
 
 static int metricgroup__topdown_max_level_callback(const struct pmu_metric *pm,
diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c
index 000c89a1e50d..b9efb296bba5 100644
--- a/tools/perf/util/parse-events.c
+++ b/tools/perf/util/parse-events.c
@@ -30,7 +30,6 @@
 #include "util/event.h"
 #include "util/bpf-filter.h"
 #include "util/stat.h"
-#include "util/tool_pmu.h"
 #include "util/util.h"
 #include "tracepoint.h"
 #include <api/fs/tracing_path.h>
@@ -230,12 +229,8 @@ __add_event(struct list_head *list, int *idx,
 	if (pmu) {
 		is_pmu_core = pmu->is_core;
 		pmu_cpus = perf_cpu_map__get(pmu->cpus);
-		if (perf_cpu_map__is_empty(pmu_cpus)) {
-			if (perf_pmu__is_tool(pmu))
-				pmu_cpus = tool_pmu__cpus(attr);
-			else
-				pmu_cpus = cpu_map__online();
-		}
+		if (perf_cpu_map__is_empty(pmu_cpus))
+			pmu_cpus = cpu_map__online();
 	} else {
 		is_pmu_core = (attr->type == PERF_TYPE_HARDWARE ||
 			       attr->type == PERF_TYPE_HW_CACHE);
@@ -274,6 +269,7 @@ __add_event(struct list_head *list, int *idx,
 	evsel->core.pmu_cpus = pmu_cpus;
 	evsel->core.requires_cpu = pmu ? pmu->is_uncore : false;
 	evsel->core.is_pmu_core = is_pmu_core;
+	evsel->core.reads_only_on_cpu_idx0 = perf_pmu__reads_only_on_cpu_idx0(attr);
 	evsel->pmu = pmu;
 	evsel->alternate_hw_config = alternate_hw_config;
 	evsel->first_wildcard_match = first_wildcard_match;
@@ -1119,105 +1115,107 @@ static int config_attr(struct perf_event_attr *attr,
 	return 0;
 }
 
-static int get_config_terms(const struct parse_events_terms *head_config,
-			    struct list_head *head_terms)
+static struct evsel_config_term *add_config_term(enum evsel_term_type type,
+						 struct list_head *head_terms,
+						 bool weak)
 {
-#define ADD_CONFIG_TERM(__type, __weak)				\
-	struct evsel_config_term *__t;			\
-								\
-	__t = zalloc(sizeof(*__t));				\
-	if (!__t)						\
-		return -ENOMEM;					\
-								\
-	INIT_LIST_HEAD(&__t->list);				\
-	__t->type       = EVSEL__CONFIG_TERM_ ## __type;	\
-	__t->weak	= __weak;				\
-	list_add_tail(&__t->list, head_terms)
-
-#define ADD_CONFIG_TERM_VAL(__type, __name, __val, __weak)	\
-do {								\
-	ADD_CONFIG_TERM(__type, __weak);			\
-	__t->val.__name = __val;				\
-} while (0)
+	struct evsel_config_term *t;
 
-#define ADD_CONFIG_TERM_STR(__type, __val, __weak)		\
-do {								\
-	ADD_CONFIG_TERM(__type, __weak);			\
-	__t->val.str = strdup(__val);				\
-	if (!__t->val.str) {					\
-		zfree(&__t);					\
-		return -ENOMEM;					\
-	}							\
-	__t->free_str = true;					\
-} while (0)
+	t = zalloc(sizeof(*t));
+	if (!t)
+		return NULL;
 
+	INIT_LIST_HEAD(&t->list);
+	t->type = type;
+	t->weak	= weak;
+	list_add_tail(&t->list, head_terms);
+
+	return t;
+}
+
+static int get_config_terms(const struct parse_events_terms *head_config,
+			    struct list_head *head_terms)
+{
 	struct parse_events_term *term;
 
 	list_for_each_entry(term, &head_config->terms, list) {
+		struct evsel_config_term *new_term;
+		enum evsel_term_type new_type;
+		bool str_type = false;
+		u64 val;
+
 		switch (term->type_term) {
 		case PARSE_EVENTS__TERM_TYPE_SAMPLE_PERIOD:
-			ADD_CONFIG_TERM_VAL(PERIOD, period, term->val.num, term->weak);
+			new_type = EVSEL__CONFIG_TERM_PERIOD;
+			val = term->val.num;
 			break;
 		case PARSE_EVENTS__TERM_TYPE_SAMPLE_FREQ:
-			ADD_CONFIG_TERM_VAL(FREQ, freq, term->val.num, term->weak);
+			new_type = EVSEL__CONFIG_TERM_FREQ;
+			val = term->val.num;
 			break;
 		case PARSE_EVENTS__TERM_TYPE_TIME:
-			ADD_CONFIG_TERM_VAL(TIME, time, term->val.num, term->weak);
+			new_type = EVSEL__CONFIG_TERM_TIME;
+			val = term->val.num;
 			break;
 		case PARSE_EVENTS__TERM_TYPE_CALLGRAPH:
-			ADD_CONFIG_TERM_STR(CALLGRAPH, term->val.str, term->weak);
+			new_type = EVSEL__CONFIG_TERM_CALLGRAPH;
+			str_type = true;
 			break;
 		case PARSE_EVENTS__TERM_TYPE_BRANCH_SAMPLE_TYPE:
-			ADD_CONFIG_TERM_STR(BRANCH, term->val.str, term->weak);
+			new_type = EVSEL__CONFIG_TERM_BRANCH;
+			str_type = true;
 			break;
 		case PARSE_EVENTS__TERM_TYPE_STACKSIZE:
-			ADD_CONFIG_TERM_VAL(STACK_USER, stack_user,
-					    term->val.num, term->weak);
+			new_type = EVSEL__CONFIG_TERM_STACK_USER;
+			val = term->val.num;
 			break;
 		case PARSE_EVENTS__TERM_TYPE_INHERIT:
-			ADD_CONFIG_TERM_VAL(INHERIT, inherit,
-					    term->val.num ? 1 : 0, term->weak);
+			new_type = EVSEL__CONFIG_TERM_INHERIT;
+			val = term->val.num ? 1 : 0;
 			break;
 		case PARSE_EVENTS__TERM_TYPE_NOINHERIT:
-			ADD_CONFIG_TERM_VAL(INHERIT, inherit,
-					    term->val.num ? 0 : 1, term->weak);
+			new_type = EVSEL__CONFIG_TERM_INHERIT;
+			val = term->val.num ? 0 : 1;
 			break;
 		case PARSE_EVENTS__TERM_TYPE_MAX_STACK:
-			ADD_CONFIG_TERM_VAL(MAX_STACK, max_stack,
-					    term->val.num, term->weak);
+			new_type = EVSEL__CONFIG_TERM_MAX_STACK;
+			val = term->val.num;
 			break;
 		case PARSE_EVENTS__TERM_TYPE_MAX_EVENTS:
-			ADD_CONFIG_TERM_VAL(MAX_EVENTS, max_events,
-					    term->val.num, term->weak);
+			new_type = EVSEL__CONFIG_TERM_MAX_EVENTS;
+			val = term->val.num;
 			break;
 		case PARSE_EVENTS__TERM_TYPE_OVERWRITE:
-			ADD_CONFIG_TERM_VAL(OVERWRITE, overwrite,
-					    term->val.num ? 1 : 0, term->weak);
+			new_type = EVSEL__CONFIG_TERM_OVERWRITE;
+			val = term->val.num ? 1 : 0;
 			break;
 		case PARSE_EVENTS__TERM_TYPE_NOOVERWRITE:
-			ADD_CONFIG_TERM_VAL(OVERWRITE, overwrite,
-					    term->val.num ? 0 : 1, term->weak);
+			new_type = EVSEL__CONFIG_TERM_OVERWRITE;
+			val = term->val.num ? 0 : 1;
 			break;
 		case PARSE_EVENTS__TERM_TYPE_DRV_CFG:
-			ADD_CONFIG_TERM_STR(DRV_CFG, term->val.str, term->weak);
+			new_type = EVSEL__CONFIG_TERM_DRV_CFG;
+			str_type = true;
 			break;
 		case PARSE_EVENTS__TERM_TYPE_PERCORE:
-			ADD_CONFIG_TERM_VAL(PERCORE, percore,
-					    term->val.num ? true : false, term->weak);
+			new_type = EVSEL__CONFIG_TERM_PERCORE;
+			val = term->val.num ? true : false;
 			break;
 		case PARSE_EVENTS__TERM_TYPE_AUX_OUTPUT:
-			ADD_CONFIG_TERM_VAL(AUX_OUTPUT, aux_output,
-					    term->val.num ? 1 : 0, term->weak);
+			new_type = EVSEL__CONFIG_TERM_AUX_OUTPUT;
+			val = term->val.num ? 1 : 0;
 			break;
 		case PARSE_EVENTS__TERM_TYPE_AUX_ACTION:
-			ADD_CONFIG_TERM_STR(AUX_ACTION, term->val.str, term->weak);
+			new_type = EVSEL__CONFIG_TERM_AUX_ACTION;
+			str_type = true;
 			break;
 		case PARSE_EVENTS__TERM_TYPE_AUX_SAMPLE_SIZE:
-			ADD_CONFIG_TERM_VAL(AUX_SAMPLE_SIZE, aux_sample_size,
-					    term->val.num, term->weak);
+			new_type = EVSEL__CONFIG_TERM_AUX_SAMPLE_SIZE;
+			val = term->val.num;
 			break;
 		case PARSE_EVENTS__TERM_TYPE_RATIO_TO_PREV:
-			ADD_CONFIG_TERM_STR(RATIO_TO_PREV, term->val.str, term->weak);
+			new_type = EVSEL__CONFIG_TERM_RATIO_TO_PREV;
+			str_type = true;
 			break;
 		case PARSE_EVENTS__TERM_TYPE_USER:
 		case PARSE_EVENTS__TERM_TYPE_CONFIG:
@@ -1232,74 +1230,106 @@ do {								\
 		case PARSE_EVENTS__TERM_TYPE_RAW:
 		case PARSE_EVENTS__TERM_TYPE_CPU:
 		default:
-			break;
+			/* Don't add a new term for these ones */
+			continue;
+		}
+
+		new_term = add_config_term(new_type, head_terms, term->weak);
+		if (!new_term)
+			return -ENOMEM;
+
+		if (str_type) {
+			new_term->val.str = strdup(term->val.str);
+			if (!new_term->val.str) {
+				zfree(&new_term);
+				return -ENOMEM;
+			}
+			new_term->free_str = true;
+		} else {
+			new_term->val.val = val;
 		}
 	}
 	return 0;
 }
 
-/*
- * Add EVSEL__CONFIG_TERM_CFG_CHG where cfg_chg will have a bit set for
- * each bit of attr->config that the user has changed.
- */
-static int get_config_chgs(struct perf_pmu *pmu, struct parse_events_terms *head_config,
-			   struct list_head *head_terms)
+static int add_cfg_chg(const struct perf_pmu *pmu,
+		       const struct parse_events_terms *head_config,
+		       struct list_head *head_terms,
+		       int format_type,
+		       enum parse_events__term_type term_type,
+		       enum evsel_term_type new_term_type)
 {
 	struct parse_events_term *term;
 	u64 bits = 0;
 	int type;
 
 	list_for_each_entry(term, &head_config->terms, list) {
-		switch (term->type_term) {
-		case PARSE_EVENTS__TERM_TYPE_USER:
+		if (term->type_term == PARSE_EVENTS__TERM_TYPE_USER) {
 			type = perf_pmu__format_type(pmu, term->config);
-			if (type != PERF_PMU_FORMAT_VALUE_CONFIG)
+			if (type != format_type)
 				continue;
 			bits |= perf_pmu__format_bits(pmu, term->config);
-			break;
-		case PARSE_EVENTS__TERM_TYPE_CONFIG:
+		} else if (term->type_term == term_type) {
 			bits = ~(u64)0;
-			break;
-		case PARSE_EVENTS__TERM_TYPE_CONFIG1:
-		case PARSE_EVENTS__TERM_TYPE_CONFIG2:
-		case PARSE_EVENTS__TERM_TYPE_CONFIG3:
-		case PARSE_EVENTS__TERM_TYPE_CONFIG4:
-		case PARSE_EVENTS__TERM_TYPE_LEGACY_HARDWARE_CONFIG:
-		case PARSE_EVENTS__TERM_TYPE_LEGACY_CACHE_CONFIG:
-		case PARSE_EVENTS__TERM_TYPE_NAME:
-		case PARSE_EVENTS__TERM_TYPE_SAMPLE_PERIOD:
-		case PARSE_EVENTS__TERM_TYPE_SAMPLE_FREQ:
-		case PARSE_EVENTS__TERM_TYPE_BRANCH_SAMPLE_TYPE:
-		case PARSE_EVENTS__TERM_TYPE_TIME:
-		case PARSE_EVENTS__TERM_TYPE_CALLGRAPH:
-		case PARSE_EVENTS__TERM_TYPE_STACKSIZE:
-		case PARSE_EVENTS__TERM_TYPE_NOINHERIT:
-		case PARSE_EVENTS__TERM_TYPE_INHERIT:
-		case PARSE_EVENTS__TERM_TYPE_MAX_STACK:
-		case PARSE_EVENTS__TERM_TYPE_MAX_EVENTS:
-		case PARSE_EVENTS__TERM_TYPE_NOOVERWRITE:
-		case PARSE_EVENTS__TERM_TYPE_OVERWRITE:
-		case PARSE_EVENTS__TERM_TYPE_DRV_CFG:
-		case PARSE_EVENTS__TERM_TYPE_PERCORE:
-		case PARSE_EVENTS__TERM_TYPE_AUX_OUTPUT:
-		case PARSE_EVENTS__TERM_TYPE_AUX_ACTION:
-		case PARSE_EVENTS__TERM_TYPE_AUX_SAMPLE_SIZE:
-		case PARSE_EVENTS__TERM_TYPE_METRIC_ID:
-		case PARSE_EVENTS__TERM_TYPE_RAW:
-		case PARSE_EVENTS__TERM_TYPE_CPU:
-		case PARSE_EVENTS__TERM_TYPE_RATIO_TO_PREV:
-		default:
-			break;
 		}
 	}
 
-	if (bits)
-		ADD_CONFIG_TERM_VAL(CFG_CHG, cfg_chg, bits, false);
+	if (bits) {
+		struct evsel_config_term *new_term;
+
+		new_term = add_config_term(new_term_type, head_terms, false);
+		if (!new_term)
+			return -ENOMEM;
+		new_term->val.cfg_chg = bits;
+	}
 
-#undef ADD_CONFIG_TERM
 	return 0;
 }
 
+/*
+ * Add EVSEL__CONFIG_TERM_USR_CFG_CONFIGn where cfg_chg will have a bit set for
+ * each bit of attr->configN that the user has changed.
+ */
+static int get_config_chgs(const struct perf_pmu *pmu,
+			   const struct parse_events_terms *head_config,
+			   struct list_head *head_terms)
+{
+	int ret;
+
+	ret = add_cfg_chg(pmu, head_config, head_terms,
+			  PERF_PMU_FORMAT_VALUE_CONFIG,
+			  PARSE_EVENTS__TERM_TYPE_CONFIG,
+			  EVSEL__CONFIG_TERM_USR_CHG_CONFIG);
+	if (ret)
+		return ret;
+
+	ret = add_cfg_chg(pmu, head_config, head_terms,
+			  PERF_PMU_FORMAT_VALUE_CONFIG1,
+			  PARSE_EVENTS__TERM_TYPE_CONFIG1,
+			  EVSEL__CONFIG_TERM_USR_CHG_CONFIG1);
+	if (ret)
+		return ret;
+
+	ret = add_cfg_chg(pmu, head_config, head_terms,
+			  PERF_PMU_FORMAT_VALUE_CONFIG2,
+			  PARSE_EVENTS__TERM_TYPE_CONFIG2,
+			  EVSEL__CONFIG_TERM_USR_CHG_CONFIG2);
+	if (ret)
+		return ret;
+
+	ret = add_cfg_chg(pmu, head_config, head_terms,
+			  PERF_PMU_FORMAT_VALUE_CONFIG3,
+			  PARSE_EVENTS__TERM_TYPE_CONFIG3,
+			  EVSEL__CONFIG_TERM_USR_CHG_CONFIG3);
+	if (ret)
+		return ret;
+
+	return add_cfg_chg(pmu, head_config, head_terms,
+			   PERF_PMU_FORMAT_VALUE_CONFIG4,
+			   PARSE_EVENTS__TERM_TYPE_CONFIG4,
+			   EVSEL__CONFIG_TERM_USR_CHG_CONFIG4);
+}
+
 int parse_events_add_tracepoint(struct parse_events_state *parse_state,
 				struct list_head *list,
 				const char *sys, const char *event,
@@ -1497,12 +1527,8 @@ static int parse_events_add_pmu(struct parse_events_state *parse_state,
 		return -ENOMEM;
 	}
 
-	/*
-	 * When using default config, record which bits of attr->config were
-	 * changed by the user.
-	 */
-	if (pmu->perf_event_attr_init_default &&
-	    get_config_chgs(pmu, &parsed_terms, &config_terms)) {
+	/* Record which bits of attr->config were changed by the user. */
+	if (get_config_chgs(pmu, &parsed_terms, &config_terms)) {
 		parse_events_terms__exit(&parsed_terms);
 		return -ENOMEM;
 	}
@@ -2220,12 +2246,12 @@ int __parse_events(struct evlist *evlist, const char *str, const char *pmu_filte
 	evlist__splice_list_tail(evlist, &parse_state.list);
 
 	if (ret2 && warn_if_reordered && !parse_state.wild_card_pmus) {
+		evlist__uniquify_evsel_names(evlist, &stat_config);
 		pr_warning("WARNING: events were regrouped to match PMUs\n");
 
 		if (verbose > 0) {
 			struct strbuf sb = STRBUF_INIT;
 
-			evlist__uniquify_evsel_names(evlist, &stat_config);
 			evlist__format_evsels(evlist, &sb, 2048);
 			pr_debug("evlist after sorting/fixing: '%s'\n", sb.buf);
 			strbuf_release(&sb);
diff --git a/tools/perf/util/parse-regs-options.c b/tools/perf/util/parse-regs-options.c
index cda1c620968e..c93c2f0c8105 100644
--- a/tools/perf/util/parse-regs-options.c
+++ b/tools/perf/util/parse-regs-options.c
@@ -5,15 +5,54 @@
 #include <string.h>
 #include <stdio.h>
 #include "util/debug.h"
+#include <dwarf-regs.h>
 #include <subcmd/parse-options.h>
 #include "util/perf_regs.h"
 #include "util/parse-regs-options.h"
 
+static void list_perf_regs(FILE *fp, uint64_t mask)
+{
+	const char *last_name = NULL;
+
+	fprintf(fp, "available registers: ");
+	for (int reg = 0; reg < 64; reg++) {
+		const char *name;
+
+		if (((1ULL << reg) & mask) == 0)
+			continue;
+
+		name = perf_reg_name(reg, EM_HOST, EF_HOST);
+		if (name && (!last_name || strcmp(last_name, name)))
+			fprintf(fp, "%s%s", reg > 0 ? " " : "", name);
+		last_name = name;
+	}
+	fputc('\n', fp);
+}
+
+static uint64_t name_to_perf_reg_mask(const char *to_match, uint64_t mask)
+{
+	uint64_t reg_mask = 0;
+
+	for (int reg = 0; reg < 64; reg++) {
+		const char *name;
+
+		if (((1ULL << reg) & mask) == 0)
+			continue;
+
+		name = perf_reg_name(reg, EM_HOST, EF_HOST);
+		if (!name)
+			continue;
+
+		if (!strcasecmp(to_match, name))
+			reg_mask |= 1ULL << reg;
+	}
+	return reg_mask;
+}
+
 static int
 __parse_regs(const struct option *opt, const char *str, int unset, bool intr)
 {
 	uint64_t *mode = (uint64_t *)opt->value;
-	const struct sample_reg *r = NULL;
 	char *s, *os = NULL, *p;
 	int ret = -1;
 	uint64_t mask;
@@ -27,56 +66,46 @@ __parse_regs(const struct option *opt, const char *str, int unset, bool intr)
 	if (*mode)
 		return -1;
 
-	if (intr)
-		mask = arch__intr_reg_mask();
-	else
-		mask = arch__user_reg_mask();
+	mask = intr ? perf_intr_reg_mask(EM_HOST) : perf_user_reg_mask(EM_HOST);
 
 	/* str may be NULL in case no arg is passed to -I */
-	if (str) {
-		/* because str is read-only */
-		s = os = strdup(str);
-		if (!s)
-			return -1;
-
-		for (;;) {
-			p = strchr(s, ',');
-			if (p)
-				*p = '\0';
-
-			if (!strcmp(s, "?")) {
-				fprintf(stderr, "available registers: ");
-				for (r = arch__sample_reg_masks(); r->name; r++) {
-					if (r->mask & mask)
-						fprintf(stderr, "%s ", r->name);
-				}
-				fputc('\n', stderr);
-				/* just printing available regs */
-				goto error;
-			}
-			for (r = arch__sample_reg_masks(); r->name; r++) {
-				if ((r->mask & mask) && !strcasecmp(s, r->name))
-					break;
-			}
-			if (!r || !r->name) {
-				ui__warning("Unknown register \"%s\", check man page or run \"perf record %s?\"\n",
-					    s, intr ? "-I" : "--user-regs=");
-				goto error;
-			}
-
-			*mode |= r->mask;
-
-			if (!p)
-				break;
-
-			s = p + 1;
+	if (!str) {
+		*mode = mask;
+		return 0;
+	}
+
+	/* because str is read-only */
+	s = os = strdup(str);
+	if (!s)
+		return -1;
+
+	for (;;) {
+		uint64_t reg_mask;
+
+		p = strchr(s, ',');
+		if (p)
+			*p = '\0';
+
+		if (!strcmp(s, "?")) {
+			list_perf_regs(stderr, mask);
+			goto error;
+		}
+
+		reg_mask = name_to_perf_reg_mask(s, mask);
+		if (reg_mask == 0) {
+			ui__warning("Unknown register \"%s\", check man page or run \"perf record %s?\"\n",
+				s, intr ? "-I" : "--user-regs=");
+			goto error;
 		}
+		*mode |= reg_mask;
+
+		if (!p)
+			break;
+
+		s = p + 1;
 	}
 	ret = 0;
 
-	/* default to all possible regs */
-	if (*mode == 0)
-		*mode = mask;
 error:
 	free(os);
 	return ret;
diff --git a/tools/perf/util/perf-regs-arch/perf_regs_aarch64.c b/tools/perf/util/perf-regs-arch/perf_regs_aarch64.c
index 9dcda80d310f..6833d34dcbfd 100644
--- a/tools/perf/util/perf-regs-arch/perf_regs_aarch64.c
+++ b/tools/perf/util/perf-regs-arch/perf_regs_aarch64.c
@@ -1,7 +1,144 @@
 // SPDX-License-Identifier: GPL-2.0
+#include <errno.h>
+#include <regex.h>
+#include <string.h>
+#include <sys/auxv.h>
+#include <linux/kernel.h>
+#include <linux/zalloc.h>
 
+#include "../debug.h"
+#include "../event.h"
 #include "../perf_regs.h"
-#include "../../../arch/arm64/include/uapi/asm/perf_regs.h"
+#include "../../perf-sys.h"
+#include "../../arch/arm64/include/perf_regs.h"
+
+#define SMPL_REG_MASK(b) (1ULL << (b))
+
+#ifndef HWCAP_SVE
+#define HWCAP_SVE	(1 << 22)
+#endif
+
+/* %xNUM */
+#define SDT_OP_REGEX1  "^(x[1-2]?[0-9]|3[0-1])$"
+
+/* [sp], [sp, NUM] */
+#define SDT_OP_REGEX2  "^\\[sp(, )?([0-9]+)?\\]$"
+
+static regex_t sdt_op_regex1, sdt_op_regex2;
+
+static int sdt_init_op_regex(void)
+{
+	static int initialized;
+	int ret = 0;
+
+	if (initialized)
+		return 0;
+
+	ret = regcomp(&sdt_op_regex1, SDT_OP_REGEX1, REG_EXTENDED);
+	if (ret)
+		goto error;
+
+	ret = regcomp(&sdt_op_regex2, SDT_OP_REGEX2, REG_EXTENDED);
+	if (ret)
+		goto free_regex1;
+
+	initialized = 1;
+	return 0;
+
+free_regex1:
+	regfree(&sdt_op_regex1);
+error:
+	pr_debug4("Regex compilation error.\n");
+	return ret;
+}
+
+/*
+ * SDT marker arguments on Arm64 uses %xREG or [sp, NUM], currently
+ * support these two formats.
+ */
+int __perf_sdt_arg_parse_op_arm64(char *old_op, char **new_op)
+{
+	int ret, new_len;
+	regmatch_t rm[5];
+
+	ret = sdt_init_op_regex();
+	if (ret < 0)
+		return ret;
+
+	if (!regexec(&sdt_op_regex1, old_op, 3, rm, 0)) {
+		/* Extract xNUM */
+		new_len = 2;	/* % NULL */
+		new_len += (int)(rm[1].rm_eo - rm[1].rm_so);
+
+		*new_op = zalloc(new_len);
+		if (!*new_op)
+			return -ENOMEM;
+
+		scnprintf(*new_op, new_len, "%%%.*s",
+			(int)(rm[1].rm_eo - rm[1].rm_so), old_op + rm[1].rm_so);
+	} else if (!regexec(&sdt_op_regex2, old_op, 5, rm, 0)) {
+		/* [sp], [sp, NUM] or [sp,NUM] */
+		new_len = 7;	/* + ( % s p ) NULL */
+
+		/* If the argument is [sp], need to fill offset '0' */
+		if (rm[2].rm_so == -1)
+			new_len += 1;
+		else
+			new_len += (int)(rm[2].rm_eo - rm[2].rm_so);
+
+		*new_op = zalloc(new_len);
+		if (!*new_op)
+			return -ENOMEM;
+
+		if (rm[2].rm_so == -1)
+			scnprintf(*new_op, new_len, "+0(%%sp)");
+		else
+			scnprintf(*new_op, new_len, "+%.*s(%%sp)",
+				  (int)(rm[2].rm_eo - rm[2].rm_so),
+				  old_op + rm[2].rm_so);
+	} else {
+		pr_debug4("Skipping unsupported SDT argument: %s\n", old_op);
+		return SDT_ARG_SKIP;
+	}
+
+	return SDT_ARG_VALID;
+}
+
+uint64_t __perf_reg_mask_arm64(bool intr)
+{
+	struct perf_event_attr attr = {
+		.type                   = PERF_TYPE_HARDWARE,
+		.config                 = PERF_COUNT_HW_CPU_CYCLES,
+		.sample_type            = PERF_SAMPLE_REGS_USER,
+		.disabled               = 1,
+		.exclude_kernel         = 1,
+		.sample_period		= 1,
+		.sample_regs_user	= PERF_REGS_MASK
+	};
+	int fd;
+
+	if (intr)
+		return PERF_REGS_MASK;
+
+	if (getauxval(AT_HWCAP) & HWCAP_SVE)
+		attr.sample_regs_user |= SMPL_REG_MASK(PERF_REG_ARM64_VG);
+
+	/*
+	 * Check if the pmu supports perf extended regs, before
+	 * returning the register mask to sample. Open the event
+	 * on the perf process to check this.
+	 */
+	if (attr.sample_regs_user != PERF_REGS_MASK) {
+		event_attr_init(&attr);
+		fd = sys_perf_event_open(&attr, /*pid=*/0, /*cpu=*/-1,
+					 /*group_fd=*/-1, /*flags=*/0);
+		if (fd != -1) {
+			close(fd);
+			return attr.sample_regs_user;
+		}
+	}
+	return PERF_REGS_MASK;
+}
 
 const char *__perf_reg_name_arm64(int id)
 {
diff --git a/tools/perf/util/perf-regs-arch/perf_regs_arm.c b/tools/perf/util/perf-regs-arch/perf_regs_arm.c
index e29d130a587a..184d6e248dfc 100644
--- a/tools/perf/util/perf-regs-arch/perf_regs_arm.c
+++ b/tools/perf/util/perf-regs-arch/perf_regs_arm.c
@@ -1,7 +1,12 @@
 // SPDX-License-Identifier: GPL-2.0
 
 #include "../perf_regs.h"
-#include "../../../arch/arm/include/uapi/asm/perf_regs.h"
+#include "../../arch/arm/include/perf_regs.h"
+
+uint64_t __perf_reg_mask_arm(bool intr __maybe_unused)
+{
+	return PERF_REGS_MASK;
+}
 
 const char *__perf_reg_name_arm(int id)
 {
diff --git a/tools/perf/util/perf-regs-arch/perf_regs_csky.c b/tools/perf/util/perf-regs-arch/perf_regs_csky.c
index 75b461ef2eba..16cbd8303acf 100644
--- a/tools/perf/util/perf-regs-arch/perf_regs_csky.c
+++ b/tools/perf/util/perf-regs-arch/perf_regs_csky.c
@@ -1,10 +1,26 @@
 // SPDX-License-Identifier: GPL-2.0
-
+#include <elf.h>
+#ifndef EF_CSKY_ABIMASK
+#define EF_CSKY_ABIMASK	0XF0000000
+#endif
+#ifndef EF_CSKY_ABIV2
+#define EF_CSKY_ABIV2	0X20000000
+#endif
 #include "../perf_regs.h"
-#include "../../arch/csky/include/uapi/asm/perf_regs.h"
+#undef __CSKYABIV2__
+#define __CSKYABIV2__ 1  // Always want the V2 register definitions.
+#include "../../arch/csky/include/perf_regs.h"
+
+uint64_t __perf_reg_mask_csky(bool intr __maybe_unused)
+{
+	return PERF_REGS_MASK;
+}
 
-const char *__perf_reg_name_csky(int id)
+const char *__perf_reg_name_csky(int id, uint32_t e_flags)
 {
+	if (id >= PERF_REG_CSKY_EXREGS0 && (e_flags & EF_CSKY_ABIMASK) == EF_CSKY_ABIV2)
+		return NULL;
+
 	switch (id) {
 	case PERF_REG_CSKY_A0:
 		return "a0";
@@ -40,7 +56,6 @@ const char *__perf_reg_name_csky(int id)
 		return "lr";
 	case PERF_REG_CSKY_PC:
 		return "pc";
-#if defined(__CSKYABIV2__)
 	case PERF_REG_CSKY_EXREGS0:
 		return "exregs0";
 	case PERF_REG_CSKY_EXREGS1:
@@ -77,12 +92,9 @@ const char *__perf_reg_name_csky(int id)
 		return "hi";
 	case PERF_REG_CSKY_LO:
 		return "lo";
-#endif
 	default:
 		return NULL;
 	}
-
-	return NULL;
 }
 
 uint64_t __perf_reg_ip_csky(void)
diff --git a/tools/perf/util/perf-regs-arch/perf_regs_loongarch.c b/tools/perf/util/perf-regs-arch/perf_regs_loongarch.c
index 043f97f4e3ac..478ee889afa1 100644
--- a/tools/perf/util/perf-regs-arch/perf_regs_loongarch.c
+++ b/tools/perf/util/perf-regs-arch/perf_regs_loongarch.c
@@ -1,7 +1,12 @@
 // SPDX-License-Identifier: GPL-2.0
 
 #include "../perf_regs.h"
-#include "../../../arch/loongarch/include/uapi/asm/perf_regs.h"
+#include "../../arch/loongarch/include/perf_regs.h"
+
+uint64_t __perf_reg_mask_loongarch(bool intr __maybe_unused)
+{
+	return PERF_REGS_MASK;
+}
 
 const char *__perf_reg_name_loongarch(int id)
 {
diff --git a/tools/perf/util/perf-regs-arch/perf_regs_mips.c b/tools/perf/util/perf-regs-arch/perf_regs_mips.c
index 793178fc3c78..c5a475f6ec64 100644
--- a/tools/perf/util/perf-regs-arch/perf_regs_mips.c
+++ b/tools/perf/util/perf-regs-arch/perf_regs_mips.c
@@ -1,7 +1,12 @@
 // SPDX-License-Identifier: GPL-2.0
 
 #include "../perf_regs.h"
-#include "../../../arch/mips/include/uapi/asm/perf_regs.h"
+#include "../../arch/mips/include/perf_regs.h"
+
+uint64_t __perf_reg_mask_mips(bool intr __maybe_unused)
+{
+	return PERF_REGS_MASK;
+}
 
 const char *__perf_reg_name_mips(int id)
 {
diff --git a/tools/perf/util/perf-regs-arch/perf_regs_powerpc.c b/tools/perf/util/perf-regs-arch/perf_regs_powerpc.c
index 08636bb09a3a..217a001ccd2e 100644
--- a/tools/perf/util/perf-regs-arch/perf_regs_powerpc.c
+++ b/tools/perf/util/perf-regs-arch/perf_regs_powerpc.c
@@ -1,7 +1,188 @@
 // SPDX-License-Identifier: GPL-2.0
 
+#include <errno.h>
+#include <string.h>
+#include <regex.h>
+#include <linux/zalloc.h>
+
+#include "../debug.h"
+#include "../event.h"
+#include "../header.h"
 #include "../perf_regs.h"
-#include "../../../arch/powerpc/include/uapi/asm/perf_regs.h"
+#include "../../perf-sys.h"
+#include "../../arch/powerpc/util/utils_header.h"
+#include "../../arch/powerpc/include/perf_regs.h"
+
+#include <linux/kernel.h>
+
+#define PVR_POWER9		0x004E
+#define PVR_POWER10		0x0080
+#define PVR_POWER11		0x0082
+
+/* REG or %rREG */
+#define SDT_OP_REGEX1  "^(%r)?([1-2]?[0-9]|3[0-1])$"
+
+/* -NUM(REG) or NUM(REG) or -NUM(%rREG) or NUM(%rREG) */
+#define SDT_OP_REGEX2  "^(\\-)?([0-9]+)\\((%r)?([1-2]?[0-9]|3[0-1])\\)$"
+
+static regex_t sdt_op_regex1, sdt_op_regex2;
+
+static int sdt_init_op_regex(void)
+{
+	static int initialized;
+	int ret = 0;
+
+	if (initialized)
+		return 0;
+
+	ret = regcomp(&sdt_op_regex1, SDT_OP_REGEX1, REG_EXTENDED);
+	if (ret)
+		goto error;
+
+	ret = regcomp(&sdt_op_regex2, SDT_OP_REGEX2, REG_EXTENDED);
+	if (ret)
+		goto free_regex1;
+
+	initialized = 1;
+	return 0;
+
+free_regex1:
+	regfree(&sdt_op_regex1);
+error:
+	pr_debug4("Regex compilation error.\n");
+	return ret;
+}
+
+/*
+ * Parse OP and convert it into uprobe format, which is, +/-NUM(%gprREG).
+ * Possible variants of OP are:
+ *	Format		Example
+ *	-------------------------
+ *	NUM(REG)	48(18)
+ *	-NUM(REG)	-48(18)
+ *	NUM(%rREG)	48(%r18)
+ *	-NUM(%rREG)	-48(%r18)
+ *	REG		18
+ *	%rREG		%r18
+ *	iNUM		i0
+ *	i-NUM		i-1
+ *
+ * SDT marker arguments on Powerpc uses %rREG form with -mregnames flag
+ * and REG form with -mno-regnames. Here REG is general purpose register,
+ * which is in 0 to 31 range.
+ */
+int __perf_sdt_arg_parse_op_powerpc(char *old_op, char **new_op)
+{
+	int ret, new_len;
+	regmatch_t rm[5];
+	char prefix;
+
+	/* Constant argument. Uprobe does not support it */
+	if (old_op[0] == 'i') {
+		pr_debug4("Skipping unsupported SDT argument: %s\n", old_op);
+		return SDT_ARG_SKIP;
+	}
+
+	ret = sdt_init_op_regex();
+	if (ret < 0)
+		return ret;
+
+	if (!regexec(&sdt_op_regex1, old_op, 3, rm, 0)) {
+		/* REG or %rREG --> %gprREG */
+
+		new_len = 5;	/* % g p r NULL */
+		new_len += (int)(rm[2].rm_eo - rm[2].rm_so);
+
+		*new_op = zalloc(new_len);
+		if (!*new_op)
+			return -ENOMEM;
+
+		scnprintf(*new_op, new_len, "%%gpr%.*s",
+			(int)(rm[2].rm_eo - rm[2].rm_so), old_op + rm[2].rm_so);
+	} else if (!regexec(&sdt_op_regex2, old_op, 5, rm, 0)) {
+		/*
+		 * -NUM(REG) or NUM(REG) or -NUM(%rREG) or NUM(%rREG) -->
+		 *	+/-NUM(%gprREG)
+		 */
+		prefix = (rm[1].rm_so == -1) ? '+' : '-';
+
+		new_len = 8;	/* +/- ( % g p r ) NULL */
+		new_len += (int)(rm[2].rm_eo - rm[2].rm_so);
+		new_len += (int)(rm[4].rm_eo - rm[4].rm_so);
+
+		*new_op = zalloc(new_len);
+		if (!*new_op)
+			return -ENOMEM;
+
+		scnprintf(*new_op, new_len, "%c%.*s(%%gpr%.*s)", prefix,
+			(int)(rm[2].rm_eo - rm[2].rm_so), old_op + rm[2].rm_so,
+			(int)(rm[4].rm_eo - rm[4].rm_so), old_op + rm[4].rm_so);
+	} else {
+		pr_debug4("Skipping unsupported SDT argument: %s\n", old_op);
+		return SDT_ARG_SKIP;
+	}
+
+	return SDT_ARG_VALID;
+}
+
+/*
+ * mfspr is a POWERPC specific instruction, ensure it's only
+ * built and called on POWERPC by guarding with __powerpc64__
+ * or __powerpc__.
+ */
+#if defined(__powerpc64__) && defined(__powerpc__)
+uint64_t __perf_reg_mask_powerpc(bool intr)
+{
+	struct perf_event_attr attr = {
+		.type                   = PERF_TYPE_HARDWARE,
+		.config                 = PERF_COUNT_HW_CPU_CYCLES,
+		.sample_type            = PERF_SAMPLE_REGS_INTR,
+		.precise_ip             = 1,
+		.disabled               = 1,
+		.exclude_kernel         = 1,
+	};
+	int fd;
+	u32 version;
+	u64 extended_mask = 0, mask = PERF_REGS_MASK;
+
+	if (!intr)
+		return PERF_REGS_MASK;
+
+	/*
+	 * Get the PVR value to set the extended
+	 * mask specific to platform.
+	 */
+	version = (((mfspr(SPRN_PVR)) >>  16) & 0xFFFF);
+	if (version == PVR_POWER9)
+		extended_mask = PERF_REG_PMU_MASK_300;
+	else if ((version == PVR_POWER10) || (version == PVR_POWER11))
+		extended_mask = PERF_REG_PMU_MASK_31;
+	else
+		return mask;
+
+	attr.sample_regs_intr = extended_mask;
+	attr.sample_period = 1;
+	event_attr_init(&attr);
+
+	/*
+	 * Check if the pmu supports perf extended regs, before
+	 * returning the register mask to sample. Open the event
+	 * on the perf process to check this.
+	 */
+	fd = sys_perf_event_open(&attr, /*pid=*/0, /*cpu=*/-1,
+				 /*group_fd=*/-1, /*flags=*/0);
+	if (fd != -1) {
+		close(fd);
+		mask |= extended_mask;
+	}
+	return mask;
+}
+#else
+uint64_t __perf_reg_mask_powerpc(bool intr __maybe_unused)
+{
+	return PERF_REGS_MASK;
+}
+#endif
 
 const char *__perf_reg_name_powerpc(int id)
 {
diff --git a/tools/perf/util/perf-regs-arch/perf_regs_riscv.c b/tools/perf/util/perf-regs-arch/perf_regs_riscv.c
index 337b687c655d..5b5f21fcba8c 100644
--- a/tools/perf/util/perf-regs-arch/perf_regs_riscv.c
+++ b/tools/perf/util/perf-regs-arch/perf_regs_riscv.c
@@ -1,7 +1,12 @@
 // SPDX-License-Identifier: GPL-2.0
 
 #include "../perf_regs.h"
-#include "../../../arch/riscv/include/uapi/asm/perf_regs.h"
+#include "../../arch/riscv/include/perf_regs.h"
+
+uint64_t __perf_reg_mask_riscv(bool intr __maybe_unused)
+{
+	return PERF_REGS_MASK;
+}
 
 const char *__perf_reg_name_riscv(int id)
 {
diff --git a/tools/perf/util/perf-regs-arch/perf_regs_s390.c b/tools/perf/util/perf-regs-arch/perf_regs_s390.c
index d69bba881080..c61df24edf0f 100644
--- a/tools/perf/util/perf-regs-arch/perf_regs_s390.c
+++ b/tools/perf/util/perf-regs-arch/perf_regs_s390.c
@@ -1,7 +1,12 @@
 // SPDX-License-Identifier: GPL-2.0
 
 #include "../perf_regs.h"
-#include "../../../arch/s390/include/uapi/asm/perf_regs.h"
+#include "../../arch/s390/include/perf_regs.h"
+
+uint64_t __perf_reg_mask_s390(bool intr __maybe_unused)
+{
+	return PERF_REGS_MASK;
+}
 
 const char *__perf_reg_name_s390(int id)
 {
diff --git a/tools/perf/util/perf-regs-arch/perf_regs_x86.c b/tools/perf/util/perf-regs-arch/perf_regs_x86.c
index 708954a9d35d..b6d20522b4e8 100644
--- a/tools/perf/util/perf-regs-arch/perf_regs_x86.c
+++ b/tools/perf/util/perf-regs-arch/perf_regs_x86.c
@@ -1,7 +1,286 @@
 // SPDX-License-Identifier: GPL-2.0
 
+#include <errno.h>
+#include <string.h>
+#include <regex.h>
+#include <linux/kernel.h>
+#include <linux/zalloc.h>
+
+#include "../debug.h"
+#include "../event.h"
+#include "../pmu.h"
+#include "../pmus.h"
 #include "../perf_regs.h"
-#include "../../../arch/x86/include/uapi/asm/perf_regs.h"
+#include "../../perf-sys.h"
+#include "../../arch/x86/include/perf_regs.h"
+
+struct sdt_name_reg {
+	const char *sdt_name;
+	const char *uprobe_name;
+};
+#define SDT_NAME_REG(n, m) {.sdt_name = "%" #n, .uprobe_name = "%" #m}
+#define SDT_NAME_REG_END {.sdt_name = NULL, .uprobe_name = NULL}
+
+static const struct sdt_name_reg sdt_reg_tbl[] = {
+	SDT_NAME_REG(eax, ax),
+	SDT_NAME_REG(rax, ax),
+	SDT_NAME_REG(al,  ax),
+	SDT_NAME_REG(ah,  ax),
+	SDT_NAME_REG(ebx, bx),
+	SDT_NAME_REG(rbx, bx),
+	SDT_NAME_REG(bl,  bx),
+	SDT_NAME_REG(bh,  bx),
+	SDT_NAME_REG(ecx, cx),
+	SDT_NAME_REG(rcx, cx),
+	SDT_NAME_REG(cl,  cx),
+	SDT_NAME_REG(ch,  cx),
+	SDT_NAME_REG(edx, dx),
+	SDT_NAME_REG(rdx, dx),
+	SDT_NAME_REG(dl,  dx),
+	SDT_NAME_REG(dh,  dx),
+	SDT_NAME_REG(esi, si),
+	SDT_NAME_REG(rsi, si),
+	SDT_NAME_REG(sil, si),
+	SDT_NAME_REG(edi, di),
+	SDT_NAME_REG(rdi, di),
+	SDT_NAME_REG(dil, di),
+	SDT_NAME_REG(ebp, bp),
+	SDT_NAME_REG(rbp, bp),
+	SDT_NAME_REG(bpl, bp),
+	SDT_NAME_REG(rsp, sp),
+	SDT_NAME_REG(esp, sp),
+	SDT_NAME_REG(spl, sp),
+
+	/* rNN registers */
+	SDT_NAME_REG(r8b,  r8),
+	SDT_NAME_REG(r8w,  r8),
+	SDT_NAME_REG(r8d,  r8),
+	SDT_NAME_REG(r9b,  r9),
+	SDT_NAME_REG(r9w,  r9),
+	SDT_NAME_REG(r9d,  r9),
+	SDT_NAME_REG(r10b, r10),
+	SDT_NAME_REG(r10w, r10),
+	SDT_NAME_REG(r10d, r10),
+	SDT_NAME_REG(r11b, r11),
+	SDT_NAME_REG(r11w, r11),
+	SDT_NAME_REG(r11d, r11),
+	SDT_NAME_REG(r12b, r12),
+	SDT_NAME_REG(r12w, r12),
+	SDT_NAME_REG(r12d, r12),
+	SDT_NAME_REG(r13b, r13),
+	SDT_NAME_REG(r13w, r13),
+	SDT_NAME_REG(r13d, r13),
+	SDT_NAME_REG(r14b, r14),
+	SDT_NAME_REG(r14w, r14),
+	SDT_NAME_REG(r14d, r14),
+	SDT_NAME_REG(r15b, r15),
+	SDT_NAME_REG(r15w, r15),
+	SDT_NAME_REG(r15d, r15),
+	SDT_NAME_REG_END,
+};
+
+/*
+ * Perf only supports OP which is in  +/-NUM(REG)  form.
+ * Here plus-minus sign, NUM and parenthesis are optional,
+ * only REG is mandatory.
+ *
+ * SDT events also supports indirect addressing mode with a
+ * symbol as offset, scaled mode and constants in OP. But
+ * perf does not support them yet. Below are few examples.
+ *
+ * OP with scaled mode:
+ *     (%rax,%rsi,8)
+ *     10(%ras,%rsi,8)
+ *
+ * OP with indirect addressing mode:
+ *     check_action(%rip)
+ *     mp_+52(%rip)
+ *     44+mp_(%rip)
+ *
+ * OP with constant values:
+ *     $0
+ *     $123
+ *     $-1
+ */
+#define SDT_OP_REGEX  "^([+\\-]?)([0-9]*)(\\(?)(%[a-z][a-z0-9]+)(\\)?)$"
+
+static regex_t sdt_op_regex;
+
+static int sdt_init_op_regex(void)
+{
+	static int initialized;
+	int ret = 0;
+
+	if (initialized)
+		return 0;
+
+	ret = regcomp(&sdt_op_regex, SDT_OP_REGEX, REG_EXTENDED);
+	if (ret < 0) {
+		pr_debug4("Regex compilation error.\n");
+		return ret;
+	}
+
+	initialized = 1;
+	return 0;
+}
+
+/*
+ * Max x86 register name length is 5(ex: %r15d). So, 6th char
+ * should always contain NULL. This helps to find register name
+ * length using strlen, instead of maintaining one more variable.
+ */
+#define SDT_REG_NAME_SIZE  6
+
+/*
+ * The uprobe parser does not support all gas register names;
+ * so, we have to replace them (ex. for x86_64: %rax -> %ax).
+ * Note: If register does not require renaming, just copy
+ * paste as it is, but don't leave it empty.
+ */
+static void sdt_rename_register(char *sdt_reg, int sdt_len, char *uprobe_reg)
+{
+	int i = 0;
+
+	for (i = 0; sdt_reg_tbl[i].sdt_name != NULL; i++) {
+		if (!strncmp(sdt_reg_tbl[i].sdt_name, sdt_reg, sdt_len)) {
+			strcpy(uprobe_reg, sdt_reg_tbl[i].uprobe_name);
+			return;
+		}
+	}
+
+	strncpy(uprobe_reg, sdt_reg, sdt_len);
+}
+
+int __perf_sdt_arg_parse_op_x86(char *old_op, char **new_op)
+{
+	char new_reg[SDT_REG_NAME_SIZE] = {0};
+	int new_len = 0, ret;
+	/*
+	 * rm[0]:  +/-NUM(REG)
+	 * rm[1]:  +/-
+	 * rm[2]:  NUM
+	 * rm[3]:  (
+	 * rm[4]:  REG
+	 * rm[5]:  )
+	 */
+	regmatch_t rm[6];
+	/*
+	 * Max prefix length is 2 as it may contains sign(+/-)
+	 * and displacement 0 (Both sign and displacement 0 are
+	 * optional so it may be empty). Use one more character
+	 * to hold last NULL so that strlen can be used to find
+	 * prefix length, instead of maintaining one more variable.
+	 */
+	char prefix[3] = {0};
+
+	ret = sdt_init_op_regex();
+	if (ret < 0)
+		return ret;
+
+	/*
+	 * If unsupported OR does not match with regex OR
+	 * register name too long, skip it.
+	 */
+	if (strchr(old_op, ',') || strchr(old_op, '$') ||
+	    regexec(&sdt_op_regex, old_op, 6, rm, 0)   ||
+	    rm[4].rm_eo - rm[4].rm_so > SDT_REG_NAME_SIZE) {
+		pr_debug4("Skipping unsupported SDT argument: %s\n", old_op);
+		return SDT_ARG_SKIP;
+	}
+
+	/*
+	 * Prepare prefix.
+	 * If SDT OP has parenthesis but does not provide
+	 * displacement, add 0 for displacement.
+	 *     SDT         Uprobe     Prefix
+	 *     -----------------------------
+	 *     +24(%rdi)   +24(%di)   +
+	 *     24(%rdi)    +24(%di)   +
+	 *     %rdi        %di
+	 *     (%rdi)      +0(%di)    +0
+	 *     -80(%rbx)   -80(%bx)   -
+	 */
+	if (rm[3].rm_so != rm[3].rm_eo) {
+		if (rm[1].rm_so != rm[1].rm_eo)
+			prefix[0] = *(old_op + rm[1].rm_so);
+		else if (rm[2].rm_so != rm[2].rm_eo)
+			prefix[0] = '+';
+		else
+			scnprintf(prefix, sizeof(prefix), "+0");
+	}
+
+	/* Rename register */
+	sdt_rename_register(old_op + rm[4].rm_so, rm[4].rm_eo - rm[4].rm_so,
+			    new_reg);
+
+	/* Prepare final OP which should be valid for uprobe_events */
+	new_len = strlen(prefix)              +
+		  (rm[2].rm_eo - rm[2].rm_so) +
+		  (rm[3].rm_eo - rm[3].rm_so) +
+		  strlen(new_reg)             +
+		  (rm[5].rm_eo - rm[5].rm_so) +
+		  1;					/* NULL */
+
+	*new_op = zalloc(new_len);
+	if (!*new_op)
+		return -ENOMEM;
+
+	scnprintf(*new_op, new_len, "%.*s%.*s%.*s%.*s%.*s",
+		  strlen(prefix), prefix,
+		  (int)(rm[2].rm_eo - rm[2].rm_so), old_op + rm[2].rm_so,
+		  (int)(rm[3].rm_eo - rm[3].rm_so), old_op + rm[3].rm_so,
+		  strlen(new_reg), new_reg,
+		  (int)(rm[5].rm_eo - rm[5].rm_so), old_op + rm[5].rm_so);
+
+	return SDT_ARG_VALID;
+}
+
+uint64_t __perf_reg_mask_x86(bool intr)
+{
+	struct perf_event_attr attr = {
+		.type			= PERF_TYPE_HARDWARE,
+		.config			= PERF_COUNT_HW_CPU_CYCLES,
+		.sample_type		= PERF_SAMPLE_REGS_INTR,
+		.sample_regs_intr	= PERF_REG_EXTENDED_MASK,
+		.precise_ip		= 1,
+		.disabled		= 1,
+		.exclude_kernel		= 1,
+	};
+	int fd;
+
+	if (!intr)
+		return PERF_REGS_MASK;
+
+	/*
+	 * In an unnamed union, init it here to build on older gcc versions
+	 */
+	attr.sample_period = 1;
+
+	if (perf_pmus__num_core_pmus() > 1) {
+		struct perf_pmu *pmu = NULL;
+		__u64 type = PERF_TYPE_RAW;
+
+		/*
+		 * The same register set is supported among different hybrid PMUs.
+		 * Only check the first available one.
+		 */
+		while ((pmu = perf_pmus__scan_core(pmu)) != NULL) {
+			type = pmu->type;
+			break;
+		}
+		attr.config |= type << PERF_PMU_TYPE_SHIFT;
+	}
+
+	event_attr_init(&attr);
+	fd = sys_perf_event_open(&attr, /*pid=*/0, /*cpu=*/-1,
+				 /*group_fd=*/-1, /*flags=*/0);
+	if (fd != -1) {
+		close(fd);
+		return (PERF_REG_EXTENDED_MASK | PERF_REGS_MASK);
+	}
+
+	return PERF_REGS_MASK;
+}
 
 const char *__perf_reg_name_x86(int id)
 {
diff --git a/tools/perf/util/perf_regs.c b/tools/perf/util/perf_regs.c
index 44b90bbf2d07..5b8f34beb24e 100644
--- a/tools/perf/util/perf_regs.c
+++ b/tools/perf/util/perf_regs.c
@@ -1,59 +1,165 @@
 // SPDX-License-Identifier: GPL-2.0
+#include <elf.h>
 #include <errno.h>
 #include <string.h>
+#include "dwarf-regs.h"
 #include "perf_regs.h"
 #include "util/sample.h"
 #include "debug.h"
 
-int __weak arch_sdt_arg_parse_op(char *old_op __maybe_unused,
-				 char **new_op __maybe_unused)
+int perf_sdt_arg_parse_op(uint16_t e_machine, char *old_op, char **new_op)
 {
-	return SDT_ARG_SKIP;
-}
+	int ret = SDT_ARG_SKIP;
+
+	switch (e_machine) {
+	case EM_AARCH64:
+		ret = __perf_sdt_arg_parse_op_arm64(old_op, new_op);
+		break;
+	case EM_PPC:
+	case EM_PPC64:
+		ret = __perf_sdt_arg_parse_op_powerpc(old_op, new_op);
+		break;
+	case EM_386:
+	case EM_X86_64:
+		ret = __perf_sdt_arg_parse_op_x86(old_op, new_op);
+		break;
+	default:
+		pr_debug("Unknown ELF machine %d, standard arguments parse will be skipped.\n",
+			 e_machine);
+		break;
+	}
 
-uint64_t __weak arch__intr_reg_mask(void)
-{
-	return 0;
+	return ret;
 }
 
-uint64_t __weak arch__user_reg_mask(void)
+uint64_t perf_intr_reg_mask(uint16_t e_machine)
 {
-	return 0;
-}
+	uint64_t mask = 0;
+
+	switch (e_machine) {
+	case EM_ARM:
+		mask = __perf_reg_mask_arm(/*intr=*/true);
+		break;
+	case EM_AARCH64:
+		mask = __perf_reg_mask_arm64(/*intr=*/true);
+		break;
+	case EM_CSKY:
+		mask = __perf_reg_mask_csky(/*intr=*/true);
+		break;
+	case EM_LOONGARCH:
+		mask = __perf_reg_mask_loongarch(/*intr=*/true);
+		break;
+	case EM_MIPS:
+		mask = __perf_reg_mask_mips(/*intr=*/true);
+		break;
+	case EM_PPC:
+	case EM_PPC64:
+		mask = __perf_reg_mask_powerpc(/*intr=*/true);
+		break;
+	case EM_RISCV:
+		mask = __perf_reg_mask_riscv(/*intr=*/true);
+		break;
+	case EM_S390:
+		mask = __perf_reg_mask_s390(/*intr=*/true);
+		break;
+	case EM_386:
+	case EM_X86_64:
+		mask = __perf_reg_mask_x86(/*intr=*/true);
+		break;
+	default:
+		pr_debug("Unknown ELF machine %d, interrupt sampling register mask will be empty.\n",
+			 e_machine);
+		break;
+	}
 
-static const struct sample_reg sample_reg_masks[] = {
-	SMPL_REG_END
-};
+	return mask;
+}
 
-const struct sample_reg * __weak arch__sample_reg_masks(void)
+uint64_t perf_user_reg_mask(uint16_t e_machine)
 {
-	return sample_reg_masks;
+	uint64_t mask = 0;
+
+	switch (e_machine) {
+	case EM_ARM:
+		mask = __perf_reg_mask_arm(/*intr=*/false);
+		break;
+	case EM_AARCH64:
+		mask = __perf_reg_mask_arm64(/*intr=*/false);
+		break;
+	case EM_CSKY:
+		mask = __perf_reg_mask_csky(/*intr=*/false);
+		break;
+	case EM_LOONGARCH:
+		mask = __perf_reg_mask_loongarch(/*intr=*/false);
+		break;
+	case EM_MIPS:
+		mask = __perf_reg_mask_mips(/*intr=*/false);
+		break;
+	case EM_PPC:
+	case EM_PPC64:
+		mask = __perf_reg_mask_powerpc(/*intr=*/false);
+		break;
+	case EM_RISCV:
+		mask = __perf_reg_mask_riscv(/*intr=*/false);
+		break;
+	case EM_S390:
+		mask = __perf_reg_mask_s390(/*intr=*/false);
+		break;
+	case EM_386:
+	case EM_X86_64:
+		mask = __perf_reg_mask_x86(/*intr=*/false);
+		break;
+	default:
+		pr_debug("Unknown ELF machine %d, user sampling register mask will be empty.\n",
+			 e_machine);
+		break;
+	}
+
+	return mask;
 }
 
-const char *perf_reg_name(int id, const char *arch)
+const char *perf_reg_name(int id, uint16_t e_machine, uint32_t e_flags)
 {
 	const char *reg_name = NULL;
 
-	if (!strcmp(arch, "csky"))
-		reg_name = __perf_reg_name_csky(id);
-	else if (!strcmp(arch, "loongarch"))
+	switch (e_machine) {
+	case EM_ARM:
+		reg_name = __perf_reg_name_arm(id);
+		break;
+	case EM_AARCH64:
+		reg_name = __perf_reg_name_arm64(id);
+		break;
+	case EM_CSKY:
+		reg_name = __perf_reg_name_csky(id, e_flags);
+		break;
+	case EM_LOONGARCH:
 		reg_name = __perf_reg_name_loongarch(id);
-	else if (!strcmp(arch, "mips"))
+		break;
+	case EM_MIPS:
 		reg_name = __perf_reg_name_mips(id);
-	else if (!strcmp(arch, "powerpc"))
+		break;
+	case EM_PPC:
+	case EM_PPC64:
 		reg_name = __perf_reg_name_powerpc(id);
-	else if (!strcmp(arch, "riscv"))
+		break;
+	case EM_RISCV:
 		reg_name = __perf_reg_name_riscv(id);
-	else if (!strcmp(arch, "s390"))
+		break;
+	case EM_S390:
 		reg_name = __perf_reg_name_s390(id);
-	else if (!strcmp(arch, "x86"))
+		break;
+	case EM_386:
+	case EM_X86_64:
 		reg_name = __perf_reg_name_x86(id);
-	else if (!strcmp(arch, "arm"))
-		reg_name = __perf_reg_name_arm(id);
-	else if (!strcmp(arch, "arm64"))
-		reg_name = __perf_reg_name_arm64(id);
+		break;
+	default:
+		break;
+	}
+	if (reg_name)
+		return reg_name;
 
-	return reg_name ?: "unknown";
+	pr_debug("Failed to find register %d for ELF machine type %u\n", id, e_machine);
+	return "unknown";
 }
 
 int perf_reg_value(u64 *valp, struct regs_dump *regs, int id)
@@ -83,52 +189,60 @@ out:
 	return 0;
 }
 
-uint64_t perf_arch_reg_ip(const char *arch)
+uint64_t perf_arch_reg_ip(uint16_t e_machine)
 {
-	if (!strcmp(arch, "arm"))
+	switch (e_machine) {
+	case EM_ARM:
 		return __perf_reg_ip_arm();
-	else if (!strcmp(arch, "arm64"))
+	case EM_AARCH64:
 		return __perf_reg_ip_arm64();
-	else if (!strcmp(arch, "csky"))
+	case EM_CSKY:
 		return __perf_reg_ip_csky();
-	else if (!strcmp(arch, "loongarch"))
+	case EM_LOONGARCH:
 		return __perf_reg_ip_loongarch();
-	else if (!strcmp(arch, "mips"))
+	case EM_MIPS:
 		return __perf_reg_ip_mips();
-	else if (!strcmp(arch, "powerpc"))
+	case EM_PPC:
+	case EM_PPC64:
 		return __perf_reg_ip_powerpc();
-	else if (!strcmp(arch, "riscv"))
+	case EM_RISCV:
 		return __perf_reg_ip_riscv();
-	else if (!strcmp(arch, "s390"))
+	case EM_S390:
 		return __perf_reg_ip_s390();
-	else if (!strcmp(arch, "x86"))
+	case EM_386:
+	case EM_X86_64:
 		return __perf_reg_ip_x86();
-
-	pr_err("Fail to find IP register for arch %s, returns 0\n", arch);
-	return 0;
+	default:
+		pr_err("Failed to find IP register for ELF machine type %u\n", e_machine);
+		return 0;
+	}
 }
 
-uint64_t perf_arch_reg_sp(const char *arch)
+uint64_t perf_arch_reg_sp(uint16_t e_machine)
 {
-	if (!strcmp(arch, "arm"))
+	switch (e_machine) {
+	case EM_ARM:
 		return __perf_reg_sp_arm();
-	else if (!strcmp(arch, "arm64"))
+	case EM_AARCH64:
 		return __perf_reg_sp_arm64();
-	else if (!strcmp(arch, "csky"))
+	case EM_CSKY:
 		return __perf_reg_sp_csky();
-	else if (!strcmp(arch, "loongarch"))
+	case EM_LOONGARCH:
 		return __perf_reg_sp_loongarch();
-	else if (!strcmp(arch, "mips"))
+	case EM_MIPS:
 		return __perf_reg_sp_mips();
-	else if (!strcmp(arch, "powerpc"))
+	case EM_PPC:
+	case EM_PPC64:
 		return __perf_reg_sp_powerpc();
-	else if (!strcmp(arch, "riscv"))
+	case EM_RISCV:
 		return __perf_reg_sp_riscv();
-	else if (!strcmp(arch, "s390"))
+	case EM_S390:
 		return __perf_reg_sp_s390();
-	else if (!strcmp(arch, "x86"))
+	case EM_386:
+	case EM_X86_64:
 		return __perf_reg_sp_x86();
-
-	pr_err("Fail to find SP register for arch %s, returns 0\n", arch);
-	return 0;
+	default:
+		pr_err("Failed to find SP register for ELF machine type %u\n", e_machine);
+		return 0;
+	}
 }
diff --git a/tools/perf/util/perf_regs.h b/tools/perf/util/perf_regs.h
index f2d0736d65cc..7c04700bf837 100644
--- a/tools/perf/util/perf_regs.h
+++ b/tools/perf/util/perf_regs.h
@@ -7,62 +7,71 @@
 
 struct regs_dump;
 
-struct sample_reg {
-	const char *name;
-	uint64_t mask;
-};
-
-#define SMPL_REG_MASK(b) (1ULL << (b))
-#define SMPL_REG(n, b) { .name = #n, .mask = SMPL_REG_MASK(b) }
-#define SMPL_REG2_MASK(b) (3ULL << (b))
-#define SMPL_REG2(n, b) { .name = #n, .mask = SMPL_REG2_MASK(b) }
-#define SMPL_REG_END { .name = NULL }
-
 enum {
 	SDT_ARG_VALID = 0,
 	SDT_ARG_SKIP,
 };
 
-int arch_sdt_arg_parse_op(char *old_op, char **new_op);
-uint64_t arch__intr_reg_mask(void);
-uint64_t arch__user_reg_mask(void);
-const struct sample_reg *arch__sample_reg_masks(void);
+int perf_sdt_arg_parse_op(uint16_t e_machine, char *old_op, char **new_op);
+uint64_t perf_intr_reg_mask(uint16_t e_machine);
+uint64_t perf_user_reg_mask(uint16_t e_machine);
 
-const char *perf_reg_name(int id, const char *arch);
+const char *perf_reg_name(int id, uint16_t e_machine, uint32_t e_flags);
 int perf_reg_value(u64 *valp, struct regs_dump *regs, int id);
-uint64_t perf_arch_reg_ip(const char *arch);
-uint64_t perf_arch_reg_sp(const char *arch);
+uint64_t perf_arch_reg_ip(uint16_t e_machine);
+uint64_t perf_arch_reg_sp(uint16_t e_machine);
+
+int __perf_sdt_arg_parse_op_arm64(char *old_op, char **new_op);
+uint64_t __perf_reg_mask_arm64(bool intr);
 const char *__perf_reg_name_arm64(int id);
 uint64_t __perf_reg_ip_arm64(void);
 uint64_t __perf_reg_sp_arm64(void);
+
+uint64_t __perf_reg_mask_arm(bool intr);
 const char *__perf_reg_name_arm(int id);
 uint64_t __perf_reg_ip_arm(void);
 uint64_t __perf_reg_sp_arm(void);
-const char *__perf_reg_name_csky(int id);
+
+uint64_t __perf_reg_mask_csky(bool intr);
+const char *__perf_reg_name_csky(int id, uint32_t e_flags);
 uint64_t __perf_reg_ip_csky(void);
 uint64_t __perf_reg_sp_csky(void);
+
+uint64_t __perf_reg_mask_loongarch(bool intr);
 const char *__perf_reg_name_loongarch(int id);
 uint64_t __perf_reg_ip_loongarch(void);
 uint64_t __perf_reg_sp_loongarch(void);
+
+uint64_t __perf_reg_mask_mips(bool intr);
 const char *__perf_reg_name_mips(int id);
 uint64_t __perf_reg_ip_mips(void);
 uint64_t __perf_reg_sp_mips(void);
+
+int __perf_sdt_arg_parse_op_powerpc(char *old_op, char **new_op);
+uint64_t __perf_reg_mask_powerpc(bool intr);
 const char *__perf_reg_name_powerpc(int id);
 uint64_t __perf_reg_ip_powerpc(void);
 uint64_t __perf_reg_sp_powerpc(void);
+
+uint64_t __perf_reg_mask_riscv(bool intr);
 const char *__perf_reg_name_riscv(int id);
 uint64_t __perf_reg_ip_riscv(void);
 uint64_t __perf_reg_sp_riscv(void);
+
+uint64_t __perf_reg_mask_s390(bool intr);
 const char *__perf_reg_name_s390(int id);
 uint64_t __perf_reg_ip_s390(void);
 uint64_t __perf_reg_sp_s390(void);
+
+int __perf_sdt_arg_parse_op_x86(char *old_op, char **new_op);
+uint64_t __perf_reg_mask_x86(bool intr);
 const char *__perf_reg_name_x86(int id);
 uint64_t __perf_reg_ip_x86(void);
 uint64_t __perf_reg_sp_x86(void);
 
-static inline uint64_t DWARF_MINIMAL_REGS(const char *arch)
+static inline uint64_t DWARF_MINIMAL_REGS(uint16_t e_machine)
 {
-	return (1ULL << perf_arch_reg_ip(arch)) | (1ULL << perf_arch_reg_sp(arch));
+	return (1ULL << perf_arch_reg_ip(e_machine)) | (1ULL << perf_arch_reg_sp(e_machine));
 }
 
 #endif /* __PERF_REGS_H */
diff --git a/tools/perf/util/pmu.c b/tools/perf/util/pmu.c
index 01a21b6aa031..23337d2fa281 100644
--- a/tools/perf/util/pmu.c
+++ b/tools/perf/util/pmu.c
@@ -118,31 +118,6 @@ struct perf_pmu_alias {
 	bool info_loaded;
 };
 
-/**
- * struct perf_pmu_format - Values from a format file read from
- * <sysfs>/devices/cpu/format/ held in struct perf_pmu.
- *
- * For example, the contents of <sysfs>/devices/cpu/format/event may be
- * "config:0-7" and will be represented here as name="event",
- * value=PERF_PMU_FORMAT_VALUE_CONFIG and bits 0 to 7 will be set.
- */
-struct perf_pmu_format {
-	/** @list: Element on list within struct perf_pmu. */
-	struct list_head list;
-	/** @bits: Which config bits are set by this format value. */
-	DECLARE_BITMAP(bits, PERF_PMU_FORMAT_BITS);
-	/** @name: The modifier/file name. */
-	char *name;
-	/**
-	 * @value : Which config value the format relates to. Supported values
-	 * are from PERF_PMU_FORMAT_VALUE_CONFIG to
-	 * PERF_PMU_FORMAT_VALUE_CONFIG_END.
-	 */
-	u16 value;
-	/** @loaded: Has the contents been loaded/parsed. */
-	bool loaded;
-};
-
 static int pmu_aliases_parse(struct perf_pmu *pmu);
 
 static struct perf_pmu_format *perf_pmu__new_format(struct list_head *list, char *name)
@@ -1364,48 +1339,28 @@ void perf_pmu__warn_invalid_formats(struct perf_pmu *pmu)
 	}
 }
 
-bool evsel__is_aux_event(const struct evsel *evsel)
-{
-	struct perf_pmu *pmu;
-
-	if (evsel->needs_auxtrace_mmap)
-		return true;
-
-	pmu = evsel__find_pmu(evsel);
-	return pmu && pmu->auxtrace;
-}
-
 /*
- * Set @config_name to @val as long as the user hasn't already set or cleared it
- * by passing a config term on the command line.
- *
- * @val is the value to put into the bits specified by @config_name rather than
- * the bit pattern. It is shifted into position by this function, so to set
- * something to true, pass 1 for val rather than a pre shifted value.
+ * Unpacks a raw config[n] value using the sparse bitfield that defines a
+ * format attr. For example "config1:1,6-7,44" defines a 4 bit value across non
+ * contiguous bits and this function returns those 4 bits as a value.
  */
-#define field_prep(_mask, _val) (((_val) << (ffsll(_mask) - 1)) & (_mask))
-void evsel__set_config_if_unset(struct perf_pmu *pmu, struct evsel *evsel,
-				const char *config_name, u64 val)
+u64 perf_pmu__format_unpack(unsigned long *format, u64 config_val)
 {
-	u64 user_bits = 0, bits;
-	struct evsel_config_term *term = evsel__get_config_term(evsel, CFG_CHG);
-
-	if (term)
-		user_bits = term->val.cfg_chg;
+	int val_bit = 0;
+	u64 res = 0;
+	int fmt_bit;
 
-	bits = perf_pmu__format_bits(pmu, config_name);
+	for_each_set_bit(fmt_bit, format, PERF_PMU_FORMAT_BITS) {
+		if (config_val & (1ULL << fmt_bit))
+			res |= BIT_ULL(val_bit);
 
-	/* Do nothing if the user changed the value */
-	if (bits & user_bits)
-		return;
-
-	/* Otherwise replace it */
-	evsel->core.attr.config &= ~bits;
-	evsel->core.attr.config |= field_prep(bits, val);
+		val_bit++;
+	}
+	return res;
 }
 
-static struct perf_pmu_format *
-pmu_find_format(const struct list_head *formats, const char *name)
+struct perf_pmu_format *pmu_find_format(const struct list_head *formats,
+					const char *name)
 {
 	struct perf_pmu_format *format;
 
@@ -1416,7 +1371,7 @@ pmu_find_format(const struct list_head *formats, const char *name)
 	return NULL;
 }
 
-__u64 perf_pmu__format_bits(struct perf_pmu *pmu, const char *name)
+__u64 perf_pmu__format_bits(const struct perf_pmu *pmu, const char *name)
 {
 	struct perf_pmu_format *format = pmu_find_format(&pmu->format, name);
 	__u64 bits = 0;
@@ -1431,7 +1386,7 @@ __u64 perf_pmu__format_bits(struct perf_pmu *pmu, const char *name)
 	return bits;
 }
 
-int perf_pmu__format_type(struct perf_pmu *pmu, const char *name)
+int perf_pmu__format_type(const struct perf_pmu *pmu, const char *name)
 {
 	struct perf_pmu_format *format = pmu_find_format(&pmu->format, name);
 
@@ -1446,8 +1401,8 @@ int perf_pmu__format_type(struct perf_pmu *pmu, const char *name)
  * Sets value based on the format definition (format parameter)
  * and unformatted value (value parameter).
  */
-static void pmu_format_value(unsigned long *format, __u64 value, __u64 *v,
-			     bool zero)
+void perf_pmu__format_pack(unsigned long *format, __u64 value, __u64 *v,
+			   bool zero)
 {
 	unsigned long fbit, vbit;
 
@@ -1564,23 +1519,23 @@ static int pmu_config_term(const struct perf_pmu *pmu,
 		switch (term->type_term) {
 		case PARSE_EVENTS__TERM_TYPE_CONFIG:
 			assert(term->type_val == PARSE_EVENTS__TERM_TYPE_NUM);
-			pmu_format_value(bits, term->val.num, &attr->config, zero);
+			perf_pmu__format_pack(bits, term->val.num, &attr->config, zero);
 			break;
 		case PARSE_EVENTS__TERM_TYPE_CONFIG1:
 			assert(term->type_val == PARSE_EVENTS__TERM_TYPE_NUM);
-			pmu_format_value(bits, term->val.num, &attr->config1, zero);
+			perf_pmu__format_pack(bits, term->val.num, &attr->config1, zero);
 			break;
 		case PARSE_EVENTS__TERM_TYPE_CONFIG2:
 			assert(term->type_val == PARSE_EVENTS__TERM_TYPE_NUM);
-			pmu_format_value(bits, term->val.num, &attr->config2, zero);
+			perf_pmu__format_pack(bits, term->val.num, &attr->config2, zero);
 			break;
 		case PARSE_EVENTS__TERM_TYPE_CONFIG3:
 			assert(term->type_val == PARSE_EVENTS__TERM_TYPE_NUM);
-			pmu_format_value(bits, term->val.num, &attr->config3, zero);
+			perf_pmu__format_pack(bits, term->val.num, &attr->config3, zero);
 			break;
 		case PARSE_EVENTS__TERM_TYPE_CONFIG4:
 			assert(term->type_val == PARSE_EVENTS__TERM_TYPE_NUM);
-			pmu_format_value(bits, term->val.num, &attr->config4, zero);
+			perf_pmu__format_pack(bits, term->val.num, &attr->config4, zero);
 			break;
 		case PARSE_EVENTS__TERM_TYPE_LEGACY_HARDWARE_CONFIG:
 			assert(term->type_val == PARSE_EVENTS__TERM_TYPE_NUM);
@@ -1718,7 +1673,7 @@ static int pmu_config_term(const struct perf_pmu *pmu,
 		 */
 	}
 
-	pmu_format_value(format->bits, val, vp, zero);
+	perf_pmu__format_pack(format->bits, val, vp, zero);
 	return 0;
 }
 
@@ -2422,6 +2377,18 @@ bool perf_pmu__is_software(const struct perf_pmu *pmu)
 	return false;
 }
 
+bool perf_pmu__benefits_from_affinity(struct perf_pmu *pmu)
+{
+	if (!pmu)
+		return true; /* Assume is core. */
+
+	/*
+	 * All perf event PMUs should benefit from accessing the perf event
+	 * contexts on the local CPU.
+	 */
+	return pmu->type <= PERF_PMU_TYPE_PE_END;
+}
+
 FILE *perf_pmu__open_file(const struct perf_pmu *pmu, const char *name)
 {
 	char path[PATH_MAX];
@@ -2765,3 +2732,14 @@ const char *perf_pmu__name_from_config(struct perf_pmu *pmu, u64 config)
 	}
 	return NULL;
 }
+
+bool perf_pmu__reads_only_on_cpu_idx0(const struct perf_event_attr *attr)
+{
+	enum tool_pmu_event event;
+
+	if (attr->type != PERF_PMU_TYPE_TOOL)
+		return false;
+
+	event = (enum tool_pmu_event)attr->config;
+	return event != TOOL_PMU__EVENT_USER_TIME && event != TOOL_PMU__EVENT_SYSTEM_TIME;
+}
diff --git a/tools/perf/util/pmu.h b/tools/perf/util/pmu.h
index 8f11bfe8ed6d..0d9f3c57e8e8 100644
--- a/tools/perf/util/pmu.h
+++ b/tools/perf/util/pmu.h
@@ -233,6 +233,31 @@ struct pmu_event_info {
 	bool deprecated;
 };
 
+/**
+ * struct perf_pmu_format - Values from a format file read from
+ * <sysfs>/devices/cpu/format/ held in struct perf_pmu.
+ *
+ * For example, the contents of <sysfs>/devices/cpu/format/event may be
+ * "config:0-7" and will be represented here as name="event",
+ * value=PERF_PMU_FORMAT_VALUE_CONFIG and bits 0 to 7 will be set.
+ */
+struct perf_pmu_format {
+	/** @list: Element on list within struct perf_pmu. */
+	struct list_head list;
+	/** @bits: Which config bits are set by this format value. */
+	DECLARE_BITMAP(bits, PERF_PMU_FORMAT_BITS);
+	/** @name: The modifier/file name. */
+	char *name;
+	/**
+	 * @value : Which config value the format relates to. Supported values
+	 * are from PERF_PMU_FORMAT_VALUE_CONFIG to
+	 * PERF_PMU_FORMAT_VALUE_CONFIG_END.
+	 */
+	u16 value;
+	/** @loaded: Has the contents been loaded/parsed. */
+	bool loaded;
+};
+
 typedef int (*pmu_event_callback)(void *state, struct pmu_event_info *info);
 typedef int (*pmu_format_callback)(void *state, const char *name, int config,
 				   const unsigned long *bits);
@@ -247,16 +272,21 @@ int perf_pmu__config_terms(const struct perf_pmu *pmu,
 			   struct parse_events_terms *terms,
 			   bool zero, bool apply_hardcoded,
 			   struct parse_events_error *error);
-__u64 perf_pmu__format_bits(struct perf_pmu *pmu, const char *name);
-int perf_pmu__format_type(struct perf_pmu *pmu, const char *name);
+__u64 perf_pmu__format_bits(const struct perf_pmu *pmu, const char *name);
+int perf_pmu__format_type(const struct perf_pmu *pmu, const char *name);
 int perf_pmu__check_alias(struct perf_pmu *pmu, struct parse_events_terms *head_terms,
 			  struct perf_pmu_info *info, bool *rewrote_terms,
 			  u64 *alternate_hw_config, struct parse_events_error *err);
 int perf_pmu__find_event(struct perf_pmu *pmu, const char *event, void *state, pmu_event_callback cb);
 
+void perf_pmu__format_pack(unsigned long *format, __u64 value, __u64 *v,
+			   bool zero);
+struct perf_pmu_format *pmu_find_format(const struct list_head *formats,
+					const char *name);
 void perf_pmu_format__set_value(void *format, int config, unsigned long *bits);
 bool perf_pmu__has_format(const struct perf_pmu *pmu, const char *name);
 int perf_pmu__for_each_format(struct perf_pmu *pmu, void *state, pmu_format_callback cb);
+u64 perf_pmu__format_unpack(unsigned long *format, u64 config_val);
 
 bool is_pmu_core(const char *name);
 bool perf_pmu__supports_legacy_cache(const struct perf_pmu *pmu);
@@ -273,6 +303,7 @@ bool perf_pmu__name_no_suffix_match(const struct perf_pmu *pmu, const char *to_m
  *                        perf_sw_context in the kernel?
  */
 bool perf_pmu__is_software(const struct perf_pmu *pmu);
+bool perf_pmu__benefits_from_affinity(struct perf_pmu *pmu);
 
 FILE *perf_pmu__open_file(const struct perf_pmu *pmu, const char *name);
 FILE *perf_pmu__open_file_at(const struct perf_pmu *pmu, int dirfd, const char *name);
@@ -320,6 +351,8 @@ void perf_pmu__delete(struct perf_pmu *pmu);
 const char *perf_pmu__name_from_config(struct perf_pmu *pmu, u64 config);
 bool perf_pmu__is_fake(const struct perf_pmu *pmu);
 
+bool perf_pmu__reads_only_on_cpu_idx0(const struct perf_event_attr *attr);
+
 static inline enum pmu_kind perf_pmu__kind(const struct perf_pmu *pmu)
 {
 	__u32 type;
diff --git a/tools/perf/util/print-events.c b/tools/perf/util/print-events.c
index 8f3ed83853a9..cb27e2898aa0 100644
--- a/tools/perf/util/print-events.c
+++ b/tools/perf/util/print-events.c
@@ -86,7 +86,7 @@ void print_sdt_events(const struct print_callbacks *print_cb, void *print_state)
 
 	strlist__for_each_entry(sdt_name, sdtlist) {
 		bool show_detail = false;
-		char *bid = strchr(sdt_name->s, '@');
+		char *bid = (char *)strchr(sdt_name->s, '@');
 		char *evt_name = NULL;
 
 		if (bid)
@@ -97,14 +97,9 @@ void print_sdt_events(const struct print_callbacks *print_cb, void *print_state)
 		} else {
 			next_sdt_name = strlist__next(sdt_name);
 			if (next_sdt_name) {
-				char *bid2 = strchr(next_sdt_name->s, '@');
-
-				if (bid2)
-					*bid2 = '\0';
-				if (strcmp(sdt_name->s, next_sdt_name->s) == 0)
-					show_detail = true;
-				if (bid2)
-					*bid2 = '@';
+				const char *bid2 = strchrnul(next_sdt_name->s, '@');
+
+				show_detail = strncmp(sdt_name->s, next_sdt_name->s, bid2 - next_sdt_name->s) == 0;
 			}
 		}
 		last_sdt_name = sdt_name->s;
diff --git a/tools/perf/util/probe-file.c b/tools/perf/util/probe-file.c
index 5069fb61f48c..f78c3bc3d601 100644
--- a/tools/perf/util/probe-file.c
+++ b/tools/perf/util/probe-file.c
@@ -28,6 +28,7 @@
 #include "session.h"
 #include "perf_regs.h"
 #include "string2.h"
+#include "dwarf-regs.h"
 
 /* 4096 - 2 ('\n' + '\0') */
 #define MAX_CMDLEN 4094
@@ -784,7 +785,7 @@ static int synthesize_sdt_probe_arg(struct strbuf *buf, int i, const char *arg)
 		op = desc;
 	}
 
-	ret = arch_sdt_arg_parse_op(op, &new_op);
+	ret = perf_sdt_arg_parse_op(EM_HOST, op, &new_op);
 
 	if (ret < 0)
 		goto error;
diff --git a/tools/perf/util/record.h b/tools/perf/util/record.h
index ea3a6c4657ee..93627c9a7338 100644
--- a/tools/perf/util/record.h
+++ b/tools/perf/util/record.h
@@ -40,6 +40,8 @@ struct record_opts {
 	bool	      record_cgroup;
 	bool	      record_switch_events;
 	bool	      record_switch_events_set;
+	bool	      record_data_mmap;
+	bool	      record_data_mmap_set;
 	bool	      all_kernel;
 	bool	      all_user;
 	bool	      kernel_callchains;
diff --git a/tools/perf/util/sample.c b/tools/perf/util/sample.c
index 605fee971f55..8f82aaf1aab6 100644
--- a/tools/perf/util/sample.c
+++ b/tools/perf/util/sample.c
@@ -1,9 +1,18 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 #include "sample.h"
 #include "debug.h"
+#include "thread.h"
+#include <elf.h>
+#ifndef EM_CSKY
+#define EM_CSKY		252
+#endif
+#ifndef EM_LOONGARCH
+#define EM_LOONGARCH	258
+#endif
 #include <linux/zalloc.h>
 #include <stdlib.h>
 #include <string.h>
+#include "../../arch/x86/include/asm/insn.h"
 
 void perf_sample__init(struct perf_sample *sample, bool all)
 {
@@ -41,3 +50,71 @@ struct regs_dump *perf_sample__intr_regs(struct perf_sample *sample)
 	}
 	return sample->intr_regs;
 }
+
+static int elf_machine_max_instruction_length(uint16_t e_machine)
+{
+	switch (e_machine) {
+	/* Fixed 4-byte (32-bit) architectures */
+	case EM_AARCH64:
+	case EM_PPC:
+	case EM_PPC64:
+	case EM_MIPS:
+	case EM_SPARC:
+	case EM_SPARCV9:
+	case EM_ALPHA:
+	case EM_LOONGARCH:
+	case EM_PARISC:
+	case EM_SH:
+		return 4;
+
+	/* Variable length or mixed-mode architectures */
+	case EM_ARM:    /* Variable due to Thumb/Thumb-2 */
+	case EM_RISCV:  /* Variable due to Compressed (C) extension */
+	case EM_CSKY:   /* Variable (16 or 32 bit) */
+	case EM_ARC:    /* Variable (ARCompact) */
+		return 4;
+	case EM_S390:   /* Variable (2, 4, or 6 bytes) */
+		return 6;
+	case EM_68K:
+		return 10;
+	case EM_386:
+	case EM_X86_64:
+		return 15;
+	case EM_XTENSA: /* Variable (FLIX) */
+		return 16;
+	default:
+		return MAX_INSN;
+	}
+}
+
+void perf_sample__fetch_insn(struct perf_sample *sample,
+			     struct thread *thread,
+			     struct machine *machine)
+{
+	int ret, len;
+	bool is64bit = false;
+	uint16_t e_machine;
+
+	if (!sample->ip || sample->insn_len != 0)
+		return;
+
+	e_machine = thread__e_machine(thread, machine, /*e_flags=*/NULL);
+	len = elf_machine_max_instruction_length(e_machine);
+	len = thread__memcpy(thread, machine, sample->insn,
+			     sample->ip, len,
+			     &is64bit);
+	if (len <= 0)
+		return;
+
+	sample->insn_len = len;
+
+	if (e_machine == EM_386 || e_machine == EM_X86_64) {
+		/* Refine the x86 instruction length with the decoder. */
+		struct insn insn;
+
+		ret = insn_decode(&insn, sample->insn, len,
+				  is64bit ? INSN_MODE_64 : INSN_MODE_32);
+		if (ret >= 0 && insn.length <= len)
+			sample->insn_len = insn.length;
+	}
+}
diff --git a/tools/perf/util/sample.h b/tools/perf/util/sample.h
index a8307b20a9ea..3cce8dd202aa 100644
--- a/tools/perf/util/sample.h
+++ b/tools/perf/util/sample.h
@@ -5,6 +5,9 @@
 #include <linux/perf_event.h>
 #include <linux/types.h>
 
+struct machine;
+struct thread;
+
 /* number of register is bound by the number of bits in regs_dump::mask (64) */
 #define PERF_SAMPLE_REGS_CACHE_SIZE (8 * sizeof(u64))
 
@@ -127,6 +130,10 @@ void perf_sample__exit(struct perf_sample *sample);
 struct regs_dump *perf_sample__user_regs(struct perf_sample *sample);
 struct regs_dump *perf_sample__intr_regs(struct perf_sample *sample);
 
+void perf_sample__fetch_insn(struct perf_sample *sample,
+			     struct thread *thread,
+			     struct machine *machine);
+
 /*
  * raw_data is always 4 bytes from an 8-byte boundary, so subtract 4 to get
  * 8-byte alignment.
diff --git a/tools/perf/util/scripting-engines/trace-event-python.c b/tools/perf/util/scripting-engines/trace-event-python.c
index 6655c0bbe0d8..2b0df7bd9a46 100644
--- a/tools/perf/util/scripting-engines/trace-event-python.c
+++ b/tools/perf/util/scripting-engines/trace-event-python.c
@@ -50,6 +50,7 @@
 #include "../thread-stack.h"
 #include "../trace-event.h"
 #include "../call-path.h"
+#include "dwarf-regs.h"
 #include "map.h"
 #include "symbol.h"
 #include "thread_map.h"
@@ -713,7 +714,8 @@ static void set_sample_datasrc_in_dict(PyObject *dict,
 			_PyUnicode_FromString(decode));
 }
 
-static void regs_map(struct regs_dump *regs, uint64_t mask, const char *arch, char *bf, int size)
+static void regs_map(struct regs_dump *regs, uint64_t mask, uint16_t e_machine, uint32_t e_flags,
+		     char *bf, int size)
 {
 	unsigned int i = 0, r;
 	int printed = 0;
@@ -731,7 +733,7 @@ static void regs_map(struct regs_dump *regs, uint64_t mask, const char *arch, ch
 
 		printed += scnprintf(bf + printed, size - printed,
 				     "%5s:0x%" PRIx64 " ",
-				     perf_reg_name(r, arch), val);
+				     perf_reg_name(r, e_machine, e_flags), val);
 	}
 }
 
@@ -739,10 +741,11 @@ static void regs_map(struct regs_dump *regs, uint64_t mask, const char *arch, ch
 
 static int set_regs_in_dict(PyObject *dict,
 			     struct perf_sample *sample,
-			     struct evsel *evsel)
+			     struct evsel *evsel,
+			     uint16_t e_machine,
+			     uint32_t e_flags)
 {
 	struct perf_event_attr *attr = &evsel->core.attr;
-	const char *arch = perf_env__arch(evsel__env(evsel));
 
 	int size = (__sw_hweight64(attr->sample_regs_intr) * MAX_REG_SIZE) + 1;
 	char *bf = NULL;
@@ -752,7 +755,7 @@ static int set_regs_in_dict(PyObject *dict,
 		if (!bf)
 			return -1;
 
-		regs_map(sample->intr_regs, attr->sample_regs_intr, arch, bf, size);
+		regs_map(sample->intr_regs, attr->sample_regs_intr, e_machine, e_flags, bf, size);
 
 		pydict_set_item_string_decref(dict, "iregs",
 					_PyUnicode_FromString(bf));
@@ -764,7 +767,7 @@ static int set_regs_in_dict(PyObject *dict,
 			if (!bf)
 				return -1;
 		}
-		regs_map(sample->user_regs, attr->sample_regs_user, arch, bf, size);
+		regs_map(sample->user_regs, attr->sample_regs_user, e_machine, e_flags, bf, size);
 
 		pydict_set_item_string_decref(dict, "uregs",
 					_PyUnicode_FromString(bf));
@@ -834,6 +837,8 @@ static PyObject *get_perf_sample_dict(struct perf_sample *sample,
 					 PyObject *callchain)
 {
 	PyObject *dict, *dict_sample, *brstack, *brstacksym;
+	uint16_t e_machine = EM_HOST;
+	uint32_t e_flags = EF_HOST;
 
 	dict = PyDict_New();
 	if (!dict)
@@ -920,7 +925,10 @@ static PyObject *get_perf_sample_dict(struct perf_sample *sample,
 			PyLong_FromUnsignedLongLong(sample->cyc_cnt));
 	}
 
-	if (set_regs_in_dict(dict, sample, evsel))
+	if (al->thread)
+		e_machine = thread__e_machine(al->thread, /*machine=*/NULL, &e_flags);
+
+	if (set_regs_in_dict(dict, sample, evsel, e_machine, e_flags))
 		Py_FatalError("Failed to setting regs in dict");
 
 	return dict;
diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c
index 4236503c8f6c..4b465abfa36c 100644
--- a/tools/perf/util/session.c
+++ b/tools/perf/util/session.c
@@ -17,6 +17,7 @@
 #include "map_symbol.h"
 #include "branch.h"
 #include "debug.h"
+#include "dwarf-regs.h"
 #include "env.h"
 #include "evlist.h"
 #include "evsel.h"
@@ -697,6 +698,20 @@ static void perf_event__time_conv_swap(union perf_event *event,
 	}
 }
 
+static void
+perf_event__schedstat_cpu_swap(union perf_event *event __maybe_unused,
+			       bool sample_id_all __maybe_unused)
+{
+	/* FIXME */
+}
+
+static void
+perf_event__schedstat_domain_swap(union perf_event *event __maybe_unused,
+				  bool sample_id_all __maybe_unused)
+{
+	/* FIXME */
+}
+
 typedef void (*perf_event__swap_op)(union perf_event *event,
 				    bool sample_id_all);
 
@@ -736,6 +751,8 @@ static perf_event__swap_op perf_event__swap_ops[] = {
 	[PERF_RECORD_STAT_ROUND]	  = perf_event__stat_round_swap,
 	[PERF_RECORD_EVENT_UPDATE]	  = perf_event__event_update_swap,
 	[PERF_RECORD_TIME_CONV]		  = perf_event__time_conv_swap,
+	[PERF_RECORD_SCHEDSTAT_CPU]	  = perf_event__schedstat_cpu_swap,
+	[PERF_RECORD_SCHEDSTAT_DOMAIN]	  = perf_event__schedstat_domain_swap,
 	[PERF_RECORD_HEADER_MAX]	  = NULL,
 };
 
@@ -841,6 +858,28 @@ static void callchain__lbr_callstack_printf(struct perf_sample *sample)
 	}
 }
 
+static const char *callchain_context_str(u64 ip)
+{
+	switch (ip) {
+	case PERF_CONTEXT_HV:
+		return " (PERF_CONTEXT_HV)";
+	case PERF_CONTEXT_KERNEL:
+		return " (PERF_CONTEXT_KERNEL)";
+	case PERF_CONTEXT_USER:
+		return " (PERF_CONTEXT_USER)";
+	case PERF_CONTEXT_GUEST:
+		return " (PERF_CONTEXT_GUEST)";
+	case PERF_CONTEXT_GUEST_KERNEL:
+		return " (PERF_CONTEXT_GUEST_KERNEL)";
+	case PERF_CONTEXT_GUEST_USER:
+		return " (PERF_CONTEXT_GUEST_USER)";
+	case PERF_CONTEXT_USER_DEFERRED:
+		return " (PERF_CONTEXT_USER_DEFERRED)";
+	default:
+		return "";
+	}
+}
+
 static void callchain__printf(struct evsel *evsel,
 			      struct perf_sample *sample)
 {
@@ -853,8 +892,9 @@ static void callchain__printf(struct evsel *evsel,
 	printf("... FP chain: nr:%" PRIu64 "\n", callchain->nr);
 
 	for (i = 0; i < callchain->nr; i++)
-		printf("..... %2d: %016" PRIx64 "\n",
-		       i, callchain->ips[i]);
+		printf("..... %2d: %016" PRIx64 "%s\n",
+		       i, callchain->ips[i],
+		       callchain_context_str(callchain->ips[i]));
 
 	if (sample->deferred_callchain)
 		printf("...... (deferred)\n");
@@ -919,7 +959,7 @@ static void branch_stack__printf(struct perf_sample *sample,
 	}
 }
 
-static void regs_dump__printf(u64 mask, u64 *regs, const char *arch)
+static void regs_dump__printf(u64 mask, u64 *regs, uint16_t e_machine, uint32_t e_flags)
 {
 	unsigned rid, i = 0;
 
@@ -927,7 +967,7 @@ static void regs_dump__printf(u64 mask, u64 *regs, const char *arch)
 		u64 val = regs[i++];
 
 		printf(".... %-5s 0x%016" PRIx64 "\n",
-		       perf_reg_name(rid, arch), val);
+		       perf_reg_name(rid, e_machine, e_flags), val);
 	}
 }
 
@@ -945,7 +985,8 @@ static inline const char *regs_dump_abi(struct regs_dump *d)
 	return regs_abi[d->abi];
 }
 
-static void regs__printf(const char *type, struct regs_dump *regs, const char *arch)
+static void regs__printf(const char *type, struct regs_dump *regs,
+			 uint16_t e_machine, uint32_t e_flags)
 {
 	u64 mask = regs->mask;
 
@@ -954,10 +995,10 @@ static void regs__printf(const char *type, struct regs_dump *regs, const char *a
 	       mask,
 	       regs_dump_abi(regs));
 
-	regs_dump__printf(mask, regs->regs, arch);
+	regs_dump__printf(mask, regs->regs, e_machine, e_flags);
 }
 
-static void regs_user__printf(struct perf_sample *sample, const char *arch)
+static void regs_user__printf(struct perf_sample *sample, uint16_t e_machine, uint32_t e_flags)
 {
 	struct regs_dump *user_regs;
 
@@ -967,10 +1008,10 @@ static void regs_user__printf(struct perf_sample *sample, const char *arch)
 	user_regs = perf_sample__user_regs(sample);
 
 	if (user_regs->regs)
-		regs__printf("user", user_regs, arch);
+		regs__printf("user", user_regs, e_machine, e_flags);
 }
 
-static void regs_intr__printf(struct perf_sample *sample, const char *arch)
+static void regs_intr__printf(struct perf_sample *sample, uint16_t e_machine, uint32_t e_flags)
 {
 	struct regs_dump *intr_regs;
 
@@ -980,7 +1021,7 @@ static void regs_intr__printf(struct perf_sample *sample, const char *arch)
 	intr_regs = perf_sample__intr_regs(sample);
 
 	if (intr_regs->regs)
-		regs__printf("intr", intr_regs, arch);
+		regs__printf("intr", intr_regs, e_machine, e_flags);
 }
 
 static void stack_user__printf(struct stack_dump *dump)
@@ -1069,21 +1110,29 @@ char *get_page_size_name(u64 size, char *str)
 	return str;
 }
 
-static void dump_sample(struct evsel *evsel, union perf_event *event,
-			struct perf_sample *sample, const char *arch)
+static void dump_sample(struct machine *machine, struct evsel *evsel, union perf_event *event,
+			struct perf_sample *sample)
 {
 	u64 sample_type;
 	char str[PAGE_SIZE_NAME_LEN];
+	uint16_t e_machine = EM_NONE;
+	uint32_t e_flags = 0;
 
 	if (!dump_trace)
 		return;
 
+	sample_type = evsel->core.attr.sample_type;
+
+	if (sample_type & (PERF_SAMPLE_REGS_USER | PERF_SAMPLE_REGS_INTR)) {
+		struct thread *thread = machine__find_thread(machine, sample->pid, sample->pid);
+
+		e_machine = thread__e_machine(thread, machine, &e_flags);
+	}
+
 	printf("(IP, 0x%x): %d/%d: %#" PRIx64 " period: %" PRIu64 " addr: %#" PRIx64 "\n",
 	       event->header.misc, sample->pid, sample->tid, sample->ip,
 	       sample->period, sample->addr);
 
-	sample_type = evsel->core.attr.sample_type;
-
 	if (evsel__has_callchain(evsel))
 		callchain__printf(evsel, sample);
 
@@ -1091,10 +1140,10 @@ static void dump_sample(struct evsel *evsel, union perf_event *event,
 		branch_stack__printf(sample, evsel);
 
 	if (sample_type & PERF_SAMPLE_REGS_USER)
-		regs_user__printf(sample, arch);
+		regs_user__printf(sample, e_machine, e_flags);
 
 	if (sample_type & PERF_SAMPLE_REGS_INTR)
-		regs_intr__printf(sample, arch);
+		regs_intr__printf(sample, e_machine, e_flags);
 
 	if (sample_type & PERF_SAMPLE_STACK_USER)
 		stack_user__printf(&sample->user_stack);
@@ -1409,10 +1458,10 @@ static int machines__deliver_event(struct machines *machines,
 		}
 		if (machine == NULL) {
 			++evlist->stats.nr_unprocessable_samples;
-			dump_sample(evsel, event, sample, perf_env__arch(NULL));
+			dump_sample(machine, evsel, event, sample);
 			return 0;
 		}
-		dump_sample(evsel, event, sample, perf_env__arch(machine->env));
+		dump_sample(machine, evsel, event, sample);
 		if (sample->deferred_callchain && tool->merge_deferred_callchains) {
 			struct deferred_event *de = malloc(sizeof(*de));
 			size_t sz = event->header.size;
@@ -1636,6 +1685,12 @@ static s64 perf_session__process_user_event(struct perf_session *session,
 	case PERF_RECORD_BPF_METADATA:
 		err = tool->bpf_metadata(tool, session, event);
 		break;
+	case PERF_RECORD_SCHEDSTAT_CPU:
+		err = tool->schedstat_cpu(tool, session, event);
+		break;
+	case PERF_RECORD_SCHEDSTAT_DOMAIN:
+		err = tool->schedstat_domain(tool, session, event);
+		break;
 	default:
 		err = -EINVAL;
 		break;
@@ -2326,9 +2381,10 @@ reader__read_event(struct reader *rd, struct perf_session *session,
 
 	if (size < sizeof(struct perf_event_header) ||
 	    (skip = rd->process(session, event, rd->file_pos, rd->path)) < 0) {
-		pr_err("%#" PRIx64 " [%#x]: failed to process type: %d [%s]\n",
+		errno = -skip;
+		pr_err("%#" PRIx64 " [%#x]: failed to process type: %d [%m]\n",
 		       rd->file_offset + rd->head, event->header.size,
-		       event->header.type, strerror(-skip));
+		       event->header.type);
 		err = skip;
 		goto out;
 	}
@@ -2620,7 +2676,7 @@ bool perf_session__has_switch_events(struct perf_session *session)
 
 int map__set_kallsyms_ref_reloc_sym(struct map *map, const char *symbol_name, u64 addr)
 {
-	char *bracket;
+	char *bracket, *name;
 	struct ref_reloc_sym *ref;
 	struct kmap *kmap;
 
@@ -2628,13 +2684,13 @@ int map__set_kallsyms_ref_reloc_sym(struct map *map, const char *symbol_name, u6
 	if (ref == NULL)
 		return -ENOMEM;
 
-	ref->name = strdup(symbol_name);
+	ref->name = name = strdup(symbol_name);
 	if (ref->name == NULL) {
 		free(ref);
 		return -ENOMEM;
 	}
 
-	bracket = strchr(ref->name, ']');
+	bracket = strchr(name, ']');
 	if (bracket)
 		*bracket = '\0';
 
@@ -2674,11 +2730,14 @@ size_t perf_session__fprintf_nr_events(struct perf_session *session, FILE *fp)
 
 size_t perf_session__fprintf(struct perf_session *session, FILE *fp)
 {
-	/*
-	 * FIXME: Here we have to actually print all the machines in this
-	 * session, not just the host...
-	 */
-	return machine__fprintf(&session->machines.host, fp);
+	size_t ret = machine__fprintf(&session->machines.host, fp);
+
+	for (struct rb_node *nd = rb_first_cached(&session->machines.guests); nd; nd = rb_next(nd)) {
+		struct machine *pos = rb_entry(nd, struct machine, rb_node);
+
+		ret += machine__fprintf(pos, fp);
+	}
+	return ret;
 }
 
 void perf_session__dump_kmaps(struct perf_session *session)
@@ -2904,3 +2963,68 @@ struct perf_env *perf_session__env(struct perf_session *session)
 {
 	return &session->header.env;
 }
+
+struct perf_session__e_machine_cb_args {
+	uint32_t e_flags;
+	uint16_t e_machine;
+};
+
+static int perf_session__e_machine_cb(struct thread *thread, void *_args)
+{
+	struct perf_session__e_machine_cb_args *args = _args;
+
+	args->e_machine = thread__e_machine(thread, /*machine=*/NULL, &args->e_flags);
+	return args->e_machine != EM_NONE ? 1 : 0;
+}
+
+/*
+ * Note, a machine may have mixed 32-bit and 64-bit processes and so mixed
+ * e_machines. Use thread__e_machine when this matters.
+ */
+uint16_t perf_session__e_machine(struct perf_session *session, uint32_t *e_flags)
+{
+	struct perf_session__e_machine_cb_args args = {
+		.e_machine = EM_NONE,
+	};
+	struct perf_env *env;
+
+	if (!session) {
+		/* Default to assuming a host machine. */
+		if (e_flags)
+			*e_flags = EF_HOST;
+
+		return EM_HOST;
+	}
+
+	env = perf_session__env(session);
+	if (env && env->e_machine != EM_NONE) {
+		if (e_flags)
+			*e_flags = env->e_flags;
+
+		return env->e_machine;
+	}
+
+	machines__for_each_thread(&session->machines,
+				  perf_session__e_machine_cb,
+				  &args);
+
+	if (args.e_machine != EM_NONE) {
+		if (env) {
+			env->e_machine = args.e_machine;
+			env->e_flags = args.e_flags;
+		}
+		if (e_flags)
+			*e_flags = args.e_flags;
+
+		return args.e_machine;
+	}
+
+	/*
+	 * Couldn't determine from the perf_env or current set of
+	 * threads. Default to the host.
+	 */
+	if (e_flags)
+		*e_flags = EF_HOST;
+
+	return EM_HOST;
+}
diff --git a/tools/perf/util/session.h b/tools/perf/util/session.h
index 22d3ff877e83..f05f0d4a6c23 100644
--- a/tools/perf/util/session.h
+++ b/tools/perf/util/session.h
@@ -211,5 +211,6 @@ int perf_event__process_finished_round(const struct perf_tool *tool,
 				       struct ordered_events *oe);
 
 struct perf_env *perf_session__env(struct perf_session *session);
+uint16_t perf_session__e_machine(struct perf_session *session, uint32_t *e_flags);
 
 #endif /* __PERF_SESSION_H */
diff --git a/tools/perf/util/sha1.c b/tools/perf/util/sha1.c
deleted file mode 100644
index 7032fa4ff3fd..000000000000
--- a/tools/perf/util/sha1.c
+++ /dev/null
@@ -1,97 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * SHA-1 message digest algorithm
- *
- * Copyright 2025 Google LLC
- */
-#include <linux/bitops.h>
-#include <linux/kernel.h>
-#include <linux/unaligned.h>
-#include <string.h>
-
-#include "sha1.h"
-
-#define SHA1_BLOCK_SIZE 64
-
-static const u32 sha1_K[4] = { 0x5A827999, 0x6ED9EBA1, 0x8F1BBCDC, 0xCA62C1D6 };
-
-#define SHA1_ROUND(i, a, b, c, d, e)                                          \
-	do {                                                                  \
-		if ((i) >= 16)                                                \
-			w[i] = rol32(w[(i) - 16] ^ w[(i) - 14] ^ w[(i) - 8] ^ \
-					     w[(i) - 3],                      \
-				     1);                                      \
-		e += w[i] + rol32(a, 5) + sha1_K[(i) / 20];                   \
-		if ((i) < 20)                                                 \
-			e += (b & (c ^ d)) ^ d;                               \
-		else if ((i) < 40 || (i) >= 60)                               \
-			e += b ^ c ^ d;                                       \
-		else                                                          \
-			e += (c & d) ^ (b & (c ^ d));                         \
-		b = rol32(b, 30);                                             \
-		/* The new (a, b, c, d, e) is the old (e, a, b, c, d). */     \
-	} while (0)
-
-#define SHA1_5ROUNDS(i)                             \
-	do {                                        \
-		SHA1_ROUND((i) + 0, a, b, c, d, e); \
-		SHA1_ROUND((i) + 1, e, a, b, c, d); \
-		SHA1_ROUND((i) + 2, d, e, a, b, c); \
-		SHA1_ROUND((i) + 3, c, d, e, a, b); \
-		SHA1_ROUND((i) + 4, b, c, d, e, a); \
-	} while (0)
-
-#define SHA1_20ROUNDS(i)                \
-	do {                            \
-		SHA1_5ROUNDS((i) + 0);  \
-		SHA1_5ROUNDS((i) + 5);  \
-		SHA1_5ROUNDS((i) + 10); \
-		SHA1_5ROUNDS((i) + 15); \
-	} while (0)
-
-static void sha1_blocks(u32 h[5], const u8 *data, size_t nblocks)
-{
-	while (nblocks--) {
-		u32 a = h[0];
-		u32 b = h[1];
-		u32 c = h[2];
-		u32 d = h[3];
-		u32 e = h[4];
-		u32 w[80];
-
-		for (int i = 0; i < 16; i++)
-			w[i] = get_unaligned_be32(&data[i * 4]);
-		SHA1_20ROUNDS(0);
-		SHA1_20ROUNDS(20);
-		SHA1_20ROUNDS(40);
-		SHA1_20ROUNDS(60);
-
-		h[0] += a;
-		h[1] += b;
-		h[2] += c;
-		h[3] += d;
-		h[4] += e;
-		data += SHA1_BLOCK_SIZE;
-	}
-}
-
-/* Calculate the SHA-1 message digest of the given data. */
-void sha1(const void *data, size_t len, u8 out[SHA1_DIGEST_SIZE])
-{
-	u32 h[5] = { 0x67452301, 0xEFCDAB89, 0x98BADCFE, 0x10325476,
-		     0xC3D2E1F0 };
-	u8 final_data[2 * SHA1_BLOCK_SIZE] = { 0 };
-	size_t final_len = len % SHA1_BLOCK_SIZE;
-
-	sha1_blocks(h, data, len / SHA1_BLOCK_SIZE);
-
-	memcpy(final_data, data + len - final_len, final_len);
-	final_data[final_len] = 0x80;
-	final_len = round_up(final_len + 9, SHA1_BLOCK_SIZE);
-	put_unaligned_be64((u64)len * 8, &final_data[final_len - 8]);
-
-	sha1_blocks(h, final_data, final_len / SHA1_BLOCK_SIZE);
-
-	for (int i = 0; i < 5; i++)
-		put_unaligned_be32(h[i], &out[i * 4]);
-}
diff --git a/tools/perf/util/sha1.h b/tools/perf/util/sha1.h
deleted file mode 100644
index e92c9966e1d5..000000000000
--- a/tools/perf/util/sha1.h
+++ /dev/null
@@ -1,6 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-#include <linux/types.h>
-
-#define SHA1_DIGEST_SIZE 20
-
-void sha1(const void *data, size_t len, u8 out[SHA1_DIGEST_SIZE]);
diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c
index f3a565b0e230..42d5cd7ef4e2 100644
--- a/tools/perf/util/sort.c
+++ b/tools/perf/util/sort.c
@@ -1016,7 +1016,7 @@ static int hist_entry__cgroup_snprintf(struct hist_entry *he,
 	const char *cgrp_name = "N/A";
 
 	if (he->cgroup) {
-		struct cgroup *cgrp = cgroup__find(maps__machine(he->ms.maps)->env,
+		struct cgroup *cgrp = cgroup__find(maps__machine(thread__maps(he->ms.thread))->env,
 						   he->cgroup);
 		if (cgrp != NULL)
 			cgrp_name = cgrp->name;
@@ -2474,8 +2474,7 @@ struct sort_entry sort_type_offset = {
 
 /* --sort typecln */
 
-/* TODO: use actual value in the system */
-#define TYPE_CACHELINE_SIZE  64
+#define DEFAULT_CACHELINE_SIZE 64
 
 static int64_t
 sort__typecln_sort(struct hist_entry *left, struct hist_entry *right)
@@ -2484,6 +2483,10 @@ sort__typecln_sort(struct hist_entry *left, struct hist_entry *right)
 	struct annotated_data_type *right_type = right->mem_type;
 	int64_t left_cln, right_cln;
 	int64_t ret;
+	int cln_size = cacheline_size();
+
+	if (cln_size == 0)
+		cln_size = DEFAULT_CACHELINE_SIZE;
 
 	if (!left_type) {
 		sort__type_init(left);
@@ -2499,8 +2502,8 @@ sort__typecln_sort(struct hist_entry *left, struct hist_entry *right)
 	if (ret)
 		return ret;
 
-	left_cln = left->mem_type_off / TYPE_CACHELINE_SIZE;
-	right_cln = right->mem_type_off / TYPE_CACHELINE_SIZE;
+	left_cln = left->mem_type_off / cln_size;
+	right_cln = right->mem_type_off / cln_size;
 	return left_cln - right_cln;
 }
 
@@ -2508,9 +2511,13 @@ static int hist_entry__typecln_snprintf(struct hist_entry *he, char *bf,
 				     size_t size, unsigned int width __maybe_unused)
 {
 	struct annotated_data_type *he_type = he->mem_type;
+	int cln_size = cacheline_size();
+
+	if (cln_size == 0)
+		cln_size = DEFAULT_CACHELINE_SIZE;
 
 	return repsep_snprintf(bf, size, "%s: cache-line %d", he_type->self.type_name,
-			       he->mem_type_off / TYPE_CACHELINE_SIZE);
+			       he->mem_type_off / cln_size);
 }
 
 struct sort_entry sort_type_cacheline = {
@@ -3538,6 +3545,56 @@ out:
 	return ret;
 }
 
+static int __sort_dimension__update(struct sort_dimension *sd,
+				    struct perf_hpp_list *list)
+{
+	if (sd->entry == &sort_parent && parent_pattern) {
+		int ret = regcomp(&parent_regex, parent_pattern, REG_EXTENDED);
+		if (ret) {
+			char err[BUFSIZ];
+
+			regerror(ret, &parent_regex, err, sizeof(err));
+			pr_err("Invalid regex: %s\n%s", parent_pattern, err);
+			return -EINVAL;
+		}
+		list->parent = 1;
+	} else if (sd->entry == &sort_sym) {
+		list->sym = 1;
+		/*
+		 * perf diff displays the performance difference amongst
+		 * two or more perf.data files. Those files could come
+		 * from different binaries. So we should not compare
+		 * their ips, but the name of symbol.
+		 */
+		if (sort__mode == SORT_MODE__DIFF)
+			sd->entry->se_collapse = sort__sym_sort;
+
+	} else if (sd->entry == &sort_sym_offset) {
+		list->sym = 1;
+	} else if (sd->entry == &sort_dso) {
+		list->dso = 1;
+	} else if (sd->entry == &sort_socket) {
+		list->socket = 1;
+	} else if (sd->entry == &sort_thread) {
+		list->thread = 1;
+	} else if (sd->entry == &sort_comm) {
+		list->comm = 1;
+	} else if (sd->entry == &sort_type_offset) {
+		symbol_conf.annotate_data_member = true;
+	} else if (sd->entry == &sort_sym_from || sd->entry == &sort_sym_to) {
+		list->sym = 1;
+	} else if (sd->entry == &sort_mem_dcacheline && cacheline_size() == 0) {
+		return -EINVAL;
+	} else if (sd->entry == &sort_mem_daddr_sym) {
+		list->sym = 1;
+	}
+
+	if (sd->entry->se_collapse)
+		list->need_collapse = 1;
+
+	return 0;
+}
+
 static int __sort_dimension__add(struct sort_dimension *sd,
 				 struct perf_hpp_list *list,
 				 int level)
@@ -3548,8 +3605,8 @@ static int __sort_dimension__add(struct sort_dimension *sd,
 	if (__sort_dimension__add_hpp_sort(sd, list, level) < 0)
 		return -1;
 
-	if (sd->entry->se_collapse)
-		list->need_collapse = 1;
+	if (__sort_dimension__update(sd, list) < 0)
+		return -1;
 
 	sd->taken = 1;
 
@@ -3585,6 +3642,9 @@ static int __sort_dimension__add_output(struct perf_hpp_list *list,
 	if (__sort_dimension__add_hpp_output(sd, list, level) < 0)
 		return -1;
 
+	if (__sort_dimension__update(sd, list) < 0)
+		return -1;
+
 	sd->taken = 1;
 	return 0;
 }
@@ -3648,39 +3708,6 @@ int sort_dimension__add(struct perf_hpp_list *list, const char *tok,
 				sort_dimension_add_dynamic_header(sd, env);
 		}
 
-		if (sd->entry == &sort_parent && parent_pattern) {
-			int ret = regcomp(&parent_regex, parent_pattern, REG_EXTENDED);
-			if (ret) {
-				char err[BUFSIZ];
-
-				regerror(ret, &parent_regex, err, sizeof(err));
-				pr_err("Invalid regex: %s\n%s", parent_pattern, err);
-				return -EINVAL;
-			}
-			list->parent = 1;
-		} else if (sd->entry == &sort_sym) {
-			list->sym = 1;
-			/*
-			 * perf diff displays the performance difference amongst
-			 * two or more perf.data files. Those files could come
-			 * from different binaries. So we should not compare
-			 * their ips, but the name of symbol.
-			 */
-			if (sort__mode == SORT_MODE__DIFF)
-				sd->entry->se_collapse = sort__sym_sort;
-
-		} else if (sd->entry == &sort_dso) {
-			list->dso = 1;
-		} else if (sd->entry == &sort_socket) {
-			list->socket = 1;
-		} else if (sd->entry == &sort_thread) {
-			list->thread = 1;
-		} else if (sd->entry == &sort_comm) {
-			list->comm = 1;
-		} else if (sd->entry == &sort_type_offset) {
-			symbol_conf.annotate_data_member = true;
-		}
-
 		return __sort_dimension__add(sd, list, level);
 	}
 
@@ -3699,9 +3726,6 @@ int sort_dimension__add(struct perf_hpp_list *list, const char *tok,
 				    strlen(tok)))
 			return -EINVAL;
 
-		if (sd->entry == &sort_sym_from || sd->entry == &sort_sym_to)
-			list->sym = 1;
-
 		__sort_dimension__add(sd, list, level);
 		return 0;
 	}
@@ -3715,12 +3739,6 @@ int sort_dimension__add(struct perf_hpp_list *list, const char *tok,
 		if (sort__mode != SORT_MODE__MEMORY)
 			return -EINVAL;
 
-		if (sd->entry == &sort_mem_dcacheline && cacheline_size() == 0)
-			return -EINVAL;
-
-		if (sd->entry == &sort_mem_daddr_sym)
-			list->sym = 1;
-
 		__sort_dimension__add(sd, list, level);
 		return 0;
 	}
diff --git a/tools/perf/util/srcline.c b/tools/perf/util/srcline.c
index 27c0966611ab..9be42f398440 100644
--- a/tools/perf/util/srcline.c
+++ b/tools/perf/util/srcline.c
@@ -6,9 +6,12 @@
 #include "libbfd.h"
 #include "llvm.h"
 #include "symbol.h"
+#include "libdw.h"
+#include "debug.h"
 
 #include <inttypes.h>
 #include <string.h>
+#include <linux/string.h>
 
 bool srcline_full_filename;
 
@@ -51,6 +54,25 @@ int inline_list__append(struct symbol *symbol, char *srcline, struct inline_node
 	return 0;
 }
 
+int inline_list__append_tail(struct symbol *symbol, char *srcline, struct inline_node *node)
+{
+	struct inline_list *ilist;
+
+	ilist = zalloc(sizeof(*ilist));
+	if (ilist == NULL)
+		return -1;
+
+	ilist->symbol = symbol;
+	ilist->srcline = srcline;
+
+	if (callchain_param.order == ORDER_CALLEE)
+		list_add(&ilist->list, &node->val);
+	else
+		list_add_tail(&ilist->list, &node->val);
+
+	return 0;
+}
+
 /* basename version that takes a const input string */
 static const char *gnu_basename(const char *path)
 {
@@ -118,17 +140,95 @@ static int addr2line(const char *dso_name, u64 addr, char **file, unsigned int *
 		     struct dso *dso, bool unwind_inlines, struct inline_node *node,
 		     struct symbol *sym)
 {
-	int ret;
+	int ret = 0;
+
+	if (symbol_conf.addr2line_style[0] == A2L_STYLE_UNKNOWN) {
+		int i = 0;
+
+		/* Default addr2line fallback order. */
+#ifdef HAVE_LIBDW_SUPPORT
+		symbol_conf.addr2line_style[i++] = A2L_STYLE_LIBDW;
+#endif
+#ifdef HAVE_LIBLLVM_SUPPORT
+		symbol_conf.addr2line_style[i++] = A2L_STYLE_LLVM;
+#endif
+#ifdef HAVE_LIBBFD_SUPPORT
+		symbol_conf.addr2line_style[i++] = A2L_STYLE_LIBBFD;
+#endif
+		symbol_conf.addr2line_style[i++] = A2L_STYLE_CMD;
+	}
+
+	for (size_t i = 0; i < ARRAY_SIZE(symbol_conf.addr2line_style); i++) {
+		switch (symbol_conf.addr2line_style[i]) {
+		case A2L_STYLE_LIBDW:
+			ret = libdw__addr2line(addr, file, line_nr, dso, unwind_inlines,
+					       node, sym);
+			break;
+		case A2L_STYLE_LLVM:
+			ret = llvm__addr2line(dso_name, addr, file, line_nr, dso, unwind_inlines,
+					      node, sym);
+			break;
+		case A2L_STYLE_LIBBFD:
+			ret = libbfd__addr2line(dso_name, addr, file, line_nr, dso, unwind_inlines,
+						node, sym);
+			break;
+		case A2L_STYLE_CMD:
+			ret = cmd__addr2line(dso_name, addr, file, line_nr, dso, unwind_inlines,
+					     node, sym);
+			break;
+		case A2L_STYLE_UNKNOWN:
+		default:
+			break;
+		}
+		if (ret > 0)
+			return ret;
+	}
+
+	return 0;
+}
+
+int addr2line_configure(const char *var, const char *value, void *cb __maybe_unused)
+{
+	static const char * const a2l_style_names[] = {
+		[A2L_STYLE_LIBDW] = "libdw",
+		[A2L_STYLE_LLVM] = "llvm",
+		[A2L_STYLE_LIBBFD] = "libbfd",
+		[A2L_STYLE_CMD] = "addr2line",
+		NULL
+	};
 
-	ret = llvm__addr2line(dso_name, addr, file, line_nr, dso, unwind_inlines, node, sym);
-	if (ret > 0)
-		return ret;
+	char *s, *p, *saveptr;
+	size_t i = 0;
 
-	ret = libbfd__addr2line(dso_name, addr, file, line_nr, dso, unwind_inlines, node, sym);
-	if (ret > 0)
-		return ret;
+	if (strcmp(var, "addr2line.style"))
+		return 0;
 
-	return cmd__addr2line(dso_name, addr, file, line_nr, dso, unwind_inlines, node, sym);
+	if (!value)
+		return -1;
+
+	s = strdup(value);
+	if (!s)
+		return -1;
+
+	p = strtok_r(s, ",", &saveptr);
+	while (p && i < ARRAY_SIZE(symbol_conf.addr2line_style)) {
+		bool found = false;
+		char *q = strim(p);
+
+		for (size_t j = A2L_STYLE_LIBDW; j < MAX_A2L_STYLE; j++) {
+			if (!strcasecmp(q, a2l_style_names[j])) {
+				symbol_conf.addr2line_style[i++] = j;
+				found = true;
+				break;
+			}
+		}
+		if (!found)
+			pr_warning("Unknown addr2line style: %s\n", q);
+		p = strtok_r(NULL, ",", &saveptr);
+	}
+
+	free(s);
+	return 0;
 }
 
 static struct inline_node *addr2inlines(const char *dso_name, u64 addr,
diff --git a/tools/perf/util/srcline.h b/tools/perf/util/srcline.h
index c36f573cd339..7c37b3bf9ce7 100644
--- a/tools/perf/util/srcline.h
+++ b/tools/perf/util/srcline.h
@@ -57,9 +57,12 @@ struct inline_node *inlines__tree_find(struct rb_root_cached *tree, u64 addr);
 void inlines__tree_delete(struct rb_root_cached *tree);
 
 int inline_list__append(struct symbol *symbol, char *srcline, struct inline_node *node);
+int inline_list__append_tail(struct symbol *symbol, char *srcline, struct inline_node *node);
 char *srcline_from_fileline(const char *file, unsigned int line);
 struct symbol *new_inline_sym(struct dso *dso,
 			      struct symbol *base_sym,
 			      const char *funcname);
 
+int addr2line_configure(const char *var, const char *value, void *cb);
+
 #endif /* PERF_SRCLINE_H */
diff --git a/tools/perf/util/stat-display.c b/tools/perf/util/stat-display.c
index 6d02f84c5691..dc2b66855f6c 100644
--- a/tools/perf/util/stat-display.c
+++ b/tools/perf/util/stat-display.c
@@ -820,12 +820,6 @@ static void printout(struct perf_stat_config *config, struct outstate *os,
 	}
 
 	if (run == 0 || ena == 0 || counter->counts->scaled == -1) {
-		if (config->metric_only) {
-			pm(config, os, METRIC_THRESHOLD_UNKNOWN, /*format=*/NULL,
-			   /*unit=*/NULL, /*val=*/0);
-			return;
-		}
-
 		ok = false;
 
 		if (counter->supported) {
@@ -848,33 +842,32 @@ static void printout(struct perf_stat_config *config, struct outstate *os,
 		print_running(config, os, run, ena, /*before_metric=*/true);
 	}
 
-	if (ok) {
-		if (!config->metric_only && counter->default_metricgroup && !counter->default_show_events) {
-			void *from = NULL;
-
-			aggr_printout(config, os, os->evsel, os->id, os->aggr_nr);
-			/* Print out all the metricgroup with the same metric event. */
-			do {
-				int num = 0;
-
-				/* Print out the new line for the next new metricgroup. */
-				if (from) {
-					if (config->json_output)
-						new_line_json(config, (void *)os);
-					else
-						__new_line_std_csv(config, os);
-				}
-
-				print_noise(config, os, counter, noise, /*before_metric=*/true);
-				print_running(config, os, run, ena, /*before_metric=*/true);
-				from = perf_stat__print_shadow_stats_metricgroup(config, counter, aggr_idx,
-										 &num, from, &out);
-			} while (from != NULL);
-		} else {
-			perf_stat__print_shadow_stats(config, counter, aggr_idx, &out);
-		}
+	if (!config->metric_only && counter->default_metricgroup &&
+	    !counter->default_show_events) {
+		void *from = NULL;
+
+		aggr_printout(config, os, os->evsel, os->id, os->aggr_nr);
+		/* Print out all the metricgroup with the same metric event. */
+		do {
+			int num = 0;
+
+			/* Print out the new line for the next new metricgroup. */
+			if (from) {
+				if (config->json_output)
+					new_line_json(config, (void *)os);
+				else
+					__new_line_std_csv(config, os);
+			}
+
+			print_noise(config, os, counter, noise,
+				    /*before_metric=*/true);
+			print_running(config, os, run, ena,
+				      /*before_metric=*/true);
+			from = perf_stat__print_shadow_stats_metricgroup(
+				config, counter, aggr_idx, &num, from, &out);
+		} while (from != NULL);
 	} else {
-		pm(config, os, METRIC_THRESHOLD_UNKNOWN, /*format=*/NULL, /*unit=*/NULL, /*val=*/0);
+		perf_stat__print_shadow_stats(config, counter, aggr_idx, &out);
 	}
 
 	if (!config->metric_only) {
@@ -987,7 +980,7 @@ static void print_counter_aggrdata(struct perf_stat_config *config,
 	ena = aggr->counts.ena;
 	run = aggr->counts.run;
 
-	if (perf_stat__skip_metric_event(counter, ena, run))
+	if (perf_stat__skip_metric_event(counter))
 		return;
 
 	if (val == 0 && should_skip_zero_counter(config, counter, &id))
@@ -1397,21 +1390,12 @@ static void print_header(struct perf_stat_config *config,
 		num_print_iv = 0;
 }
 
-static int get_precision(double num)
-{
-	if (num > 1)
-		return 0;
-
-	return lround(ceil(-log10(num)));
-}
-
-static void print_table(struct perf_stat_config *config,
-			FILE *output, int precision, double avg)
+static void print_table(struct perf_stat_config *config, FILE *output, double avg)
 {
 	char tmp[64];
 	int idx, indent = 0;
 
-	scnprintf(tmp, 64, " %17.*f", precision, avg);
+	scnprintf(tmp, 64, " %17.9f", avg);
 	while (tmp[indent] == ' ')
 		indent++;
 
@@ -1421,8 +1405,7 @@ static void print_table(struct perf_stat_config *config,
 		double run = (double) config->walltime_run[idx] / NSEC_PER_SEC;
 		int h, n = 1 + abs((int) (100.0 * (run - avg)/run) / 5);
 
-		fprintf(output, " %17.*f (%+.*f) ",
-			precision, run, precision, run - avg);
+		fprintf(output, " %17.9f (%+.9f) ", run, run - avg);
 
 		for (h = 0; h < n; h++)
 			fprintf(output, "#");
@@ -1462,17 +1445,11 @@ static void print_footer(struct perf_stat_config *config)
 		}
 	} else {
 		double sd = stddev_stats(config->walltime_nsecs_stats) / NSEC_PER_SEC;
-		/*
-		 * Display at most 2 more significant
-		 * digits than the stddev inaccuracy.
-		 */
-		int precision = get_precision(sd) + 2;
 
 		if (config->walltime_run_table)
-			print_table(config, output, precision, avg);
+			print_table(config, output, avg);
 
-		fprintf(output, " %17.*f +- %.*f seconds time elapsed",
-			precision, avg, precision, sd);
+		fprintf(output, " %17.9f +- %.9f seconds time elapsed", avg, sd);
 
 		print_noise_pct(config, NULL, sd, avg, /*before_metric=*/false);
 	}
diff --git a/tools/perf/util/stat-shadow.c b/tools/perf/util/stat-shadow.c
index 9c83f7d96caa..59d2cd4f2188 100644
--- a/tools/perf/util/stat-shadow.c
+++ b/tools/perf/util/stat-shadow.c
@@ -57,7 +57,6 @@ static int prepare_metric(struct perf_stat_config *config,
 		bool is_tool_time =
 			tool_pmu__is_time_event(config, metric_events[i], &tool_aggr_idx);
 		struct perf_stat_evsel *ps = metric_events[i]->stats;
-		struct perf_stat_aggr *aggr;
 		char *n;
 		double val;
 
@@ -82,8 +81,7 @@ static int prepare_metric(struct perf_stat_config *config,
 			}
 		}
 		/* Time events are always on CPU0, the first aggregation index. */
-		aggr = &ps->aggr[is_tool_time ? tool_aggr_idx : aggr_idx];
-		if (!aggr || !metric_events[i]->supported) {
+		if (!ps || !metric_events[i]->supported) {
 			/*
 			 * Not supported events will have a count of 0, which
 			 * can be confusing in a metric. Explicitly set the
@@ -93,11 +91,21 @@ static int prepare_metric(struct perf_stat_config *config,
 			val = NAN;
 			source_count = 0;
 		} else {
-			val = aggr->counts.val;
-			if (is_tool_time)
-				val *= 1e-9; /* Convert time event nanoseconds to seconds. */
-			if (!source_count)
-				source_count = evsel__source_count(metric_events[i]);
+			struct perf_stat_aggr *aggr =
+				&ps->aggr[is_tool_time ? tool_aggr_idx : aggr_idx];
+
+			if (aggr->counts.run == 0) {
+				val = NAN;
+				source_count = 0;
+			} else {
+				val = aggr->counts.val;
+				if (is_tool_time) {
+					/* Convert time event nanoseconds to seconds. */
+					val *= 1e-9;
+				}
+				if (!source_count)
+					source_count = evsel__source_count(metric_events[i]);
+			}
 		}
 		n = strdup(evsel__metric_id(metric_events[i]));
 		if (!n)
@@ -335,14 +343,10 @@ void perf_stat__print_shadow_stats(struct perf_stat_config *config,
  * perf_stat__skip_metric_event - Skip the evsel in the Default metricgroup,
  *				  if it's not running or not the metric event.
  */
-bool perf_stat__skip_metric_event(struct evsel *evsel,
-				  u64 ena, u64 run)
+bool perf_stat__skip_metric_event(struct evsel *evsel)
 {
 	if (!evsel->default_metricgroup)
 		return false;
 
-	if (!ena || !run)
-		return true;
-
 	return !metricgroup__lookup(&evsel->evlist->metric_events, evsel, false);
 }
diff --git a/tools/perf/util/stat.h b/tools/perf/util/stat.h
index f986911c9296..4bced233d2fc 100644
--- a/tools/perf/util/stat.h
+++ b/tools/perf/util/stat.h
@@ -163,7 +163,7 @@ void perf_stat__print_shadow_stats(struct perf_stat_config *config,
 				   struct evsel *evsel,
 				   int aggr_idx,
 				   struct perf_stat_output_ctx *out);
-bool perf_stat__skip_metric_event(struct evsel *evsel, u64 ena, u64 run);
+bool perf_stat__skip_metric_event(struct evsel *evsel);
 void *perf_stat__print_shadow_stats_metricgroup(struct perf_stat_config *config,
 						struct evsel *evsel,
 						int aggr_idx,
diff --git a/tools/perf/util/strlist.c b/tools/perf/util/strlist.c
index 8a868cbeffae..50add72575e0 100644
--- a/tools/perf/util/strlist.c
+++ b/tools/perf/util/strlist.c
@@ -12,20 +12,16 @@
 #include <linux/zalloc.h>
 
 static
-struct rb_node *strlist__node_new(struct rblist *rblist, const void *entry)
+struct rb_node *strlist__node_new(struct rblist *rblist __maybe_unused, const void *entry)
 {
 	const char *s = entry;
 	struct rb_node *rc = NULL;
-	struct strlist *strlist = container_of(rblist, struct strlist, rblist);
 	struct str_node *snode = malloc(sizeof(*snode));
 
 	if (snode != NULL) {
-		if (strlist->dupstr) {
-			s = strdup(s);
-			if (s == NULL)
-				goto out_delete;
-		}
-		snode->s = s;
+		snode->s = strdup(s);
+		if (snode->s == NULL)
+			goto out_delete;
 		rc = &snode->rb_node;
 	}
 
@@ -36,20 +32,18 @@ out_delete:
 	return NULL;
 }
 
-static void str_node__delete(struct str_node *snode, bool dupstr)
+static void str_node__delete(struct str_node *snode)
 {
-	if (dupstr)
-		zfree((char **)&snode->s);
+	zfree((char **)&snode->s);
 	free(snode);
 }
 
 static
-void strlist__node_delete(struct rblist *rblist, struct rb_node *rb_node)
+void strlist__node_delete(struct rblist *rblist __maybe_unused, struct rb_node *rb_node)
 {
-	struct strlist *slist = container_of(rblist, struct strlist, rblist);
 	struct str_node *snode = container_of(rb_node, struct str_node, rb_node);
 
-	str_node__delete(snode, slist->dupstr);
+	str_node__delete(snode);
 }
 
 static int strlist__node_cmp(struct rb_node *rb_node, const void *entry)
@@ -139,21 +133,25 @@ out:
 	return err;
 }
 
-static int strlist__parse_list(struct strlist *slist, const char *s, const char *subst_dir)
+static int strlist__parse_list(struct strlist *slist, const char *list, const char *subst_dir)
 {
-	char *sep;
+	char *sep, *s = strdup(list), *sdup = s;
 	int err;
 
+	if (s == NULL)
+		return -ENOMEM;
+
 	while ((sep = strchr(s, ',')) != NULL) {
 		*sep = '\0';
 		err = strlist__parse_list_entry(slist, s, subst_dir);
-		*sep = ',';
 		if (err != 0)
 			return err;
 		s = sep + 1;
 	}
 
-	return *s ? strlist__parse_list_entry(slist, s, subst_dir) : 0;
+	err = *s ? strlist__parse_list_entry(slist, s, subst_dir) : 0;
+	free(sdup);
+	return err;
 }
 
 struct strlist *strlist__new(const char *list, const struct strlist_config *config)
@@ -161,12 +159,10 @@ struct strlist *strlist__new(const char *list, const struct strlist_config *conf
 	struct strlist *slist = malloc(sizeof(*slist));
 
 	if (slist != NULL) {
-		bool dupstr = true;
 		bool file_only = false;
 		const char *dirname = NULL;
 
 		if (config) {
-			dupstr = !config->dont_dupstr;
 			dirname = config->dirname;
 			file_only = config->file_only;
 		}
@@ -176,7 +172,6 @@ struct strlist *strlist__new(const char *list, const struct strlist_config *conf
 		slist->rblist.node_new    = strlist__node_new;
 		slist->rblist.node_delete = strlist__node_delete;
 
-		slist->dupstr	 = dupstr;
 		slist->file_only = file_only;
 
 		if (list && strlist__parse_list(slist, list, dirname) != 0)
diff --git a/tools/perf/util/strlist.h b/tools/perf/util/strlist.h
index 7e82c71dcc42..3e9533e66ca9 100644
--- a/tools/perf/util/strlist.h
+++ b/tools/perf/util/strlist.h
@@ -14,7 +14,6 @@ struct str_node {
 
 struct strlist {
 	struct rblist rblist;
-	bool	      dupstr;
 	bool	      file_only;
 };
 
@@ -24,7 +23,6 @@ struct strlist {
  *             found
  */
 struct strlist_config {
-	bool dont_dupstr;
 	bool file_only;
 	const char *dirname;
 };
diff --git a/tools/perf/util/symbol-elf.c b/tools/perf/util/symbol-elf.c
index d1dcafa4b3b8..76912c62b6a0 100644
--- a/tools/perf/util/symbol-elf.c
+++ b/tools/perf/util/symbol-elf.c
@@ -1105,14 +1105,14 @@ static Elf *read_gnu_debugdata(struct dso *dso, Elf *elf, const char *name, int
 
 	wrapped = fmemopen(scn_data->d_buf, scn_data->d_size, "r");
 	if (!wrapped) {
-		pr_debug("%s: fmemopen: %s\n", __func__, strerror(errno));
+		pr_debug("%s: fmemopen: %m\n", __func__);
 		*dso__load_errno(dso) = -errno;
 		return NULL;
 	}
 
 	temp_fd = mkstemp(temp_filename);
 	if (temp_fd < 0) {
-		pr_debug("%s: mkstemp: %s\n", __func__, strerror(errno));
+		pr_debug("%s: mkstemp: %m\n", __func__);
 		*dso__load_errno(dso) = -errno;
 		fclose(wrapped);
 		return NULL;
@@ -1173,7 +1173,7 @@ int symsrc__init(struct symsrc *ss, struct dso *dso, const char *name,
 		Elf *embedded = read_gnu_debugdata(dso, elf, name, &new_fd);
 
 		if (!embedded)
-			goto out_close;
+			goto out_elf_end;
 
 		elf_end(elf);
 		close(fd);
diff --git a/tools/perf/util/symbol.c b/tools/perf/util/symbol.c
index 814f960fa8f8..8662001e1e25 100644
--- a/tools/perf/util/symbol.c
+++ b/tools/perf/util/symbol.c
@@ -104,21 +104,10 @@ static enum dso_binary_type binary_type_symtab[] = {
 
 #define DSO_BINARY_TYPE__SYMTAB_CNT ARRAY_SIZE(binary_type_symtab)
 
-static bool symbol_type__filter(char __symbol_type)
-{
-	// Since 'U' == undefined and 'u' == unique global symbol, we can't use toupper there
-	// 'N' is for debugging symbols, 'n' is a non-data, non-code, non-debug read-only section.
-	// According to 'man nm'.
-	// 'N' first seen in:
-	// ffffffff9b35d130 N __pfx__RNCINvNtNtNtCsbDUBuN8AbD4_4core4iter8adapters3map12map_try_foldjNtCs6vVzKs5jPr6_12drm_panic_qr7VersionuINtNtNtBa_3ops12control_flow11ControlFlowB10_ENcB10_0NCINvNvNtNtNtB8_6traits8iterator8Iterator4find5checkB10_NCNvMB12_B10_13from_segments0E0E0B12_
-	// a seemingly Rust mangled name
-	// Ditto for '1':
-	// root@x1:~# grep ' 1 ' /proc/kallsyms
-	// ffffffffb098bc00 1 __pfx__RNCINvNtNtNtCsfwaGRd4cjqE_4core4iter8adapters3map12map_try_foldjNtCskFudTml27HW_12drm_panic_qr7VersionuINtNtNtBa_3ops12control_flow11ControlFlowB10_ENcB10_0NCINvNvNtNtNtB8_6traits8iterator8Iterator4find5checkB10_NCNvMB12_B10_13from_segments0E0E0B12_
-	// ffffffffb098bc10 1 _RNCINvNtNtNtCsfwaGRd4cjqE_4core4iter8adapters3map12map_try_foldjNtCskFudTml27HW_12drm_panic_qr7VersionuINtNtNtBa_3ops12control_flow11ControlFlowB10_ENcB10_0NCINvNvNtNtNtB8_6traits8iterator8Iterator4find5checkB10_NCNvMB12_B10_13from_segments0E0E0B12_
-	char symbol_type = toupper(__symbol_type);
-	return symbol_type == 'T' || symbol_type == 'W' || symbol_type == 'D' || symbol_type == 'B' ||
-	       __symbol_type == 'u' || __symbol_type == 'l' || __symbol_type == 'N' || __symbol_type == '1';
+static bool symbol_type__filter(char symbol_type)
+{
+	symbol_type = toupper(symbol_type);
+	return symbol_type == 'T' || symbol_type == 'W' || symbol_type == 'D' || symbol_type == 'B';
 }
 
 static int prefix_underscores_count(const char *str)
diff --git a/tools/perf/util/symbol_conf.h b/tools/perf/util/symbol_conf.h
index 7a80d2c14d9b..71bb17372a6c 100644
--- a/tools/perf/util/symbol_conf.h
+++ b/tools/perf/util/symbol_conf.h
@@ -9,6 +9,15 @@
 struct strlist;
 struct intlist;
 
+enum a2l_style {
+	A2L_STYLE_UNKNOWN = 0,
+	A2L_STYLE_LIBDW,
+	A2L_STYLE_LLVM,
+	A2L_STYLE_LIBBFD,
+	A2L_STYLE_CMD,
+};
+#define MAX_A2L_STYLE (A2L_STYLE_CMD + 1)
+
 struct symbol_conf {
 	bool		nanosecs;
 	unsigned short	priv_size;
@@ -70,6 +79,7 @@ struct symbol_conf {
 			*col_width_list_str,
 			*bt_stop_list_str;
 	const char		*addr2line_path;
+	enum a2l_style	addr2line_style[MAX_A2L_STYLE];
 	unsigned long	time_quantum;
        struct strlist	*dso_list,
 			*comm_list,
diff --git a/tools/perf/util/synthetic-events.c b/tools/perf/util/synthetic-events.c
index 2ba9fa25e00a..ef79433ebc3a 100644
--- a/tools/perf/util/synthetic-events.c
+++ b/tools/perf/util/synthetic-events.c
@@ -2529,3 +2529,199 @@ int parse_synth_opt(char *synth)
 
 	return ret;
 }
+
+static union perf_event *__synthesize_schedstat_cpu(struct io *io, __u16 version,
+						    __u64 *cpu, __u64 timestamp)
+{
+	struct perf_record_schedstat_cpu *cs;
+	union perf_event *event;
+	size_t size;
+	char ch;
+
+	size = sizeof(*cs);
+	size = PERF_ALIGN(size, sizeof(u64));
+	event = zalloc(size);
+
+	if (!event)
+		return NULL;
+
+	cs = &event->schedstat_cpu;
+	cs->header.type = PERF_RECORD_SCHEDSTAT_CPU;
+	cs->header.size = size;
+	cs->timestamp = timestamp;
+
+	if (io__get_char(io) != 'p' || io__get_char(io) != 'u')
+		goto out_cpu;
+
+	if (io__get_dec(io, (__u64 *)cpu) != ' ')
+		goto out_cpu;
+
+#define CPU_FIELD(_type, _name, _desc, _format, _is_pct, _pct_of, _ver)	\
+	do {								\
+		__u64 _tmp;						\
+		ch = io__get_dec(io, &_tmp);				\
+		if (ch != ' ' && ch != '\n')				\
+			goto out_cpu;					\
+		cs->_ver._name = _tmp;					\
+	} while (0)
+
+	if (version == 15) {
+#include <perf/schedstat-v15.h>
+	} else if (version == 16) {
+#include <perf/schedstat-v16.h>
+	} else if (version == 17) {
+#include <perf/schedstat-v17.h>
+	}
+#undef CPU_FIELD
+
+	cs->cpu = *cpu;
+	cs->version = version;
+
+	return event;
+out_cpu:
+	free(event);
+	return NULL;
+}
+
+static union perf_event *__synthesize_schedstat_domain(struct io *io, __u16 version,
+						       __u64 cpu, __u64 timestamp)
+{
+	struct perf_record_schedstat_domain *ds;
+	union perf_event *event = NULL;
+	__u64 d_num;
+	size_t size;
+	char ch;
+
+	if (io__get_char(io) != 'o' || io__get_char(io) != 'm' || io__get_char(io) != 'a' ||
+	    io__get_char(io) != 'i' || io__get_char(io) != 'n')
+		return NULL;
+
+	ch = io__get_dec(io, &d_num);
+	if (version >= 17) {
+		/* Skip domain name as it can be extracted from perf header */
+		while (io__get_char(io) != ' ')
+			continue;
+	}
+
+	/* Skip cpumask as it can be extracted from perf header */
+	while (io__get_char(io) != ' ')
+		continue;
+
+	size = sizeof(*ds);
+	size = PERF_ALIGN(size, sizeof(u64));
+	event = zalloc(size);
+
+	ds = &event->schedstat_domain;
+	ds->header.type = PERF_RECORD_SCHEDSTAT_DOMAIN;
+	ds->header.size = size;
+	ds->version = version;
+	ds->timestamp = timestamp;
+	ds->domain = d_num;
+
+#define DOMAIN_FIELD(_type, _name, _desc, _format, _is_jiffies, _ver)	\
+	do {								\
+		__u64 _tmp;						\
+		ch = io__get_dec(io, &_tmp);				\
+		if (ch != ' ' && ch != '\n')				\
+			goto out_domain;				\
+		ds->_ver._name = _tmp;					\
+	} while (0)
+
+	if (version == 15) {
+#include <perf/schedstat-v15.h>
+	} else if (version == 16) {
+#include <perf/schedstat-v16.h>
+	} else if (version == 17) {
+#include <perf/schedstat-v17.h>
+	}
+#undef DOMAIN_FIELD
+
+	ds->cpu = cpu;
+	goto out;
+
+out_domain:
+	free(event);
+	event = NULL;
+out:
+	return event;
+}
+
+int perf_event__synthesize_schedstat(const struct perf_tool *tool,
+				     perf_event__handler_t process,
+				     struct perf_cpu_map *user_requested_cpus)
+{
+	char *line = NULL, path[PATH_MAX];
+	union perf_event *event = NULL;
+	size_t line_len = 0;
+	char bf[BUFSIZ];
+	__u64 timestamp;
+	__u64 cpu = -1;
+	__u16 version;
+	struct io io;
+	int ret = -1;
+	char ch;
+
+	snprintf(path, PATH_MAX, "%s/schedstat", procfs__mountpoint());
+	io.fd = open(path, O_RDONLY, 0);
+	if (io.fd < 0) {
+		pr_err("Failed to open %s. Possibly CONFIG_SCHEDSTAT is disabled.\n", path);
+		return -1;
+	}
+	io__init(&io, io.fd, bf, sizeof(bf));
+
+	if (io__getline(&io, &line, &line_len) < 0 || !line_len)
+		goto out;
+
+	if (!strcmp(line, "version 15\n")) {
+		version = 15;
+	} else if (!strcmp(line, "version 16\n")) {
+		version = 16;
+	} else if (!strcmp(line, "version 17\n")) {
+		version = 17;
+	} else {
+		pr_err("Unsupported %s version: %s", path, line + 8);
+		goto out_free_line;
+	}
+
+	if (io__getline(&io, &line, &line_len) < 0 || !line_len)
+		goto out_free_line;
+	timestamp = atol(line + 10);
+
+	/*
+	 * FIXME: Can be optimized a bit by not synthesizing domain samples
+	 * for filtered out cpus.
+	 */
+	for (ch = io__get_char(&io); !io.eof; ch = io__get_char(&io)) {
+		struct perf_cpu this_cpu;
+
+		if (ch == 'c') {
+			event = __synthesize_schedstat_cpu(&io, version,
+							   &cpu, timestamp);
+		} else if (ch == 'd') {
+			event = __synthesize_schedstat_domain(&io, version,
+							      cpu, timestamp);
+		}
+		if (!event)
+			goto out_free_line;
+
+		this_cpu.cpu = cpu;
+
+		if (user_requested_cpus && !perf_cpu_map__has(user_requested_cpus, this_cpu))
+			continue;
+
+		if (process(tool, event, NULL, NULL) < 0) {
+			free(event);
+			goto out_free_line;
+		}
+
+		free(event);
+	}
+
+	ret = 0;
+
+out_free_line:
+	free(line);
+out:
+	close(io.fd);
+	return ret;
+}
diff --git a/tools/perf/util/synthetic-events.h b/tools/perf/util/synthetic-events.h
index f8588b6cf11a..b0edad0c3100 100644
--- a/tools/perf/util/synthetic-events.h
+++ b/tools/perf/util/synthetic-events.h
@@ -128,4 +128,7 @@ int perf_event__synthesize_for_pipe(const struct perf_tool *tool,
 				    struct perf_data *data,
 				    perf_event__handler_t process);
 
+int perf_event__synthesize_schedstat(const struct perf_tool *tool,
+				     perf_event__handler_t process,
+				     struct perf_cpu_map *user_requested_cpu);
 #endif // __PERF_SYNTHETIC_EVENTS_H
diff --git a/tools/perf/util/thread-stack.c b/tools/perf/util/thread-stack.c
index c6a0a27b12c2..c5ce741b0744 100644
--- a/tools/perf/util/thread-stack.c
+++ b/tools/perf/util/thread-stack.c
@@ -157,10 +157,10 @@ static int thread_stack__init(struct thread_stack *ts, struct thread *thread,
 
 	if (thread__maps(thread) && maps__machine(thread__maps(thread))) {
 		struct machine *machine = maps__machine(thread__maps(thread));
-		const char *arch = perf_env__arch(machine->env);
+		uint16_t e_machine = thread__e_machine(thread, machine, /*e_flags=*/NULL);
 
 		ts->kernel_start = machine__kernel_start(machine);
-		if (!strcmp(arch, "x86"))
+		if (e_machine == EM_X86_64 || e_machine == EM_386)
 			ts->rstate = X86_RETPOLINE_POSSIBLE;
 	} else {
 		ts->kernel_start = 1ULL << 63;
diff --git a/tools/perf/util/thread.c b/tools/perf/util/thread.c
index aa9c58bbf9d3..22be77225bb0 100644
--- a/tools/perf/util/thread.c
+++ b/tools/perf/util/thread.c
@@ -449,7 +449,7 @@ void thread__find_cpumode_addr_location(struct thread *thread, u64 addr,
 	}
 }
 
-static uint16_t read_proc_e_machine_for_pid(pid_t pid)
+static uint16_t read_proc_e_machine_for_pid(pid_t pid, uint32_t *e_flags)
 {
 	char path[6 /* "/proc/" */ + 11 /* max length of pid */ + 5 /* "/exe\0" */];
 	int fd;
@@ -458,52 +458,71 @@ static uint16_t read_proc_e_machine_for_pid(pid_t pid)
 	snprintf(path, sizeof(path), "/proc/%d/exe", pid);
 	fd = open(path, O_RDONLY);
 	if (fd >= 0) {
-		_Static_assert(offsetof(Elf32_Ehdr, e_machine) == 18, "Unexpected offset");
-		_Static_assert(offsetof(Elf64_Ehdr, e_machine) == 18, "Unexpected offset");
-		if (pread(fd, &e_machine, sizeof(e_machine), 18) != sizeof(e_machine))
-			e_machine = EM_NONE;
+		e_machine = dso__read_e_machine(/*optional_dso=*/NULL, fd, e_flags);
 		close(fd);
 	}
 	return e_machine;
 }
 
-static int thread__e_machine_callback(struct map *map, void *machine)
+struct thread__e_machine_callback_args {
+	struct machine *machine;
+	uint32_t e_flags;
+	uint16_t e_machine;
+};
+
+static int thread__e_machine_callback(struct map *map, void *_args)
 {
+	struct thread__e_machine_callback_args *args = _args;
 	struct dso *dso = map__dso(map);
 
-	_Static_assert(0 == EM_NONE, "Unexpected EM_NONE");
 	if (!dso)
-		return EM_NONE;
+		return 0; // No dso, continue search.
 
-	return dso__e_machine(dso, machine);
+	args->e_machine = dso__e_machine(dso, args->machine, &args->e_flags);
+	return args->e_machine != EM_NONE ? 1 /* stop search */ : 0 /* continue search */;
 }
 
-uint16_t thread__e_machine(struct thread *thread, struct machine *machine)
+uint16_t thread__e_machine(struct thread *thread, struct machine *machine, uint32_t *e_flags)
 {
 	pid_t tid, pid;
 	uint16_t e_machine = RC_CHK_ACCESS(thread)->e_machine;
+	uint32_t local_e_flags = 0;
+	struct thread__e_machine_callback_args args = {
+		.machine = machine,
+		.e_flags = 0,
+		.e_machine = EM_NONE,
+	};
 
-	if (e_machine != EM_NONE)
+	if (e_machine != EM_NONE) {
+		if (e_flags)
+			*e_flags = thread__e_flags(thread);
 		return e_machine;
+	}
+
+	if (machine == NULL) {
+		struct maps *maps = thread__maps(thread);
 
+		machine = maps__machine(maps);
+	}
 	tid = thread__tid(thread);
 	pid = thread__pid(thread);
 	if (pid != tid) {
 		struct thread *parent = machine__findnew_thread(machine, pid, pid);
 
 		if (parent) {
-			e_machine = thread__e_machine(parent, machine);
+			e_machine = thread__e_machine(parent, machine, &local_e_flags);
 			thread__put(parent);
-			thread__set_e_machine(thread, e_machine);
-			return e_machine;
+			goto out;
 		}
 		/* Something went wrong, fallback. */
 	}
 	/* Reading on the PID thread. First try to find from the maps. */
-	e_machine = maps__for_each_map(thread__maps(thread),
-				       thread__e_machine_callback,
-				       machine);
-	if (e_machine == EM_NONE) {
+	maps__for_each_map(thread__maps(thread), thread__e_machine_callback, &args);
+
+	if (args.e_machine != EM_NONE) {
+		e_machine = args.e_machine;
+		local_e_flags = args.e_flags;
+	} else {
 		/* Maps failed, perhaps we're live with map events disabled. */
 		bool is_live = machine->machines == NULL;
 
@@ -517,12 +536,18 @@ uint16_t thread__e_machine(struct thread *thread, struct machine *machine)
 		}
 		/* Read from /proc/pid/exe if live. */
 		if (is_live)
-			e_machine = read_proc_e_machine_for_pid(pid);
+			e_machine = read_proc_e_machine_for_pid(pid, &local_e_flags);
 	}
-	if (e_machine != EM_NONE)
+out:
+	if (e_machine != EM_NONE) {
 		thread__set_e_machine(thread, e_machine);
-	else
+		thread__set_e_flags(thread, local_e_flags);
+	} else {
 		e_machine = EM_HOST;
+		local_e_flags = EF_HOST;
+	}
+	if (e_flags)
+		*e_flags = local_e_flags;
 	return e_machine;
 }
 
diff --git a/tools/perf/util/thread.h b/tools/perf/util/thread.h
index 310eaea344bb..f5792d3e8a16 100644
--- a/tools/perf/util/thread.h
+++ b/tools/perf/util/thread.h
@@ -61,6 +61,10 @@ DECLARE_RC_STRUCT(thread) {
 	bool			filter;
 	int			filter_entry_depth;
 	/**
+	 * @e_flags: The ELF EF_* associated with the thread. Valid if e_machine != EM_NONE.
+	 */
+	uint16_t		e_flags;
+	/**
 	 * @e_machine: The ELF EM_* associated with the thread. EM_NONE if not
 	 * computed.
 	 */
@@ -307,13 +311,23 @@ static inline void thread__set_filter_entry_depth(struct thread *thread, int dep
 	RC_CHK_ACCESS(thread)->filter_entry_depth = depth;
 }
 
-uint16_t thread__e_machine(struct thread *thread, struct machine *machine);
+uint16_t thread__e_machine(struct thread *thread, struct machine *machine, uint32_t *e_flags);
 
 static inline void thread__set_e_machine(struct thread *thread, uint16_t e_machine)
 {
 	RC_CHK_ACCESS(thread)->e_machine = e_machine;
 }
 
+static inline uint32_t thread__e_flags(const struct thread *thread)
+{
+	return RC_CHK_ACCESS(thread)->e_flags;
+}
+
+static inline void thread__set_e_flags(struct thread *thread, uint32_t e_flags)
+{
+	RC_CHK_ACCESS(thread)->e_flags = e_flags;
+}
+
 
 static inline bool thread__lbr_stitch_enable(const struct thread *thread)
 {
diff --git a/tools/perf/util/thread_map.c b/tools/perf/util/thread_map.c
index ca193c1374ed..48c70f149e92 100644
--- a/tools/perf/util/thread_map.c
+++ b/tools/perf/util/thread_map.c
@@ -164,19 +164,16 @@ static struct perf_thread_map *thread_map__new_by_pid_str(const char *pid_str)
 	struct dirent **namelist = NULL;
 	int i, j = 0;
 	pid_t pid, prev_pid = INT_MAX;
-	char *end_ptr;
 	struct str_node *pos;
-	struct strlist_config slist_config = { .dont_dupstr = true, };
-	struct strlist *slist = strlist__new(pid_str, &slist_config);
+	struct strlist *slist = strlist__new(pid_str, NULL);
 
 	if (!slist)
 		return NULL;
 
 	strlist__for_each_entry(pos, slist) {
-		pid = strtol(pos->s, &end_ptr, 10);
+		pid = strtol(pos->s, NULL, 10);
 
-		if (pid == INT_MIN || pid == INT_MAX ||
-		    (*end_ptr != '\0' && *end_ptr != ','))
+		if (pid == INT_MIN || pid == INT_MAX)
 			goto out_free_threads;
 
 		if (pid == prev_pid)
@@ -223,24 +220,21 @@ struct perf_thread_map *thread_map__new_by_tid_str(const char *tid_str)
 	struct perf_thread_map *threads = NULL, *nt;
 	int ntasks = 0;
 	pid_t tid, prev_tid = INT_MAX;
-	char *end_ptr;
 	struct str_node *pos;
-	struct strlist_config slist_config = { .dont_dupstr = true, };
 	struct strlist *slist;
 
 	/* perf-stat expects threads to be generated even if tid not given */
 	if (!tid_str)
 		return perf_thread_map__new_dummy();
 
-	slist = strlist__new(tid_str, &slist_config);
+	slist = strlist__new(tid_str, NULL);
 	if (!slist)
 		return NULL;
 
 	strlist__for_each_entry(pos, slist) {
-		tid = strtol(pos->s, &end_ptr, 10);
+		tid = strtol(pos->s, NULL, 10);
 
-		if (tid == INT_MIN || tid == INT_MAX ||
-		    (*end_ptr != '\0' && *end_ptr != ','))
+		if (tid == INT_MIN || tid == INT_MAX)
 			goto out_free_threads;
 
 		if (tid == prev_tid)
diff --git a/tools/perf/util/time-utils.c b/tools/perf/util/time-utils.c
index 1b91ccd4d523..d43c4577d7eb 100644
--- a/tools/perf/util/time-utils.c
+++ b/tools/perf/util/time-utils.c
@@ -325,7 +325,7 @@ static int percent_comma_split(struct perf_time_interval *ptime_buf, int num,
 }
 
 static int one_percent_convert(struct perf_time_interval *ptime_buf,
-			       const char *ostr, u64 start, u64 end, char *c)
+			       const char *ostr, u64 start, u64 end, const char *c)
 {
 	char *str;
 	int len = strlen(ostr), ret;
@@ -358,7 +358,7 @@ static int one_percent_convert(struct perf_time_interval *ptime_buf,
 int perf_time__percent_parse_str(struct perf_time_interval *ptime_buf, int num,
 				 const char *ostr, u64 start, u64 end)
 {
-	char *c;
+	const char *c;
 
 	/*
 	 * ostr example:
diff --git a/tools/perf/util/tool.c b/tools/perf/util/tool.c
index 27ba5849c74a..013c7839e2cf 100644
--- a/tools/perf/util/tool.c
+++ b/tools/perf/util/tool.c
@@ -253,7 +253,25 @@ static int perf_event__process_bpf_metadata_stub(const struct perf_tool *tool __
 {
 	if (dump_trace)
 		perf_event__fprintf_bpf_metadata(event, stdout);
+	dump_printf(": unhandled!\n");
+	return 0;
+}
+static int process_schedstat_cpu_stub(const struct perf_tool *tool __maybe_unused,
+				      struct perf_session *perf_session __maybe_unused,
+				      union perf_event *event)
+{
+	if (dump_trace)
+		perf_event__fprintf_schedstat_cpu(event, stdout);
+	dump_printf(": unhandled!\n");
+	return 0;
+}
 
+static int process_schedstat_domain_stub(const struct perf_tool *tool __maybe_unused,
+					 struct perf_session *perf_session __maybe_unused,
+					 union perf_event *event)
+{
+	if (dump_trace)
+		perf_event__fprintf_schedstat_domain(event, stdout);
 	dump_printf(": unhandled!\n");
 	return 0;
 }
@@ -317,6 +335,8 @@ void perf_tool__init(struct perf_tool *tool, bool ordered_events)
 #endif
 	tool->finished_init = process_event_op2_stub;
 	tool->bpf_metadata = perf_event__process_bpf_metadata_stub;
+	tool->schedstat_cpu = process_schedstat_cpu_stub;
+	tool->schedstat_domain = process_schedstat_domain_stub;
 }
 
 bool perf_tool__compressed_is_stub(const struct perf_tool *tool)
diff --git a/tools/perf/util/tool.h b/tools/perf/util/tool.h
index e96b69d25a5b..2d9a4b1ca9d0 100644
--- a/tools/perf/util/tool.h
+++ b/tools/perf/util/tool.h
@@ -81,7 +81,9 @@ struct perf_tool {
 			stat_round,
 			feature,
 			finished_init,
-			bpf_metadata;
+			bpf_metadata,
+			schedstat_cpu,
+			schedstat_domain;
 	event_op4	compressed;
 	event_op3	auxtrace;
 	bool		ordered_events;
diff --git a/tools/perf/util/tool_pmu.c b/tools/perf/util/tool_pmu.c
index 37c4eae0bef1..6a9df3dc0e07 100644
--- a/tools/perf/util/tool_pmu.c
+++ b/tools/perf/util/tool_pmu.c
@@ -2,7 +2,6 @@
 #include "cgroup.h"
 #include "counts.h"
 #include "cputopo.h"
-#include "debug.h"
 #include "evsel.h"
 #include "pmu.h"
 #include "print-events.h"
@@ -14,7 +13,6 @@
 #include <api/fs/fs.h>
 #include <api/io.h>
 #include <internal/threadmap.h>
-#include <perf/cpumap.h>
 #include <perf/threadmap.h>
 #include <fcntl.h>
 #include <strings.h>
@@ -111,23 +109,6 @@ const char *evsel__tool_pmu_event_name(const struct evsel *evsel)
 	return tool_pmu__event_to_str(evsel->core.attr.config);
 }
 
-struct perf_cpu_map *tool_pmu__cpus(struct perf_event_attr *attr)
-{
-	static struct perf_cpu_map *cpu0_map;
-	enum tool_pmu_event event = (enum tool_pmu_event)attr->config;
-
-	if (event <= TOOL_PMU__EVENT_NONE || event >= TOOL_PMU__EVENT_MAX) {
-		pr_err("Invalid tool PMU event config %llx\n", attr->config);
-		return NULL;
-	}
-	if (event == TOOL_PMU__EVENT_USER_TIME || event == TOOL_PMU__EVENT_SYSTEM_TIME)
-		return cpu_map__online();
-
-	if (!cpu0_map)
-		cpu0_map = perf_cpu_map__new_int(0);
-	return perf_cpu_map__get(cpu0_map);
-}
-
 static bool read_until_char(struct io *io, char e)
 {
 	int c;
diff --git a/tools/perf/util/tool_pmu.h b/tools/perf/util/tool_pmu.h
index ea343d1983d3..f1714001bc1d 100644
--- a/tools/perf/util/tool_pmu.h
+++ b/tools/perf/util/tool_pmu.h
@@ -46,7 +46,6 @@ bool tool_pmu__read_event(enum tool_pmu_event ev,
 u64 tool_pmu__cpu_slots_per_cycle(void);
 
 bool perf_pmu__is_tool(const struct perf_pmu *pmu);
-struct perf_cpu_map *tool_pmu__cpus(struct perf_event_attr *attr);
 
 bool evsel__is_tool(const struct evsel *evsel);
 enum tool_pmu_event evsel__tool_event(const struct evsel *evsel);
diff --git a/tools/perf/util/tp_pmu.c b/tools/perf/util/tp_pmu.c
index eddb9807131a..c2be8c9f9084 100644
--- a/tools/perf/util/tp_pmu.c
+++ b/tools/perf/util/tp_pmu.c
@@ -192,7 +192,7 @@ bool tp_pmu__have_event(struct perf_pmu *pmu __maybe_unused, const char *name)
 	char *dup_name, *colon;
 	int id;
 
-	colon = strchr(name, ':');
+	colon = strchr((char *)name, ':');
 	if (colon == NULL)
 		return false;
 
diff --git a/tools/perf/util/trace-event-info.c b/tools/perf/util/trace-event-info.c
index c8755679281e..45774722f249 100644
--- a/tools/perf/util/trace-event-info.c
+++ b/tools/perf/util/trace-event-info.c
@@ -482,7 +482,7 @@ char *tracepoint_id_to_name(u64 config)
 static struct tracepoint_path *tracepoint_name_to_path(const char *name)
 {
 	struct tracepoint_path *path = zalloc(sizeof(*path));
-	char *str = strchr(name, ':');
+	const char *str = strchr(name, ':');
 
 	if (path == NULL || str == NULL) {
 		free(path);
diff --git a/tools/perf/util/trace-event-scripting.c b/tools/perf/util/trace-event-scripting.c
index 72abb28b7b5a..fa850e44cb46 100644
--- a/tools/perf/util/trace-event-scripting.c
+++ b/tools/perf/util/trace-event-scripting.c
@@ -13,7 +13,6 @@
 #include <event-parse.h>
 #endif
 
-#include "archinsn.h"
 #include "debug.h"
 #include "event.h"
 #include "trace-event.h"
@@ -274,21 +273,6 @@ void setup_perl_scripting(void)
 #endif
 #endif
 
-#if !defined(__i386__) && !defined(__x86_64__)
-void arch_fetch_insn(struct perf_sample *sample __maybe_unused,
-		     struct thread *thread __maybe_unused,
-		     struct machine *machine __maybe_unused)
-{
-}
-#endif
-
-void script_fetch_insn(struct perf_sample *sample, struct thread *thread,
-		       struct machine *machine, bool native_arch)
-{
-	if (sample->insn_len == 0 && native_arch)
-		arch_fetch_insn(sample, thread, machine);
-}
-
 static const struct {
 	u32 flags;
 	const char *name;
diff --git a/tools/perf/util/trace-event.h b/tools/perf/util/trace-event.h
index 71e680bc3d4b..914d9b69ed62 100644
--- a/tools/perf/util/trace-event.h
+++ b/tools/perf/util/trace-event.h
@@ -116,9 +116,6 @@ extern unsigned int scripting_max_stack;
 struct scripting_ops *script_spec__lookup(const char *spec);
 int script_spec__for_each(int (*cb)(struct scripting_ops *ops, const char *spec));
 
-void script_fetch_insn(struct perf_sample *sample, struct thread *thread,
-		       struct machine *machine, bool native_arch);
-
 void setup_perl_scripting(void);
 void setup_python_scripting(void);
 
diff --git a/tools/perf/util/units.c b/tools/perf/util/units.c
index 4c6a86e1cb54..0bbacf5a29aa 100644
--- a/tools/perf/util/units.c
+++ b/tools/perf/util/units.c
@@ -12,7 +12,7 @@ unsigned long parse_tag_value(const char *str, struct parse_tag *tags)
 	struct parse_tag *i = tags;
 
 	while (i->tag) {
-		char *s = strchr(str, i->tag);
+		const char *s = strchr(str, i->tag);
 
 		if (s) {
 			unsigned long int value;
diff --git a/tools/perf/util/unwind-libdw.c b/tools/perf/util/unwind-libdw.c
index ae70fb56a057..05e8e68bd49c 100644
--- a/tools/perf/util/unwind-libdw.c
+++ b/tools/perf/util/unwind-libdw.c
@@ -6,6 +6,7 @@
 #include <errno.h>
 #include "debug.h"
 #include "dso.h"
+#include <dwarf-regs.h>
 #include "unwind.h"
 #include "unwind-libdw.h"
 #include "machine.h"
@@ -19,6 +20,17 @@
 #include "callchain.h"
 #include "util/env.h"
 
+/*
+ * The dwfl thread argument passed to functions like memory_read. Memory has to
+ * be allocated to persist of multiple uses of the dwfl.
+ */
+struct dwfl_ui_thread_info {
+	/* Back link to the dwfl. */
+	Dwfl *dwfl;
+	/* The current unwind info, only 1 is supported. */
+	struct unwind_info *ui;
+};
+
 static char *debuginfo_path;
 
 static int __find_debuginfo(Dwfl_Module *mod __maybe_unused, void **userdata,
@@ -34,6 +46,19 @@ static int __find_debuginfo(Dwfl_Module *mod __maybe_unused, void **userdata,
 	return -1;
 }
 
+void libdw__invalidate_dwfl(struct maps *maps, void *arg)
+{
+	struct dwfl_ui_thread_info *dwfl_ui_ti = arg;
+
+	if (!dwfl_ui_ti)
+		return;
+
+	assert(dwfl_ui_ti->ui == NULL);
+	maps__set_libdw_addr_space_dwfl(maps, NULL);
+	dwfl_end(dwfl_ui_ti->dwfl);
+	free(dwfl_ui_ti);
+}
+
 static const Dwfl_Callbacks offline_callbacks = {
 	.find_debuginfo		= __find_debuginfo,
 	.debuginfo_path		= &debuginfo_path,
@@ -136,8 +161,8 @@ static int entry(u64 ip, struct unwind_info *ui)
 	}
 
 	e->ip	  = ip;
-	e->ms.maps = al.maps;
-	e->ms.map = al.map;
+	e->ms.thread = thread__get(al.thread);
+	e->ms.map = map__get(al.map);
 	e->ms.sym = al.sym;
 
 	pr_debug("unwind: %s:ip = 0x%" PRIx64 " (0x%" PRIx64 ")\n",
@@ -186,8 +211,8 @@ out_fail:
 static bool memory_read(Dwfl *dwfl __maybe_unused, Dwarf_Addr addr, Dwarf_Word *result,
 			void *arg)
 {
-	struct unwind_info *ui = arg;
-	const char *arch = perf_env__arch(ui->machine->env);
+	struct dwfl_ui_thread_info *dwfl_ui_ti = arg;
+	struct unwind_info *ui = dwfl_ui_ti->ui;
 	struct stack_dump *stack = &ui->sample->user_stack;
 	u64 start, end;
 	int offset;
@@ -197,7 +222,7 @@ static bool memory_read(Dwfl *dwfl __maybe_unused, Dwarf_Addr addr, Dwarf_Word *
 		return false;
 
 	ret = perf_reg_value(&start, ui->sample->user_regs,
-			     perf_arch_reg_sp(arch));
+			     perf_arch_reg_sp(ui->e_machine));
 	if (ret)
 		return false;
 
@@ -225,10 +250,64 @@ static bool memory_read(Dwfl *dwfl __maybe_unused, Dwarf_Addr addr, Dwarf_Word *
 	return true;
 }
 
+static bool libdw_set_initial_registers(Dwfl_Thread *thread, void *arg)
+{
+	struct dwfl_ui_thread_info *dwfl_ui_ti = arg;
+	struct unwind_info *ui = dwfl_ui_ti->ui;
+	struct regs_dump *user_regs = perf_sample__user_regs(ui->sample);
+	Dwarf_Word *dwarf_regs;
+	int max_dwarf_reg = 0;
+	bool ret;
+	uint16_t e_machine = ui->e_machine;
+	int e_flags = ui->e_flags;
+	uint64_t ip_perf_reg = perf_arch_reg_ip(e_machine);
+	Dwarf_Word val = 0;
+
+
+	/*
+	 * For every possible perf register in the bitmap determine the dwarf
+	 * register and use to compute the max.
+	 */
+	for (int perf_reg = 0; perf_reg < 64; perf_reg++) {
+		if (user_regs->mask & (1ULL << perf_reg)) {
+			int dwarf_reg =
+				get_dwarf_regnum_for_perf_regnum(perf_reg, e_machine,
+								 e_flags,
+								 /*only_libdw_supported=*/true);
+			if (dwarf_reg > max_dwarf_reg)
+				max_dwarf_reg = dwarf_reg;
+		}
+	}
+
+	dwarf_regs = calloc(max_dwarf_reg + 1, sizeof(*dwarf_regs));
+	if (!dwarf_regs)
+		return false;
+
+	for (int perf_reg = 0; perf_reg < 64; perf_reg++) {
+		if (user_regs->mask & (1ULL << perf_reg)) {
+			int dwarf_reg =
+				get_dwarf_regnum_for_perf_regnum(perf_reg, e_machine,
+								 e_flags,
+								 /*only_libdw_supported=*/true);
+			if (dwarf_reg >= 0) {
+				val = 0;
+				if (perf_reg_value(&val, user_regs, perf_reg) == 0)
+					dwarf_regs[dwarf_reg] = val;
+			}
+		}
+	}
+	if (perf_reg_value(&val, user_regs, ip_perf_reg) == 0)
+		dwfl_thread_state_register_pc(thread, val);
+
+	ret = dwfl_thread_state_registers(thread, 0, max_dwarf_reg + 1, dwarf_regs);
+	free(dwarf_regs);
+	return ret;
+}
+
 static const Dwfl_Thread_Callbacks callbacks = {
-	.next_thread		= next_thread,
-	.memory_read		= memory_read,
-	.set_initial_registers	= libdw__arch_set_initial_registers,
+	.next_thread           = next_thread,
+	.memory_read           = memory_read,
+	.set_initial_registers = libdw_set_initial_registers,
 };
 
 static int
@@ -266,33 +345,54 @@ int unwind__get_entries(unwind_entry_cb_t cb, void *arg,
 			int max_stack,
 			bool best_effort)
 {
-	struct unwind_info *ui, ui_buf = {
-		.sample		= data,
-		.thread		= thread,
-		.machine	= maps__machine((thread__maps(thread))),
-		.cb		= cb,
-		.arg		= arg,
-		.max_stack	= max_stack,
-		.best_effort    = best_effort
-	};
-	const char *arch = perf_env__arch(ui_buf.machine->env);
+	struct maps *maps = thread__maps(thread);
+	struct machine *machine = maps__machine(maps);
+	uint32_t e_flags = 0;
+	uint16_t e_machine = thread__e_machine(thread, machine, &e_flags);
+	struct dwfl_ui_thread_info *dwfl_ui_ti;
+	static struct unwind_info *ui;
+	Dwfl *dwfl;
 	Dwarf_Word ip;
 	int err = -EINVAL, i;
 
 	if (!data->user_regs || !data->user_regs->regs)
 		return -EINVAL;
 
-	ui = zalloc(sizeof(ui_buf) + sizeof(ui_buf.entries[0]) * max_stack);
+	ui = zalloc(sizeof(*ui) + sizeof(ui->entries[0]) * max_stack);
 	if (!ui)
 		return -ENOMEM;
 
-	*ui = ui_buf;
+	*ui = (struct unwind_info){
+		.sample		= data,
+		.thread		= thread,
+		.machine	= machine,
+		.cb		= cb,
+		.arg		= arg,
+		.max_stack	= max_stack,
+		.e_machine	= e_machine,
+		.e_flags	= e_flags,
+		.best_effort    = best_effort
+	};
 
-	ui->dwfl = dwfl_begin(&offline_callbacks);
-	if (!ui->dwfl)
-		goto out;
+	dwfl_ui_ti = maps__libdw_addr_space_dwfl(maps);
+	if (dwfl_ui_ti) {
+		dwfl = dwfl_ui_ti->dwfl;
+	} else {
+		dwfl_ui_ti = zalloc(sizeof(*dwfl_ui_ti));
+		dwfl = dwfl_begin(&offline_callbacks);
+		if (!dwfl)
+			goto out;
+
+		dwfl_ui_ti->dwfl = dwfl;
+		maps__set_libdw_addr_space_dwfl(maps, dwfl_ui_ti);
+	}
+	assert(dwfl_ui_ti->ui == NULL);
+	assert(dwfl_ui_ti->dwfl == dwfl);
+	assert(dwfl_ui_ti == maps__libdw_addr_space_dwfl(maps));
+	dwfl_ui_ti->ui = ui;
+	ui->dwfl = dwfl;
 
-	err = perf_reg_value(&ip, data->user_regs, perf_arch_reg_ip(arch));
+	err = perf_reg_value(&ip, data->user_regs, perf_arch_reg_ip(e_machine));
 	if (err)
 		goto out;
 
@@ -300,11 +400,12 @@ int unwind__get_entries(unwind_entry_cb_t cb, void *arg,
 	if (err)
 		goto out;
 
-	err = !dwfl_attach_state(ui->dwfl, EM_NONE, thread__tid(thread), &callbacks, ui);
-	if (err)
-		goto out;
+	dwfl_attach_state(dwfl, /*elf=*/NULL, thread__tid(thread), &callbacks,
+			  /* Dwfl thread function argument*/dwfl_ui_ti);
+	// Ignore thread already attached error.
 
-	err = dwfl_getthread_frames(ui->dwfl, thread__tid(thread), frame_callback, ui);
+	err = dwfl_getthread_frames(dwfl, thread__tid(thread), frame_callback,
+				    /* Dwfl frame function argument*/ui);
 
 	if (err && ui->max_stack != max_stack)
 		err = 0;
@@ -325,7 +426,10 @@ int unwind__get_entries(unwind_entry_cb_t cb, void *arg,
 	if (err)
 		pr_debug("unwind: failed with '%s'\n", dwfl_errmsg(-1));
 
-	dwfl_end(ui->dwfl);
+	for (i = 0; i < ui->idx; i++)
+		map_symbol__exit(&ui->entries[i].ms);
+
+	dwfl_ui_ti->ui = NULL;
 	free(ui);
 	return 0;
 }
diff --git a/tools/perf/util/unwind-libdw.h b/tools/perf/util/unwind-libdw.h
index 8c88bc4f2304..6423bf5a2492 100644
--- a/tools/perf/util/unwind-libdw.h
+++ b/tools/perf/util/unwind-libdw.h
@@ -2,17 +2,17 @@
 #ifndef __PERF_UNWIND_LIBDW_H
 #define __PERF_UNWIND_LIBDW_H
 
-#include <elfutils/libdwfl.h>
+#include <stdint.h>
 #include "unwind.h"
 
 struct machine;
 struct perf_sample;
 struct thread;
 
-bool libdw__arch_set_initial_registers(Dwfl_Thread *thread, void *arg);
+#ifdef HAVE_LIBDW_SUPPORT
 
 struct unwind_info {
-	Dwfl			*dwfl;
+	void			*dwfl;
 	struct perf_sample      *sample;
 	struct machine          *machine;
 	struct thread           *thread;
@@ -20,8 +20,13 @@ struct unwind_info {
 	void			*arg;
 	int			max_stack;
 	int			idx;
+	uint32_t		e_flags;
+	uint16_t		e_machine;
 	bool			best_effort;
 	struct unwind_entry	entries[];
 };
 
+void libdw__invalidate_dwfl(struct maps *maps, void *dwfl);
+#endif
+
 #endif /* __PERF_UNWIND_LIBDW_H */
diff --git a/tools/perf/util/unwind-libunwind-local.c b/tools/perf/util/unwind-libunwind-local.c
index 0b037e7389a0..5b39ce21e333 100644
--- a/tools/perf/util/unwind-libunwind-local.c
+++ b/tools/perf/util/unwind-libunwind-local.c
@@ -572,7 +572,6 @@ static int access_mem(unw_addr_space_t __maybe_unused as,
 		      int __write, void *arg)
 {
 	struct unwind_info *ui = arg;
-	const char *arch = perf_env__arch(ui->machine->env);
 	struct stack_dump *stack = &ui->sample->user_stack;
 	u64 start, end;
 	int offset;
@@ -585,7 +584,9 @@ static int access_mem(unw_addr_space_t __maybe_unused as,
 	}
 
 	ret = perf_reg_value(&start, perf_sample__user_regs(ui->sample),
-			     perf_arch_reg_sp(arch));
+			     perf_arch_reg_sp(thread__e_machine(ui->thread,
+								ui->machine,
+								/*e_flags=*/NULL)));
 	if (ret)
 		return ret;
 
@@ -667,7 +668,7 @@ static int entry(u64 ip, struct thread *thread,
 	e.ms.sym = thread__find_symbol(thread, PERF_RECORD_MISC_USER, ip, &al);
 	e.ip     = ip;
 	e.ms.map = al.map;
-	e.ms.maps = al.maps;
+	e.ms.thread = thread__get(al.thread);
 
 	pr_debug("unwind: %s:ip = 0x%" PRIx64 " (0x%" PRIx64 ")\n",
 		 al.sym ? al.sym->name : "''",
@@ -734,7 +735,7 @@ static void _unwind__finish_access(struct maps *maps)
 static int get_entries(struct unwind_info *ui, unwind_entry_cb_t cb,
 		       void *arg, int max_stack)
 {
-	const char *arch = perf_env__arch(ui->machine->env);
+	uint16_t e_machine = thread__e_machine(ui->thread, ui->machine, /*e_flags=*/NULL);
 	u64 val;
 	unw_word_t ips[max_stack];
 	unw_addr_space_t addr_space;
@@ -742,7 +743,7 @@ static int get_entries(struct unwind_info *ui, unwind_entry_cb_t cb,
 	int ret, i = 0;
 
 	ret = perf_reg_value(&val, perf_sample__user_regs(ui->sample),
-			     perf_arch_reg_ip(arch));
+			     perf_arch_reg_ip(e_machine));
 	if (ret)
 		return ret;
 
diff --git a/tools/perf/util/util.c b/tools/perf/util/util.c
index 0f031eb80b4c..8b893de35f77 100644
--- a/tools/perf/util/util.c
+++ b/tools/perf/util/util.c
@@ -1,4 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
+#include "perf.h"
 #include "util.h"
 #include "debug.h"
 #include "event.h"
@@ -257,6 +258,54 @@ static int rm_rf_kcore_dir(const char *path)
 	return 0;
 }
 
+void cpumask_to_cpulist(char *cpumask, char *cpulist)
+{
+	int i, j, bm_size, nbits;
+	int len = strlen(cpumask);
+	unsigned long *bm;
+	char cpus[MAX_NR_CPUS];
+
+	for (i = 0; i < len; i++) {
+		if (cpumask[i] == ',') {
+			for (j = i; j < len; j++)
+				cpumask[j] = cpumask[j + 1];
+		}
+	}
+
+	len = strlen(cpumask);
+	bm_size = (len + 15) / 16;
+	nbits = bm_size * 64;
+	if (nbits <= 0)
+		return;
+
+	bm = calloc(bm_size, sizeof(unsigned long));
+	if (!bm)
+		goto free_bm;
+
+	for (i = 0; i < bm_size; i++) {
+		char blk[17];
+		int blklen = len > 16 ? 16 : len;
+
+		strncpy(blk, cpumask + len - blklen, blklen);
+		blk[blklen] = '\0';
+		bm[i] = strtoul(blk, NULL, 16);
+		cpumask[len - blklen] = '\0';
+		len = strlen(cpumask);
+	}
+
+	bitmap_scnprintf(bm, nbits, cpus, sizeof(cpus));
+	strcpy(cpulist, cpus);
+
+free_bm:
+	free(bm);
+}
+
+void print_separator2(int pre_dash_cnt, const char *s, int post_dash_cnt)
+{
+	printf("%.*s%s%.*s\n", pre_dash_cnt, graph_dotted_line, s, post_dash_cnt,
+	       graph_dotted_line);
+}
+
 int rm_rf_perf_data(const char *path)
 {
 	const char *pat[] = {
diff --git a/tools/perf/util/util.h b/tools/perf/util/util.h
index 3423778e39a5..394dbfa944ac 100644
--- a/tools/perf/util/util.h
+++ b/tools/perf/util/util.h
@@ -11,6 +11,7 @@
 #include <stdbool.h>
 #include <stddef.h>
 #include <linux/compiler.h>
+#include <linux/bitmap.h>
 #include <sys/types.h>
 #ifndef __cplusplus
 #include <internal/cpumap.h>
@@ -48,6 +49,10 @@ bool sysctl__nmi_watchdog_enabled(void);
 
 int perf_tip(char **strp, const char *dirpath);
 
+void cpumask_to_cpulist(char *cpumask, char *cpulist);
+
+void print_separator2(int pre_dash_cnt, const char *s, int post_dash_cnt);
+
 #ifndef HAVE_SCHED_GETCPU_SUPPORT
 int sched_getcpu(void);
 #endif
diff --git a/tools/scripts/Makefile.include b/tools/scripts/Makefile.include
index ded48263dd5e..b5ecf137febc 100644
--- a/tools/scripts/Makefile.include
+++ b/tools/scripts/Makefile.include
@@ -94,6 +94,8 @@ LLVM_STRIP	?= llvm-strip
 # Some tools require bpftool
 SYSTEM_BPFTOOL	?= bpftool
 
+RUSTC		?= rustc
+
 ifeq ($(CC_NO_CLANG), 1)
 EXTRA_WARNINGS += -Wstrict-aliasing=3
author	Linus Torvalds <torvalds@linux-foundation.org>	2026-02-21 10:51:08 -0800
committer	Linus Torvalds <torvalds@linux-foundation.org>	2026-02-21 10:51:08 -0800
commit	c7decec2f2d2ab0366567f9e30c0e1418cece43f (patch)
tree	50312739ad43d0655ea71c942d848db2ff123e8e /tools
parent	3544d5ce36f403db6e5c994f526101c870ffe9fe (diff)
parent	dbf0108347bdb5d4ccef8910555b16c1f1a505f8 (diff)