From 3caeaa562733c4836e61086ec07666635006a787 Mon Sep 17 00:00:00 2001 From: Ravi Bangoria Date: Mon, 7 Dec 2015 12:25:02 +0530 Subject: perf kvm record/report: 'unprocessable sample' error while recording/reporting guest data While recording guest samples in host using perf kvm record, it will populate unprocessable sample error, though samples will be recorded properly. While generating report using perf kvm report, no samples will be processed and same error will populate. We have seen this behaviour with upstream perf(4.4-rc3) on x86 and ppc64 hardware. Reason behind this failure is, when it tries to fetch machine from rb_tree of machines, it fails. As a part of tracing a bug, we figured out that this code was incorrectly refactored in commit 54245fdc3576 ("perf session: Remove wrappers to machines__find"). This patch will change the functionality such that if it can't fetch machine in first trial, it will create one node of machine and add that to rb_tree. So next time when it tries to fetch same machine from rb_tree, it won't fail. Actually it was the case before refactoring of code in aforementioned commit. This patch is generated from acme perf/core branch. Below I've mention an example that demonstrate the behaviour before and after applying patch. Before applying patch: [Note: One needs to run guest before recording data in host] ravi@ravi-bangoria:~$ ./perf kvm record -a Warning: 5903 unprocessable samples recorded. Do you have a KVM guest running and not using 'perf kvm'? [ perf record: Captured and wrote 1.409 MB perf.data.guest (285 samples) ] ravi@ravi-bangoria:~$ ./perf kvm report --stdio Warning: 5903 unprocessable samples recorded. Do you have a KVM guest running and not using 'perf kvm'? # To display the perf.data header info, please use --header/--header-only options. # # Total Lost Samples: 0 # # Samples: 285 of event 'cycles' # Event count (approx.): 88715406 # # Overhead Command Shared Object Symbol # ........ ....... ............. ...... # # (For a higher level overview, try: perf report --sort comm,dso) # After applying patch: ravi@ravi-bangoria:~$ ./perf kvm record -a [ perf record: Captured and wrote 1.188 MB perf.data.guest (17 samples) ] ravi@ravi-bangoria:~$ ./perf kvm report --stdio # To display the perf.data header info, please use --header/--header-only options. # # Total Lost Samples: 0 # # Samples: 17 of event 'cycles' # Event count (approx.): 700746 # # Overhead Command Shared Object Symbol # ........ ....... ................ ...................... # 34.19% :5758 [unknown] [g] 0xffffffff818682ab 22.79% :5758 [unknown] [g] 0xffffffff812dc7f8 22.79% :5758 [unknown] [g] 0xffffffff818650d0 14.83% :5758 [unknown] [g] 0xffffffff8161a1b6 2.49% :5758 [unknown] [g] 0xffffffff818692bf 0.48% :5758 [unknown] [g] 0xffffffff81869253 0.05% :5758 [unknown] [g] 0xffffffff81869250 Signed-off-by: Ravi Bangoria Cc: Naveen N. Rao Cc: stable@vger.kernel.org # v3.19+ Fixes: 54245fdc3576 ("perf session: Remove wrappers to machines__find") Link: http://lkml.kernel.org/r/1449471302-11283-1-git-send-email-ravi.bangoria@linux.vnet.ibm.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/session.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c index d5636ba94b20..40b7a0d0905b 100644 --- a/tools/perf/util/session.c +++ b/tools/perf/util/session.c @@ -1149,7 +1149,7 @@ static struct machine *machines__find_for_cpumode(struct machines *machines, machine = machines__find(machines, pid); if (!machine) - machine = machines__find(machines, DEFAULT_GUEST_KERNEL_ID); + machine = machines__findnew(machines, DEFAULT_GUEST_KERNEL_ID); return machine; } -- cgit v1.2.3 From 40c4a0f92aed570cc529a1e5c24c7e04a0ce8b85 Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Wed, 13 Jan 2016 17:23:01 +0000 Subject: perf symbols: Fix reading of build-id from vDSO We need to use the long name (the filename) when reading the build-id from a DSO. Using the short name doesn't work for (at least) vDSOs. Signed-off-by: Ben Hutchings Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20160113172301.GT28542@decadent.org.uk Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/symbol.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/perf/util/symbol.c b/tools/perf/util/symbol.c index 3b2de6eb3376..ab02209a7cf3 100644 --- a/tools/perf/util/symbol.c +++ b/tools/perf/util/symbol.c @@ -1466,7 +1466,7 @@ int dso__load(struct dso *dso, struct map *map, symbol_filter_t filter) * Read the build id if possible. This is required for * DSO_BINARY_TYPE__BUILDID_DEBUGINFO to work */ - if (filename__read_build_id(dso->name, build_id, BUILD_ID_SIZE) > 0) + if (filename__read_build_id(dso->long_name, build_id, BUILD_ID_SIZE) > 0) dso__set_build_id(dso, build_id); /* -- cgit v1.2.3 From 7be43dfb1e617b87bf2d936d82c026be39b43910 Mon Sep 17 00:00:00 2001 From: Wang Nan Date: Wed, 13 Jan 2016 12:17:14 +0000 Subject: perf build: Set parallel making options build-test 'make build-test' is painful because of time consuming. In a full test, all test cases are built twice with tools/perf/Makefile and tools/perf/Makefile.perf. 'Makefile' automatically computes parallel options for make, but 'Makefile.perf' not, so all test cases is built with one job. It is very slow. This patch adds '-j' options to Makefile.perf testing. It computes parallel building options like what tools/perf/Makefile does, and pass '-j' option to Makefile.perf test. Signed-off-by: Wang Nan Acked-by: Jiri Olsa Tested-by: Arnaldo Carvalho de Melo Cc: Namhyung Kim Cc: Zefan Li Cc: pi3orama@163.com Link: http://lkml.kernel.org/r/1452687442-6186-2-git-send-email-wangnan0@huawei.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/make | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) (limited to 'tools') diff --git a/tools/perf/tests/make b/tools/perf/tests/make index df38decc48c3..c0ee67906e0a 100644 --- a/tools/perf/tests/make +++ b/tools/perf/tests/make @@ -5,7 +5,7 @@ ifeq ($(MAKECMDGOALS),) # no target specified, trigger the whole suite all: @echo "Testing Makefile"; $(MAKE) -sf tests/make MK=Makefile - @echo "Testing Makefile.perf"; $(MAKE) -sf tests/make MK=Makefile.perf + @echo "Testing Makefile.perf"; $(MAKE) -sf tests/make MK=Makefile.perf SET_PARALLEL=1 else # run only specific test over 'Makefile' %: @@ -14,6 +14,15 @@ endif else PERF := . +PARALLEL_OPT= +ifeq ($(SET_PARALLEL),1) + cores := $(shell (getconf _NPROCESSORS_ONLN || egrep -c '^processor|^CPU[0-9]' /proc/cpuinfo) 2>/dev/null) + ifeq ($(cores),0) + cores := 1 + endif + PARALLEL_OPT="-j$(cores)" +endif + # As per kernel Makefile, avoid funny character set dependencies unexport LC_ALL LC_COLLATE=C @@ -252,7 +261,7 @@ clean := @(cd $(PERF); make -s -f $(MK) clean >/dev/null) $(run): $(call clean) @TMP_DEST=$$(mktemp -d); \ - cmd="cd $(PERF) && make -f $(MK) DESTDIR=$$TMP_DEST $($@)"; \ + cmd="cd $(PERF) && make -f $(MK) $(PARALLEL_OPT) DESTDIR=$$TMP_DEST $($@)"; \ echo "- $@: $$cmd" && echo $$cmd > $@ && \ ( eval $$cmd ) >> $@ 2>&1; \ echo " test: $(call test,$@)" >> $@ 2>&1; \ @@ -263,7 +272,7 @@ $(run_O): $(call clean) @TMP_O=$$(mktemp -d); \ TMP_DEST=$$(mktemp -d); \ - cmd="cd $(PERF) && make -f $(MK) O=$$TMP_O DESTDIR=$$TMP_DEST $($(patsubst %_O,%,$@))"; \ + cmd="cd $(PERF) && make -f $(MK) $(PARALLEL_OPT) O=$$TMP_O DESTDIR=$$TMP_DEST $($(patsubst %_O,%,$@))"; \ echo "- $@: $$cmd" && echo $$cmd > $@ && \ ( eval $$cmd ) >> $@ 2>&1 && \ echo " test: $(call test_O,$@)" >> $@ 2>&1; \ @@ -277,15 +286,15 @@ tarpkg: rm -f $@ make_kernelsrc: - @echo "- make -C tools/perf" + @echo "- make -C $(PARALLEL_OPT) tools/perf" $(call clean); \ - (make -C ../.. tools/perf) > $@ 2>&1 && \ + (make -C ../.. $(PARALLEL_OPT) tools/perf) > $@ 2>&1 && \ test -x perf && rm -f $@ || (cat $@ ; false) make_kernelsrc_tools: - @echo "- make -C /tools perf" + @echo "- make -C /tools $(PARALLEL_OPT) perf" $(call clean); \ - (make -C ../../tools perf) > $@ 2>&1 && \ + (make -C ../../tools $(PARALLEL_OPT) perf) > $@ 2>&1 && \ test -x perf && rm -f $@ || (cat $@ ; false) all: $(run) $(run_O) tarpkg make_kernelsrc make_kernelsrc_tools -- cgit v1.2.3 From eb807730c034090599135bc03578015ebf8974af Mon Sep 17 00:00:00 2001 From: Wang Nan Date: Fri, 15 Jan 2016 04:00:14 +0000 Subject: perf build: Pass O option to Makefile.perf in build-test Unlike tools/perf/Makefile, tools/perf/Makefile.perf obey 'O' option when it is passed through cmdline only, due to code in tools/scripts/Makefile.include: ifneq ($(O),) ifeq ($(origin O), command line) ... ABSOLUTE_O := $(shell cd $(O) ; pwd) OUTPUT := $(ABSOLUTE_O)/$(if $(subdir),$(subdir)/) endif endif This patch passes 'O' to Makefile.perf through cmdline explicitly to make it follow O variable during build-test. 'make clean' should have identical 'O' option with 'make'. If not, config-clean may error. Signed-off-by: Wang Nan Tested-by: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Namhyung Kim Cc: Zefan Li Cc: pi3orama@163.com Link: http://lkml.kernel.org/r/1452830421-77757-3-git-send-email-wangnan0@huawei.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/make | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/tools/perf/tests/make b/tools/perf/tests/make index c0ee67906e0a..fc2a9a31113f 100644 --- a/tools/perf/tests/make +++ b/tools/perf/tests/make @@ -5,7 +5,7 @@ ifeq ($(MAKECMDGOALS),) # no target specified, trigger the whole suite all: @echo "Testing Makefile"; $(MAKE) -sf tests/make MK=Makefile - @echo "Testing Makefile.perf"; $(MAKE) -sf tests/make MK=Makefile.perf SET_PARALLEL=1 + @echo "Testing Makefile.perf"; $(MAKE) -sf tests/make MK=Makefile.perf SET_PARALLEL=1 SET_O=1 else # run only specific test over 'Makefile' %: @@ -13,6 +13,14 @@ else endif else PERF := . +O_OPT := + +ifneq ($(O),) + FULL_O := $(shell readlink -f $(O) || echo $(O)) + ifeq ($(SET_O),1) + O_OPT := 'O=$(FULL_O)' + endif +endif PARALLEL_OPT= ifeq ($(SET_PARALLEL),1) @@ -256,12 +264,12 @@ endif MAKEFLAGS := --no-print-directory -clean := @(cd $(PERF); make -s -f $(MK) clean >/dev/null) +clean := @(cd $(PERF); make -s -f $(MK) $(O_OPT) clean >/dev/null) $(run): $(call clean) @TMP_DEST=$$(mktemp -d); \ - cmd="cd $(PERF) && make -f $(MK) $(PARALLEL_OPT) DESTDIR=$$TMP_DEST $($@)"; \ + cmd="cd $(PERF) && make -f $(MK) $(PARALLEL_OPT) $(O_OPT) DESTDIR=$$TMP_DEST $($@)"; \ echo "- $@: $$cmd" && echo $$cmd > $@ && \ ( eval $$cmd ) >> $@ 2>&1; \ echo " test: $(call test,$@)" >> $@ 2>&1; \ -- cgit v1.2.3 From 68824de19a0b4cdc17193cb004c55d82c06af8bd Mon Sep 17 00:00:00 2001 From: Wang Nan Date: Fri, 15 Jan 2016 04:00:15 +0000 Subject: perf build: Test correct path of perf in build-test If an 'O' is passed to 'make build-test', many 'test -x' and 'test -f' will fail because perf resides in a different directory. Fix this by computing PERF_OUT according to 'O' and test correct output files. For make_kernelsrc and make_kernelsrc_tools, set KBUILD_OUTPUT_DIR instead because the path is different from others ($(O)/perf vs $(O)/tools/perf). Signed-off-by: Wang Nan Tested-by: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Namhyung Kim Cc: Zefan Li Cc: pi3orama@163.com Link: http://lkml.kernel.org/r/1452830421-77757-4-git-send-email-wangnan0@huawei.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/make | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) (limited to 'tools') diff --git a/tools/perf/tests/make b/tools/perf/tests/make index fc2a9a31113f..29810cf2c117 100644 --- a/tools/perf/tests/make +++ b/tools/perf/tests/make @@ -13,10 +13,12 @@ else endif else PERF := . +PERF_O := $(PERF) O_OPT := ifneq ($(O),) FULL_O := $(shell readlink -f $(O) || echo $(O)) + PERF_O := $(FULL_O) ifeq ($(SET_O),1) O_OPT := 'O=$(FULL_O)' endif @@ -173,11 +175,11 @@ test_make_doc := $(test_ok) test_make_help_O := $(test_ok) test_make_doc_O := $(test_ok) -test_make_python_perf_so := test -f $(PERF)/python/perf.so +test_make_python_perf_so := test -f $(PERF_O)/python/perf.so -test_make_perf_o := test -f $(PERF)/perf.o -test_make_util_map_o := test -f $(PERF)/util/map.o -test_make_util_pmu_bison_o := test -f $(PERF)/util/pmu-bison.o +test_make_perf_o := test -f $(PERF_O)/perf.o +test_make_util_map_o := test -f $(PERF_O)/util/map.o +test_make_util_pmu_bison_o := test -f $(PERF_O)/util/pmu-bison.o define test_dest_files for file in $(1); do \ @@ -244,7 +246,7 @@ test_make_perf_o_O := test -f $$TMP_O/perf.o test_make_util_map_o_O := test -f $$TMP_O/util/map.o test_make_util_pmu_bison_o_O := test -f $$TMP_O/util/pmu-bison.o -test_default = test -x $(PERF)/perf +test_default = test -x $(PERF_O)/perf test = $(if $(test_$1),$(test_$1),$(test_default)) test_default_O = test -x $$TMP_O/perf @@ -293,17 +295,22 @@ tarpkg: ( eval $$cmd ) >> $@ 2>&1 && \ rm -f $@ +KERNEL_O := ../.. +ifneq ($(O),) + KERNEL_O := $(O) +endif + make_kernelsrc: @echo "- make -C $(PARALLEL_OPT) tools/perf" $(call clean); \ (make -C ../.. $(PARALLEL_OPT) tools/perf) > $@ 2>&1 && \ - test -x perf && rm -f $@ || (cat $@ ; false) + test -x $(KERNEL_O)/tools/perf/perf && rm -f $@ || (cat $@ ; false) make_kernelsrc_tools: @echo "- make -C /tools $(PARALLEL_OPT) perf" $(call clean); \ (make -C ../../tools $(PARALLEL_OPT) perf) > $@ 2>&1 && \ - test -x perf && rm -f $@ || (cat $@ ; false) + test -x $(KERNEL_O)/tools/perf/perf && rm -f $@ || (cat $@ ; false) all: $(run) $(run_O) tarpkg make_kernelsrc make_kernelsrc_tools @echo OK -- cgit v1.2.3 From c15e758c4bd7fbe38983e36258541ffcf81e1500 Mon Sep 17 00:00:00 2001 From: Wang Nan Date: Fri, 15 Jan 2016 04:00:16 +0000 Subject: perf build: Pass O option to kernel makefile in build-test Kernel makefile only follows an 'O' option passed from command line explicitely. In build-test with 'O' option set, kernel makefile contaminate kernel source directory. Build test also fail if we don't create output directory manually. K_O_OPT is added and passed to kernel makefile if 'O' is passed to build-test. Signed-off-by: Wang Nan Tested-by: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Namhyung Kim Cc: Zefan Li Cc: pi3orama@163.com Link: http://lkml.kernel.org/r/1452830421-77757-5-git-send-email-wangnan0@huawei.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/make | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'tools') diff --git a/tools/perf/tests/make b/tools/perf/tests/make index 29810cf2c117..f918015512af 100644 --- a/tools/perf/tests/make +++ b/tools/perf/tests/make @@ -22,6 +22,7 @@ ifneq ($(O),) ifeq ($(SET_O),1) O_OPT := 'O=$(FULL_O)' endif + K_O_OPT := 'O=$(FULL_O)' endif PARALLEL_OPT= @@ -301,15 +302,15 @@ ifneq ($(O),) endif make_kernelsrc: - @echo "- make -C $(PARALLEL_OPT) tools/perf" + @echo "- make -C $(PARALLEL_OPT) $(K_O_OPT) tools/perf" $(call clean); \ - (make -C ../.. $(PARALLEL_OPT) tools/perf) > $@ 2>&1 && \ + (make -C ../.. $(PARALLEL_OPT) $(K_O_OPT) tools/perf) > $@ 2>&1 && \ test -x $(KERNEL_O)/tools/perf/perf && rm -f $@ || (cat $@ ; false) make_kernelsrc_tools: - @echo "- make -C /tools $(PARALLEL_OPT) perf" + @echo "- make -C /tools $(PARALLEL_OPT) $(K_O_OPT) perf" $(call clean); \ - (make -C ../../tools $(PARALLEL_OPT) perf) > $@ 2>&1 && \ + (make -C ../../tools $(PARALLEL_OPT) $(K_O_OPT) perf) > $@ 2>&1 && \ test -x $(KERNEL_O)/tools/perf/perf && rm -f $@ || (cat $@ ; false) all: $(run) $(run_O) tarpkg make_kernelsrc make_kernelsrc_tools -- cgit v1.2.3 From b8e52be00ca7e7659e8ee172ce9e6f0c6af28ec8 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Fri, 15 Jan 2016 04:00:17 +0000 Subject: perf build: Add feature-dump target To provide FEATURE-DUMP into $(FEATURE_DUMP_COPY) if defined, with no further action. Get feature dump of the current build: $ make feature-dump BUILD: Doing 'make -j4' parallel build Auto-detecting system features: ... dwarf: [ on ] FEATURE-DUMP file available in FEATURE-DUMP Get feature dump static build into /tmp/fd file: $ make feature-dump FEATURE_DUMP_COPY=/tmp/fd LDFLAGS=-static BUILD: Doing 'make -j4' parallel build Auto-detecting system features: ... dwarf: [ OFF ] SNIP FEATURE-DUMP file copied into /tmp/fd Suggested-by: Wang Nan Signed-off-by: Jiri Olsa Tested-by: Arnaldo Carvalho de Melo Cc: Zefan Li Cc: pi3orama@163.com Link: http://lkml.kernel.org/r/1452830421-77757-6-git-send-email-wangnan0@huawei.com Signed-off-by: Wang Nan Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Makefile.perf | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'tools') diff --git a/tools/perf/Makefile.perf b/tools/perf/Makefile.perf index 0a22407e1d7d..f758a72df1b3 100644 --- a/tools/perf/Makefile.perf +++ b/tools/perf/Makefile.perf @@ -610,6 +610,17 @@ clean: $(LIBTRACEEVENT)-clean $(LIBAPI)-clean $(LIBBPF)-clean $(LIBSUBCMD)-clean $(QUIET_SUBDIR0)Documentation $(QUIET_SUBDIR1) clean $(python-clean) +# +# To provide FEATURE-DUMP into $(FEATURE_DUMP_COPY) +# file if defined, with no further action. +feature-dump: +ifdef FEATURE_DUMP_COPY + @cp $(OUTPUT)FEATURE-DUMP $(FEATURE_DUMP_COPY) + @echo "FEATURE-DUMP file copied into $(FEATURE_DUMP_COPY)" +else + @echo "FEATURE-DUMP file available in $(OUTPUT)FEATURE-DUMP" +endif + # # Trick: if ../../.git does not exist - we are building out of tree for example, # then force version regeneration: -- cgit v1.2.3 From 96b9e70b8e6cd65f71ee71889143976f3afb038a Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Fri, 15 Jan 2016 04:00:18 +0000 Subject: perf build: Introduce FEATURES_DUMP make variable Introducing FEATURES_DUMP make variable to provide features detection dump file and bypass the feature detection. The intention is to use this during build tests to skip repeated features detection, like: Get feature dump static build into /tmp/fd file: $ make feature-dump FEATURE_DUMP_COPY=/tmp/fd LDFLAGS=-static BUILD: Doing 'make -j4' parallel build Auto-detecting system features: ... dwarf: [ OFF ] SNIP FEATURE-DUMP file copied into /tmp/fd Use /tmp/fd to build perf: $ make FEATURES_DUMP=/tmp/fd LDFLAGS=-static $ file perf perf: ELF 64-bit LSB executable, x86-64, version 1 (GNU/Linux), statically linked, for ... Suggested-by: Wang Nan Signed-off-by: Jiri Olsa Tested-by: Arnaldo Carvalho de Melo Cc: Zefan Li Cc: pi3orama@163.com Link: http://lkml.kernel.org/r/1452830421-77757-7-git-send-email-wangnan0@huawei.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Makefile.perf | 14 +++++++++++++- tools/perf/config/Makefile | 4 ++++ 2 files changed, 17 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/perf/Makefile.perf b/tools/perf/Makefile.perf index f758a72df1b3..5d34815c7ccb 100644 --- a/tools/perf/Makefile.perf +++ b/tools/perf/Makefile.perf @@ -77,6 +77,9 @@ include config/utilities.mak # Define NO_AUXTRACE if you do not want AUX area tracing support # # Define NO_LIBBPF if you do not want BPF support +# +# Define FEATURES_DUMP to provide features detection dump file +# and bypass the feature detection # As per kernel Makefile, avoid funny character set dependencies unexport LC_ALL @@ -166,6 +169,15 @@ ifeq ($(config),1) include config/Makefile endif +# The FEATURE_DUMP_EXPORT holds location of the actual +# FEATURE_DUMP file to be used to bypass feature detection +# (for bpf or any other subproject) +ifeq ($(FEATURES_DUMP),) +FEATURE_DUMP_EXPORT := $(realpath $(OUTPUT)FEATURE-DUMP) +else +FEATURE_DUMP_EXPORT := $(FEATURES_DUMP) +endif + export prefix bindir sharedir sysconfdir DESTDIR # sparse is architecture-neutral, which means that we need to tell it @@ -436,7 +448,7 @@ $(LIBAPI)-clean: $(Q)$(MAKE) -C $(LIB_DIR) O=$(OUTPUT) clean >/dev/null $(LIBBPF): fixdep FORCE - $(Q)$(MAKE) -C $(BPF_DIR) O=$(OUTPUT) $(OUTPUT)libbpf.a FEATURES_DUMP=$(realpath $(OUTPUT)FEATURE-DUMP) + $(Q)$(MAKE) -C $(BPF_DIR) O=$(OUTPUT) $(OUTPUT)libbpf.a FEATURES_DUMP=$(FEATURE_DUMP_EXPORT) $(LIBBPF)-clean: $(call QUIET_CLEAN, libbpf) diff --git a/tools/perf/config/Makefile b/tools/perf/config/Makefile index e5959c136a19..511141b102e8 100644 --- a/tools/perf/config/Makefile +++ b/tools/perf/config/Makefile @@ -181,7 +181,11 @@ LDFLAGS += -Wl,-z,noexecstack EXTLIBS = -lpthread -lrt -lm -ldl +ifeq ($(FEATURES_DUMP),) include $(srctree)/tools/build/Makefile.feature +else +include $(FEATURES_DUMP) +endif ifeq ($(feature-stackprotector-all), 1) CFLAGS += -fstack-protector-all -- cgit v1.2.3 From a7c490333df3cff5086ddf19a0837529304fa097 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Wed, 20 Jan 2016 21:12:58 +0200 Subject: tools/virtio: use virt_xxx barriers Fix build after API changes. Reported-by: Kamal Mostafa Signed-off-by: Michael S. Tsirkin --- tools/virtio/asm/barrier.h | 22 +++++++++++++--------- tools/virtio/linux/compiler.h | 9 +++++++++ tools/virtio/linux/kernel.h | 1 + 3 files changed, 23 insertions(+), 9 deletions(-) create mode 100644 tools/virtio/linux/compiler.h (limited to 'tools') diff --git a/tools/virtio/asm/barrier.h b/tools/virtio/asm/barrier.h index 26b7926bda88..ba34f9e96efd 100644 --- a/tools/virtio/asm/barrier.h +++ b/tools/virtio/asm/barrier.h @@ -1,15 +1,19 @@ #if defined(__i386__) || defined(__x86_64__) #define barrier() asm volatile("" ::: "memory") -#define mb() __sync_synchronize() - -#define smp_mb() mb() -# define dma_rmb() barrier() -# define dma_wmb() barrier() -# define smp_rmb() barrier() -# define smp_wmb() barrier() +#define virt_mb() __sync_synchronize() +#define virt_rmb() barrier() +#define virt_wmb() barrier() +/* Atomic store should be enough, but gcc generates worse code in that case. */ +#define virt_store_mb(var, value) do { \ + typeof(var) virt_store_mb_value = (value); \ + __atomic_exchange(&(var), &virt_store_mb_value, &virt_store_mb_value, \ + __ATOMIC_SEQ_CST); \ + barrier(); \ +} while (0); /* Weak barriers should be used. If not - it's a bug */ -# define rmb() abort() -# define wmb() abort() +# define mb() abort() +# define rmb() abort() +# define wmb() abort() #else #error Please fill in barrier macros #endif diff --git a/tools/virtio/linux/compiler.h b/tools/virtio/linux/compiler.h new file mode 100644 index 000000000000..845960e1cbf2 --- /dev/null +++ b/tools/virtio/linux/compiler.h @@ -0,0 +1,9 @@ +#ifndef LINUX_COMPILER_H +#define LINUX_COMPILER_H + +#define WRITE_ONCE(var, val) \ + (*((volatile typeof(val) *)(&(var))) = (val)) + +#define READ_ONCE(var) (*((volatile typeof(val) *)(&(var)))) + +#endif diff --git a/tools/virtio/linux/kernel.h b/tools/virtio/linux/kernel.h index 4db7d5691ba7..033849948215 100644 --- a/tools/virtio/linux/kernel.h +++ b/tools/virtio/linux/kernel.h @@ -8,6 +8,7 @@ #include #include +#include #include #include #include -- cgit v1.2.3 From 481eaec37e91e2b33f17275901172f50ce2c71e8 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Thu, 21 Jan 2016 14:44:10 +0200 Subject: tools/virtio: add ringtest utilities This adds micro-benchmarks useful for tuning virtio ring layouts. Three layouts are currently implemented: - virtio 0.9 compatible one - an experimental extension bypassing the ring index, polling ring itself instead - an experimental extension bypassing avail and used ring completely Typical use: sh run-on-all.sh perf stat -r 10 --log-fd 1 -- ./ring It doesn't depend on the kernel directly, but it's handy to have as much virtio stuff as possible in one tree. Signed-off-by: Michael S. Tsirkin --- tools/virtio/ringtest/Makefile | 22 ++ tools/virtio/ringtest/README | 2 + tools/virtio/ringtest/main.c | 366 +++++++++++++++++++++++++++++++ tools/virtio/ringtest/main.h | 119 ++++++++++ tools/virtio/ringtest/ring.c | 272 +++++++++++++++++++++++ tools/virtio/ringtest/run-on-all.sh | 24 ++ tools/virtio/ringtest/virtio_ring_0_9.c | 316 ++++++++++++++++++++++++++ tools/virtio/ringtest/virtio_ring_poll.c | 2 + 8 files changed, 1123 insertions(+) create mode 100644 tools/virtio/ringtest/Makefile create mode 100644 tools/virtio/ringtest/README create mode 100644 tools/virtio/ringtest/main.c create mode 100644 tools/virtio/ringtest/main.h create mode 100644 tools/virtio/ringtest/ring.c create mode 100755 tools/virtio/ringtest/run-on-all.sh create mode 100644 tools/virtio/ringtest/virtio_ring_0_9.c create mode 100644 tools/virtio/ringtest/virtio_ring_poll.c (limited to 'tools') diff --git a/tools/virtio/ringtest/Makefile b/tools/virtio/ringtest/Makefile new file mode 100644 index 000000000000..feaa64ac4630 --- /dev/null +++ b/tools/virtio/ringtest/Makefile @@ -0,0 +1,22 @@ +all: + +all: ring virtio_ring_0_9 virtio_ring_poll + +CFLAGS += -Wall +CFLAGS += -pthread -O2 -ggdb +LDFLAGS += -pthread -O2 -ggdb + +main.o: main.c main.h +ring.o: ring.c main.h +virtio_ring_0_9.o: virtio_ring_0_9.c main.h +virtio_ring_poll.o: virtio_ring_poll.c virtio_ring_0_9.c main.h +ring: ring.o main.o +virtio_ring_0_9: virtio_ring_0_9.o main.o +virtio_ring_poll: virtio_ring_poll.o main.o +clean: + -rm main.o + -rm ring.o ring + -rm virtio_ring_0_9.o virtio_ring_0_9 + -rm virtio_ring_poll.o virtio_ring_poll + +.PHONY: all clean diff --git a/tools/virtio/ringtest/README b/tools/virtio/ringtest/README new file mode 100644 index 000000000000..34e94c46104f --- /dev/null +++ b/tools/virtio/ringtest/README @@ -0,0 +1,2 @@ +Partial implementation of various ring layouts, useful to tune virtio design. +Uses shared memory heavily. diff --git a/tools/virtio/ringtest/main.c b/tools/virtio/ringtest/main.c new file mode 100644 index 000000000000..3a5ff438bd62 --- /dev/null +++ b/tools/virtio/ringtest/main.c @@ -0,0 +1,366 @@ +/* + * Copyright (C) 2016 Red Hat, Inc. + * Author: Michael S. Tsirkin + * This work is licensed under the terms of the GNU GPL, version 2. + * + * Command line processing and common functions for ring benchmarking. + */ +#define _GNU_SOURCE +#include +#include +#include +#include +#include "main.h" +#include +#include +#include +#include +#include + +int runcycles = 10000000; +int max_outstanding = INT_MAX; +int batch = 1; + +bool do_sleep = false; +bool do_relax = false; +bool do_exit = true; + +unsigned ring_size = 256; + +static int kickfd = -1; +static int callfd = -1; + +void notify(int fd) +{ + unsigned long long v = 1; + int r; + + vmexit(); + r = write(fd, &v, sizeof v); + assert(r == sizeof v); + vmentry(); +} + +void wait_for_notify(int fd) +{ + unsigned long long v = 1; + int r; + + vmexit(); + r = read(fd, &v, sizeof v); + assert(r == sizeof v); + vmentry(); +} + +void kick(void) +{ + notify(kickfd); +} + +void wait_for_kick(void) +{ + wait_for_notify(kickfd); +} + +void call(void) +{ + notify(callfd); +} + +void wait_for_call(void) +{ + wait_for_notify(callfd); +} + +void set_affinity(const char *arg) +{ + cpu_set_t cpuset; + int ret; + pthread_t self; + long int cpu; + char *endptr; + + if (!arg) + return; + + cpu = strtol(arg, &endptr, 0); + assert(!*endptr); + + assert(cpu >= 0 || cpu < CPU_SETSIZE); + + self = pthread_self(); + CPU_ZERO(&cpuset); + CPU_SET(cpu, &cpuset); + + ret = pthread_setaffinity_np(self, sizeof(cpu_set_t), &cpuset); + assert(!ret); +} + +static void run_guest(void) +{ + int completed_before; + int completed = 0; + int started = 0; + int bufs = runcycles; + int spurious = 0; + int r; + unsigned len; + void *buf; + int tokick = batch; + + for (;;) { + if (do_sleep) + disable_call(); + completed_before = completed; + do { + if (started < bufs && + started - completed < max_outstanding) { + r = add_inbuf(0, NULL, "Hello, world!"); + if (__builtin_expect(r == 0, true)) { + ++started; + if (!--tokick) { + tokick = batch; + if (do_sleep) + kick_available(); + } + + } + } else + r = -1; + + /* Flush out completed bufs if any */ + if (get_buf(&len, &buf)) { + ++completed; + if (__builtin_expect(completed == bufs, false)) + return; + r = 0; + } + } while (r == 0); + if (completed == completed_before) + ++spurious; + assert(completed <= bufs); + assert(started <= bufs); + if (do_sleep) { + if (enable_call()) + wait_for_call(); + } else { + poll_used(); + } + } +} + +static void run_host(void) +{ + int completed_before; + int completed = 0; + int spurious = 0; + int bufs = runcycles; + unsigned len; + void *buf; + + for (;;) { + if (do_sleep) { + if (enable_kick()) + wait_for_kick(); + } else { + poll_avail(); + } + if (do_sleep) + disable_kick(); + completed_before = completed; + while (__builtin_expect(use_buf(&len, &buf), true)) { + if (do_sleep) + call_used(); + ++completed; + if (__builtin_expect(completed == bufs, false)) + return; + } + if (completed == completed_before) + ++spurious; + assert(completed <= bufs); + if (completed == bufs) + break; + } +} + +void *start_guest(void *arg) +{ + set_affinity(arg); + run_guest(); + pthread_exit(NULL); +} + +void *start_host(void *arg) +{ + set_affinity(arg); + run_host(); + pthread_exit(NULL); +} + +static const char optstring[] = ""; +static const struct option longopts[] = { + { + .name = "help", + .has_arg = no_argument, + .val = 'h', + }, + { + .name = "host-affinity", + .has_arg = required_argument, + .val = 'H', + }, + { + .name = "guest-affinity", + .has_arg = required_argument, + .val = 'G', + }, + { + .name = "ring-size", + .has_arg = required_argument, + .val = 'R', + }, + { + .name = "run-cycles", + .has_arg = required_argument, + .val = 'C', + }, + { + .name = "outstanding", + .has_arg = required_argument, + .val = 'o', + }, + { + .name = "batch", + .has_arg = required_argument, + .val = 'b', + }, + { + .name = "sleep", + .has_arg = no_argument, + .val = 's', + }, + { + .name = "relax", + .has_arg = no_argument, + .val = 'x', + }, + { + .name = "exit", + .has_arg = no_argument, + .val = 'e', + }, + { + } +}; + +static void help(void) +{ + fprintf(stderr, "Usage: [--help]" + " [--host-affinity H]" + " [--guest-affinity G]" + " [--ring-size R (default: %d)]" + " [--run-cycles C (default: %d)]" + " [--batch b]" + " [--outstanding o]" + " [--sleep]" + " [--relax]" + " [--exit]" + "\n", + ring_size, + runcycles); +} + +int main(int argc, char **argv) +{ + int ret; + pthread_t host, guest; + void *tret; + char *host_arg = NULL; + char *guest_arg = NULL; + char *endptr; + long int c; + + kickfd = eventfd(0, 0); + assert(kickfd >= 0); + callfd = eventfd(0, 0); + assert(callfd >= 0); + + for (;;) { + int o = getopt_long(argc, argv, optstring, longopts, NULL); + switch (o) { + case -1: + goto done; + case '?': + help(); + exit(2); + case 'H': + host_arg = optarg; + break; + case 'G': + guest_arg = optarg; + break; + case 'R': + ring_size = strtol(optarg, &endptr, 0); + assert(ring_size && !(ring_size & (ring_size - 1))); + assert(!*endptr); + break; + case 'C': + c = strtol(optarg, &endptr, 0); + assert(!*endptr); + assert(c > 0 && c < INT_MAX); + runcycles = c; + break; + case 'o': + c = strtol(optarg, &endptr, 0); + assert(!*endptr); + assert(c > 0 && c < INT_MAX); + max_outstanding = c; + break; + case 'b': + c = strtol(optarg, &endptr, 0); + assert(!*endptr); + assert(c > 0 && c < INT_MAX); + batch = c; + break; + case 's': + do_sleep = true; + break; + case 'x': + do_relax = true; + break; + case 'e': + do_exit = true; + break; + default: + help(); + exit(4); + break; + } + } + + /* does nothing here, used to make sure all smp APIs compile */ + smp_acquire(); + smp_release(); + smp_mb(); +done: + + if (batch > max_outstanding) + batch = max_outstanding; + + if (optind < argc) { + help(); + exit(4); + } + alloc_ring(); + + ret = pthread_create(&host, NULL, start_host, host_arg); + assert(!ret); + ret = pthread_create(&guest, NULL, start_guest, guest_arg); + assert(!ret); + + ret = pthread_join(guest, &tret); + assert(!ret); + ret = pthread_join(host, &tret); + assert(!ret); + return 0; +} diff --git a/tools/virtio/ringtest/main.h b/tools/virtio/ringtest/main.h new file mode 100644 index 000000000000..16917acb0ade --- /dev/null +++ b/tools/virtio/ringtest/main.h @@ -0,0 +1,119 @@ +/* + * Copyright (C) 2016 Red Hat, Inc. + * Author: Michael S. Tsirkin + * This work is licensed under the terms of the GNU GPL, version 2. + * + * Common macros and functions for ring benchmarking. + */ +#ifndef MAIN_H +#define MAIN_H + +#include + +extern bool do_exit; + +#if defined(__x86_64__) || defined(__i386__) +#include "x86intrin.h" + +static inline void wait_cycles(unsigned long long cycles) +{ + unsigned long long t; + + t = __rdtsc(); + while (__rdtsc() - t < cycles) {} +} + +#define VMEXIT_CYCLES 500 +#define VMENTRY_CYCLES 500 + +#else +static inline void wait_cycles(unsigned long long cycles) +{ + _Exit(5); +} +#define VMEXIT_CYCLES 0 +#define VMENTRY_CYCLES 0 +#endif + +static inline void vmexit(void) +{ + if (!do_exit) + return; + + wait_cycles(VMEXIT_CYCLES); +} +static inline void vmentry(void) +{ + if (!do_exit) + return; + + wait_cycles(VMENTRY_CYCLES); +} + +/* implemented by ring */ +void alloc_ring(void); +/* guest side */ +int add_inbuf(unsigned, void *, void *); +void *get_buf(unsigned *, void **); +void disable_call(); +bool enable_call(); +void kick_available(); +void poll_used(); +/* host side */ +void disable_kick(); +bool enable_kick(); +bool use_buf(unsigned *, void **); +void call_used(); +void poll_avail(); + +/* implemented by main */ +extern bool do_sleep; +void kick(void); +void wait_for_kick(void); +void call(void); +void wait_for_call(void); + +extern unsigned ring_size; + +/* Compiler barrier - similar to what Linux uses */ +#define barrier() asm volatile("" ::: "memory") + +/* Is there a portable way to do this? */ +#if defined(__x86_64__) || defined(__i386__) +#define cpu_relax() asm ("rep; nop" ::: "memory") +#else +#define cpu_relax() assert(0) +#endif + +extern bool do_relax; + +static inline void busy_wait(void) +{ + if (do_relax) + cpu_relax(); + else + /* prevent compiler from removing busy loops */ + barrier(); +} + +/* + * Not using __ATOMIC_SEQ_CST since gcc docs say they are only synchronized + * with other __ATOMIC_SEQ_CST calls. + */ +#define smp_mb() __sync_synchronize() + +/* + * This abuses the atomic builtins for thread fences, and + * adds a compiler barrier. + */ +#define smp_release() do { \ + barrier(); \ + __atomic_thread_fence(__ATOMIC_RELEASE); \ +} while (0) + +#define smp_acquire() do { \ + __atomic_thread_fence(__ATOMIC_ACQUIRE); \ + barrier(); \ +} while (0) + +#endif diff --git a/tools/virtio/ringtest/ring.c b/tools/virtio/ringtest/ring.c new file mode 100644 index 000000000000..c25c8d248b6b --- /dev/null +++ b/tools/virtio/ringtest/ring.c @@ -0,0 +1,272 @@ +/* + * Copyright (C) 2016 Red Hat, Inc. + * Author: Michael S. Tsirkin + * This work is licensed under the terms of the GNU GPL, version 2. + * + * Simple descriptor-based ring. virtio 0.9 compatible event index is used for + * signalling, unconditionally. + */ +#define _GNU_SOURCE +#include "main.h" +#include +#include +#include + +/* Next - Where next entry will be written. + * Prev - "Next" value when event triggered previously. + * Event - Peer requested event after writing this entry. + */ +static inline bool need_event(unsigned short event, + unsigned short next, + unsigned short prev) +{ + return (unsigned short)(next - event - 1) < (unsigned short)(next - prev); +} + +/* Design: + * Guest adds descriptors with unique index values and DESC_HW in flags. + * Host overwrites used descriptors with correct len, index, and DESC_HW clear. + * Flags are always set last. + */ +#define DESC_HW 0x1 + +struct desc { + unsigned short flags; + unsigned short index; + unsigned len; + unsigned long long addr; +}; + +/* how much padding is needed to avoid false cache sharing */ +#define HOST_GUEST_PADDING 0x80 + +/* Mostly read */ +struct event { + unsigned short kick_index; + unsigned char reserved0[HOST_GUEST_PADDING - 2]; + unsigned short call_index; + unsigned char reserved1[HOST_GUEST_PADDING - 2]; +}; + +struct data { + void *buf; /* descriptor is writeable, we can't get buf from there */ + void *data; +} *data; + +struct desc *ring; +struct event *event; + +struct guest { + unsigned avail_idx; + unsigned last_used_idx; + unsigned num_free; + unsigned kicked_avail_idx; + unsigned char reserved[HOST_GUEST_PADDING - 12]; +} guest; + +struct host { + /* we do not need to track last avail index + * unless we have more than one in flight. + */ + unsigned used_idx; + unsigned called_used_idx; + unsigned char reserved[HOST_GUEST_PADDING - 4]; +} host; + +/* implemented by ring */ +void alloc_ring(void) +{ + int ret; + int i; + + ret = posix_memalign((void **)&ring, 0x1000, ring_size * sizeof *ring); + if (ret) { + perror("Unable to allocate ring buffer.\n"); + exit(3); + } + event = malloc(sizeof *event); + if (!event) { + perror("Unable to allocate event buffer.\n"); + exit(3); + } + memset(event, 0, sizeof *event); + guest.avail_idx = 0; + guest.kicked_avail_idx = -1; + guest.last_used_idx = 0; + host.used_idx = 0; + host.called_used_idx = -1; + for (i = 0; i < ring_size; ++i) { + struct desc desc = { + .index = i, + }; + ring[i] = desc; + } + guest.num_free = ring_size; + data = malloc(ring_size * sizeof *data); + if (!data) { + perror("Unable to allocate data buffer.\n"); + exit(3); + } + memset(data, 0, ring_size * sizeof *data); +} + +/* guest side */ +int add_inbuf(unsigned len, void *buf, void *datap) +{ + unsigned head, index; + + if (!guest.num_free) + return -1; + + guest.num_free--; + head = (ring_size - 1) & (guest.avail_idx++); + + /* Start with a write. On MESI architectures this helps + * avoid a shared state with consumer that is polling this descriptor. + */ + ring[head].addr = (unsigned long)(void*)buf; + ring[head].len = len; + /* read below might bypass write above. That is OK because it's just an + * optimization. If this happens, we will get the cache line in a + * shared state which is unfortunate, but probably not worth it to + * add an explicit full barrier to avoid this. + */ + barrier(); + index = ring[head].index; + data[index].buf = buf; + data[index].data = datap; + /* Barrier A (for pairing) */ + smp_release(); + ring[head].flags = DESC_HW; + + return 0; +} + +void *get_buf(unsigned *lenp, void **bufp) +{ + unsigned head = (ring_size - 1) & guest.last_used_idx; + unsigned index; + void *datap; + + if (ring[head].flags & DESC_HW) + return NULL; + /* Barrier B (for pairing) */ + smp_acquire(); + *lenp = ring[head].len; + index = ring[head].index & (ring_size - 1); + datap = data[index].data; + *bufp = data[index].buf; + data[index].buf = NULL; + data[index].data = NULL; + guest.num_free++; + guest.last_used_idx++; + return datap; +} + +void poll_used(void) +{ + unsigned head = (ring_size - 1) & guest.last_used_idx; + + while (ring[head].flags & DESC_HW) + busy_wait(); +} + +void disable_call() +{ + /* Doing nothing to disable calls might cause + * extra interrupts, but reduces the number of cache misses. + */ +} + +bool enable_call() +{ + unsigned head = (ring_size - 1) & guest.last_used_idx; + + event->call_index = guest.last_used_idx; + /* Flush call index write */ + /* Barrier D (for pairing) */ + smp_mb(); + return ring[head].flags & DESC_HW; +} + +void kick_available(void) +{ + /* Flush in previous flags write */ + /* Barrier C (for pairing) */ + smp_mb(); + if (!need_event(event->kick_index, + guest.avail_idx, + guest.kicked_avail_idx)) + return; + + guest.kicked_avail_idx = guest.avail_idx; + kick(); +} + +/* host side */ +void disable_kick() +{ + /* Doing nothing to disable kicks might cause + * extra interrupts, but reduces the number of cache misses. + */ +} + +bool enable_kick() +{ + unsigned head = (ring_size - 1) & host.used_idx; + + event->kick_index = host.used_idx; + /* Barrier C (for pairing) */ + smp_mb(); + return !(ring[head].flags & DESC_HW); +} + +void poll_avail(void) +{ + unsigned head = (ring_size - 1) & host.used_idx; + + while (!(ring[head].flags & DESC_HW)) + busy_wait(); +} + +bool use_buf(unsigned *lenp, void **bufp) +{ + unsigned head = (ring_size - 1) & host.used_idx; + + if (!(ring[head].flags & DESC_HW)) + return false; + + /* make sure length read below is not speculated */ + /* Barrier A (for pairing) */ + smp_acquire(); + + /* simple in-order completion: we don't need + * to touch index at all. This also means we + * can just modify the descriptor in-place. + */ + ring[head].len--; + /* Make sure len is valid before flags. + * Note: alternative is to write len and flags in one access - + * possible on 64 bit architectures but wmb is free on Intel anyway + * so I have no way to test whether it's a gain. + */ + /* Barrier B (for pairing) */ + smp_release(); + ring[head].flags = 0; + host.used_idx++; + return true; +} + +void call_used(void) +{ + /* Flush in previous flags write */ + /* Barrier D (for pairing) */ + smp_mb(); + if (!need_event(event->call_index, + host.used_idx, + host.called_used_idx)) + return; + + host.called_used_idx = host.used_idx; + call(); +} diff --git a/tools/virtio/ringtest/run-on-all.sh b/tools/virtio/ringtest/run-on-all.sh new file mode 100755 index 000000000000..52b0f71ffa8d --- /dev/null +++ b/tools/virtio/ringtest/run-on-all.sh @@ -0,0 +1,24 @@ +#!/bin/sh + +#use last CPU for host. Why not the first? +#many devices tend to use cpu0 by default so +#it tends to be busier +HOST_AFFINITY=$(cd /dev/cpu; ls|grep -v '[a-z]'|sort -n|tail -1) + +#run command on all cpus +for cpu in $(cd /dev/cpu; ls|grep -v '[a-z]'|sort -n); +do + #Don't run guest and host on same CPU + #It actually works ok if using signalling + if + (echo "$@" | grep -e "--sleep" > /dev/null) || \ + test $HOST_AFFINITY '!=' $cpu + then + echo "GUEST AFFINITY $cpu" + "$@" --host-affinity $HOST_AFFINITY --guest-affinity $cpu + fi +done +echo "NO GUEST AFFINITY" +"$@" --host-affinity $HOST_AFFINITY +echo "NO AFFINITY" +"$@" diff --git a/tools/virtio/ringtest/virtio_ring_0_9.c b/tools/virtio/ringtest/virtio_ring_0_9.c new file mode 100644 index 000000000000..47c9a1a18d36 --- /dev/null +++ b/tools/virtio/ringtest/virtio_ring_0_9.c @@ -0,0 +1,316 @@ +/* + * Copyright (C) 2016 Red Hat, Inc. + * Author: Michael S. Tsirkin + * This work is licensed under the terms of the GNU GPL, version 2. + * + * Partial implementation of virtio 0.9. event index is used for signalling, + * unconditionally. Design roughly follows linux kernel implementation in order + * to be able to judge its performance. + */ +#define _GNU_SOURCE +#include "main.h" +#include +#include +#include +#include +#include + +struct data { + void *data; +} *data; + +struct vring ring; + +/* enabling the below activates experimental ring polling code + * (which skips index reads on consumer in favor of looking at + * high bits of ring id ^ 0x8000). + */ +/* #ifdef RING_POLL */ + +/* how much padding is needed to avoid false cache sharing */ +#define HOST_GUEST_PADDING 0x80 + +struct guest { + unsigned short avail_idx; + unsigned short last_used_idx; + unsigned short num_free; + unsigned short kicked_avail_idx; + unsigned short free_head; + unsigned char reserved[HOST_GUEST_PADDING - 10]; +} guest; + +struct host { + /* we do not need to track last avail index + * unless we have more than one in flight. + */ + unsigned short used_idx; + unsigned short called_used_idx; + unsigned char reserved[HOST_GUEST_PADDING - 4]; +} host; + +/* implemented by ring */ +void alloc_ring(void) +{ + int ret; + int i; + void *p; + + ret = posix_memalign(&p, 0x1000, vring_size(ring_size, 0x1000)); + if (ret) { + perror("Unable to allocate ring buffer.\n"); + exit(3); + } + memset(p, 0, vring_size(ring_size, 0x1000)); + vring_init(&ring, ring_size, p, 0x1000); + + guest.avail_idx = 0; + guest.kicked_avail_idx = -1; + guest.last_used_idx = 0; + /* Put everything in free lists. */ + guest.free_head = 0; + for (i = 0; i < ring_size - 1; i++) + ring.desc[i].next = i + 1; + host.used_idx = 0; + host.called_used_idx = -1; + guest.num_free = ring_size; + data = malloc(ring_size * sizeof *data); + if (!data) { + perror("Unable to allocate data buffer.\n"); + exit(3); + } + memset(data, 0, ring_size * sizeof *data); +} + +/* guest side */ +int add_inbuf(unsigned len, void *buf, void *datap) +{ + unsigned head, avail; + struct vring_desc *desc; + + if (!guest.num_free) + return -1; + + head = guest.free_head; + guest.num_free--; + + desc = ring.desc; + desc[head].flags = VRING_DESC_F_NEXT; + desc[head].addr = (unsigned long)(void *)buf; + desc[head].len = len; + /* We do it like this to simulate the way + * we'd have to flip it if we had multiple + * descriptors. + */ + desc[head].flags &= ~VRING_DESC_F_NEXT; + guest.free_head = desc[head].next; + + data[head].data = datap; + +#ifdef RING_POLL + /* Barrier A (for pairing) */ + smp_release(); + avail = guest.avail_idx++; + ring.avail->ring[avail & (ring_size - 1)] = + (head | (avail & ~(ring_size - 1))) ^ 0x8000; +#else + avail = (ring_size - 1) & (guest.avail_idx++); + ring.avail->ring[avail] = head; + /* Barrier A (for pairing) */ + smp_release(); +#endif + ring.avail->idx = guest.avail_idx; + return 0; +} + +void *get_buf(unsigned *lenp, void **bufp) +{ + unsigned head; + unsigned index; + void *datap; + +#ifdef RING_POLL + head = (ring_size - 1) & guest.last_used_idx; + index = ring.used->ring[head].id; + if ((index ^ guest.last_used_idx ^ 0x8000) & ~(ring_size - 1)) + return NULL; + /* Barrier B (for pairing) */ + smp_acquire(); + index &= ring_size - 1; +#else + if (ring.used->idx == guest.last_used_idx) + return NULL; + /* Barrier B (for pairing) */ + smp_acquire(); + head = (ring_size - 1) & guest.last_used_idx; + index = ring.used->ring[head].id; +#endif + *lenp = ring.used->ring[head].len; + datap = data[index].data; + *bufp = (void*)(unsigned long)ring.desc[index].addr; + data[index].data = NULL; + ring.desc[index].next = guest.free_head; + guest.free_head = index; + guest.num_free++; + guest.last_used_idx++; + return datap; +} + +void poll_used(void) +{ +#ifdef RING_POLL + unsigned head = (ring_size - 1) & guest.last_used_idx; + + for (;;) { + unsigned index = ring.used->ring[head].id; + + if ((index ^ guest.last_used_idx ^ 0x8000) & ~(ring_size - 1)) + busy_wait(); + else + break; + } +#else + unsigned head = guest.last_used_idx; + + while (ring.used->idx == head) + busy_wait(); +#endif +} + +void disable_call() +{ + /* Doing nothing to disable calls might cause + * extra interrupts, but reduces the number of cache misses. + */ +} + +bool enable_call() +{ + unsigned short last_used_idx; + + vring_used_event(&ring) = (last_used_idx = guest.last_used_idx); + /* Flush call index write */ + /* Barrier D (for pairing) */ + smp_mb(); +#ifdef RING_POLL + { + unsigned short head = last_used_idx & (ring_size - 1); + unsigned index = ring.used->ring[head].id; + + return (index ^ last_used_idx ^ 0x8000) & ~(ring_size - 1); + } +#else + return ring.used->idx == last_used_idx; +#endif +} + +void kick_available(void) +{ + /* Flush in previous flags write */ + /* Barrier C (for pairing) */ + smp_mb(); + if (!vring_need_event(vring_avail_event(&ring), + guest.avail_idx, + guest.kicked_avail_idx)) + return; + + guest.kicked_avail_idx = guest.avail_idx; + kick(); +} + +/* host side */ +void disable_kick() +{ + /* Doing nothing to disable kicks might cause + * extra interrupts, but reduces the number of cache misses. + */ +} + +bool enable_kick() +{ + unsigned head = host.used_idx; + + vring_avail_event(&ring) = head; + /* Barrier C (for pairing) */ + smp_mb(); +#ifdef RING_POLL + { + unsigned index = ring.avail->ring[head & (ring_size - 1)]; + + return (index ^ head ^ 0x8000) & ~(ring_size - 1); + } +#else + return head == ring.avail->idx; +#endif +} + +void poll_avail(void) +{ + unsigned head = host.used_idx; +#ifdef RING_POLL + for (;;) { + unsigned index = ring.avail->ring[head & (ring_size - 1)]; + if ((index ^ head ^ 0x8000) & ~(ring_size - 1)) + busy_wait(); + else + break; + } +#else + while (ring.avail->idx == head) + busy_wait(); +#endif +} + +bool use_buf(unsigned *lenp, void **bufp) +{ + unsigned used_idx = host.used_idx; + struct vring_desc *desc; + unsigned head; + +#ifdef RING_POLL + head = ring.avail->ring[used_idx & (ring_size - 1)]; + if ((used_idx ^ head ^ 0x8000) & ~(ring_size - 1)) + return false; + /* Barrier A (for pairing) */ + smp_acquire(); + + used_idx &= ring_size - 1; + desc = &ring.desc[head & (ring_size - 1)]; +#else + if (used_idx == ring.avail->idx) + return false; + + /* Barrier A (for pairing) */ + smp_acquire(); + + used_idx &= ring_size - 1; + head = ring.avail->ring[used_idx]; + desc = &ring.desc[head]; +#endif + + *lenp = desc->len; + *bufp = (void *)(unsigned long)desc->addr; + + /* now update used ring */ + ring.used->ring[used_idx].id = head; + ring.used->ring[used_idx].len = desc->len - 1; + /* Barrier B (for pairing) */ + smp_release(); + host.used_idx++; + ring.used->idx = host.used_idx; + + return true; +} + +void call_used(void) +{ + /* Flush in previous flags write */ + /* Barrier D (for pairing) */ + smp_mb(); + if (!vring_need_event(vring_used_event(&ring), + host.used_idx, + host.called_used_idx)) + return; + + host.called_used_idx = host.used_idx; + call(); +} diff --git a/tools/virtio/ringtest/virtio_ring_poll.c b/tools/virtio/ringtest/virtio_ring_poll.c new file mode 100644 index 000000000000..84fc2c557aaa --- /dev/null +++ b/tools/virtio/ringtest/virtio_ring_poll.c @@ -0,0 +1,2 @@ +#define RING_POLL 1 +#include "virtio_ring_0_9.c" -- cgit v1.2.3 From cf89813a5b514bff9b3b5e7eaf2090f22fba62e0 Mon Sep 17 00:00:00 2001 From: Markus Trippelsdorf Date: Mon, 14 Dec 2015 16:43:35 +0100 Subject: perf tests: Remove wrong semicolon in while loop in CQM test The while loop was spinning. Fix by removing a semicolon. The issue was pointed out by gcc-6's -Wmisleading-indentation. Signed-off-by: Markus Trippelsdorf Reviewed-by: Matt Fleming Acked-by: Ingo Molnar Cc: Ben Hutchings Cc: Peter Zijlstra Fixes: 035827e9f2bd ("perf tests: Add Intel CQM test") Link: http://lkml.kernel.org/r/20151214154335.GA1409@x4 Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/arch/x86/tests/intel-cqm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/perf/arch/x86/tests/intel-cqm.c b/tools/perf/arch/x86/tests/intel-cqm.c index 3e89ba825f6b..7f064eb37158 100644 --- a/tools/perf/arch/x86/tests/intel-cqm.c +++ b/tools/perf/arch/x86/tests/intel-cqm.c @@ -17,7 +17,7 @@ static pid_t spawn(void) if (pid) return pid; - while(1); + while(1) sleep(5); return 0; } -- cgit v1.2.3 From d4913cbd05bab685e49c8174896e563b2487d054 Mon Sep 17 00:00:00 2001 From: Markus Trippelsdorf Date: Mon, 14 Dec 2015 16:44:03 +0100 Subject: perf annotate browser: Fix behaviour of Shift-Tab with nothing focussed The issue was pointed out by gcc-6's -Wmisleading-indentation. Signed-off-by: Markus Trippelsdorf Acked-by: Ingo Molnar Cc: Ben Hutchings Cc: Matt Fleming Cc: Peter Zijlstra Fixes: c97cf42219b7 ("perf top: Live TUI Annotation") Link: http://lkml.kernel.org/r/20151214154403.GB1409@x4 Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/ui/browsers/annotate.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/perf/ui/browsers/annotate.c b/tools/perf/ui/browsers/annotate.c index d4d7cc27252f..718bd46d47fa 100644 --- a/tools/perf/ui/browsers/annotate.c +++ b/tools/perf/ui/browsers/annotate.c @@ -755,11 +755,11 @@ static int annotate_browser__run(struct annotate_browser *browser, nd = browser->curr_hot; break; case K_UNTAB: - if (nd != NULL) + if (nd != NULL) { nd = rb_next(nd); if (nd == NULL) nd = rb_first(&browser->entries); - else + } else nd = browser->curr_hot; break; case K_F1: -- cgit v1.2.3 From 0805909f59e02036a4e2660159f27dbf8b6084ac Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Wed, 20 Jan 2016 12:56:33 +0100 Subject: perf hists: Fix HISTC_MEM_DCACHELINE width setting Set correct width for unresolved mem_dcacheline addr. Signed-off-by: Jiri Olsa Cc: David Ahern Cc: Don Zickus Cc: Namhyung Kim Cc: Peter Zijlstra Fixes: 9b32ba71ba90 ("perf tools: Add dcacheline sort") Link: http://lkml.kernel.org/r/1453290995-18485-3-git-send-email-jolsa@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/hist.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'tools') diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c index c226303e3da0..68a7612019dc 100644 --- a/tools/perf/util/hist.c +++ b/tools/perf/util/hist.c @@ -131,6 +131,8 @@ void hists__calc_col_len(struct hists *hists, struct hist_entry *h) symlen = unresolved_col_width + 4 + 2; hists__new_col_len(hists, HISTC_MEM_DADDR_SYMBOL, symlen); + hists__new_col_len(hists, HISTC_MEM_DCACHELINE, + symlen); } if (h->mem_info->iaddr.sym) { -- cgit v1.2.3 From 3f416f22d1e21709a631189ba169f76fd267b374 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Wed, 20 Jan 2016 12:56:34 +0100 Subject: perf stat: Do not clean event's private stats Mel reported stddev reporting was broken due to following commit: 106a94a0f8c2 ("perf stat: Introduce read_counters function") This commit merged interval and overall counters reading into single read_counters function. The old interval code cleaned the stddev data for some reason (it's never displayed in interval mode) and the mentioned commit kept on cleaning the stddev data in merged function, which resulted in the stddev not being displayed. Removing the wrong stddev data cleanup init_stats call. Reported-and-Tested-by: Mel Gorman Signed-off-by: Jiri Olsa Cc: David Ahern Cc: Namhyung Kim Cc: Peter Zijlstra Cc: stable@vger.kernel.org # v4.2+ Fixes: 106a94a0f8c2 ("perf stat: Introduce read_counters function") Link: http://lkml.kernel.org/r/1453290995-18485-4-git-send-email-jolsa@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/stat.c | 1 - 1 file changed, 1 deletion(-) (limited to 'tools') diff --git a/tools/perf/util/stat.c b/tools/perf/util/stat.c index 2f901d15e063..2b58edccd56f 100644 --- a/tools/perf/util/stat.c +++ b/tools/perf/util/stat.c @@ -310,7 +310,6 @@ int perf_stat_process_counter(struct perf_stat_config *config, int i, ret; aggr->val = aggr->ena = aggr->run = 0; - init_stats(ps->res_stats); if (counter->per_pkg) zero_per_pkg(counter); -- cgit v1.2.3 From e03a58c320e1103ebe97bda8ebdfcc5c9829c53f Mon Sep 17 00:00:00 2001 From: John Stultz Date: Thu, 21 Jan 2016 15:03:35 -0800 Subject: kselftests: timers: Add adjtimex SETOFFSET validity tests Add some simple tests to check both valid and invalid offsets when using adjtimex's ADJ_SETOFFSET method. Signed-off-by: John Stultz Acked-by: Shuah Khan Cc: Sasha Levin Cc: Richard Cochran Cc: Prarit Bhargava Cc: Harald Hoyer Cc: Kay Sievers Cc: David Herrmann Link: http://lkml.kernel.org/r/1453417415-19110-3-git-send-email-john.stultz@linaro.org Signed-off-by: Thomas Gleixner --- tools/testing/selftests/timers/valid-adjtimex.c | 139 +++++++++++++++++++++++- 1 file changed, 138 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/timers/valid-adjtimex.c b/tools/testing/selftests/timers/valid-adjtimex.c index e86d937cc22c..60fe3c569bd9 100644 --- a/tools/testing/selftests/timers/valid-adjtimex.c +++ b/tools/testing/selftests/timers/valid-adjtimex.c @@ -45,7 +45,17 @@ static inline int ksft_exit_fail(void) } #endif -#define NSEC_PER_SEC 1000000000L +#define NSEC_PER_SEC 1000000000LL +#define USEC_PER_SEC 1000000LL + +#define ADJ_SETOFFSET 0x0100 + +#include +static int clock_adjtime(clockid_t id, struct timex *tx) +{ + return syscall(__NR_clock_adjtime, id, tx); +} + /* clear NTP time_status & time_state */ int clear_time_state(void) @@ -193,10 +203,137 @@ out: } +int set_offset(long long offset, int use_nano) +{ + struct timex tmx = {}; + int ret; + + tmx.modes = ADJ_SETOFFSET; + if (use_nano) { + tmx.modes |= ADJ_NANO; + + tmx.time.tv_sec = offset / NSEC_PER_SEC; + tmx.time.tv_usec = offset % NSEC_PER_SEC; + + if (offset < 0 && tmx.time.tv_usec) { + tmx.time.tv_sec -= 1; + tmx.time.tv_usec += NSEC_PER_SEC; + } + } else { + tmx.time.tv_sec = offset / USEC_PER_SEC; + tmx.time.tv_usec = offset % USEC_PER_SEC; + + if (offset < 0 && tmx.time.tv_usec) { + tmx.time.tv_sec -= 1; + tmx.time.tv_usec += USEC_PER_SEC; + } + } + + ret = clock_adjtime(CLOCK_REALTIME, &tmx); + if (ret < 0) { + printf("(sec: %ld usec: %ld) ", tmx.time.tv_sec, tmx.time.tv_usec); + printf("[FAIL]\n"); + return -1; + } + return 0; +} + +int set_bad_offset(long sec, long usec, int use_nano) +{ + struct timex tmx = {}; + int ret; + + tmx.modes = ADJ_SETOFFSET; + if (use_nano) + tmx.modes |= ADJ_NANO; + + tmx.time.tv_sec = sec; + tmx.time.tv_usec = usec; + ret = clock_adjtime(CLOCK_REALTIME, &tmx); + if (ret >= 0) { + printf("Invalid (sec: %ld usec: %ld) did not fail! ", tmx.time.tv_sec, tmx.time.tv_usec); + printf("[FAIL]\n"); + return -1; + } + return 0; +} + +int validate_set_offset(void) +{ + printf("Testing ADJ_SETOFFSET... "); + + /* Test valid values */ + if (set_offset(NSEC_PER_SEC - 1, 1)) + return -1; + + if (set_offset(-NSEC_PER_SEC + 1, 1)) + return -1; + + if (set_offset(-NSEC_PER_SEC - 1, 1)) + return -1; + + if (set_offset(5 * NSEC_PER_SEC, 1)) + return -1; + + if (set_offset(-5 * NSEC_PER_SEC, 1)) + return -1; + + if (set_offset(5 * NSEC_PER_SEC + NSEC_PER_SEC / 2, 1)) + return -1; + + if (set_offset(-5 * NSEC_PER_SEC - NSEC_PER_SEC / 2, 1)) + return -1; + + if (set_offset(USEC_PER_SEC - 1, 0)) + return -1; + + if (set_offset(-USEC_PER_SEC + 1, 0)) + return -1; + + if (set_offset(-USEC_PER_SEC - 1, 0)) + return -1; + + if (set_offset(5 * USEC_PER_SEC, 0)) + return -1; + + if (set_offset(-5 * USEC_PER_SEC, 0)) + return -1; + + if (set_offset(5 * USEC_PER_SEC + USEC_PER_SEC / 2, 0)) + return -1; + + if (set_offset(-5 * USEC_PER_SEC - USEC_PER_SEC / 2, 0)) + return -1; + + /* Test invalid values */ + if (set_bad_offset(0, -1, 1)) + return -1; + if (set_bad_offset(0, -1, 0)) + return -1; + if (set_bad_offset(0, 2 * NSEC_PER_SEC, 1)) + return -1; + if (set_bad_offset(0, 2 * USEC_PER_SEC, 0)) + return -1; + if (set_bad_offset(0, NSEC_PER_SEC, 1)) + return -1; + if (set_bad_offset(0, USEC_PER_SEC, 0)) + return -1; + if (set_bad_offset(0, -NSEC_PER_SEC, 1)) + return -1; + if (set_bad_offset(0, -USEC_PER_SEC, 0)) + return -1; + + printf("[OK]\n"); + return 0; +} + int main(int argc, char **argv) { if (validate_freq()) return ksft_exit_fail(); + if (validate_set_offset()) + return ksft_exit_fail(); + return ksft_exit_pass(); } -- cgit v1.2.3 From 76e9f0ee52b0be5761e29847e0ef01f23f24f1df Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Fri, 22 Jan 2016 09:43:28 -0800 Subject: phys_to_pfn_t: use phys_addr_t A dma_addr_t is potentially smaller than a phys_addr_t on some archs. Don't truncate the address when doing the pfn conversion. Cc: Ross Zwisler Reported-by: Matthew Wilcox [willy: fix pfn_t_to_phys as well] Signed-off-by: Dan Williams --- tools/testing/nvdimm/test/iomap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/nvdimm/test/iomap.c b/tools/testing/nvdimm/test/iomap.c index 7ec7df9e7fc7..0c1a7e65bb81 100644 --- a/tools/testing/nvdimm/test/iomap.c +++ b/tools/testing/nvdimm/test/iomap.c @@ -113,7 +113,7 @@ void *__wrap_devm_memremap_pages(struct device *dev, struct resource *res, } EXPORT_SYMBOL(__wrap_devm_memremap_pages); -pfn_t __wrap_phys_to_pfn_t(dma_addr_t addr, unsigned long flags) +pfn_t __wrap_phys_to_pfn_t(phys_addr_t addr, unsigned long flags) { struct nfit_test_resource *nfit_res = get_nfit_res(addr); -- cgit v1.2.3 From ec183d22cc284a7a1e17f0341219d8ec8ca070cc Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Tue, 26 Jan 2016 14:05:20 +0200 Subject: perf tools: tracepoint_error() can receive e=NULL, robustify it Fixes segmentation fault using, for instance: (gdb) run record -I -e intel_pt/tsc=1,noretcomp=1/u /bin/ls Starting program: /home/acme/bin/perf record -I -e intel_pt/tsc=1,noretcomp=1/u /bin/ls Missing separate debuginfos, use: dnf debuginfo-install glibc-2.22-7.fc23.x86_64 [Thread debugging using libthread_db enabled] Using host libthread_db library "/lib64/libthread_db.so.1". Program received signal SIGSEGV, Segmentation fault. 0 x00000000004b9ea5 in tracepoint_error (e=0x0, err=13, sys=0x19b1370 "sched", name=0x19a5d00 "sched_switch") at util/parse-events.c:410 (gdb) bt #0 0x00000000004b9ea5 in tracepoint_error (e=0x0, err=13, sys=0x19b1370 "sched", name=0x19a5d00 "sched_switch") at util/parse-events.c:410 #1 0x00000000004b9fc5 in add_tracepoint (list=0x19a5d20, idx=0x7fffffffb8c0, sys_name=0x19b1370 "sched", evt_name=0x19a5d00 "sched_switch", err=0x0, head_config=0x0) at util/parse-events.c:433 #2 0x00000000004ba334 in add_tracepoint_event (list=0x19a5d20, idx=0x7fffffffb8c0, sys_name=0x19b1370 "sched", evt_name=0x19a5d00 "sched_switch", err=0x0, head_config=0x0) at util/parse-events.c:498 #3 0x00000000004bb699 in parse_events_add_tracepoint (list=0x19a5d20, idx=0x7fffffffb8c0, sys=0x19b1370 "sched", event=0x19a5d00 "sched_switch", err=0x0, head_config=0x0) at util/parse-events.c:936 #4 0x00000000004f6eda in parse_events_parse (_data=0x7fffffffb8b0, scanner=0x19a49d0) at util/parse-events.y:391 #5 0x00000000004bc8e5 in parse_events__scanner (str=0x663ff2 "sched:sched_switch", data=0x7fffffffb8b0, start_token=258) at util/parse-events.c:1361 #6 0x00000000004bca57 in parse_events (evlist=0x19a5220, str=0x663ff2 "sched:sched_switch", err=0x0) at util/parse-events.c:1401 #7 0x0000000000518d5f in perf_evlist__can_select_event (evlist=0x19a3b90, str=0x663ff2 "sched:sched_switch") at util/record.c:253 #8 0x0000000000553c42 in intel_pt_track_switches (evlist=0x19a3b90) at arch/x86/util/intel-pt.c:364 #9 0x00000000005549d1 in intel_pt_recording_options (itr=0x19a2c40, evlist=0x19a3b90, opts=0x8edf68 ) at arch/x86/util/intel-pt.c:664 #10 0x000000000051e076 in auxtrace_record__options (itr=0x19a2c40, evlist=0x19a3b90, opts=0x8edf68 ) at util/auxtrace.c:539 #11 0x0000000000433368 in cmd_record (argc=1, argv=0x7fffffffde60, prefix=0x0) at builtin-record.c:1264 #12 0x000000000049bec2 in run_builtin (p=0x8fa2a8 , argc=5, argv=0x7fffffffde60) at perf.c:390 #13 0x000000000049c12a in handle_internal_command (argc=5, argv=0x7fffffffde60) at perf.c:451 #14 0x000000000049c278 in run_argv (argcp=0x7fffffffdcbc, argv=0x7fffffffdcb0) at perf.c:495 #15 0x000000000049c60a in main (argc=5, argv=0x7fffffffde60) at perf.c:618 (gdb) Intel PT attempts to find the sched:sched_switch tracepoint but that seg faults if tracefs is not readable, because the error reporting structure is null, as errors are not reported when automatically adding tracepoints. Fix by checking before using. Committer note: This doesn't take place in a kernel that supports perf_event_attr.context_switch, that is the default way that will be used for tracking context switches, only in older kernels, like 4.2, in a machine with Intel PT (e.g. Broadwell) for non-priviledged users. Further info from a similar patch by Wang: The error is in tracepoint_error: it assumes the 'e' parameter is valid. However, there are many situation a parse_event() can be called without parse_events_error. See result of $ grep 'parse_events(.*NULL)' ./tools/perf/ -r' Signed-off-by: Adrian Hunter Tested-by: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Josh Poimboeuf Cc: Tong Zhang Cc: Wang Nan Cc: stable@vger.kernel.org # v4.4+ Fixes: 196581717d85 ("perf tools: Enhance parsing events tracepoint error output") Link: http://lkml.kernel.org/r/1453809921-24596-2-git-send-email-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/parse-events.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'tools') diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c index 4f7b0efdde2f..813d9b272c81 100644 --- a/tools/perf/util/parse-events.c +++ b/tools/perf/util/parse-events.c @@ -399,6 +399,9 @@ static void tracepoint_error(struct parse_events_error *e, int err, { char help[BUFSIZ]; + if (!e) + return; + /* * We get error directly from syscall errno ( > 0), * or from encoded pointer's error ( < 0). -- cgit v1.2.3 From 3a4acda1ecbd290973de08250d7dcdfaf5b2fe0f Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Mon, 1 Feb 2016 03:21:04 +0000 Subject: perf tools: Fix thread lifetime related segfaut in intel_pt intel_pt_process_auxtrace_info() creates a pt->unknown_thread thread that eventually needs to be freed by the last thread__put() on it, when its refcount hits zero, which may happen in intel_pt_process_auxtrace_info() error handling path and triggers the following segfault, which would happen as well at intel_pt_free, when tools using this intel_pt codebase frees up resources: # perf record -I -e intel_pt/tsc=1,noretcomp=1/u /bin/ls 0 a anaconda-ks.cfg bin perf.data perf.data.old perf-f23-bringup.todo [ perf record: Woken up 1 times to write data ] [ perf record: Captured and wrote 0.217 MB perf.data ] # # perf script -F event,comm,pid,tid,time,addr,ip,sym,dso,iregs Samples for 'instructions:u' event do not have IREGS attribute set. Cannot print 'iregs' field. intel_pt_synth_events: failed to synthesize 'instructions' event type Segmentation fault (core dumped) # The problem is: there's a union in 'struct thread' combines a list_head and a rb_node. The standard life cycle of a thread is: init rb_node in the constructor, insert it into machine->threads rbtree using rb_node, move it to machine->dead_threads using list_head, clean in the last thread__put: list_del_init(&thread->node). In the above command, it clean a thread before adding it into list, causes the above segfault. Since pt->unknown_thread will never live in an rbtree, initialize its list node so that when list_del_init() is done on it we don't segfault. After this patch: # perf script -F event,comm,pid,tid,time,addr,ip,sym,dso,iregs Samples for 'instructions:u' event do not have IREGS attribute set. Cannot print 'iregs' field. intel_pt_synth_events: failed to synthesize 'instructions' event type 0x248 [0x88]: failed to process type: 70 # Reported-by: Tong Zhang Reported-by: Wang Nan Signed-off-by: Adrian Hunter Tested-by: Arnaldo Carvalho de Melo Cc: Josh Poimboeuf Link: http://lkml.kernel.org/r/1454296865-19749-1-git-send-email-wangnan0@huawei.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/intel-pt.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'tools') diff --git a/tools/perf/util/intel-pt.c b/tools/perf/util/intel-pt.c index 81a2eb77ba7f..05d815851be1 100644 --- a/tools/perf/util/intel-pt.c +++ b/tools/perf/util/intel-pt.c @@ -2068,6 +2068,15 @@ int intel_pt_process_auxtrace_info(union perf_event *event, err = -ENOMEM; goto err_free_queues; } + + /* + * Since this thread will not be kept in any rbtree not in a + * list, initialize its list node so that at thread__put() the + * current thread lifetime assuption is kept and we don't segfault + * at list_del_init(). + */ + INIT_LIST_HEAD(&pt->unknown_thread->node); + err = thread__set_comm(pt->unknown_thread, "unknown", 0); if (err) goto err_delete_thread; -- cgit v1.2.3 From 270bde1e76f400d81f8d0ab68905a18ee17fa2e8 Mon Sep 17 00:00:00 2001 From: Hemant Kumar Date: Tue, 2 Feb 2016 20:56:46 +0530 Subject: perf probe: Search both .eh_frame and .debug_frame sections for probe location 'perf probe' through debuginfo__find_probes() in util/probe-finder.c checks for the functions' frame descriptions in either .eh_frame section of an ELF or the .debug_frame. The check is based on whether either one of these sections is present. Depending on distro, toolchain defaults, architetcutre, build flags, etc., CFI might be found in either .eh_frame and/or .debug_frame. Sometimes, it may happen that, .eh_frame, even if present, may not be complete and may miss some descriptions. Therefore, to be sure, to find the CFI covering an address we will always have to investigate both if available. For e.g., in powerpc, this may happen: $ gcc -g bin.c -o bin $ objdump --dwarf ./bin <1><145>: Abbrev Number: 7 (DW_TAG_subprogram) <146> DW_AT_external : 1 <146> DW_AT_name : (indirect string, offset: 0x9e): main <14a> DW_AT_decl_file : 1 <14b> DW_AT_decl_line : 39 <14c> DW_AT_prototyped : 1 <14c> DW_AT_type : <0x57> <150> DW_AT_low_pc : 0x100007b8 If the .eh_frame and .debug_frame are checked for the same binary, we will find that, .eh_frame (although present) doesn't contain a description for "main" function. But, .debug_frame has a description: 000000d8 00000024 00000000 FDE cie=00000000 pc=100007b8..10000838 DW_CFA_advance_loc: 16 to 100007c8 DW_CFA_def_cfa_offset: 144 DW_CFA_offset_extended_sf: r65 at cfa+16 ... Due to this (since, perf checks whether .eh_frame is present and goes on searching for that address inside that frame), perf is unable to process the probes: # perf probe -x ./bin main Failed to get call frame on 0x100007b8 Error: Failed to add events. To avoid this issue, we need to check both the sections (.eh_frame and .debug_frame), which is done in this patch. Note that, we can always force everything into both .eh_frame and .debug_frame by: $ gcc bin.c -fasynchronous-unwind-tables -fno-dwarf2-cfi-asm -g -o bin Signed-off-by: Hemant Kumar Acked-by: Masami Hiramatsu Cc: linuxppc-dev@lists.ozlabs.org Cc: Mark Wielaard Cc: Naveen N. Rao Cc: Srikar Dronamraju Link: http://lkml.kernel.org/r/1454426806-13974-1-git-send-email-hemant@linux.vnet.ibm.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/probe-finder.c | 62 +++++++++++++++++++++++++----------------- tools/perf/util/probe-finder.h | 5 +++- 2 files changed, 41 insertions(+), 26 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/probe-finder.c b/tools/perf/util/probe-finder.c index 2be10fb27172..4ce5c5e18f48 100644 --- a/tools/perf/util/probe-finder.c +++ b/tools/perf/util/probe-finder.c @@ -686,8 +686,9 @@ static int call_probe_finder(Dwarf_Die *sc_die, struct probe_finder *pf) pf->fb_ops = NULL; #if _ELFUTILS_PREREQ(0, 142) } else if (nops == 1 && pf->fb_ops[0].atom == DW_OP_call_frame_cfa && - pf->cfi != NULL) { - if (dwarf_cfi_addrframe(pf->cfi, pf->addr, &frame) != 0 || + (pf->cfi_eh != NULL || pf->cfi_dbg != NULL)) { + if ((dwarf_cfi_addrframe(pf->cfi_eh, pf->addr, &frame) != 0 && + (dwarf_cfi_addrframe(pf->cfi_dbg, pf->addr, &frame) != 0)) || dwarf_frame_cfa(frame, &pf->fb_ops, &nops) != 0) { pr_warning("Failed to get call frame on 0x%jx\n", (uintmax_t)pf->addr); @@ -1015,8 +1016,7 @@ static int pubname_search_cb(Dwarf *dbg, Dwarf_Global *gl, void *data) return DWARF_CB_OK; } -/* Find probe points from debuginfo */ -static int debuginfo__find_probes(struct debuginfo *dbg, +static int debuginfo__find_probe_location(struct debuginfo *dbg, struct probe_finder *pf) { struct perf_probe_point *pp = &pf->pev->point; @@ -1025,27 +1025,6 @@ static int debuginfo__find_probes(struct debuginfo *dbg, Dwarf_Die *diep; int ret = 0; -#if _ELFUTILS_PREREQ(0, 142) - Elf *elf; - GElf_Ehdr ehdr; - GElf_Shdr shdr; - - /* Get the call frame information from this dwarf */ - elf = dwarf_getelf(dbg->dbg); - if (elf == NULL) - return -EINVAL; - - if (gelf_getehdr(elf, &ehdr) == NULL) - return -EINVAL; - - if (elf_section_by_name(elf, &ehdr, &shdr, ".eh_frame", NULL) && - shdr.sh_type == SHT_PROGBITS) { - pf->cfi = dwarf_getcfi_elf(elf); - } else { - pf->cfi = dwarf_getcfi(dbg->dbg); - } -#endif - off = 0; pf->lcache = intlist__new(NULL); if (!pf->lcache) @@ -1108,6 +1087,39 @@ found: return ret; } +/* Find probe points from debuginfo */ +static int debuginfo__find_probes(struct debuginfo *dbg, + struct probe_finder *pf) +{ + int ret = 0; + +#if _ELFUTILS_PREREQ(0, 142) + Elf *elf; + GElf_Ehdr ehdr; + GElf_Shdr shdr; + + if (pf->cfi_eh || pf->cfi_dbg) + return debuginfo__find_probe_location(dbg, pf); + + /* Get the call frame information from this dwarf */ + elf = dwarf_getelf(dbg->dbg); + if (elf == NULL) + return -EINVAL; + + if (gelf_getehdr(elf, &ehdr) == NULL) + return -EINVAL; + + if (elf_section_by_name(elf, &ehdr, &shdr, ".eh_frame", NULL) && + shdr.sh_type == SHT_PROGBITS) + pf->cfi_eh = dwarf_getcfi_elf(elf); + + pf->cfi_dbg = dwarf_getcfi(dbg->dbg); +#endif + + ret = debuginfo__find_probe_location(dbg, pf); + return ret; +} + struct local_vars_finder { struct probe_finder *pf; struct perf_probe_arg *args; diff --git a/tools/perf/util/probe-finder.h b/tools/perf/util/probe-finder.h index bed82716e1b4..0aec7704e395 100644 --- a/tools/perf/util/probe-finder.h +++ b/tools/perf/util/probe-finder.h @@ -76,7 +76,10 @@ struct probe_finder { /* For variable searching */ #if _ELFUTILS_PREREQ(0, 142) - Dwarf_CFI *cfi; /* Call Frame Information */ + /* Call Frame Information from .eh_frame */ + Dwarf_CFI *cfi_eh; + /* Call Frame Information from .debug_frame */ + Dwarf_CFI *cfi_dbg; #endif Dwarf_Op *fb_ops; /* Frame base attribute */ struct perf_probe_arg *pvar; /* Current target variable */ -- cgit v1.2.3 From 51fd2df1e882a3c2a3f4b6c9ff243a93c9046dba Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Wed, 3 Feb 2016 08:43:56 +0100 Subject: perf stat: Fix interval output values We broke interval data displays with commit: 3f416f22d1e2 ("perf stat: Do not clean event's private stats") This commit removed stats cleaning, which is important for '-r' option to carry counters data over the whole run. But it's necessary to clean it for interval mode, otherwise the displayed value is avg of all previous values. Before: $ perf stat -e cycles -a -I 1000 record # time counts unit events 1.000240796 75,216,287 cycles 2.000512791 107,823,524 cycles $ perf stat report # time counts unit events 1.000240796 75,216,287 cycles 2.000512791 91,519,906 cycles Now: $ perf stat report # time counts unit events 1.000240796 75,216,287 cycles 2.000512791 107,823,524 cycles Notice the second value being bigger (91,.. < 107,..). This could be easily verified by using perf script which displays raw stat data: $ perf script CPU THREAD VAL ENA RUN TIME EVENT 0 -1 23855779 1000209530 1000209530 1000240796 cycles 1 -1 33340397 1000224964 1000224964 1000240796 cycles 2 -1 15835415 1000226695 1000226695 1000240796 cycles 3 -1 2184696 1000228245 1000228245 1000240796 cycles 0 -1 97014312 2000514533 2000514533 2000512791 cycles 1 -1 46121497 2000543795 2000543795 2000512791 cycles 2 -1 32269530 2000543566 2000543566 2000512791 cycles 3 -1 7634472 2000544108 2000544108 2000512791 cycles The sum of the first 4 values is the first interval aggregated value: 23855779 + 33340397 + 15835415 + 2184696 = 75,216,287 The sum of the second 4 values minus first value is the second interval aggregated value: 97014312 + 46121497 + 32269530 + 7634472 - 75216287 = 107,823,524 Signed-off-by: Jiri Olsa Cc: Andi Kleen Cc: David Ahern Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Link: http://lkml.kernel.org/r/1454485436-20639-1-git-send-email-jolsa@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/stat.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'tools') diff --git a/tools/perf/util/stat.c b/tools/perf/util/stat.c index 2b58edccd56f..afb0c45eba34 100644 --- a/tools/perf/util/stat.c +++ b/tools/perf/util/stat.c @@ -311,6 +311,16 @@ int perf_stat_process_counter(struct perf_stat_config *config, aggr->val = aggr->ena = aggr->run = 0; + /* + * We calculate counter's data every interval, + * and the display code shows ps->res_stats + * avg value. We need to zero the stats for + * interval mode, otherwise overall avg running + * averages will be shown for each interval. + */ + if (config->interval) + init_stats(ps->res_stats); + if (counter->per_pkg) zero_per_pkg(counter); -- cgit v1.2.3 From ed8b0de5a33d2a2557dce7f9429dca8cb5bc5879 Mon Sep 17 00:00:00 2001 From: Peter Jones Date: Mon, 8 Feb 2016 14:48:15 -0500 Subject: efi: Make efivarfs entries immutable by default "rm -rf" is bricking some peoples' laptops because of variables being used to store non-reinitializable firmware driver data that's required to POST the hardware. These are 100% bugs, and they need to be fixed, but in the mean time it shouldn't be easy to *accidentally* brick machines. We have to have delete working, and picking which variables do and don't work for deletion is quite intractable, so instead make everything immutable by default (except for a whitelist), and make tools that aren't quite so broad-spectrum unset the immutable flag. Signed-off-by: Peter Jones Tested-by: Lee, Chun-Yi Acked-by: Matthew Garrett Signed-off-by: Matt Fleming --- tools/testing/selftests/efivarfs/efivarfs.sh | 19 +++++-- tools/testing/selftests/efivarfs/open-unlink.c | 72 +++++++++++++++++++++++++- 2 files changed, 87 insertions(+), 4 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/efivarfs/efivarfs.sh b/tools/testing/selftests/efivarfs/efivarfs.sh index 77edcdcc016b..057278448515 100755 --- a/tools/testing/selftests/efivarfs/efivarfs.sh +++ b/tools/testing/selftests/efivarfs/efivarfs.sh @@ -88,7 +88,11 @@ test_delete() exit 1 fi - rm $file + rm $file 2>/dev/null + if [ $? -ne 0 ]; then + chattr -i $file + rm $file + fi if [ -e $file ]; then echo "$file couldn't be deleted" >&2 @@ -111,6 +115,7 @@ test_zero_size_delete() exit 1 fi + chattr -i $file printf "$attrs" > $file if [ -e $file ]; then @@ -141,7 +146,11 @@ test_valid_filenames() echo "$file could not be created" >&2 ret=1 else - rm $file + rm $file 2>/dev/null + if [ $? -ne 0 ]; then + chattr -i $file + rm $file + fi fi done @@ -174,7 +183,11 @@ test_invalid_filenames() if [ -e $file ]; then echo "Creating $file should have failed" >&2 - rm $file + rm $file 2>/dev/null + if [ $? -ne 0 ]; then + chattr -i $file + rm $file + fi ret=1 fi done diff --git a/tools/testing/selftests/efivarfs/open-unlink.c b/tools/testing/selftests/efivarfs/open-unlink.c index 8c0764407b3c..4af74f733036 100644 --- a/tools/testing/selftests/efivarfs/open-unlink.c +++ b/tools/testing/selftests/efivarfs/open-unlink.c @@ -1,10 +1,68 @@ +#include #include #include #include #include +#include #include #include #include +#include + +static int set_immutable(const char *path, int immutable) +{ + unsigned int flags; + int fd; + int rc; + int error; + + fd = open(path, O_RDONLY); + if (fd < 0) + return fd; + + rc = ioctl(fd, FS_IOC_GETFLAGS, &flags); + if (rc < 0) { + error = errno; + close(fd); + errno = error; + return rc; + } + + if (immutable) + flags |= FS_IMMUTABLE_FL; + else + flags &= ~FS_IMMUTABLE_FL; + + rc = ioctl(fd, FS_IOC_SETFLAGS, &flags); + error = errno; + close(fd); + errno = error; + return rc; +} + +static int get_immutable(const char *path) +{ + unsigned int flags; + int fd; + int rc; + int error; + + fd = open(path, O_RDONLY); + if (fd < 0) + return fd; + + rc = ioctl(fd, FS_IOC_GETFLAGS, &flags); + if (rc < 0) { + error = errno; + close(fd); + errno = error; + return rc; + } + close(fd); + if (flags & FS_IMMUTABLE_FL) + return 1; + return 0; +} int main(int argc, char **argv) { @@ -27,7 +85,7 @@ int main(int argc, char **argv) buf[4] = 0; /* create a test variable */ - fd = open(path, O_WRONLY | O_CREAT); + fd = open(path, O_WRONLY | O_CREAT, 0600); if (fd < 0) { perror("open(O_WRONLY)"); return EXIT_FAILURE; @@ -41,6 +99,18 @@ int main(int argc, char **argv) close(fd); + rc = get_immutable(path); + if (rc < 0) { + perror("ioctl(FS_IOC_GETFLAGS)"); + return EXIT_FAILURE; + } else if (rc) { + rc = set_immutable(path, 0); + if (rc < 0) { + perror("ioctl(FS_IOC_SETFLAGS)"); + return EXIT_FAILURE; + } + } + fd = open(path, O_RDONLY); if (fd < 0) { perror("open"); -- cgit v1.2.3