From 13c4a90119d28cfcb6b5bdd820c233b86c2b0237 Mon Sep 17 00:00:00 2001 From: Tycho Andersen Date: Sat, 13 Jun 2015 09:02:48 -0600 Subject: seccomp: add ptrace options for suspend/resume This patch is the first step in enabling checkpoint/restore of processes with seccomp enabled. One of the things CRIU does while dumping tasks is inject code into them via ptrace to collect information that is only available to the process itself. However, if we are in a seccomp mode where these processes are prohibited from making these syscalls, then what CRIU does kills the task. This patch adds a new ptrace option, PTRACE_O_SUSPEND_SECCOMP, that enables a task from the init user namespace which has CAP_SYS_ADMIN and no seccomp filters to disable (and re-enable) seccomp filters for another task so that they can be successfully dumped (and restored). We restrict the set of processes that can disable seccomp through ptrace because although today ptrace can be used to bypass seccomp, there is some discussion of closing this loophole in the future and we would like this patch to not depend on that behavior and be future proofed for when it is removed. Note that seccomp can be suspended before any filters are actually installed; this behavior is useful on criu restore, so that we can suspend seccomp, restore the filters, unmap our restore code from the restored process' address space, and then resume the task by detaching and have the filters resumed as well. v2 changes: * require that the tracer have no seccomp filters installed * drop TIF_NOTSC manipulation from the patch * change from ptrace command to a ptrace option and use this ptrace option as the flag to check. This means that as soon as the tracer detaches/dies, seccomp is re-enabled and as a corrollary that one can not disable seccomp across PTRACE_ATTACHs. v3 changes: * get rid of various #ifdefs everywhere * report more sensible errors when PTRACE_O_SUSPEND_SECCOMP is incorrectly used v4 changes: * get rid of may_suspend_seccomp() in favor of a capable() check in ptrace directly v5 changes: * check that seccomp is not enabled (or suspended) on the tracer Signed-off-by: Tycho Andersen CC: Will Drewry CC: Roland McGrath CC: Pavel Emelyanov CC: Serge E. Hallyn Acked-by: Oleg Nesterov Acked-by: Andy Lutomirski [kees: access seccomp.mode through seccomp_mode() instead] Signed-off-by: Kees Cook --- include/uapi/linux/ptrace.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/ptrace.h b/include/uapi/linux/ptrace.h index cf1019e15f5b..a7a697986614 100644 --- a/include/uapi/linux/ptrace.h +++ b/include/uapi/linux/ptrace.h @@ -89,9 +89,11 @@ struct ptrace_peeksiginfo_args { #define PTRACE_O_TRACESECCOMP (1 << PTRACE_EVENT_SECCOMP) /* eventless options */ -#define PTRACE_O_EXITKILL (1 << 20) +#define PTRACE_O_EXITKILL (1 << 20) +#define PTRACE_O_SUSPEND_SECCOMP (1 << 21) -#define PTRACE_O_MASK (0x000000ff | PTRACE_O_EXITKILL) +#define PTRACE_O_MASK (\ + 0x000000ff | PTRACE_O_EXITKILL | PTRACE_O_SUSPEND_SECCOMP) #include -- cgit v1.2.3 From 8ab30c1538b14424015e45063c41d509b24c1dea Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alex=20Benn=C3=A9e?= Date: Tue, 7 Jul 2015 17:29:53 +0100 Subject: KVM: add comments for kvm_debug_exit_arch struct MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bring into line with the comments for the other structures and their KVM_EXIT_* cases. Also update api.txt to reflect use in kvm_run documentation. Signed-off-by: Alex Bennée Reviewed-by: David Hildenbrand Reviewed-by: Andrew Jones Acked-by: Christoffer Dall Signed-off-by: Marc Zyngier --- include/uapi/linux/kvm.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 716ad4ae4d4b..4ab3c6a8d563 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -237,6 +237,7 @@ struct kvm_run { __u32 count; __u64 data_offset; /* relative to kvm_run start */ } io; + /* KVM_EXIT_DEBUG */ struct { struct kvm_debug_exit_arch arch; } debug; @@ -285,6 +286,7 @@ struct kvm_run { __u32 data; __u8 is_write; } dcr; + /* KVM_EXIT_INTERNAL_ERROR */ struct { __u32 suberror; /* Available with KVM_CAP_INTERNAL_ERROR_DATA: */ @@ -295,6 +297,7 @@ struct kvm_run { struct { __u64 gprs[32]; } osi; + /* KVM_EXIT_PAPR_HCALL */ struct { __u64 nr; __u64 ret; -- cgit v1.2.3 From 5540546bc93b49f98a0466fe3f96615286c76574 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alex=20Benn=C3=A9e?= Date: Tue, 7 Jul 2015 17:30:01 +0100 Subject: KVM: arm64: guest debug, HW assisted debug support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This adds support for userspace to control the HW debug registers for guest debug. In the debug ioctl we copy an IMPDEF registers into a new register set called host_debug_state. We use the recently introduced vcpu parameter debug_ptr to select which register set is copied into the real registers when world switch occurs. I've made some helper functions from hw_breakpoint.c more widely available for re-use. As with single step we need to tweak the guest registers to enable the exceptions so we need to save and restore those bits. Two new capabilities have been added to the KVM_EXTENSION ioctl to allow userspace to query the number of hardware break and watch points available on the host hardware. Signed-off-by: Alex Bennée Reviewed-by: Christoffer Dall Signed-off-by: Marc Zyngier --- include/uapi/linux/kvm.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 4ab3c6a8d563..a1e08e7bbf20 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -820,6 +820,8 @@ struct kvm_ppc_smmu_info { #define KVM_CAP_DISABLE_QUIRKS 116 #define KVM_CAP_X86_SMM 117 #define KVM_CAP_MULTI_ADDRESS_SPACE 118 +#define KVM_CAP_GUEST_DEBUG_HW_BPS 119 +#define KVM_CAP_GUEST_DEBUG_HW_WPS 120 #ifdef KVM_CAP_IRQ_ROUTING -- cgit v1.2.3 From fc5462f8525b47fa219452289ecb22c921c16823 Mon Sep 17 00:00:00 2001 From: Azael Avalos Date: Wed, 22 Jul 2015 18:09:11 -0600 Subject: toshiba_acpi: Add /dev/toshiba_acpi device There were previous attempts to "merge" the toshiba SMM module to the toshiba_acpi one, they were trying to imitate what the old toshiba module does, however, some models (TOS1900 devices) come with a "crippled" implementation and do not provide all the "features" a "genuine" Toshiba BIOS does. This patch adds a new device called toshiba_acpi, which aim is to enable userspace to access the SMM on Toshiba laptops via ACPI calls. Creating a new convenience _IOWR command to access the SCI functions by opening/closing the SCI internally to avoid buggy BIOS, while at the same time providing backwards compatibility. Older programs (and new) who wish to access the SMM on newer models can do it by pointing their path to /dev/toshiba_acpi (instead of /dev/toshiba) as the toshiba.h header was modified to reflect these changes as well as adds all the toshiba_acpi paths and command, however, it is strongly recommended to use the new IOCTL for any SCI command to avoid any buggy BIOS. Signed-off-by: Azael Avalos Signed-off-by: Darren Hart --- include/uapi/linux/toshiba.h | 32 +++++++++++++++++++++++++++++--- 1 file changed, 29 insertions(+), 3 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/toshiba.h b/include/uapi/linux/toshiba.h index e9bef5b2f91e..c58bf4b5bb26 100644 --- a/include/uapi/linux/toshiba.h +++ b/include/uapi/linux/toshiba.h @@ -1,6 +1,7 @@ /* toshiba.h -- Linux driver for accessing the SMM on Toshiba laptops * * Copyright (c) 1996-2000 Jonathan A. Buzzard (jonathan@buzzard.org.uk) + * Copyright (c) 2015 Azael Avalos * * Thanks to Juergen Heinzl for the pointers * on making sure the structure is aligned and packed. @@ -20,9 +21,18 @@ #ifndef _UAPI_LINUX_TOSHIBA_H #define _UAPI_LINUX_TOSHIBA_H -#define TOSH_PROC "/proc/toshiba" -#define TOSH_DEVICE "/dev/toshiba" -#define TOSH_SMM _IOWR('t', 0x90, int) /* broken: meant 24 bytes */ +/* + * Toshiba modules paths + */ + +#define TOSH_PROC "/proc/toshiba" +#define TOSH_DEVICE "/dev/toshiba" +#define TOSHIBA_ACPI_PROC "/proc/acpi/toshiba" +#define TOSHIBA_ACPI_DEVICE "/dev/toshiba_acpi" + +/* + * Toshiba SMM structure + */ typedef struct { unsigned int eax; @@ -33,5 +43,21 @@ typedef struct { unsigned int edi __attribute__ ((packed)); } SMMRegisters; +/* + * IOCTLs (0x90 - 0x91) + */ + +#define TOSH_SMM _IOWR('t', 0x90, SMMRegisters) +/* + * Convenience toshiba_acpi command. + * + * The System Configuration Interface (SCI) is opened/closed internally + * to avoid userspace of buggy BIOSes. + * + * The toshiba_acpi module checks whether the eax register is set with + * SCI_GET (0xf300) or SCI_SET (0xf400), returning -EINVAL if not. + */ +#define TOSHIBA_ACPI_SCI _IOWR('t', 0x91, SMMRegisters) + #endif /* _UAPI_LINUX_TOSHIBA_H */ -- cgit v1.2.3 From ec92777f2ba93c00387b8fe53780c25adc57c744 Mon Sep 17 00:00:00 2001 From: Vishal Verma Date: Thu, 9 Jul 2015 13:25:35 -0600 Subject: libnvdimm: Update name of the ars_status_record mask field The spec suggests that this is a simple 'length' field, not a mask. Update the name accordingly. Signed-off-by: Vishal Verma Signed-off-by: Dan Williams --- include/uapi/linux/ndctl.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/ndctl.h b/include/uapi/linux/ndctl.h index 2b94ea2287bb..e94bc20016b2 100644 --- a/include/uapi/linux/ndctl.h +++ b/include/uapi/linux/ndctl.h @@ -87,7 +87,7 @@ struct nd_cmd_ars_status { __u32 handle; __u32 flags; __u64 err_address; - __u64 mask; + __u64 length; } __packed records[0]; } __packed; -- cgit v1.2.3 From 39c686b862cdb2049b90e095b6c6c727b2a7ab60 Mon Sep 17 00:00:00 2001 From: Vishal Verma Date: Thu, 9 Jul 2015 13:25:36 -0600 Subject: libnvdimm: Add DSM support for Address Range Scrub commands Add support for the three ARS DSM commands: - Query ARS Capabilities - Queries the firmware to check if a given range supports scrub, and if so, which type (persistent vs. volatile) - Start ARS - Starts a scrub for a given range/type - Query ARS Status - Checks status of a previously started scrub, and provides the error logs if any. The commands are described by the example DSM spec at: http://pmem.io/documents/NVDIMM_DSM_Interface_Example.pdf Also add these commands to the nfit_test test framework, and return canned data. Signed-off-by: Vishal Verma Signed-off-by: Dan Williams --- include/uapi/linux/ndctl.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/ndctl.h b/include/uapi/linux/ndctl.h index e94bc20016b2..5b4a4be06e2b 100644 --- a/include/uapi/linux/ndctl.h +++ b/include/uapi/linux/ndctl.h @@ -111,6 +111,11 @@ enum { ND_CMD_VENDOR = 9, }; +enum { + ND_ARS_VOLATILE = 1, + ND_ARS_PERSISTENT = 2, +}; + static inline const char *nvdimm_bus_cmd_name(unsigned cmd) { static const char * const names[] = { @@ -194,4 +199,9 @@ enum nd_driver_flags { enum { ND_MIN_NAMESPACE_SIZE = 0x00400000, }; + +enum ars_masks { + ARS_STATUS_MASK = 0x0000FFFF, + ARS_EXT_STATUS_SHIFT = 16, +}; #endif /* __NDCTL_H__ */ -- cgit v1.2.3 From 34d99af52ad40bd498ba66970579a5bc1fb1a3bc Mon Sep 17 00:00:00 2001 From: Richard Guy Briggs Date: Wed, 5 Aug 2015 16:29:37 -0400 Subject: audit: implement audit by executable This adds the ability audit the actions of a not-yet-running process. This patch implements the ability to filter on the executable path. Instead of just hard coding the ino and dev of the executable we care about at the moment the rule is inserted into the kernel, use the new audit_fsnotify infrastructure to manage this dynamically. This means that if the filename does not yet exist but the containing directory does, or if the inode in question is unlinked and creat'd (aka updated) the rule will just continue to work. If the containing directory is moved or deleted or the filesystem is unmounted, the rule is deleted automatically. A future enhancement would be to have the rule survive across directory disruptions. This is a heavily modified version of a patch originally submitted by Eric Paris with some ideas from Peter Moody. Cc: Peter Moody Cc: Eric Paris Signed-off-by: Richard Guy Briggs [PM: minor whitespace clean to satisfy ./scripts/checkpatch] Signed-off-by: Paul Moore --- include/uapi/linux/audit.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/audit.h b/include/uapi/linux/audit.h index d3475e1f15ec..f6ff62c24aba 100644 --- a/include/uapi/linux/audit.h +++ b/include/uapi/linux/audit.h @@ -266,6 +266,7 @@ #define AUDIT_OBJ_UID 109 #define AUDIT_OBJ_GID 110 #define AUDIT_FIELD_COMPARE 111 +#define AUDIT_EXE 112 #define AUDIT_ARG0 200 #define AUDIT_ARG1 (AUDIT_ARG0+1) @@ -324,8 +325,10 @@ enum { #define AUDIT_FEATURE_BITMAP_BACKLOG_LIMIT 0x00000001 #define AUDIT_FEATURE_BITMAP_BACKLOG_WAIT_TIME 0x00000002 +#define AUDIT_FEATURE_BITMAP_EXECUTABLE_PATH 0x00000004 #define AUDIT_FEATURE_BITMAP_ALL (AUDIT_FEATURE_BITMAP_BACKLOG_LIMIT | \ - AUDIT_FEATURE_BITMAP_BACKLOG_WAIT_TIME) + AUDIT_FEATURE_BITMAP_BACKLOG_WAIT_TIME | \ + AUDIT_FEATURE_BITMAP_EXECUTABLE_PATH) /* deprecated: AUDIT_VERSION_* */ #define AUDIT_VERSION_LATEST AUDIT_FEATURE_BITMAP_ALL -- cgit v1.2.3 From b14132797d8041a42e03f4ffa1e722da1425adfb Mon Sep 17 00:00:00 2001 From: Mike Frysinger Date: Tue, 18 Aug 2015 03:28:01 -0400 Subject: elf-em.h: move EM_MICROBLAZE to the common header The linux/audit.h header uses EM_MICROBLAZE in order to define AUDIT_ARCH_MICROBLAZE, but it's only available in the microblaze asm headers. Move it to the common elf-em.h header so that the define can be used on non-microblaze systems. Otherwise we get build errors that EM_MICROBLAZE isn't defined when we try to use the AUDIT_ARCH_MICROBLAZE symbol. Signed-off-by: Mike Frysinger Signed-off-by: Michal Simek --- include/uapi/linux/elf-em.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/elf-em.h b/include/uapi/linux/elf-em.h index 3429a3ba382b..b56dfcfe922a 100644 --- a/include/uapi/linux/elf-em.h +++ b/include/uapi/linux/elf-em.h @@ -39,6 +39,7 @@ #define EM_TI_C6000 140 /* TI C6X DSPs */ #define EM_AARCH64 183 /* ARM 64 bit */ #define EM_TILEPRO 188 /* Tilera TILEPro */ +#define EM_MICROBLAZE 189 /* Xilinx MicroBlaze */ #define EM_TILEGX 191 /* Tilera TILE-Gx */ #define EM_FRV 0x5441 /* Fujitsu FR-V */ #define EM_AVR32 0x18ad /* Atmel AVR32 */ -- cgit v1.2.3 From 1ab1e895492d8084dfc1c854efacde219e56b8c1 Mon Sep 17 00:00:00 2001 From: Henrik Austad Date: Wed, 9 Sep 2015 12:25:17 +0200 Subject: ether: add IEEE 1722 ethertype - TSN IEEE 1722 describes AVB (later renamed to TSN - Time Sensitive Networking), a protocol, encapsualtion and synchronization to utilize standard networks for audio/video (and later other time-sensitive) streams. This standard uses ethertype 0x22F0. http://standards.ieee.org/develop/regauth/ethertype/eth.txt This is a respin of a previous patch ("ether: add AVB frame type ETH_P_AVB") CC: "David S. Miller" CC: netdev@vger.kernel.org CC: linux-api@vger.kernel.org CC: linux-kernel@vger.kernel.org Signed-off-by: Henrik Austad Signed-off-by: David S. Miller --- include/uapi/linux/if_ether.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/if_ether.h b/include/uapi/linux/if_ether.h index aa63ed023c2b..ea9221b0331a 100644 --- a/include/uapi/linux/if_ether.h +++ b/include/uapi/linux/if_ether.h @@ -42,6 +42,7 @@ #define ETH_P_LOOP 0x0060 /* Ethernet Loopback packet */ #define ETH_P_PUP 0x0200 /* Xerox PUP packet */ #define ETH_P_PUPAT 0x0201 /* Xerox PUP Addr Trans packet */ +#define ETH_P_TSN 0x22F0 /* TSN (IEEE 1722) packet */ #define ETH_P_IP 0x0800 /* Internet Protocol packet */ #define ETH_P_X25 0x0805 /* CCITT X.25 */ #define ETH_P_ARP 0x0806 /* Address Resolution packet */ -- cgit v1.2.3 From f074a8f49eb87cde95ac9d040ad5e7ea4f029738 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Wed, 9 Sep 2015 15:35:48 -0700 Subject: proc: export idle flag via kpageflags As noted by Minchan, a benefit of reading idle flag from /proc/kpageflags is that one can easily filter dirty and/or unevictable pages while estimating the size of unused memory. Note that idle flag read from /proc/kpageflags may be stale in case the page was accessed via a PTE, because it would be too costly to iterate over all page mappings on each /proc/kpageflags read to provide an up-to-date value. To make sure the flag is up-to-date one has to read /sys/kernel/mm/page_idle/bitmap first. Signed-off-by: Vladimir Davydov Reviewed-by: Andres Lagar-Cavilla Cc: Minchan Kim Cc: Raghavendra K T Cc: Johannes Weiner Cc: Michal Hocko Cc: Greg Thelen Cc: Michel Lespinasse Cc: David Rientjes Cc: Pavel Emelyanov Cc: Cyrill Gorcunov Cc: Jonathan Corbet Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/uapi/linux/kernel-page-flags.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/kernel-page-flags.h b/include/uapi/linux/kernel-page-flags.h index a6c4962e5d46..5da5f8751ce7 100644 --- a/include/uapi/linux/kernel-page-flags.h +++ b/include/uapi/linux/kernel-page-flags.h @@ -33,6 +33,7 @@ #define KPF_THP 22 #define KPF_BALLOON 23 #define KPF_ZERO_PAGE 24 +#define KPF_IDLE 25 #endif /* _UAPILINUX_KERNEL_PAGE_FLAGS_H */ -- cgit v1.2.3 From ac64a2ce509104a746321a4f9646b6750cf281eb Mon Sep 17 00:00:00 2001 From: David Disseldorp Date: Fri, 4 Sep 2015 01:39:56 +0200 Subject: target: use stringify.h instead of own definition Signed-off-by: David Disseldorp Acked-by: Andy Grover Signed-off-by: Nicholas Bellinger --- include/uapi/linux/target_core_user.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/target_core_user.h b/include/uapi/linux/target_core_user.h index b67f99d3c520..95c6521d8a95 100644 --- a/include/uapi/linux/target_core_user.h +++ b/include/uapi/linux/target_core_user.h @@ -42,10 +42,6 @@ #define TCMU_MAILBOX_VERSION 2 #define ALIGN_SIZE 64 /* Should be enough for most CPUs */ -/* See https://gcc.gnu.org/onlinedocs/cpp/Stringification.html */ -#define xstr(s) str(s) -#define str(s) #s - struct tcmu_mailbox { __u16 version; __u16 flags; -- cgit v1.2.3 From 5b25b13ab08f616efd566347d809b4ece54570d1 Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Fri, 11 Sep 2015 13:07:39 -0700 Subject: sys_membarrier(): system-wide memory barrier (generic, x86) Here is an implementation of a new system call, sys_membarrier(), which executes a memory barrier on all threads running on the system. It is implemented by calling synchronize_sched(). It can be used to distribute the cost of user-space memory barriers asymmetrically by transforming pairs of memory barriers into pairs consisting of sys_membarrier() and a compiler barrier. For synchronization primitives that distinguish between read-side and write-side (e.g. userspace RCU [1], rwlocks), the read-side can be accelerated significantly by moving the bulk of the memory barrier overhead to the write-side. The existing applications of which I am aware that would be improved by this system call are as follows: * Through Userspace RCU library (http://urcu.so) - DNS server (Knot DNS) https://www.knot-dns.cz/ - Network sniffer (http://netsniff-ng.org/) - Distributed object storage (https://sheepdog.github.io/sheepdog/) - User-space tracing (http://lttng.org) - Network storage system (https://www.gluster.org/) - Virtual routers (https://events.linuxfoundation.org/sites/events/files/slides/DPDK_RCU_0MQ.pdf) - Financial software (https://lkml.org/lkml/2015/3/23/189) Those projects use RCU in userspace to increase read-side speed and scalability compared to locking. Especially in the case of RCU used by libraries, sys_membarrier can speed up the read-side by moving the bulk of the memory barrier cost to synchronize_rcu(). * Direct users of sys_membarrier - core dotnet garbage collector (https://github.com/dotnet/coreclr/issues/198) Microsoft core dotnet GC developers are planning to use the mprotect() side-effect of issuing memory barriers through IPIs as a way to implement Windows FlushProcessWriteBuffers() on Linux. They are referring to sys_membarrier in their github thread, specifically stating that sys_membarrier() is what they are looking for. To explain the benefit of this scheme, let's introduce two example threads: Thread A (non-frequent, e.g. executing liburcu synchronize_rcu()) Thread B (frequent, e.g. executing liburcu rcu_read_lock()/rcu_read_unlock()) In a scheme where all smp_mb() in thread A are ordering memory accesses with respect to smp_mb() present in Thread B, we can change each smp_mb() within Thread A into calls to sys_membarrier() and each smp_mb() within Thread B into compiler barriers "barrier()". Before the change, we had, for each smp_mb() pairs: Thread A Thread B previous mem accesses previous mem accesses smp_mb() smp_mb() following mem accesses following mem accesses After the change, these pairs become: Thread A Thread B prev mem accesses prev mem accesses sys_membarrier() barrier() follow mem accesses follow mem accesses As we can see, there are two possible scenarios: either Thread B memory accesses do not happen concurrently with Thread A accesses (1), or they do (2). 1) Non-concurrent Thread A vs Thread B accesses: Thread A Thread B prev mem accesses sys_membarrier() follow mem accesses prev mem accesses barrier() follow mem accesses In this case, thread B accesses will be weakly ordered. This is OK, because at that point, thread A is not particularly interested in ordering them with respect to its own accesses. 2) Concurrent Thread A vs Thread B accesses Thread A Thread B prev mem accesses prev mem accesses sys_membarrier() barrier() follow mem accesses follow mem accesses In this case, thread B accesses, which are ensured to be in program order thanks to the compiler barrier, will be "upgraded" to full smp_mb() by synchronize_sched(). * Benchmarks On Intel Xeon E5405 (8 cores) (one thread is calling sys_membarrier, the other 7 threads are busy looping) 1000 non-expedited sys_membarrier calls in 33s =3D 33 milliseconds/call. * User-space user of this system call: Userspace RCU library Both the signal-based and the sys_membarrier userspace RCU schemes permit us to remove the memory barrier from the userspace RCU rcu_read_lock() and rcu_read_unlock() primitives, thus significantly accelerating them. These memory barriers are replaced by compiler barriers on the read-side, and all matching memory barriers on the write-side are turned into an invocation of a memory barrier on all active threads in the process. By letting the kernel perform this synchronization rather than dumbly sending a signal to every process threads (as we currently do), we diminish the number of unnecessary wake ups and only issue the memory barriers on active threads. Non-running threads do not need to execute such barrier anyway, because these are implied by the scheduler context switches. Results in liburcu: Operations in 10s, 6 readers, 2 writers: memory barriers in reader: 1701557485 reads, 2202847 writes signal-based scheme: 9830061167 reads, 6700 writes sys_membarrier: 9952759104 reads, 425 writes sys_membarrier (dyn. check): 7970328887 reads, 425 writes The dynamic sys_membarrier availability check adds some overhead to the read-side compared to the signal-based scheme, but besides that, sys_membarrier slightly outperforms the signal-based scheme. However, this non-expedited sys_membarrier implementation has a much slower grace period than signal and memory barrier schemes. Besides diminishing the number of wake-ups, one major advantage of the membarrier system call over the signal-based scheme is that it does not need to reserve a signal. This plays much more nicely with libraries, and with processes injected into for tracing purposes, for which we cannot expect that signals will be unused by the application. An expedited version of this system call can be added later on to speed up the grace period. Its implementation will likely depend on reading the cpu_curr()->mm without holding each CPU's rq lock. This patch adds the system call to x86 and to asm-generic. [1] http://urcu.so membarrier(2) man page: MEMBARRIER(2) Linux Programmer's Manual MEMBARRIER(2) NAME membarrier - issue memory barriers on a set of threads SYNOPSIS #include int membarrier(int cmd, int flags); DESCRIPTION The cmd argument is one of the following: MEMBARRIER_CMD_QUERY Query the set of supported commands. It returns a bitmask of supported commands. MEMBARRIER_CMD_SHARED Execute a memory barrier on all threads running on the system. Upon return from system call, the caller thread is ensured that all running threads have passed through a state where all memory accesses to user-space addresses match program order between entry to and return from the system call (non-running threads are de facto in such a state). This covers threads from all pro=E2=80=90 cesses running on the system. This command returns 0. The flags argument needs to be 0. For future extensions. All memory accesses performed in program order from each targeted thread is guaranteed to be ordered with respect to sys_membarrier(). If we use the semantic "barrier()" to represent a compiler barrier forcing memory accesses to be performed in program order across the barrier, and smp_mb() to represent explicit memory barriers forcing full memory ordering across the barrier, we have the following ordering table for each pair of barrier(), sys_membarrier() and smp_mb(): The pair ordering is detailed as (O: ordered, X: not ordered): barrier() smp_mb() sys_membarrier() barrier() X X O smp_mb() X O O sys_membarrier() O O O RETURN VALUE On success, these system calls return zero. On error, -1 is returned, and errno is set appropriately. For a given command, with flags argument set to 0, this system call is guaranteed to always return the same value until reboot. ERRORS ENOSYS System call is not implemented. EINVAL Invalid arguments. Linux 2015-04-15 MEMBARRIER(2) Signed-off-by: Mathieu Desnoyers Reviewed-by: Paul E. McKenney Reviewed-by: Josh Triplett Cc: KOSAKI Motohiro Cc: Steven Rostedt Cc: Nicholas Miell Cc: Ingo Molnar Cc: Alan Cox Cc: Lai Jiangshan Cc: Stephen Hemminger Cc: Thomas Gleixner Cc: Peter Zijlstra Cc: David Howells Cc: Pranith Kumar Cc: Michael Kerrisk Cc: Shuah Khan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/uapi/linux/Kbuild | 1 + include/uapi/linux/membarrier.h | 53 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 54 insertions(+) create mode 100644 include/uapi/linux/membarrier.h (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/Kbuild b/include/uapi/linux/Kbuild index 70ff1d9abf0d..f7b2db44eb4b 100644 --- a/include/uapi/linux/Kbuild +++ b/include/uapi/linux/Kbuild @@ -252,6 +252,7 @@ header-y += mdio.h header-y += media.h header-y += media-bus-format.h header-y += mei.h +header-y += membarrier.h header-y += memfd.h header-y += mempolicy.h header-y += meye.h diff --git a/include/uapi/linux/membarrier.h b/include/uapi/linux/membarrier.h new file mode 100644 index 000000000000..e0b108bd2624 --- /dev/null +++ b/include/uapi/linux/membarrier.h @@ -0,0 +1,53 @@ +#ifndef _UAPI_LINUX_MEMBARRIER_H +#define _UAPI_LINUX_MEMBARRIER_H + +/* + * linux/membarrier.h + * + * membarrier system call API + * + * Copyright (c) 2010, 2015 Mathieu Desnoyers + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +/** + * enum membarrier_cmd - membarrier system call command + * @MEMBARRIER_CMD_QUERY: Query the set of supported commands. It returns + * a bitmask of valid commands. + * @MEMBARRIER_CMD_SHARED: Execute a memory barrier on all running threads. + * Upon return from system call, the caller thread + * is ensured that all running threads have passed + * through a state where all memory accesses to + * user-space addresses match program order between + * entry to and return from the system call + * (non-running threads are de facto in such a + * state). This covers threads from all processes + * running on the system. This command returns 0. + * + * Command to be passed to the membarrier system call. The commands need to + * be a single bit each, except for MEMBARRIER_CMD_QUERY which is assigned to + * the value 0. + */ +enum membarrier_cmd { + MEMBARRIER_CMD_QUERY = 0, + MEMBARRIER_CMD_SHARED = (1 << 0), +}; + +#endif /* _UAPI_LINUX_MEMBARRIER_H */ -- cgit v1.2.3