From 3ca0ff571b092ee4d807f1168caa428d95b0173b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 23 Aug 2016 13:36:04 +0200 Subject: locking/mutex: Rework mutex::owner The current mutex implementation has an atomic lock word and a non-atomic owner field. This disparity leads to a number of issues with the current mutex code as it means that we can have a locked mutex without an explicit owner (because the owner field has not been set, or already cleared). This leads to a number of weird corner cases, esp. between the optimistic spinning and debug code. Where the optimistic spinning code needs the owner field updated inside the lock region, the debug code is more relaxed because the whole lock is serialized by the wait_lock. Also, the spinning code itself has a few corner cases where we need to deal with a held lock without an owner field. Furthermore, it becomes even more of a problem when trying to fix starvation cases in the current code. We end up stacking special case on special case. To solve this rework the basic mutex implementation to be a single atomic word that contains the owner and uses the low bits for extra state. This matches how PI futexes and rt_mutex already work. By having the owner an integral part of the lock state a lot of the problems dissapear and we get a better option to deal with starvation cases, direct owner handoff. Changing the basic mutex does however invalidate all the arch specific mutex code; this patch leaves that unused in-place, a later patch will remove that. Tested-by: Jason Low Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Will Deacon Cc: Andrew Morton Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- include/linux/mutex-debug.h | 24 ----------------------- include/linux/mutex.h | 46 +++++++++++++++++++++++++++++---------------- 2 files changed, 30 insertions(+), 40 deletions(-) delete mode 100644 include/linux/mutex-debug.h (limited to 'include/linux') diff --git a/include/linux/mutex-debug.h b/include/linux/mutex-debug.h deleted file mode 100644 index 4ac8b1977b73..000000000000 --- a/include/linux/mutex-debug.h +++ /dev/null @@ -1,24 +0,0 @@ -#ifndef __LINUX_MUTEX_DEBUG_H -#define __LINUX_MUTEX_DEBUG_H - -#include -#include -#include - -/* - * Mutexes - debugging helpers: - */ - -#define __DEBUG_MUTEX_INITIALIZER(lockname) \ - , .magic = &lockname - -#define mutex_init(mutex) \ -do { \ - static struct lock_class_key __key; \ - \ - __mutex_init((mutex), #mutex, &__key); \ -} while (0) - -extern void mutex_destroy(struct mutex *lock); - -#endif diff --git a/include/linux/mutex.h b/include/linux/mutex.h index 2cb7531e7d7a..4d3bccabbea5 100644 --- a/include/linux/mutex.h +++ b/include/linux/mutex.h @@ -18,6 +18,7 @@ #include #include #include +#include /* * Simple, straightforward mutexes with strict semantics: @@ -48,16 +49,12 @@ * locks and tasks (and only those tasks) */ struct mutex { - /* 1: unlocked, 0: locked, negative: locked, possible waiters */ - atomic_t count; + atomic_long_t owner; spinlock_t wait_lock; - struct list_head wait_list; -#if defined(CONFIG_DEBUG_MUTEXES) || defined(CONFIG_MUTEX_SPIN_ON_OWNER) - struct task_struct *owner; -#endif #ifdef CONFIG_MUTEX_SPIN_ON_OWNER struct optimistic_spin_queue osq; /* Spinner MCS lock */ #endif + struct list_head wait_list; #ifdef CONFIG_DEBUG_MUTEXES void *magic; #endif @@ -66,6 +63,11 @@ struct mutex { #endif }; +static inline struct task_struct *__mutex_owner(struct mutex *lock) +{ + return (struct task_struct *)(atomic_long_read(&lock->owner) & ~0x03); +} + /* * This is the control structure for tasks blocked on mutex, * which resides on the blocked task's kernel stack: @@ -79,9 +81,20 @@ struct mutex_waiter { }; #ifdef CONFIG_DEBUG_MUTEXES -# include + +#define __DEBUG_MUTEX_INITIALIZER(lockname) \ + , .magic = &lockname + +extern void mutex_destroy(struct mutex *lock); + #else + # define __DEBUG_MUTEX_INITIALIZER(lockname) + +static inline void mutex_destroy(struct mutex *lock) {} + +#endif + /** * mutex_init - initialize the mutex * @mutex: the mutex to be initialized @@ -90,14 +103,12 @@ struct mutex_waiter { * * It is not allowed to initialize an already locked mutex. */ -# define mutex_init(mutex) \ -do { \ - static struct lock_class_key __key; \ - \ - __mutex_init((mutex), #mutex, &__key); \ +#define mutex_init(mutex) \ +do { \ + static struct lock_class_key __key; \ + \ + __mutex_init((mutex), #mutex, &__key); \ } while (0) -static inline void mutex_destroy(struct mutex *lock) {} -#endif #ifdef CONFIG_DEBUG_LOCK_ALLOC # define __DEP_MAP_MUTEX_INITIALIZER(lockname) \ @@ -107,7 +118,7 @@ static inline void mutex_destroy(struct mutex *lock) {} #endif #define __MUTEX_INITIALIZER(lockname) \ - { .count = ATOMIC_INIT(1) \ + { .owner = ATOMIC_LONG_INIT(0) \ , .wait_lock = __SPIN_LOCK_UNLOCKED(lockname.wait_lock) \ , .wait_list = LIST_HEAD_INIT(lockname.wait_list) \ __DEBUG_MUTEX_INITIALIZER(lockname) \ @@ -127,7 +138,10 @@ extern void __mutex_init(struct mutex *lock, const char *name, */ static inline int mutex_is_locked(struct mutex *lock) { - return atomic_read(&lock->count) != 1; + /* + * XXX think about spin_is_locked + */ + return __mutex_owner(lock) != NULL; } /* -- cgit v1.2.3 From 0f5225b024d4bffd682aab008c35862e8fdc1865 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 7 Oct 2016 17:43:51 +0200 Subject: locking/mutex, drm: Introduce mutex_trylock_recursive() By popular DRM demand, introduce mutex_trylock_recursive() to fix up the two GEM users. Without this it is very easy for these drivers to get stuck in low-memory situations and trigger OOM. Work is in progress to remove the need for this in at least i915. Signed-off-by: Peter Zijlstra (Intel) Cc: Chris Wilson Cc: Daniel Vetter Cc: David Airlie Cc: Davidlohr Bueso Cc: Ding Tianhong Cc: Imre Deak Cc: Jason Low Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Rob Clark Cc: Terry Rudd Cc: Thomas Gleixner Cc: Tim Chen Cc: Will Deacon Cc: dri-devel@lists.freedesktop.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- include/linux/mutex.h | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mutex.h b/include/linux/mutex.h index 4d3bccabbea5..6a902f0a2148 100644 --- a/include/linux/mutex.h +++ b/include/linux/mutex.h @@ -189,4 +189,35 @@ extern void mutex_unlock(struct mutex *lock); extern int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock); +/* + * These values are chosen such that FAIL and SUCCESS match the + * values of the regular mutex_trylock(). + */ +enum mutex_trylock_recursive_enum { + MUTEX_TRYLOCK_FAILED = 0, + MUTEX_TRYLOCK_SUCCESS = 1, + MUTEX_TRYLOCK_RECURSIVE, +}; + +/** + * mutex_trylock_recursive - trylock variant that allows recursive locking + * @lock: mutex to be locked + * + * This function should not be used, _ever_. It is purely for hysterical GEM + * raisins, and once those are gone this will be removed. + * + * Returns: + * MUTEX_TRYLOCK_FAILED - trylock failed, + * MUTEX_TRYLOCK_SUCCESS - lock acquired, + * MUTEX_TRYLOCK_RECURSIVE - we already owned the lock. + */ +static inline __deprecated __must_check enum mutex_trylock_recursive_enum +mutex_trylock_recursive(struct mutex *lock) +{ + if (unlikely(__mutex_owner(lock) == current)) + return MUTEX_TRYLOCK_RECURSIVE; + + return mutex_trylock(lock); +} + #endif /* __LINUX_MUTEX_H */ -- cgit v1.2.3 From 43496d35513b25ad468bef91e51a39d61a0d8464 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 16 Nov 2016 10:36:05 +0100 Subject: locking/mutex: Don't mark mutex_trylock_recursive() as deprecated, temporarily MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Until the DRM drivers are fixed to not use mutex_trylock_recursive(), allyes/modconfig builds will emit an API deprecation warning: drivers/gpu/drm/i915/i915_gem_shrinker.c: In function ‘i915_gem_shrinker_lock’: drivers/gpu/drm/i915/i915_gem_shrinker.c:230:2: warning: ‘mutex_trylock_recursive’ is deprecated [-Wdeprecated-declarations] switch (mutex_trylock_recursive(&dev->struct_mutex)) { ^ Don't pollute the kernel log until the DRM code is fixed. Hopefully the checkpatch warning is enough to keep people from using this new API, and we'll be NAK-ing new users as well. Cc: Chris Wilson Cc: Daniel Vetter Cc: David Airlie Cc: Davidlohr Bueso Cc: Ding Tianhong Cc: Imre Deak Cc: Jason Low Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Rob Clark Cc: Terry Rudd Cc: Thomas Gleixner Cc: Tim Chen Cc: Will Deacon Cc: dri-devel@lists.freedesktop.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- include/linux/mutex.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mutex.h b/include/linux/mutex.h index 6a902f0a2148..b97870f2debd 100644 --- a/include/linux/mutex.h +++ b/include/linux/mutex.h @@ -211,7 +211,7 @@ enum mutex_trylock_recursive_enum { * MUTEX_TRYLOCK_SUCCESS - lock acquired, * MUTEX_TRYLOCK_RECURSIVE - we already owned the lock. */ -static inline __deprecated __must_check enum mutex_trylock_recursive_enum +static inline /* __deprecated */ __must_check enum mutex_trylock_recursive_enum mutex_trylock_recursive(struct mutex *lock) { if (unlikely(__mutex_owner(lock) == current)) -- cgit v1.2.3 From 6d0d287891a022ebba572327cbd70b5de69a63a2 Mon Sep 17 00:00:00 2001 From: Christian Borntraeger Date: Wed, 16 Nov 2016 13:23:05 +0100 Subject: locking/core: Provide common cpu_relax_yield() definition No need to duplicate the same define everywhere. Since the only user is stop-machine and the only provider is s390, we can use a default implementation of cpu_relax_yield() in sched.h. Suggested-by: Russell King Signed-off-by: Christian Borntraeger Reviewed-by: David Hildenbrand Acked-by: Russell King Cc: Andrew Morton Cc: Catalin Marinas Cc: Heiko Carstens Cc: Linus Torvalds Cc: Martin Schwidefsky Cc: Nicholas Piggin Cc: Noam Camus Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Will Deacon Cc: kvm@vger.kernel.org Cc: linux-arch@vger.kernel.org Cc: linux-s390 Cc: linuxppc-dev@lists.ozlabs.org Cc: sparclinux@vger.kernel.org Cc: virtualization@lists.linux-foundation.org Cc: xen-devel@lists.xenproject.org Link: http://lkml.kernel.org/r/1479298985-191589-1-git-send-email-borntraeger@de.ibm.com Signed-off-by: Ingo Molnar --- include/linux/sched.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 348f51b0ec92..c1aa3b02f6ac 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2444,6 +2444,10 @@ static inline void calc_load_enter_idle(void) { } static inline void calc_load_exit_idle(void) { } #endif /* CONFIG_NO_HZ_COMMON */ +#ifndef cpu_relax_yield +#define cpu_relax_yield() cpu_relax() +#endif + /* * Do not use outside of architecture code which knows its limitations. * -- cgit v1.2.3 From 194a6b5b9cb6b91a5f7d86984165a3bc55188599 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Thu, 17 Nov 2016 11:46:38 -0500 Subject: sched/wake_q: Rename WAKE_Q to DEFINE_WAKE_Q Currently the wake_q data structure is defined by the WAKE_Q() macro. This macro, however, looks like a function doing something as "wake" is a verb. Even checkpatch.pl was confused as it reported warnings like WARNING: Missing a blank line after declarations #548: FILE: kernel/futex.c:3665: + int ret; + WAKE_Q(wake_q); This patch renames the WAKE_Q() macro to DEFINE_WAKE_Q() which clarifies what the macro is doing and eliminates the checkpatch.pl warnings. Signed-off-by: Waiman Long Acked-by: Davidlohr Bueso Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1479401198-1765-1-git-send-email-longman@redhat.com [ Resolved conflict and added missing rename. ] Signed-off-by: Ingo Molnar --- include/linux/sched.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index c1aa3b02f6ac..dc37cbe2b13c 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -989,7 +989,7 @@ enum cpu_idle_type { * already in a wake queue, the wakeup will happen soon and the second * waker can just skip it. * - * The WAKE_Q macro declares and initializes the list head. + * The DEFINE_WAKE_Q macro declares and initializes the list head. * wake_up_q() does NOT reinitialize the list; it's expected to be * called near the end of a function, where the fact that the queue is * not used again will be easy to see by inspection. @@ -1009,7 +1009,7 @@ struct wake_q_head { #define WAKE_Q_TAIL ((struct wake_q_node *) 0x01) -#define WAKE_Q(name) \ +#define DEFINE_WAKE_Q(name) \ struct wake_q_head name = { WAKE_Q_TAIL, &name.first } extern void wake_q_add(struct wake_q_head *head, -- cgit v1.2.3 From d9345c65eb7930ac6755cf593ee7686f4029ccf4 Mon Sep 17 00:00:00 2001 From: Pan Xinhui Date: Wed, 2 Nov 2016 05:08:28 -0400 Subject: sched/core: Introduce the vcpu_is_preempted(cpu) interface This patch is the first step to add support to improve lock holder preemption beaviour. vcpu_is_preempted(cpu) does the obvious thing: it tells us whether a vCPU is preempted or not. Defaults to false on architectures that don't support it. Suggested-by: Peter Zijlstra (Intel) Tested-by: Juergen Gross Signed-off-by: Pan Xinhui Signed-off-by: Peter Zijlstra (Intel) [ Translated the changelog to English. ] Acked-by: Christian Borntraeger Acked-by: Paolo Bonzini Cc: David.Laight@ACULAB.COM Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: benh@kernel.crashing.org Cc: boqun.feng@gmail.com Cc: bsingharora@gmail.com Cc: dave@stgolabs.net Cc: kernellwp@gmail.com Cc: konrad.wilk@oracle.com Cc: linuxppc-dev@lists.ozlabs.org Cc: mpe@ellerman.id.au Cc: paulmck@linux.vnet.ibm.com Cc: paulus@samba.org Cc: rkrcmar@redhat.com Cc: virtualization@lists.linux-foundation.org Cc: will.deacon@arm.com Cc: xen-devel-request@lists.xenproject.org Cc: xen-devel@lists.xenproject.org Link: http://lkml.kernel.org/r/1478077718-37424-2-git-send-email-xinhui.pan@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- include/linux/sched.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index dc37cbe2b13c..37261afbf16a 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -3510,6 +3510,18 @@ static inline void set_task_cpu(struct task_struct *p, unsigned int cpu) #endif /* CONFIG_SMP */ +/* + * In order to reduce various lock holder preemption latencies provide an + * interface to see if a vCPU is currently running or not. + * + * This allows us to terminate optimistic spin loops and block, analogous to + * the native optimistic spin heuristic of testing if the lock owner task is + * running or not. + */ +#ifndef vcpu_is_preempted +# define vcpu_is_preempted(cpu) false +#endif + extern long sched_setaffinity(pid_t pid, const struct cpumask *new_mask); extern long sched_getaffinity(pid_t pid, struct cpumask *mask); -- cgit v1.2.3 From 4ec6e863625625a54f527464ab91ce1a1cb16c42 Mon Sep 17 00:00:00 2001 From: Pan Xinhui Date: Wed, 2 Nov 2016 05:08:34 -0400 Subject: kvm: Introduce kvm_write_guest_offset_cached() It allows us to update some status or field of a structure partially. We can also save a kvm_read_guest_cached() call if we just update one fild of the struct regardless of its current value. Signed-off-by: Pan Xinhui Signed-off-by: Peter Zijlstra (Intel) Acked-by: Paolo Bonzini Cc: David.Laight@ACULAB.COM Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: benh@kernel.crashing.org Cc: boqun.feng@gmail.com Cc: borntraeger@de.ibm.com Cc: bsingharora@gmail.com Cc: dave@stgolabs.net Cc: jgross@suse.com Cc: kernellwp@gmail.com Cc: konrad.wilk@oracle.com Cc: linuxppc-dev@lists.ozlabs.org Cc: mpe@ellerman.id.au Cc: paulmck@linux.vnet.ibm.com Cc: paulus@samba.org Cc: rkrcmar@redhat.com Cc: virtualization@lists.linux-foundation.org Cc: will.deacon@arm.com Cc: xen-devel-request@lists.xenproject.org Cc: xen-devel@lists.xenproject.org Link: http://lkml.kernel.org/r/1478077718-37424-8-git-send-email-xinhui.pan@linux.vnet.ibm.com [ Typo fixes. ] Signed-off-by: Ingo Molnar --- include/linux/kvm_host.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 01c0b9cc3915..6f0023797b33 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -645,6 +645,8 @@ int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data, unsigned long len); int kvm_write_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, void *data, unsigned long len); +int kvm_write_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, + void *data, int offset, unsigned long len); int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc, gpa_t gpa, unsigned long len); int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len); -- cgit v1.2.3 From f4ec57b632fe15ed7a355099cb96ed4b647416de Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 24 Nov 2016 12:46:26 +0100 Subject: locking/ww_mutex: Use relaxed atomics The stamp is a sequence number, we don't care about memory ordering. Signed-off-by: Peter Zijlstra (Intel) Cc: Boqun Feng Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Will Deacon Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- include/linux/ww_mutex.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/ww_mutex.h b/include/linux/ww_mutex.h index 2bb5deb0012e..7b0066814fa0 100644 --- a/include/linux/ww_mutex.h +++ b/include/linux/ww_mutex.h @@ -120,7 +120,7 @@ static inline void ww_acquire_init(struct ww_acquire_ctx *ctx, struct ww_class *ww_class) { ctx->task = current; - ctx->stamp = atomic_long_inc_return(&ww_class->stamp); + ctx->stamp = atomic_long_inc_return_relaxed(&ww_class->stamp); ctx->acquired = 0; #ifdef CONFIG_DEBUG_MUTEXES ctx->ww_class = ww_class; -- cgit v1.2.3