summaryrefslogtreecommitdiff
path: root/include/linux/mmap_lock.h
diff options
context:
space:
mode:
Diffstat (limited to 'include/linux/mmap_lock.h')
-rw-r--r--include/linux/mmap_lock.h279
1 files changed, 240 insertions, 39 deletions
diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h
index d53f72dba7fe..93eca48bc443 100644
--- a/include/linux/mmap_lock.h
+++ b/include/linux/mmap_lock.h
@@ -78,6 +78,43 @@ static inline void mmap_assert_write_locked(const struct mm_struct *mm)
#ifdef CONFIG_PER_VMA_LOCK
+#ifdef CONFIG_LOCKDEP
+#define __vma_lockdep_map(vma) (&vma->vmlock_dep_map)
+#else
+#define __vma_lockdep_map(vma) NULL
+#endif
+
+/*
+ * VMA locks do not behave like most ordinary locks found in the kernel, so we
+ * cannot quite have full lockdep tracking in the way we would ideally prefer.
+ *
+ * Read locks act as shared locks which exclude an exclusive lock being
+ * taken. We therefore mark these accordingly on read lock acquire/release.
+ *
+ * Write locks are acquired exclusively per-VMA, but released in a shared
+ * fashion, that is upon vma_end_write_all(), we update the mmap's seqcount such
+ * that write lock is released.
+ *
+ * We therefore cannot track write locks per-VMA, nor do we try. Mitigating this
+ * is the fact that, of course, we do lockdep-track the mmap lock rwsem which
+ * must be held when taking a VMA write lock.
+ *
+ * We do, however, want to indicate that during either acquisition of a VMA
+ * write lock or detachment of a VMA that we require the lock held be exclusive,
+ * so we utilise lockdep to do so.
+ */
+#define __vma_lockdep_acquire_read(vma) \
+ lock_acquire_shared(__vma_lockdep_map(vma), 0, 1, NULL, _RET_IP_)
+#define __vma_lockdep_release_read(vma) \
+ lock_release(__vma_lockdep_map(vma), _RET_IP_)
+#define __vma_lockdep_acquire_exclusive(vma) \
+ lock_acquire_exclusive(__vma_lockdep_map(vma), 0, 0, NULL, _RET_IP_)
+#define __vma_lockdep_release_exclusive(vma) \
+ lock_release(__vma_lockdep_map(vma), _RET_IP_)
+/* Only meaningful if CONFIG_LOCK_STAT is defined. */
+#define __vma_lockdep_stat_mark_acquired(vma) \
+ lock_acquired(__vma_lockdep_map(vma), _RET_IP_)
+
static inline void mm_lock_seqcount_init(struct mm_struct *mm)
{
seqcount_init(&mm->mm_lock_seq);
@@ -115,36 +152,81 @@ static inline void vma_lock_init(struct vm_area_struct *vma, bool reset_refcnt)
#ifdef CONFIG_DEBUG_LOCK_ALLOC
static struct lock_class_key lockdep_key;
- lockdep_init_map(&vma->vmlock_dep_map, "vm_lock", &lockdep_key, 0);
+ lockdep_init_map(__vma_lockdep_map(vma), "vm_lock", &lockdep_key, 0);
#endif
if (reset_refcnt)
refcount_set(&vma->vm_refcnt, 0);
vma->vm_lock_seq = UINT_MAX;
}
-static inline bool is_vma_writer_only(int refcnt)
+/*
+ * This function determines whether the input VMA reference count describes a
+ * VMA which has excluded all VMA read locks.
+ *
+ * In the case of a detached VMA, we may incorrectly indicate that readers are
+ * excluded when one remains, because in that scenario we target a refcount of
+ * VM_REFCNT_EXCLUDE_READERS_FLAG, rather than the attached target of
+ * VM_REFCNT_EXCLUDE_READERS_FLAG + 1.
+ *
+ * However, the race window for that is very small so it is unlikely.
+ *
+ * Returns: true if readers are excluded, false otherwise.
+ */
+static inline bool __vma_are_readers_excluded(int refcnt)
{
/*
- * With a writer and no readers, refcnt is VMA_LOCK_OFFSET if the vma
- * is detached and (VMA_LOCK_OFFSET + 1) if it is attached. Waiting on
- * a detached vma happens only in vma_mark_detached() and is a rare
- * case, therefore most of the time there will be no unnecessary wakeup.
+ * See the comment describing the vm_area_struct->vm_refcnt field for
+ * details of possible refcnt values.
*/
- return (refcnt & VMA_LOCK_OFFSET) && refcnt <= VMA_LOCK_OFFSET + 1;
+ return (refcnt & VM_REFCNT_EXCLUDE_READERS_FLAG) &&
+ refcnt <= VM_REFCNT_EXCLUDE_READERS_FLAG + 1;
+}
+
+/*
+ * Actually decrement the VMA reference count.
+ *
+ * The function returns the reference count as it was immediately after the
+ * decrement took place. If it returns zero, the VMA is now detached.
+ */
+static inline __must_check unsigned int
+__vma_refcount_put_return(struct vm_area_struct *vma)
+{
+ int oldcnt;
+
+ if (__refcount_dec_and_test(&vma->vm_refcnt, &oldcnt))
+ return 0;
+
+ return oldcnt - 1;
}
+/**
+ * vma_refcount_put() - Drop reference count in VMA vm_refcnt field due to a
+ * read-lock being dropped.
+ * @vma: The VMA whose reference count we wish to decrement.
+ *
+ * If we were the last reader, wake up threads waiting to obtain an exclusive
+ * lock.
+ */
static inline void vma_refcount_put(struct vm_area_struct *vma)
{
- /* Use a copy of vm_mm in case vma is freed after we drop vm_refcnt */
+ /* Use a copy of vm_mm in case vma is freed after we drop vm_refcnt. */
struct mm_struct *mm = vma->vm_mm;
- int oldcnt;
+ int newcnt;
- rwsem_release(&vma->vmlock_dep_map, _RET_IP_);
- if (!__refcount_dec_and_test(&vma->vm_refcnt, &oldcnt)) {
+ __vma_lockdep_release_read(vma);
+ newcnt = __vma_refcount_put_return(vma);
- if (is_vma_writer_only(oldcnt - 1))
- rcuwait_wake_up(&mm->vma_writer_wait);
- }
+ /*
+ * __vma_start_exclude_readers() may be sleeping waiting for readers to
+ * drop their reference count, so wake it up if we were the last reader
+ * blocking it from being acquired.
+ *
+ * We may be raced by other readers temporarily incrementing the
+ * reference count, though the race window is very small, this might
+ * cause spurious wakeups.
+ */
+ if (newcnt && __vma_are_readers_excluded(newcnt))
+ rcuwait_wake_up(&mm->vma_writer_wait);
}
/*
@@ -159,10 +241,10 @@ static inline bool vma_start_read_locked_nested(struct vm_area_struct *vma, int
mmap_assert_locked(vma->vm_mm);
if (unlikely(!__refcount_inc_not_zero_limited_acquire(&vma->vm_refcnt, &oldcnt,
- VMA_REF_LIMIT)))
+ VM_REFCNT_LIMIT)))
return false;
- rwsem_acquire_read(&vma->vmlock_dep_map, 0, 1, _RET_IP_);
+ __vma_lockdep_acquire_read(vma);
return true;
}
@@ -182,21 +264,31 @@ static inline void vma_end_read(struct vm_area_struct *vma)
vma_refcount_put(vma);
}
-/* WARNING! Can only be used if mmap_lock is expected to be write-locked */
-static inline bool __is_vma_write_locked(struct vm_area_struct *vma, unsigned int *mm_lock_seq)
+static inline unsigned int __vma_raw_mm_seqnum(struct vm_area_struct *vma)
{
+ const struct mm_struct *mm = vma->vm_mm;
+
+ /* We must hold an exclusive write lock for this access to be valid. */
mmap_assert_write_locked(vma->vm_mm);
+ return mm->mm_lock_seq.sequence;
+}
+/*
+ * Determine whether a VMA is write-locked. Must be invoked ONLY if the mmap
+ * write lock is held.
+ *
+ * Returns true if write-locked, otherwise false.
+ */
+static inline bool __is_vma_write_locked(struct vm_area_struct *vma)
+{
/*
* current task is holding mmap_write_lock, both vma->vm_lock_seq and
* mm->mm_lock_seq can't be concurrently modified.
*/
- *mm_lock_seq = vma->vm_mm->mm_lock_seq.sequence;
- return (vma->vm_lock_seq == *mm_lock_seq);
+ return vma->vm_lock_seq == __vma_raw_mm_seqnum(vma);
}
-int __vma_start_write(struct vm_area_struct *vma, unsigned int mm_lock_seq,
- int state);
+int __vma_start_write(struct vm_area_struct *vma, int state);
/*
* Begin writing to a VMA.
@@ -205,12 +297,10 @@ int __vma_start_write(struct vm_area_struct *vma, unsigned int mm_lock_seq,
*/
static inline void vma_start_write(struct vm_area_struct *vma)
{
- unsigned int mm_lock_seq;
-
- if (__is_vma_write_locked(vma, &mm_lock_seq))
+ if (__is_vma_write_locked(vma))
return;
- __vma_start_write(vma, mm_lock_seq, TASK_UNINTERRUPTIBLE);
+ __vma_start_write(vma, TASK_UNINTERRUPTIBLE);
}
/**
@@ -229,26 +319,110 @@ static inline void vma_start_write(struct vm_area_struct *vma)
static inline __must_check
int vma_start_write_killable(struct vm_area_struct *vma)
{
- unsigned int mm_lock_seq;
-
- if (__is_vma_write_locked(vma, &mm_lock_seq))
+ if (__is_vma_write_locked(vma))
return 0;
- return __vma_start_write(vma, mm_lock_seq, TASK_KILLABLE);
+
+ return __vma_start_write(vma, TASK_KILLABLE);
}
+/**
+ * vma_assert_write_locked() - assert that @vma holds a VMA write lock.
+ * @vma: The VMA to assert.
+ */
static inline void vma_assert_write_locked(struct vm_area_struct *vma)
{
- unsigned int mm_lock_seq;
-
- VM_BUG_ON_VMA(!__is_vma_write_locked(vma, &mm_lock_seq), vma);
+ VM_WARN_ON_ONCE_VMA(!__is_vma_write_locked(vma), vma);
}
+/**
+ * vma_assert_locked() - assert that @vma holds either a VMA read or a VMA write
+ * lock and is not detached.
+ * @vma: The VMA to assert.
+ */
static inline void vma_assert_locked(struct vm_area_struct *vma)
{
- unsigned int mm_lock_seq;
+ unsigned int refcnt;
+
+ if (IS_ENABLED(CONFIG_LOCKDEP)) {
+ if (!lock_is_held(__vma_lockdep_map(vma)))
+ vma_assert_write_locked(vma);
+ return;
+ }
+
+ /*
+ * See the comment describing the vm_area_struct->vm_refcnt field for
+ * details of possible refcnt values.
+ */
+ refcnt = refcount_read(&vma->vm_refcnt);
+
+ /*
+ * In this case we're either read-locked, write-locked with temporary
+ * readers, or in the midst of excluding readers, all of which means
+ * we're locked.
+ */
+ if (refcnt > 1)
+ return;
+
+ /* It is a bug for the VMA to be detached here. */
+ VM_WARN_ON_ONCE_VMA(!refcnt, vma);
+
+ /*
+ * OK, the VMA has a reference count of 1 which means it is either
+ * unlocked and attached or write-locked, so assert that it is
+ * write-locked.
+ */
+ vma_assert_write_locked(vma);
+}
+
+/**
+ * vma_assert_stabilised() - assert that this VMA cannot be changed from
+ * underneath us either by having a VMA or mmap lock held.
+ * @vma: The VMA whose stability we wish to assess.
+ *
+ * If lockdep is enabled we can precisely ensure stability via either an mmap
+ * lock owned by us or a specific VMA lock.
+ *
+ * With lockdep disabled we may sometimes race with other threads acquiring the
+ * mmap read lock simultaneous with our VMA read lock.
+ */
+static inline void vma_assert_stabilised(struct vm_area_struct *vma)
+{
+ /*
+ * If another thread owns an mmap lock, it may go away at any time, and
+ * thus is no guarantee of stability.
+ *
+ * If lockdep is enabled we can accurately determine if an mmap lock is
+ * held and owned by us. Otherwise we must approximate.
+ *
+ * It doesn't necessarily mean we are not stabilised however, as we may
+ * hold a VMA read lock (not a write lock as this would require an owned
+ * mmap lock).
+ *
+ * If (assuming lockdep is not enabled) we were to assert a VMA read
+ * lock first we may also run into issues, as other threads can hold VMA
+ * read locks simlutaneous to us.
+ *
+ * Therefore if lockdep is not enabled we risk a false negative (i.e. no
+ * assert fired). If accurate checking is required, enable lockdep.
+ */
+ if (IS_ENABLED(CONFIG_LOCKDEP)) {
+ if (lockdep_is_held(&vma->vm_mm->mmap_lock))
+ return;
+ } else {
+ if (rwsem_is_locked(&vma->vm_mm->mmap_lock))
+ return;
+ }
- VM_BUG_ON_VMA(refcount_read(&vma->vm_refcnt) <= 1 &&
- !__is_vma_write_locked(vma, &mm_lock_seq), vma);
+ /*
+ * We're not stabilised by the mmap lock, so assert that we're
+ * stabilised by a VMA lock.
+ */
+ vma_assert_locked(vma);
+}
+
+static inline bool vma_is_attached(struct vm_area_struct *vma)
+{
+ return refcount_read(&vma->vm_refcnt);
}
/*
@@ -258,12 +432,12 @@ static inline void vma_assert_locked(struct vm_area_struct *vma)
*/
static inline void vma_assert_attached(struct vm_area_struct *vma)
{
- WARN_ON_ONCE(!refcount_read(&vma->vm_refcnt));
+ WARN_ON_ONCE(!vma_is_attached(vma));
}
static inline void vma_assert_detached(struct vm_area_struct *vma)
{
- WARN_ON_ONCE(refcount_read(&vma->vm_refcnt));
+ WARN_ON_ONCE(vma_is_attached(vma));
}
static inline void vma_mark_attached(struct vm_area_struct *vma)
@@ -273,7 +447,28 @@ static inline void vma_mark_attached(struct vm_area_struct *vma)
refcount_set_release(&vma->vm_refcnt, 1);
}
-void vma_mark_detached(struct vm_area_struct *vma);
+void __vma_exclude_readers_for_detach(struct vm_area_struct *vma);
+
+static inline void vma_mark_detached(struct vm_area_struct *vma)
+{
+ vma_assert_write_locked(vma);
+ vma_assert_attached(vma);
+
+ /*
+ * The VMA still being attached (refcnt > 0) - is unlikely, because the
+ * vma has been already write-locked and readers can increment vm_refcnt
+ * only temporarily before they check vm_lock_seq, realize the vma is
+ * locked and drop back the vm_refcnt. That is a narrow window for
+ * observing a raised vm_refcnt.
+ *
+ * See the comment describing the vm_area_struct->vm_refcnt field for
+ * details of possible refcnt values.
+ */
+ if (likely(!__vma_refcount_put_return(vma)))
+ return;
+
+ __vma_exclude_readers_for_detach(vma);
+}
struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm,
unsigned long address);
@@ -327,6 +522,12 @@ static inline void vma_assert_locked(struct vm_area_struct *vma)
mmap_assert_locked(vma->vm_mm);
}
+static inline void vma_assert_stabilised(struct vm_area_struct *vma)
+{
+ /* If no VMA locks, then either mmap lock suffices to stabilise. */
+ mmap_assert_locked(vma->vm_mm);
+}
+
#endif /* CONFIG_PER_VMA_LOCK */
static inline void mmap_write_lock(struct mm_struct *mm)