From e067eba5871c6922539dc1728699c14e6b22590f Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Wed, 22 Feb 2017 15:42:06 -0800 Subject: userfaultfd: document _IOR/_IOW Patch series "userfaultfd tmpfs/hugetlbfs/non-cooperative", v2 These userfaultfd features are finished and are ready for larger exposure in -mm and upstream merging. 1) tmpfs non present userfault 2) hugetlbfs non present userfault 3) non cooperative userfault for fork/madvise/mremap qemu development code is already exercising 2) and container postcopy live migration needs 3). 1) is not currently used but there's a self test and we know some qemu user for various reasons uses tmpfs as backing for KVM so it'll need it too to use postcopy live migration with tmpfs memory. All review feedback from the previous submit has been handled and the fixes are included. There's no outstanding issue AFIK. Upstream code just did a s/fe/vmf/ conversion in the page faults and this has been converted as well incrementally. In addition to the previous submits, this also wakes up stuck userfaults during UFFDIO_UNREGISTER. The non cooperative testcase actually reproduced this problem by getting stuck instead of quitting clean in some rare case as it could call UFFDIO_UNREGISTER while some userfault could be still in flight. The other option would have been to keep leaving it up to userland to serialize itself and to patch the testcase instead but the wakeup during unregister I think is preferable. David also asked the UFFD_FEATURE_MISSING_HUGETLBFS and UFFD_FEATURE_MISSING_SHMEM feature flags to be added so QEMU can avoid to probe if the hugetlbfs/shmem missing support is available by calling UFFDIO_REGISTER. QEMU already checks HUGETLBFS_MAGIC with fstatfs so if UFFD_FEATURE_MISSING_HUGETLBFS is also set, it knows UFFDIO_REGISTER will succeed (or if it fails, it's for some other more concerning reason). There's no reason to worry about adding too many feature flags. There are 64 available and worst case we've to bump the API if someday we're really going to run out of them. The round-trip network latency of hugetlbfs userfaults during postcopy live migration is still of the order of dozen milliseconds on 10GBit if at 2MB hugepage granularity so it's working perfectly and it should provide for higher bandwidth or lower CPU usage (which makes it interesting to add an option in the future to support THP granularity too for anonymous memory, UFFDIO_COPY would then have to create THP if alignment/len allows for it). 1GB hugetlbfs granularity will require big changes in hugetlbfs to work so it's deferred for later. This patch (of 42): This adds proper documentation (inline) to avoid the risk of further misunderstandings about the semantics of _IOW/_IOR and it also reminds whoever will bump the UFFDIO_API in the future, to change the two ioctl to _IOW. This was found while implementing strace support for those ioctl, otherwise we could have never found it by just reviewing kernel code and testing it. _IOC_READ or _IOC_WRITE alters nothing but the ioctl number itself, so it's only worth fixing if the UFFDIO_API is bumped someday. Link: http://lkml.kernel.org/r/20161216144821.5183-2-aarcange@redhat.com Signed-off-by: Andrea Arcangeli Reported-by: "Dmitry V. Levin" Cc: Michael Rapoport Cc: "Dr. David Alan Gilbert" Cc: Mike Kravetz Cc: Pavel Emelyanov Cc: Hillf Danton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/uapi/linux/userfaultfd.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h index 9057d7af3ae1..94046b8aa6ad 100644 --- a/include/uapi/linux/userfaultfd.h +++ b/include/uapi/linux/userfaultfd.h @@ -11,6 +11,12 @@ #include +/* + * If the UFFDIO_API is upgraded someday, the UFFDIO_UNREGISTER and + * UFFDIO_WAKE ioctls should be defined as _IOW and not as _IOR. In + * userfaultfd.h we assumed the kernel was reading (instead _IOC_READ + * means the userland is reading). + */ #define UFFD_API ((__u64)0xAA) /* * After implementing the respective features it will become: -- cgit v1.2.3 From 893e26e61d04eac974ded0c11e1647b335c8cb7b Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Wed, 22 Feb 2017 15:42:27 -0800 Subject: userfaultfd: non-cooperative: Add fork() event When the mm with uffd-ed vmas fork()-s the respective vmas notify their uffds with the event which contains a descriptor with new uffd. This new descriptor can then be used to get events from the child and populate its mm with data. Note, that there can be different uffd-s controlling different vmas within one mm, so first we should collect all those uffds (and ctx-s) in a list and then notify them all one by one but only once per fork(). The context is created at fork() time but the descriptor, file struct and anon inode object is created at event read time. So some trickery is added to the userfaultfd_ctx_read() to handle the ctx queues' locking vs file creation. Another thing worth noticing is that the task that fork()-s waits for the uffd event to get processed WITHOUT the mmap sem. [aarcange@redhat.com: build warning fix] Link: http://lkml.kernel.org/r/20161216144821.5183-10-aarcange@redhat.com Link: http://lkml.kernel.org/r/20161216144821.5183-9-aarcange@redhat.com Signed-off-by: Pavel Emelyanov Signed-off-by: Mike Rapoport Signed-off-by: Andrea Arcangeli Cc: "Dr. David Alan Gilbert" Cc: Hillf Danton Cc: Michael Rapoport Cc: Mike Kravetz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/uapi/linux/userfaultfd.h | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h index 94046b8aa6ad..c8953c84fdcc 100644 --- a/include/uapi/linux/userfaultfd.h +++ b/include/uapi/linux/userfaultfd.h @@ -18,12 +18,7 @@ * means the userland is reading). */ #define UFFD_API ((__u64)0xAA) -/* - * After implementing the respective features it will become: - * #define UFFD_API_FEATURES (UFFD_FEATURE_PAGEFAULT_FLAG_WP | \ - * UFFD_FEATURE_EVENT_FORK) - */ -#define UFFD_API_FEATURES (0) +#define UFFD_API_FEATURES (UFFD_FEATURE_EVENT_FORK) #define UFFD_API_IOCTLS \ ((__u64)1 << _UFFDIO_REGISTER | \ (__u64)1 << _UFFDIO_UNREGISTER | \ @@ -77,6 +72,10 @@ struct uffd_msg { __u64 address; } pagefault; + struct { + __u32 ufd; + } fork; + struct { /* unused reserved fields */ __u64 reserved1; @@ -90,9 +89,7 @@ struct uffd_msg { * Start at 0x12 and not at 0 to be more strict against bugs. */ #define UFFD_EVENT_PAGEFAULT 0x12 -#if 0 /* not available yet */ #define UFFD_EVENT_FORK 0x13 -#endif /* flags for UFFD_EVENT_PAGEFAULT */ #define UFFD_PAGEFAULT_FLAG_WRITE (1<<0) /* If this was a write fault */ @@ -111,10 +108,8 @@ struct uffdio_api { * are to be considered implicitly always enabled in all kernels as * long as the uffdio_api.api requested matches UFFD_API. */ -#if 0 /* not available yet */ #define UFFD_FEATURE_PAGEFAULT_FLAG_WP (1<<0) #define UFFD_FEATURE_EVENT_FORK (1<<1) -#endif __u64 features; __u64 ioctls; -- cgit v1.2.3 From 72f87654c69690ff4721bd9b4a39983f971de9a5 Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Wed, 22 Feb 2017 15:42:34 -0800 Subject: userfaultfd: non-cooperative: add mremap() event The event denotes that an area [start:end] moves to different location. Length change isn't reported as "new" addresses, if they appear on the uffd reader side they will not contain any data and the latter can just zeromap them. Waiting for the event ACK is also done outside of mmap sem, as for fork event. Link: http://lkml.kernel.org/r/20161216144821.5183-12-aarcange@redhat.com Signed-off-by: Pavel Emelyanov Signed-off-by: Mike Rapoport Signed-off-by: Andrea Arcangeli Cc: "Dr. David Alan Gilbert" Cc: Hillf Danton Cc: Michael Rapoport Cc: Mike Kravetz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/uapi/linux/userfaultfd.h | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h index c8953c84fdcc..79a85e5bd388 100644 --- a/include/uapi/linux/userfaultfd.h +++ b/include/uapi/linux/userfaultfd.h @@ -18,7 +18,8 @@ * means the userland is reading). */ #define UFFD_API ((__u64)0xAA) -#define UFFD_API_FEATURES (UFFD_FEATURE_EVENT_FORK) +#define UFFD_API_FEATURES (UFFD_FEATURE_EVENT_FORK | \ + UFFD_FEATURE_EVENT_REMAP) #define UFFD_API_IOCTLS \ ((__u64)1 << _UFFDIO_REGISTER | \ (__u64)1 << _UFFDIO_UNREGISTER | \ @@ -76,6 +77,12 @@ struct uffd_msg { __u32 ufd; } fork; + struct { + __u64 from; + __u64 to; + __u64 len; + } remap; + struct { /* unused reserved fields */ __u64 reserved1; @@ -90,6 +97,7 @@ struct uffd_msg { */ #define UFFD_EVENT_PAGEFAULT 0x12 #define UFFD_EVENT_FORK 0x13 +#define UFFD_EVENT_REMAP 0x14 /* flags for UFFD_EVENT_PAGEFAULT */ #define UFFD_PAGEFAULT_FLAG_WRITE (1<<0) /* If this was a write fault */ @@ -110,6 +118,7 @@ struct uffdio_api { */ #define UFFD_FEATURE_PAGEFAULT_FLAG_WP (1<<0) #define UFFD_FEATURE_EVENT_FORK (1<<1) +#define UFFD_FEATURE_EVENT_REMAP (1<<2) __u64 features; __u64 ioctls; -- cgit v1.2.3 From 05ce77249d5068b057082d24ec22d3824f4816ac Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Wed, 22 Feb 2017 15:42:40 -0800 Subject: userfaultfd: non-cooperative: add madvise() event for MADV_DONTNEED request If the page is punched out of the address space the uffd reader should know this and zeromap the respective area in case of the #PF event. Link: http://lkml.kernel.org/r/20161216144821.5183-14-aarcange@redhat.com Signed-off-by: Pavel Emelyanov Signed-off-by: Mike Rapoport Signed-off-by: Andrea Arcangeli Cc: "Dr. David Alan Gilbert" Cc: Hillf Danton Cc: Michael Rapoport Cc: Mike Kravetz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/uapi/linux/userfaultfd.h | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h index 79a85e5bd388..2bbf32319cf5 100644 --- a/include/uapi/linux/userfaultfd.h +++ b/include/uapi/linux/userfaultfd.h @@ -19,7 +19,8 @@ */ #define UFFD_API ((__u64)0xAA) #define UFFD_API_FEATURES (UFFD_FEATURE_EVENT_FORK | \ - UFFD_FEATURE_EVENT_REMAP) + UFFD_FEATURE_EVENT_REMAP | \ + UFFD_FEATURE_EVENT_MADVDONTNEED) #define UFFD_API_IOCTLS \ ((__u64)1 << _UFFDIO_REGISTER | \ (__u64)1 << _UFFDIO_UNREGISTER | \ @@ -83,6 +84,11 @@ struct uffd_msg { __u64 len; } remap; + struct { + __u64 start; + __u64 end; + } madv_dn; + struct { /* unused reserved fields */ __u64 reserved1; @@ -98,6 +104,7 @@ struct uffd_msg { #define UFFD_EVENT_PAGEFAULT 0x12 #define UFFD_EVENT_FORK 0x13 #define UFFD_EVENT_REMAP 0x14 +#define UFFD_EVENT_MADVDONTNEED 0x15 /* flags for UFFD_EVENT_PAGEFAULT */ #define UFFD_PAGEFAULT_FLAG_WRITE (1<<0) /* If this was a write fault */ @@ -119,6 +126,7 @@ struct uffdio_api { #define UFFD_FEATURE_PAGEFAULT_FLAG_WP (1<<0) #define UFFD_FEATURE_EVENT_FORK (1<<1) #define UFFD_FEATURE_EVENT_REMAP (1<<2) +#define UFFD_FEATURE_EVENT_MADVDONTNEED (1<<3) __u64 features; __u64 ioctls; -- cgit v1.2.3 From cab350afcbc9c8a744e0d164d1c26560568f770b Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Wed, 22 Feb 2017 15:43:04 -0800 Subject: userfaultfd: hugetlbfs: allow registration of ranges containing huge pages Expand the userfaultfd_register/unregister routines to allow VM_HUGETLB vmas. huge page alignment checking is performed after a VM_HUGETLB vma is encountered. Also, since there is no UFFDIO_ZEROPAGE support for huge pages do not return that as a valid ioctl method for huge page ranges. Link: http://lkml.kernel.org/r/20161216144821.5183-22-aarcange@redhat.com Signed-off-by: Mike Kravetz Signed-off-by: Andrea Arcangeli Cc: "Dr. David Alan Gilbert" Cc: Hillf Danton Cc: Michael Rapoport Cc: Mike Rapoport Cc: Pavel Emelyanov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/uapi/linux/userfaultfd.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h index 2bbf32319cf5..a3828a9bc16e 100644 --- a/include/uapi/linux/userfaultfd.h +++ b/include/uapi/linux/userfaultfd.h @@ -29,6 +29,9 @@ ((__u64)1 << _UFFDIO_WAKE | \ (__u64)1 << _UFFDIO_COPY | \ (__u64)1 << _UFFDIO_ZEROPAGE) +#define UFFD_API_RANGE_IOCTLS_HPAGE \ + ((__u64)1 << _UFFDIO_WAKE | \ + (__u64)1 << _UFFDIO_COPY) /* * Valid ioctl command number range with this API is from 0x00 to -- cgit v1.2.3 From 163e11bc4f6ebbfcfdf751c108bd212a26e492ee Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Wed, 22 Feb 2017 15:43:19 -0800 Subject: userfaultfd: hugetlbfs: UFFD_FEATURE_MISSING_HUGETLBFS Userland developers asked to be notified immediately by the UFFDIO_API ioctl if hugetlbfs missing mode is supported by userfaultfd in the running kernel. This avoids the need to run UFFDIO_REGISTER on a hugetlbfs virtual memory range to find out. Link: http://lkml.kernel.org/r/20161216144821.5183-27-aarcange@redhat.com Signed-off-by: Andrea Arcangeli Cc: "Dr. David Alan Gilbert" Cc: Hillf Danton Cc: Michael Rapoport Cc: Mike Kravetz Cc: Mike Rapoport Cc: Pavel Emelyanov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/uapi/linux/userfaultfd.h | 28 +++++++++++++++++++++++++--- 1 file changed, 25 insertions(+), 3 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h index a3828a9bc16e..7293321abdfb 100644 --- a/include/uapi/linux/userfaultfd.h +++ b/include/uapi/linux/userfaultfd.h @@ -18,9 +18,10 @@ * means the userland is reading). */ #define UFFD_API ((__u64)0xAA) -#define UFFD_API_FEATURES (UFFD_FEATURE_EVENT_FORK | \ - UFFD_FEATURE_EVENT_REMAP | \ - UFFD_FEATURE_EVENT_MADVDONTNEED) +#define UFFD_API_FEATURES (UFFD_FEATURE_EVENT_FORK | \ + UFFD_FEATURE_EVENT_REMAP | \ + UFFD_FEATURE_EVENT_MADVDONTNEED | \ + UFFD_FEATURE_MISSING_HUGETLBFS) #define UFFD_API_IOCTLS \ ((__u64)1 << _UFFDIO_REGISTER | \ (__u64)1 << _UFFDIO_UNREGISTER | \ @@ -125,11 +126,32 @@ struct uffdio_api { * Note: UFFD_EVENT_PAGEFAULT and UFFD_PAGEFAULT_FLAG_WRITE * are to be considered implicitly always enabled in all kernels as * long as the uffdio_api.api requested matches UFFD_API. + * + * UFFD_FEATURE_MISSING_HUGETLBFS means an UFFDIO_REGISTER + * with UFFDIO_REGISTER_MODE_MISSING mode will succeed on + * hugetlbfs virtual memory ranges. Adding or not adding + * UFFD_FEATURE_MISSING_HUGETLBFS to uffdio_api.features has + * no real functional effect after UFFDIO_API returns, but + * it's only useful for an initial feature set probe at + * UFFDIO_API time. There are two ways to use it: + * + * 1) by adding UFFD_FEATURE_MISSING_HUGETLBFS to the + * uffdio_api.features before calling UFFDIO_API, an error + * will be returned by UFFDIO_API on a kernel without + * hugetlbfs missing support + * + * 2) the UFFD_FEATURE_MISSING_HUGETLBFS can not be added in + * uffdio_api.features and instead it will be set by the + * kernel in the uffdio_api.features if the kernel supports + * it, so userland can later check if the feature flag is + * present in uffdio_api.features after UFFDIO_API + * succeeded. */ #define UFFD_FEATURE_PAGEFAULT_FLAG_WP (1<<0) #define UFFD_FEATURE_EVENT_FORK (1<<1) #define UFFD_FEATURE_EVENT_REMAP (1<<2) #define UFFD_FEATURE_EVENT_MADVDONTNEED (1<<3) +#define UFFD_FEATURE_MISSING_HUGETLBFS (1<<4) __u64 features; __u64 ioctls; -- cgit v1.2.3 From cac673292b9b39493bb0ff526b96c83ace6fdcd0 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Wed, 22 Feb 2017 15:43:40 -0800 Subject: userfaultfd: shmem: allow registration of shared memory ranges Expand the userfaultfd_register/unregister routines to allow shared memory VMAs. Currently, there is no UFFDIO_ZEROPAGE and write-protection support for shared memory VMAs, which is reflected in ioctl methods supported by uffdio_register. Link: http://lkml.kernel.org/r/20161216144821.5183-34-aarcange@redhat.com Signed-off-by: Mike Rapoport Signed-off-by: Andrea Arcangeli Cc: "Dr. David Alan Gilbert" Cc: Hillf Danton Cc: Michael Rapoport Cc: Mike Kravetz Cc: Pavel Emelyanov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/uapi/linux/userfaultfd.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h index 7293321abdfb..10631a4cdb24 100644 --- a/include/uapi/linux/userfaultfd.h +++ b/include/uapi/linux/userfaultfd.h @@ -30,7 +30,7 @@ ((__u64)1 << _UFFDIO_WAKE | \ (__u64)1 << _UFFDIO_COPY | \ (__u64)1 << _UFFDIO_ZEROPAGE) -#define UFFD_API_RANGE_IOCTLS_HPAGE \ +#define UFFD_API_RANGE_IOCTLS_BASIC \ ((__u64)1 << _UFFDIO_WAKE | \ (__u64)1 << _UFFDIO_COPY) -- cgit v1.2.3 From 47dd924508f5fb10480afc69de04539fa3d14034 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Wed, 22 Feb 2017 15:43:58 -0800 Subject: userfaultfd: hugetlbfs: UFFD_FEATURE_MISSING_SHMEM Userland developers asked to be notified immediately by the UFFDIO_API ioctl if shmem missing mode is supported by userfaultfd in the running kernel. This avoids the need to run UFFDIO_REGISTER on a shmem virtual memory range to find out. Link: http://lkml.kernel.org/r/20161216144821.5183-38-aarcange@redhat.com Signed-off-by: Andrea Arcangeli Cc: "Dr. David Alan Gilbert" Cc: Hillf Danton Cc: Michael Rapoport Cc: Mike Kravetz Cc: Mike Rapoport Cc: Pavel Emelyanov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/uapi/linux/userfaultfd.h | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h index 10631a4cdb24..9ac4b68c54d1 100644 --- a/include/uapi/linux/userfaultfd.h +++ b/include/uapi/linux/userfaultfd.h @@ -21,7 +21,8 @@ #define UFFD_API_FEATURES (UFFD_FEATURE_EVENT_FORK | \ UFFD_FEATURE_EVENT_REMAP | \ UFFD_FEATURE_EVENT_MADVDONTNEED | \ - UFFD_FEATURE_MISSING_HUGETLBFS) + UFFD_FEATURE_MISSING_HUGETLBFS | \ + UFFD_FEATURE_MISSING_SHMEM) #define UFFD_API_IOCTLS \ ((__u64)1 << _UFFDIO_REGISTER | \ (__u64)1 << _UFFDIO_UNREGISTER | \ @@ -146,12 +147,17 @@ struct uffdio_api { * it, so userland can later check if the feature flag is * present in uffdio_api.features after UFFDIO_API * succeeded. + * + * UFFD_FEATURE_MISSING_SHMEM works the same as + * UFFD_FEATURE_MISSING_HUGETLBFS, but it applies to shmem + * (i.e. tmpfs and other shmem based APIs). */ #define UFFD_FEATURE_PAGEFAULT_FLAG_WP (1<<0) #define UFFD_FEATURE_EVENT_FORK (1<<1) #define UFFD_FEATURE_EVENT_REMAP (1<<2) #define UFFD_FEATURE_EVENT_MADVDONTNEED (1<<3) #define UFFD_FEATURE_MISSING_HUGETLBFS (1<<4) +#define UFFD_FEATURE_MISSING_SHMEM (1<<5) __u64 features; __u64 ioctls; -- cgit v1.2.3