From 3fc21924100b13f73c734d0ce8dfcfe913fcf7a8 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Fri, 24 Feb 2017 14:55:48 -0800 Subject: mm: validate device_hotplug is held for memory hotplug mem_hotplug_begin() assumes that it can set mem_hotplug.active_writer and run the hotplug process without racing another thread. Validate this assumption with a lockdep assertion. Link: http://lkml.kernel.org/r/148693886229.16345.1770484669403334689.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: Dan Williams Reported-by: Ben Hutchings Cc: Michal Hocko Cc: Toshi Kani Cc: Vlastimil Babka Cc: Logan Gunthorpe Cc: Greg Kroah-Hartman Cc: Masayoshi Mizuma Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/device.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/device.h b/include/linux/device.h index bd684fc8ec1d..a48a7ff70164 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -1139,6 +1139,7 @@ static inline bool device_supports_offline(struct device *dev) extern void lock_device_hotplug(void); extern void unlock_device_hotplug(void); extern int lock_device_hotplug_sysfs(void); +void assert_held_device_hotplug(void); extern int device_offline(struct device *dev); extern int device_online(struct device *dev); extern void set_primary_fwnode(struct device *dev, struct fwnode_handle *fwnode); -- cgit v1.2.3 From 0262d9c845ec349edf93f69688a5129c36cc2232 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Fri, 24 Feb 2017 14:55:59 -0800 Subject: memblock: embed memblock type name within struct memblock_type Provide the name of each memblock type with struct memblock_type. This allows to get rid of the function memblock_type_name() and duplicating the type names in __memblock_dump_all(). The only memblock_type usage out of mm/memblock.c seems to be arch/s390/kernel/crash_dump.c. While at it, give it a name. Link: http://lkml.kernel.org/r/20170120123456.46508-4-heiko.carstens@de.ibm.com Signed-off-by: Heiko Carstens Cc: Philipp Hachtmann Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memblock.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/memblock.h b/include/linux/memblock.h index 38bcf00cbed3..bdfc65af4152 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -42,6 +42,7 @@ struct memblock_type { unsigned long max; /* size of the allocated array */ phys_addr_t total_size; /* size of all regions */ struct memblock_region *regions; + char *name; }; struct memblock { -- cgit v1.2.3 From d811914d87576c562e849c00d9f9beff45038801 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Fri, 24 Feb 2017 14:56:02 -0800 Subject: userfaultfd: non-cooperative: rename *EVENT_MADVDONTNEED to *EVENT_REMOVE Patch series "userfaultfd: non-cooperative: add madvise() event for MADV_REMOVE request". These patches add notification of madvise(MADV_REMOVE) event to non-cooperative userfaultfd monitor. The first pacth renames EVENT_MADVDONTNEED to EVENT_REMOVE along with relevant functions and structures. Using _REMOVE instead of _MADVDONTNEED describes the event semantics more clearly and I hope it's not too late for such change in the ABI. This patch (of 3): The UFFD_EVENT_MADVDONTNEED purpose is to notify uffd monitor about removal of certain range from address space tracked by userfaultfd. Hence, UFFD_EVENT_REMOVE seems to better reflect the operation semantics. Respectively, 'madv_dn' field of uffd_msg is renamed to 'remove' and the madvise_userfault_dontneed callback is renamed to userfaultfd_remove. Link: http://lkml.kernel.org/r/1484814154-1557-2-git-send-email-rppt@linux.vnet.ibm.com Signed-off-by: Mike Rapoport Reviewed-by: Andrea Arcangeli Acked-by: Hillf Danton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/userfaultfd_k.h | 16 ++++++++-------- include/uapi/linux/userfaultfd.h | 8 ++++---- 2 files changed, 12 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index f431861f22f1..2521542f6c07 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -61,10 +61,10 @@ extern void mremap_userfaultfd_complete(struct vm_userfaultfd_ctx *, unsigned long from, unsigned long to, unsigned long len); -extern void madvise_userfault_dontneed(struct vm_area_struct *vma, - struct vm_area_struct **prev, - unsigned long start, - unsigned long end); +extern void userfaultfd_remove(struct vm_area_struct *vma, + struct vm_area_struct **prev, + unsigned long start, + unsigned long end); #else /* CONFIG_USERFAULTFD */ @@ -112,10 +112,10 @@ static inline void mremap_userfaultfd_complete(struct vm_userfaultfd_ctx *ctx, { } -static inline void madvise_userfault_dontneed(struct vm_area_struct *vma, - struct vm_area_struct **prev, - unsigned long start, - unsigned long end) +static inline void userfaultfd_remove(struct vm_area_struct *vma, + struct vm_area_struct **prev, + unsigned long start, + unsigned long end) { } #endif /* CONFIG_USERFAULTFD */ diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h index 9ac4b68c54d1..b742c40c2880 100644 --- a/include/uapi/linux/userfaultfd.h +++ b/include/uapi/linux/userfaultfd.h @@ -20,7 +20,7 @@ #define UFFD_API ((__u64)0xAA) #define UFFD_API_FEATURES (UFFD_FEATURE_EVENT_FORK | \ UFFD_FEATURE_EVENT_REMAP | \ - UFFD_FEATURE_EVENT_MADVDONTNEED | \ + UFFD_FEATURE_EVENT_REMOVE | \ UFFD_FEATURE_MISSING_HUGETLBFS | \ UFFD_FEATURE_MISSING_SHMEM) #define UFFD_API_IOCTLS \ @@ -92,7 +92,7 @@ struct uffd_msg { struct { __u64 start; __u64 end; - } madv_dn; + } remove; struct { /* unused reserved fields */ @@ -109,7 +109,7 @@ struct uffd_msg { #define UFFD_EVENT_PAGEFAULT 0x12 #define UFFD_EVENT_FORK 0x13 #define UFFD_EVENT_REMAP 0x14 -#define UFFD_EVENT_MADVDONTNEED 0x15 +#define UFFD_EVENT_REMOVE 0x15 /* flags for UFFD_EVENT_PAGEFAULT */ #define UFFD_PAGEFAULT_FLAG_WRITE (1<<0) /* If this was a write fault */ @@ -155,7 +155,7 @@ struct uffdio_api { #define UFFD_FEATURE_PAGEFAULT_FLAG_WP (1<<0) #define UFFD_FEATURE_EVENT_FORK (1<<1) #define UFFD_FEATURE_EVENT_REMAP (1<<2) -#define UFFD_FEATURE_EVENT_MADVDONTNEED (1<<3) +#define UFFD_FEATURE_EVENT_REMOVE (1<<3) #define UFFD_FEATURE_MISSING_HUGETLBFS (1<<4) #define UFFD_FEATURE_MISSING_SHMEM (1<<5) __u64 features; -- cgit v1.2.3 From 1276ad68e2491d1ceeb65f55d790f9277593c459 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Fri, 24 Feb 2017 14:56:11 -0800 Subject: mm: vmscan: scan dirty pages even in laptop mode Patch series "mm: vmscan: fix kswapd writeback regression". We noticed a regression on multiple hadoop workloads when moving from 3.10 to 4.0 and 4.6, which involves kswapd getting tangled up in page writeout, causing direct reclaim herds that also don't make progress. I tracked it down to the thrash avoidance efforts after 3.10 that make the kernel better at keeping use-once cache and use-many cache sorted on the inactive and active list, with more aggressive protection of the active list as long as there is inactive cache. Unfortunately, our workload's use-once cache is mostly from streaming writes. Waiting for writes to avoid potential reloads in the future is not a good tradeoff. These patches do the following: 1. Wake the flushers when kswapd sees a lump of dirty pages. It's possible to be below the dirty background limit and still have cache velocity push them through the LRU. So start a-flushin'. 2. Let kswapd only write pages that have been rotated twice. This makes sure we really tried to get all the clean pages on the inactive list before resorting to horrible LRU-order writeback. 3. Move rotating dirty pages off the inactive list. Instead of churning or waiting on page writeback, we'll go after clean active cache. This might lead to thrashing, but in this state memory demand outstrips IO speed anyway, and reads are faster than writes. Mel backported the series to 4.10-rc5 with one minor conflict and ran a couple of tests on it. Mix of read/write random workload didn't show anything interesting. Write-only database didn't show much difference in performance but there were slight reductions in IO -- probably in the noise. simoop did show big differences although not as big as Mel expected. This is Chris Mason's workload that similate the VM activity of hadoop. Mel won't go through the full details but over the samples measured during an hour it reported 4.10.0-rc5 4.10.0-rc5 vanilla johannes-v1r1 Amean p50-Read 21346531.56 ( 0.00%) 21697513.24 ( -1.64%) Amean p95-Read 24700518.40 ( 0.00%) 25743268.98 ( -4.22%) Amean p99-Read 27959842.13 ( 0.00%) 28963271.11 ( -3.59%) Amean p50-Write 1138.04 ( 0.00%) 989.82 ( 13.02%) Amean p95-Write 1106643.48 ( 0.00%) 12104.00 ( 98.91%) Amean p99-Write 1569213.22 ( 0.00%) 36343.38 ( 97.68%) Amean p50-Allocation 85159.82 ( 0.00%) 79120.70 ( 7.09%) Amean p95-Allocation 204222.58 ( 0.00%) 129018.43 ( 36.82%) Amean p99-Allocation 278070.04 ( 0.00%) 183354.43 ( 34.06%) Amean final-p50-Read 21266432.00 ( 0.00%) 21921792.00 ( -3.08%) Amean final-p95-Read 24870912.00 ( 0.00%) 26116096.00 ( -5.01%) Amean final-p99-Read 28147712.00 ( 0.00%) 29523968.00 ( -4.89%) Amean final-p50-Write 1130.00 ( 0.00%) 977.00 ( 13.54%) Amean final-p95-Write 1033216.00 ( 0.00%) 2980.00 ( 99.71%) Amean final-p99-Write 1517568.00 ( 0.00%) 32672.00 ( 97.85%) Amean final-p50-Allocation 86656.00 ( 0.00%) 78464.00 ( 9.45%) Amean final-p95-Allocation 211712.00 ( 0.00%) 116608.00 ( 44.92%) Amean final-p99-Allocation 287232.00 ( 0.00%) 168704.00 ( 41.27%) The latencies are actually completely horrific in comparison to 4.4 (and 4.10-rc5 is worse than 4.9 according to historical data for reasons Mel hasn't analysed yet). Still, 95% of write latency (p95-write) is halved by the series and allocation latency is way down. Direct reclaim activity is one fifth of what it was according to vmstats. Kswapd activity is higher but this is not necessarily surprising. Kswapd efficiency is unchanged at 99% (99% of pages scanned were reclaimed) but direct reclaim efficiency went from 77% to 99% In the vanilla kernel, 627MB of data was written back from reclaim context. With the series, no data was written back. With or without the patch, pages are being immediately reclaimed after writeback completes. However, with the patch, only 1/8th of the pages are reclaimed like this. This patch (of 5): We have an elaborate dirty/writeback throttling mechanism inside the reclaim scanner, but for that to work the pages have to go through shrink_page_list() and get counted for what they are. Otherwise, we mess up the LRU order and don't match reclaim speed to writeback. Especially during deactivation, there is never a reason to skip dirty pages; nothing is even trying to write them out from there. Don't mess up the LRU order for nothing, shuffle these pages along. Link: http://lkml.kernel.org/r/20170123181641.23938-2-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Acked-by: Minchan Kim Acked-by: Michal Hocko Acked-by: Mel Gorman Acked-by: Hillf Danton Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 82fc632fd11d..8e02b3750fe0 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -236,8 +236,6 @@ struct lruvec { #define LRU_ALL_ANON (BIT(LRU_INACTIVE_ANON) | BIT(LRU_ACTIVE_ANON)) #define LRU_ALL ((1 << NR_LRU_LISTS) - 1) -/* Isolate clean file */ -#define ISOLATE_CLEAN ((__force isolate_mode_t)0x1) /* Isolate unmapped file */ #define ISOLATE_UNMAPPED ((__force isolate_mode_t)0x2) /* Isolate for asynchronous migration */ -- cgit v1.2.3 From 726d061fbd3658e4bfeffa1b8e82da97de2ca4dd Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Fri, 24 Feb 2017 14:56:14 -0800 Subject: mm: vmscan: kick flushers when we encounter dirty pages on the LRU Memory pressure can put dirty pages at the end of the LRU without anybody running into dirty limits. Don't start writing individual pages from kswapd while the flushers might be asleep. Unlike the old direct reclaim flusher wakeup (removed in the next patch) that flushes the number of pages just scanned, this patch wakes the flushers for all outstanding dirty pages. That seemed to perform better in a synthetic test that pushes dirty pages to the end of the LRU and into reclaim, because we know LRU aging outstrips writeback already, and this way we give younger dirty pages a headstart rather than wait until reclaim runs into them as well. It also means less plugging and risk of exhausting the struct request pool from reclaim. There is a concern that this will cause temporary files that used to get dirtied and truncated before writeback to now get written to disk under memory pressure. If this turns out to be a real problem, we'll have to revisit this and tame the reclaim flusher wakeups. [hannes@cmpxchg.org: mention dirty expiration as a condition] Link: http://lkml.kernel.org/r/20170126174739.GA30636@cmpxchg.org Link: http://lkml.kernel.org/r/20170123181641.23938-3-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Acked-by: Minchan Kim Acked-by: Michal Hocko Acked-by: Mel Gorman Acked-by: Hillf Danton Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/writeback.h | 2 +- include/trace/events/writeback.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 5527d910ba3d..a3c0cbd7c888 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -46,7 +46,7 @@ enum writeback_sync_modes { */ enum wb_reason { WB_REASON_BACKGROUND, - WB_REASON_TRY_TO_FREE_PAGES, + WB_REASON_VMSCAN, WB_REASON_SYNC, WB_REASON_PERIODIC, WB_REASON_LAPTOP_TIMER, diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h index 2ccd9ccbf9ef..7bd8783a590f 100644 --- a/include/trace/events/writeback.h +++ b/include/trace/events/writeback.h @@ -31,7 +31,7 @@ #define WB_WORK_REASON \ EM( WB_REASON_BACKGROUND, "background") \ - EM( WB_REASON_TRY_TO_FREE_PAGES, "try_to_free_pages") \ + EM( WB_REASON_VMSCAN, "vmscan") \ EM( WB_REASON_SYNC, "sync") \ EM( WB_REASON_PERIODIC, "periodic") \ EM( WB_REASON_LAPTOP_TIMER, "laptop_timer") \ -- cgit v1.2.3 From c55e8d035b28b2867e68b0e2d0eee2c0f1016b43 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Fri, 24 Feb 2017 14:56:23 -0800 Subject: mm: vmscan: move dirty pages out of the way until they're flushed We noticed a performance regression when moving hadoop workloads from 3.10 kernels to 4.0 and 4.6. This is accompanied by increased pageout activity initiated by kswapd as well as frequent bursts of allocation stalls and direct reclaim scans. Even lowering the dirty ratios to the equivalent of less than 1% of memory would not eliminate the issue, suggesting that dirty pages concentrate where the scanner is looking. This can be traced back to recent efforts of thrash avoidance. Where 3.10 would not detect refaulting pages and continuously supply clean cache to the inactive list, a thrashing workload on 4.0+ will detect and activate refaulting pages right away, distilling used-once pages on the inactive list much more effectively. This is by design, and it makes sense for clean cache. But for the most part our workload's cache faults are refaults and its use-once cache is from streaming writes. We end up with most of the inactive list dirty, and we don't go after the active cache as long as we have use-once pages around. But waiting for writes to avoid reclaiming clean cache that *might* refault is a bad trade-off. Even if the refaults happen, reads are faster than writes. Before getting bogged down on writeback, reclaim should first look at *all* cache in the system, even active cache. To accomplish this, activate pages that are dirty or under writeback when they reach the end of the inactive LRU. The pages are marked for immediate reclaim, meaning they'll get moved back to the inactive LRU tail as soon as they're written back and become reclaimable. But in the meantime, by reducing the inactive list to only immediately reclaimable pages, we allow the scanner to deactivate and refill the inactive list with clean cache from the active list tail to guarantee forward progress. [hannes@cmpxchg.org: update comment] Link: http://lkml.kernel.org/r/20170202191957.22872-8-hannes@cmpxchg.org Link: http://lkml.kernel.org/r/20170123181641.23938-6-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Acked-by: Minchan Kim Acked-by: Michal Hocko Acked-by: Hillf Danton Cc: Mel Gorman Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm_inline.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include') diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index 41d376e7116d..e030a68ead7e 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -50,6 +50,13 @@ static __always_inline void add_page_to_lru_list(struct page *page, list_add(&page->lru, &lruvec->lists[lru]); } +static __always_inline void add_page_to_lru_list_tail(struct page *page, + struct lruvec *lruvec, enum lru_list lru) +{ + update_lru_size(lruvec, lru, page_zonenum(page), hpage_nr_pages(page)); + list_add_tail(&page->lru, &lruvec->lists[lru]); +} + static __always_inline void del_page_from_lru_list(struct page *page, struct lruvec *lruvec, enum lru_list lru) { -- cgit v1.2.3 From 11bac80004499ea59f361ef2a5516c84b6eab675 Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Fri, 24 Feb 2017 14:56:41 -0800 Subject: mm, fs: reduce fault, page_mkwrite, and pfn_mkwrite to take only vmf ->fault(), ->page_mkwrite(), and ->pfn_mkwrite() calls do not need to take a vma and vmf parameter when the vma already resides in vmf. Remove the vma parameter to simplify things. [arnd@arndb.de: fix ARM build] Link: http://lkml.kernel.org/r/20170125223558.1451224-1-arnd@arndb.de Link: http://lkml.kernel.org/r/148521301778.19116.10840599906674778980.stgit@djiang5-desk3.ch.intel.com Signed-off-by: Dave Jiang Signed-off-by: Arnd Bergmann Reviewed-by: Ross Zwisler Cc: Theodore Ts'o Cc: Darrick J. Wong Cc: Matthew Wilcox Cc: Dave Hansen Cc: Christoph Hellwig Cc: Jan Kara Cc: Dan Williams Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/dax.h | 5 ++--- include/linux/iomap.h | 3 +-- include/linux/mm.h | 10 +++++----- 3 files changed, 8 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/include/linux/dax.h b/include/linux/dax.h index 1e77ff5818f1..eeb02421c848 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -38,8 +38,7 @@ static inline void *dax_radix_locked_entry(sector_t sector, unsigned long flags) ssize_t dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter, const struct iomap_ops *ops); -int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf, - const struct iomap_ops *ops); +int dax_iomap_fault(struct vm_fault *vmf, const struct iomap_ops *ops); int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index); int dax_invalidate_mapping_entry(struct address_space *mapping, pgoff_t index); int dax_invalidate_mapping_entry_sync(struct address_space *mapping, @@ -83,7 +82,7 @@ static inline int dax_iomap_pmd_fault(struct vm_fault *vmf, return VM_FAULT_FALLBACK; } #endif -int dax_pfn_mkwrite(struct vm_area_struct *, struct vm_fault *); +int dax_pfn_mkwrite(struct vm_fault *vmf); static inline bool vma_is_dax(struct vm_area_struct *vma) { diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 891459caa278..7291810067eb 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -79,8 +79,7 @@ int iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero, const struct iomap_ops *ops); int iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero, const struct iomap_ops *ops); -int iomap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf, - const struct iomap_ops *ops); +int iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops); int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, loff_t start, loff_t len, const struct iomap_ops *ops); diff --git a/include/linux/mm.h b/include/linux/mm.h index 574bc157a27c..3dd80ba6568a 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -350,17 +350,17 @@ struct vm_operations_struct { void (*open)(struct vm_area_struct * area); void (*close)(struct vm_area_struct * area); int (*mremap)(struct vm_area_struct * area); - int (*fault)(struct vm_area_struct *vma, struct vm_fault *vmf); + int (*fault)(struct vm_fault *vmf); int (*pmd_fault)(struct vm_fault *vmf); void (*map_pages)(struct vm_fault *vmf, pgoff_t start_pgoff, pgoff_t end_pgoff); /* notification that a previously read-only page is about to become * writable, if an error is returned it will cause a SIGBUS */ - int (*page_mkwrite)(struct vm_area_struct *vma, struct vm_fault *vmf); + int (*page_mkwrite)(struct vm_fault *vmf); /* same as page_mkwrite when using VM_PFNMAP|VM_MIXEDMAP */ - int (*pfn_mkwrite)(struct vm_area_struct *vma, struct vm_fault *vmf); + int (*pfn_mkwrite)(struct vm_fault *vmf); /* called by access_process_vm when get_user_pages() fails, typically * for use by special VMAs that can switch between memory and hardware @@ -2124,10 +2124,10 @@ extern void truncate_inode_pages_range(struct address_space *, extern void truncate_inode_pages_final(struct address_space *); /* generic vm_area_ops exported for stackable file systems */ -extern int filemap_fault(struct vm_area_struct *, struct vm_fault *); +extern int filemap_fault(struct vm_fault *vmf); extern void filemap_map_pages(struct vm_fault *vmf, pgoff_t start_pgoff, pgoff_t end_pgoff); -extern int filemap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); +extern int filemap_page_mkwrite(struct vm_fault *vmf); /* mm/page-writeback.c */ int write_one_page(struct page *page, int wait); -- cgit v1.2.3 From a2d581675d485eb7188f521f36efc114639a3096 Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Fri, 24 Feb 2017 14:56:59 -0800 Subject: mm,fs,dax: change ->pmd_fault to ->huge_fault Patch series "1G transparent hugepage support for device dax", v2. The following series implements support for 1G trasparent hugepage on x86 for device dax. The bulk of the code was written by Mathew Wilcox a while back supporting transparent 1G hugepage for fs DAX. I have forward ported the relevant bits to 4.10-rc. The current submission has only the necessary code to support device DAX. Comments from Dan Williams: So the motivation and intended user of this functionality mirrors the motivation and users of 1GB page support in hugetlbfs. Given expected capacities of persistent memory devices an in-memory database may want to reduce tlb pressure beyond what they can already achieve with 2MB mappings of a device-dax file. We have customer feedback to that effect as Willy mentioned in his previous version of these patches [1]. [1]: https://lkml.org/lkml/2016/1/31/52 Comments from Nilesh @ Oracle: There are applications which have a process model; and if you assume 10,000 processes attempting to mmap all the 6TB memory available on a server; we are looking at the following: processes : 10,000 memory : 6TB pte @ 4k page size: 8 bytes / 4K of memory * #processes = 6TB / 4k * 8 * 10000 = 1.5GB * 80000 = 120,000GB pmd @ 2M page size: 120,000 / 512 = ~240GB pud @ 1G page size: 240GB / 512 = ~480MB As you can see with 2M pages, this system will use up an exorbitant amount of DRAM to hold the page tables; but the 1G pages finally brings it down to a reasonable level. Memory sizes will keep increasing; so this number will keep increasing. An argument can be made to convert the applications from process model to thread model, but in the real world that may not be always practical. Hopefully this helps explain the use case where this is valuable. This patch (of 3): In preparation for adding the ability to handle PUD pages, convert vm_operations_struct.pmd_fault to vm_operations_struct.huge_fault. The vm_fault structure is extended to include a union of the different page table pointers that may be needed, and three flag bits are reserved to indicate which type of pointer is in the union. [ross.zwisler@linux.intel.com: remove unused function ext4_dax_huge_fault()] Link: http://lkml.kernel.org/r/1485813172-7284-1-git-send-email-ross.zwisler@linux.intel.com [dave.jiang@intel.com: clear PMD or PUD size flags when in fall through path] Link: http://lkml.kernel.org/r/148589842696.5820.16078080610311444794.stgit@djiang5-desk3.ch.intel.com Link: http://lkml.kernel.org/r/148545058784.17912.6353162518188733642.stgit@djiang5-desk3.ch.intel.com Signed-off-by: Matthew Wilcox Signed-off-by: Dave Jiang Signed-off-by: Ross Zwisler Cc: Dave Hansen Cc: Vlastimil Babka Cc: Jan Kara Cc: Dan Williams Cc: Kirill A. Shutemov Cc: Nilesh Choudhury Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Thomas Gleixner Cc: Dave Jiang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/dax.h | 6 ------ include/linux/mm.h | 10 +++++++++- 2 files changed, 9 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/linux/dax.h b/include/linux/dax.h index eeb02421c848..cf9af225962b 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -70,17 +70,11 @@ static inline unsigned int dax_radix_order(void *entry) return PMD_SHIFT - PAGE_SHIFT; return 0; } -int dax_iomap_pmd_fault(struct vm_fault *vmf, const struct iomap_ops *ops); #else static inline unsigned int dax_radix_order(void *entry) { return 0; } -static inline int dax_iomap_pmd_fault(struct vm_fault *vmf, - const struct iomap_ops *ops) -{ - return VM_FAULT_FALLBACK; -} #endif int dax_pfn_mkwrite(struct vm_fault *vmf); diff --git a/include/linux/mm.h b/include/linux/mm.h index 3dd80ba6568a..035a688e5472 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -285,6 +285,11 @@ extern pgprot_t protection_map[16]; #define FAULT_FLAG_REMOTE 0x80 /* faulting for non current tsk/mm */ #define FAULT_FLAG_INSTRUCTION 0x100 /* The fault was during an instruction fetch */ +#define FAULT_FLAG_SIZE_MASK 0x7000 /* Support up to 8-level page tables */ +#define FAULT_FLAG_SIZE_PTE 0x0000 /* First level (eg 4k) */ +#define FAULT_FLAG_SIZE_PMD 0x1000 /* Second level (eg 2MB) */ +#define FAULT_FLAG_SIZE_PUD 0x2000 /* Third level (eg 1GB) */ + #define FAULT_FLAG_TRACE \ { FAULT_FLAG_WRITE, "WRITE" }, \ { FAULT_FLAG_MKWRITE, "MKWRITE" }, \ @@ -314,6 +319,9 @@ struct vm_fault { unsigned long address; /* Faulting virtual address */ pmd_t *pmd; /* Pointer to pmd entry matching * the 'address' */ + pud_t *pud; /* Pointer to pud entry matching + * the 'address' + */ pte_t orig_pte; /* Value of PTE at the time of fault */ struct page *cow_page; /* Page handler may use for COW fault */ @@ -351,7 +359,7 @@ struct vm_operations_struct { void (*close)(struct vm_area_struct * area); int (*mremap)(struct vm_area_struct * area); int (*fault)(struct vm_fault *vmf); - int (*pmd_fault)(struct vm_fault *vmf); + int (*huge_fault)(struct vm_fault *vmf); void (*map_pages)(struct vm_fault *vmf, pgoff_t start_pgoff, pgoff_t end_pgoff); -- cgit v1.2.3 From a00cc7d9dd93d66a3fb83fc52aa57a4bec51c517 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Fri, 24 Feb 2017 14:57:02 -0800 Subject: mm, x86: add support for PUD-sized transparent hugepages The current transparent hugepage code only supports PMDs. This patch adds support for transparent use of PUDs with DAX. It does not include support for anonymous pages. x86 support code also added. Most of this patch simply parallels the work that was done for huge PMDs. The only major difference is how the new ->pud_entry method in mm_walk works. The ->pmd_entry method replaces the ->pte_entry method, whereas the ->pud_entry method works along with either ->pmd_entry or ->pte_entry. The pagewalk code takes care of locking the PUD before calling ->pud_walk, so handlers do not need to worry whether the PUD is stable. [dave.jiang@intel.com: fix SMP x86 32bit build for native_pud_clear()] Link: http://lkml.kernel.org/r/148719066814.31111.3239231168815337012.stgit@djiang5-desk3.ch.intel.com [dave.jiang@intel.com: native_pud_clear missing on i386 build] Link: http://lkml.kernel.org/r/148640375195.69754.3315433724330910314.stgit@djiang5-desk3.ch.intel.com Link: http://lkml.kernel.org/r/148545059381.17912.8602162635537598445.stgit@djiang5-desk3.ch.intel.com Signed-off-by: Matthew Wilcox Signed-off-by: Dave Jiang Tested-by: Alexander Kapshuk Cc: Dave Hansen Cc: Vlastimil Babka Cc: Jan Kara Cc: Dan Williams Cc: Ross Zwisler Cc: Kirill A. Shutemov Cc: Nilesh Choudhury Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-generic/pgtable.h | 80 ++++++++++++++++++++++++++++++++++++++--- include/asm-generic/tlb.h | 14 ++++++++ include/linux/huge_mm.h | 83 ++++++++++++++++++++++++++++++++++++++++--- include/linux/mm.h | 30 +++++++++++++++- include/linux/mmu_notifier.h | 14 ++++++++ include/linux/pfn_t.h | 12 +++++++ 6 files changed, 223 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h index 18af2bcefe6a..a0aba0f9c57b 100644 --- a/include/asm-generic/pgtable.h +++ b/include/asm-generic/pgtable.h @@ -36,6 +36,9 @@ extern int ptep_set_access_flags(struct vm_area_struct *vma, extern int pmdp_set_access_flags(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp, pmd_t entry, int dirty); +extern int pudp_set_access_flags(struct vm_area_struct *vma, + unsigned long address, pud_t *pudp, + pud_t entry, int dirty); #else static inline int pmdp_set_access_flags(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp, @@ -44,6 +47,13 @@ static inline int pmdp_set_access_flags(struct vm_area_struct *vma, BUILD_BUG(); return 0; } +static inline int pudp_set_access_flags(struct vm_area_struct *vma, + unsigned long address, pud_t *pudp, + pud_t entry, int dirty) +{ + BUILD_BUG(); + return 0; +} #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #endif @@ -121,8 +131,8 @@ static inline pte_t ptep_get_and_clear(struct mm_struct *mm, } #endif -#ifndef __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR #ifdef CONFIG_TRANSPARENT_HUGEPAGE +#ifndef __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, unsigned long address, pmd_t *pmdp) @@ -131,20 +141,40 @@ static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, pmd_clear(pmdp); return pmd; } +#endif /* __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR */ +#ifndef __HAVE_ARCH_PUDP_HUGE_GET_AND_CLEAR +static inline pud_t pudp_huge_get_and_clear(struct mm_struct *mm, + unsigned long address, + pud_t *pudp) +{ + pud_t pud = *pudp; + + pud_clear(pudp); + return pud; +} +#endif /* __HAVE_ARCH_PUDP_HUGE_GET_AND_CLEAR */ #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ -#endif -#ifndef __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR_FULL #ifdef CONFIG_TRANSPARENT_HUGEPAGE +#ifndef __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR_FULL static inline pmd_t pmdp_huge_get_and_clear_full(struct mm_struct *mm, unsigned long address, pmd_t *pmdp, int full) { return pmdp_huge_get_and_clear(mm, address, pmdp); } -#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #endif +#ifndef __HAVE_ARCH_PUDP_HUGE_GET_AND_CLEAR_FULL +static inline pud_t pudp_huge_get_and_clear_full(struct mm_struct *mm, + unsigned long address, pud_t *pudp, + int full) +{ + return pudp_huge_get_and_clear(mm, address, pudp); +} +#endif +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + #ifndef __HAVE_ARCH_PTEP_GET_AND_CLEAR_FULL static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm, unsigned long address, pte_t *ptep, @@ -181,6 +211,9 @@ extern pte_t ptep_clear_flush(struct vm_area_struct *vma, extern pmd_t pmdp_huge_clear_flush(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp); +extern pud_t pudp_huge_clear_flush(struct vm_area_struct *vma, + unsigned long address, + pud_t *pudp); #endif #ifndef __HAVE_ARCH_PTEP_SET_WRPROTECT @@ -208,6 +241,23 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm, } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #endif +#ifndef __HAVE_ARCH_PUDP_SET_WRPROTECT +#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD +static inline void pudp_set_wrprotect(struct mm_struct *mm, + unsigned long address, pud_t *pudp) +{ + pud_t old_pud = *pudp; + + set_pud_at(mm, address, pudp, pud_wrprotect(old_pud)); +} +#else +static inline void pudp_set_wrprotect(struct mm_struct *mm, + unsigned long address, pud_t *pudp) +{ + BUILD_BUG(); +} +#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ +#endif #ifndef pmdp_collapse_flush #ifdef CONFIG_TRANSPARENT_HUGEPAGE @@ -273,12 +323,23 @@ static inline int pmd_same(pmd_t pmd_a, pmd_t pmd_b) { return pmd_val(pmd_a) == pmd_val(pmd_b); } + +static inline int pud_same(pud_t pud_a, pud_t pud_b) +{ + return pud_val(pud_a) == pud_val(pud_b); +} #else /* CONFIG_TRANSPARENT_HUGEPAGE */ static inline int pmd_same(pmd_t pmd_a, pmd_t pmd_b) { BUILD_BUG(); return 0; } + +static inline int pud_same(pud_t pud_a, pud_t pud_b) +{ + BUILD_BUG(); + return 0; +} #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #endif @@ -640,6 +701,15 @@ static inline int pmd_write(pmd_t pmd) #endif /* __HAVE_ARCH_PMD_WRITE */ #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ +#if !defined(CONFIG_TRANSPARENT_HUGEPAGE) || \ + (defined(CONFIG_TRANSPARENT_HUGEPAGE) && \ + !defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)) +static inline int pud_trans_huge(pud_t pud) +{ + return 0; +} +#endif + #ifndef pmd_read_atomic static inline pmd_t pmd_read_atomic(pmd_t *pmdp) { @@ -785,8 +855,10 @@ static inline int pmd_clear_huge(pmd_t *pmd) * e.g. see arch/arc: flush_pmd_tlb_range */ #define flush_pmd_tlb_range(vma, addr, end) flush_tlb_range(vma, addr, end) +#define flush_pud_tlb_range(vma, addr, end) flush_tlb_range(vma, addr, end) #else #define flush_pmd_tlb_range(vma, addr, end) BUILD_BUG() +#define flush_pud_tlb_range(vma, addr, end) BUILD_BUG() #endif #endif diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h index 7eed8cf3130a..4329bc6ef04b 100644 --- a/include/asm-generic/tlb.h +++ b/include/asm-generic/tlb.h @@ -232,6 +232,20 @@ static inline void tlb_remove_check_page_size_change(struct mmu_gather *tlb, __tlb_remove_pmd_tlb_entry(tlb, pmdp, address); \ } while (0) +/** + * tlb_remove_pud_tlb_entry - remember a pud mapping for later tlb + * invalidation. This is a nop so far, because only x86 needs it. + */ +#ifndef __tlb_remove_pud_tlb_entry +#define __tlb_remove_pud_tlb_entry(tlb, pudp, address) do {} while (0) +#endif + +#define tlb_remove_pud_tlb_entry(tlb, pudp, address) \ + do { \ + __tlb_adjust_range(tlb, address, HPAGE_PUD_SIZE); \ + __tlb_remove_pud_tlb_entry(tlb, pudp, address); \ + } while (0) + /* * For things like page tables caches (ie caching addresses "inside" the * page tables, like x86 does), for legacy reasons, flushing an diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index f0029e786205..a3762d49ba39 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -6,6 +6,18 @@ extern int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr, struct vm_area_struct *vma); extern void huge_pmd_set_accessed(struct vm_fault *vmf, pmd_t orig_pmd); +extern int copy_huge_pud(struct mm_struct *dst_mm, struct mm_struct *src_mm, + pud_t *dst_pud, pud_t *src_pud, unsigned long addr, + struct vm_area_struct *vma); + +#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD +extern void huge_pud_set_accessed(struct vm_fault *vmf, pud_t orig_pud); +#else +static inline void huge_pud_set_accessed(struct vm_fault *vmf, pud_t orig_pud) +{ +} +#endif + extern int do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd); extern struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, unsigned long addr, @@ -17,6 +29,9 @@ extern bool madvise_free_huge_pmd(struct mmu_gather *tlb, extern int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr); +extern int zap_huge_pud(struct mmu_gather *tlb, + struct vm_area_struct *vma, + pud_t *pud, unsigned long addr); extern int mincore_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, unsigned long end, unsigned char *vec); @@ -26,8 +41,10 @@ extern bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr, extern int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, pgprot_t newprot, int prot_numa); -int vmf_insert_pfn_pmd(struct vm_area_struct *, unsigned long addr, pmd_t *, - pfn_t pfn, bool write); +int vmf_insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr, + pmd_t *pmd, pfn_t pfn, bool write); +int vmf_insert_pfn_pud(struct vm_area_struct *vma, unsigned long addr, + pud_t *pud, pfn_t pfn, bool write); enum transparent_hugepage_flag { TRANSPARENT_HUGEPAGE_FLAG, TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, @@ -58,13 +75,14 @@ extern struct kobj_attribute shmem_enabled_attr; #define HPAGE_PMD_NR (1<vm_mm->mmap_sem), vma); + if (pud_trans_huge(*pud) || pud_devmap(*pud)) + return __pud_trans_huge_lock(pud, vma); + else + return NULL; +} static inline int hpage_nr_pages(struct page *page) { if (unlikely(PageTransHuge(page))) @@ -143,6 +183,11 @@ static inline int hpage_nr_pages(struct page *page) return 1; } +struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr, + pmd_t *pmd, int flags); +struct page *follow_devmap_pud(struct vm_area_struct *vma, unsigned long addr, + pud_t *pud, int flags); + extern int do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t orig_pmd); extern struct page *huge_zero_page; @@ -157,6 +202,11 @@ static inline bool is_huge_zero_pmd(pmd_t pmd) return is_huge_zero_page(pmd_page(pmd)); } +static inline bool is_huge_zero_pud(pud_t pud) +{ + return false; +} + struct page *mm_get_huge_zero_page(struct mm_struct *mm); void mm_put_huge_zero_page(struct mm_struct *mm); @@ -167,6 +217,10 @@ void mm_put_huge_zero_page(struct mm_struct *mm); #define HPAGE_PMD_MASK ({ BUILD_BUG(); 0; }) #define HPAGE_PMD_SIZE ({ BUILD_BUG(); 0; }) +#define HPAGE_PUD_SHIFT ({ BUILD_BUG(); 0; }) +#define HPAGE_PUD_MASK ({ BUILD_BUG(); 0; }) +#define HPAGE_PUD_SIZE ({ BUILD_BUG(); 0; }) + #define hpage_nr_pages(x) 1 #define transparent_hugepage_enabled(__vma) 0 @@ -195,6 +249,9 @@ static inline void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, static inline void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address, bool freeze, struct page *page) {} +#define split_huge_pud(__vma, __pmd, __address) \ + do { } while (0) + static inline int hugepage_madvise(struct vm_area_struct *vma, unsigned long *vm_flags, int advice) { @@ -212,6 +269,11 @@ static inline spinlock_t *pmd_trans_huge_lock(pmd_t *pmd, { return NULL; } +static inline spinlock_t *pud_trans_huge_lock(pud_t *pud, + struct vm_area_struct *vma) +{ + return NULL; +} static inline int do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t orig_pmd) { @@ -223,6 +285,11 @@ static inline bool is_huge_zero_page(struct page *page) return false; } +static inline bool is_huge_zero_pud(pud_t pud) +{ + return false; +} + static inline void mm_put_huge_zero_page(struct mm_struct *mm) { return; @@ -233,6 +300,12 @@ static inline struct page *follow_devmap_pmd(struct vm_area_struct *vma, { return NULL; } + +static inline struct page *follow_devmap_pud(struct vm_area_struct *vma, + unsigned long addr, pud_t *pud, int flags) +{ + return NULL; +} #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #endif /* _LINUX_HUGE_MM_H */ diff --git a/include/linux/mm.h b/include/linux/mm.h index 035a688e5472..d8b75d7d6a9e 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -424,6 +424,10 @@ static inline int pmd_devmap(pmd_t pmd) { return 0; } +static inline int pud_devmap(pud_t pud) +{ + return 0; +} #endif /* @@ -1199,6 +1203,10 @@ void unmap_vmas(struct mmu_gather *tlb, struct vm_area_struct *start_vma, /** * mm_walk - callbacks for walk_page_range + * @pud_entry: if set, called for each non-empty PUD (2nd-level) entry + * this handler should only handle pud_trans_huge() puds. + * the pmd_entry or pte_entry callbacks will be used for + * regular PUDs. * @pmd_entry: if set, called for each non-empty PMD (3rd-level) entry * this handler is required to be able to handle * pmd_trans_huge() pmds. They may simply choose to @@ -1218,6 +1226,8 @@ void unmap_vmas(struct mmu_gather *tlb, struct vm_area_struct *start_vma, * (see the comment on walk_page_range() for more details) */ struct mm_walk { + int (*pud_entry)(pud_t *pud, unsigned long addr, + unsigned long next, struct mm_walk *walk); int (*pmd_entry)(pmd_t *pmd, unsigned long addr, unsigned long next, struct mm_walk *walk); int (*pte_entry)(pte_t *pte, unsigned long addr, @@ -1801,8 +1811,26 @@ static inline spinlock_t *pmd_lock(struct mm_struct *mm, pmd_t *pmd) return ptl; } -extern void __init pagecache_init(void); +/* + * No scalability reason to split PUD locks yet, but follow the same pattern + * as the PMD locks to make it easier if we decide to. The VM should not be + * considered ready to switch to split PUD locks yet; there may be places + * which need to be converted from page_table_lock. + */ +static inline spinlock_t *pud_lockptr(struct mm_struct *mm, pud_t *pud) +{ + return &mm->page_table_lock; +} + +static inline spinlock_t *pud_lock(struct mm_struct *mm, pud_t *pud) +{ + spinlock_t *ptl = pud_lockptr(mm, pud); + + spin_lock(ptl); + return ptl; +} +extern void __init pagecache_init(void); extern void free_area_init(unsigned long * zones_size); extern void free_area_init_node(int nid, unsigned long * zones_size, unsigned long zone_start_pfn, unsigned long *zholes_size); diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h index a1a210d59961..51891fb0d3ce 100644 --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h @@ -381,6 +381,19 @@ static inline void mmu_notifier_mm_destroy(struct mm_struct *mm) ___pmd; \ }) +#define pudp_huge_clear_flush_notify(__vma, __haddr, __pud) \ +({ \ + unsigned long ___haddr = __haddr & HPAGE_PUD_MASK; \ + struct mm_struct *___mm = (__vma)->vm_mm; \ + pud_t ___pud; \ + \ + ___pud = pudp_huge_clear_flush(__vma, __haddr, __pud); \ + mmu_notifier_invalidate_range(___mm, ___haddr, \ + ___haddr + HPAGE_PUD_SIZE); \ + \ + ___pud; \ +}) + #define pmdp_huge_get_and_clear_notify(__mm, __haddr, __pmd) \ ({ \ unsigned long ___haddr = __haddr & HPAGE_PMD_MASK; \ @@ -475,6 +488,7 @@ static inline void mmu_notifier_mm_destroy(struct mm_struct *mm) #define pmdp_clear_young_notify pmdp_test_and_clear_young #define ptep_clear_flush_notify ptep_clear_flush #define pmdp_huge_clear_flush_notify pmdp_huge_clear_flush +#define pudp_huge_clear_flush_notify pudp_huge_clear_flush #define pmdp_huge_get_and_clear_notify pmdp_huge_get_and_clear #define set_pte_at_notify set_pte_at diff --git a/include/linux/pfn_t.h b/include/linux/pfn_t.h index 033fc7bbcefa..a49b3259cad7 100644 --- a/include/linux/pfn_t.h +++ b/include/linux/pfn_t.h @@ -90,6 +90,13 @@ static inline pmd_t pfn_t_pmd(pfn_t pfn, pgprot_t pgprot) { return pfn_pmd(pfn_t_to_pfn(pfn), pgprot); } + +#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD +static inline pud_t pfn_t_pud(pfn_t pfn, pgprot_t pgprot) +{ + return pfn_pud(pfn_t_to_pfn(pfn), pgprot); +} +#endif #endif #ifdef __HAVE_ARCH_PTE_DEVMAP @@ -106,5 +113,10 @@ static inline bool pfn_t_devmap(pfn_t pfn) } pte_t pte_mkdevmap(pte_t pte); pmd_t pmd_mkdevmap(pmd_t pmd); +#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && \ + defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) +pud_t pud_mkdevmap(pud_t pud); #endif +#endif /* __HAVE_ARCH_PTE_DEVMAP */ + #endif /* _LINUX_PFN_T_H_ */ -- cgit v1.2.3 From c791ace1e747371658237f0d30234fef56c39669 Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Fri, 24 Feb 2017 14:57:08 -0800 Subject: mm: replace FAULT_FLAG_SIZE with parameter to huge_fault Since the introduction of FAULT_FLAG_SIZE to the vm_fault flag, it has been somewhat painful with getting the flags set and removed at the correct locations. More than one kernel oops was introduced due to difficulties of getting the placement correctly. Remove the flag values and introduce an input parameter to huge_fault that indicates the size of the page entry. This makes the code easier to trace and should avoid the issues we see with the fault flags where removal of the flag was necessary in the fallback paths. Link: http://lkml.kernel.org/r/148615748258.43180.1690152053774975329.stgit@djiang5-desk3.ch.intel.com Signed-off-by: Dave Jiang Tested-by: Dan Williams Reviewed-by: Jan Kara Cc: Matthew Wilcox Cc: Dave Hansen Cc: Vlastimil Babka Cc: Ross Zwisler Cc: Kirill A. Shutemov Cc: Nilesh Choudhury Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/dax.h | 3 ++- include/linux/mm.h | 14 ++++++++------ 2 files changed, 10 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/linux/dax.h b/include/linux/dax.h index cf9af225962b..d8a3dc042e1c 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -38,7 +38,8 @@ static inline void *dax_radix_locked_entry(sector_t sector, unsigned long flags) ssize_t dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter, const struct iomap_ops *ops); -int dax_iomap_fault(struct vm_fault *vmf, const struct iomap_ops *ops); +int dax_iomap_fault(struct vm_fault *vmf, enum page_entry_size pe_size, + const struct iomap_ops *ops); int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index); int dax_invalidate_mapping_entry(struct address_space *mapping, pgoff_t index); int dax_invalidate_mapping_entry_sync(struct address_space *mapping, diff --git a/include/linux/mm.h b/include/linux/mm.h index d8b75d7d6a9e..c65aa43b5712 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -285,11 +285,6 @@ extern pgprot_t protection_map[16]; #define FAULT_FLAG_REMOTE 0x80 /* faulting for non current tsk/mm */ #define FAULT_FLAG_INSTRUCTION 0x100 /* The fault was during an instruction fetch */ -#define FAULT_FLAG_SIZE_MASK 0x7000 /* Support up to 8-level page tables */ -#define FAULT_FLAG_SIZE_PTE 0x0000 /* First level (eg 4k) */ -#define FAULT_FLAG_SIZE_PMD 0x1000 /* Second level (eg 2MB) */ -#define FAULT_FLAG_SIZE_PUD 0x2000 /* Third level (eg 1GB) */ - #define FAULT_FLAG_TRACE \ { FAULT_FLAG_WRITE, "WRITE" }, \ { FAULT_FLAG_MKWRITE, "MKWRITE" }, \ @@ -349,6 +344,13 @@ struct vm_fault { */ }; +/* page entry size for vm->huge_fault() */ +enum page_entry_size { + PE_SIZE_PTE = 0, + PE_SIZE_PMD, + PE_SIZE_PUD, +}; + /* * These are the virtual MM functions - opening of an area, closing and * unmapping it (needed to keep files on disk up-to-date etc), pointer @@ -359,7 +361,7 @@ struct vm_operations_struct { void (*close)(struct vm_area_struct * area); int (*mremap)(struct vm_area_struct * area); int (*fault)(struct vm_fault *vmf); - int (*huge_fault)(struct vm_fault *vmf); + int (*huge_fault)(struct vm_fault *vmf, enum page_entry_size pe_size); void (*map_pages)(struct vm_fault *vmf, pgoff_t start_pgoff, pgoff_t end_pgoff); -- cgit v1.2.3 From 9e5bcd610ffcedf5e485e78a72762810b25c7f25 Mon Sep 17 00:00:00 2001 From: Yisheng Xie Date: Fri, 24 Feb 2017 14:57:29 -0800 Subject: mm/migration: make isolate_movable_page() return int type Patch series "HWPOISON: soft offlining for non-lru movable page", v6. After Minchan's commit bda807d44454 ("mm: migrate: support non-lru movable page migration"), some type of non-lru page like zsmalloc and virtio-balloon page also support migration. Therefore, we can: 1) soft offlining no-lru movable pages, which means when memory corrected errors occur on a non-lru movable page, we can stop to use it by migrating data onto another page and disable the original (maybe half-broken) one. 2) enable memory hotplug for non-lru movable pages, i.e. we may offline blocks, which include such pages, by using non-lru page migration. This patchset is heavily dependent on non-lru movable page migration. This patch (of 4): Change the return type of isolate_movable_page() from bool to int. It will return 0 when isolate movable page successfully, and return -EBUSY when it isolates failed. There is no functional change within this patch but prepare for later patch. [xieyisheng1@huawei.com: v6] Link: http://lkml.kernel.org/r/1486108770-630-2-git-send-email-xieyisheng1@huawei.com Link: http://lkml.kernel.org/r/1485867981-16037-2-git-send-email-ysxie@foxmail.com Signed-off-by: Yisheng Xie Suggested-by: Michal Hocko Acked-by: Minchan Kim Cc: Andi Kleen Cc: Hanjun Guo Cc: Johannes Weiner Cc: Joonsoo Kim Cc: Mel Gorman Cc: Naoya Horiguchi Cc: Reza Arbab Cc: Taku Izumi Cc: Vitaly Kuznetsov Cc: Vlastimil Babka Cc: Xishi Qiu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/migrate.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/migrate.h b/include/linux/migrate.h index ae8d475a9385..43d5deb0c4cc 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -37,7 +37,7 @@ extern int migrate_page(struct address_space *, struct page *, struct page *, enum migrate_mode); extern int migrate_pages(struct list_head *l, new_page_t new, free_page_t free, unsigned long private, enum migrate_mode mode, int reason); -extern bool isolate_movable_page(struct page *page, isolate_mode_t mode); +extern int isolate_movable_page(struct page *page, isolate_mode_t mode); extern void putback_movable_page(struct page *page); extern int migrate_prep(void); -- cgit v1.2.3 From cbae0170e5329c79c0a36a81fedd2800146aca12 Mon Sep 17 00:00:00 2001 From: Yisheng Xie Date: Fri, 24 Feb 2017 14:57:32 -0800 Subject: mm/migration: make isolate_movable_page always defined Define isolate_movable_page as a static inline function when CONFIG_MIGRATION is not enable. It should return -EBUSY here which means failed to isolate movable pages. This patch do not have any functional change but prepare for later patch. Link: http://lkml.kernel.org/r/1485867981-16037-3-git-send-email-ysxie@foxmail.com Signed-off-by: Yisheng Xie Acked-by: Minchan Kim Suggested-by: Michal Hocko Cc: Naoya Horiguchi Cc: Vlastimil Babka Cc: Andi Kleen Cc: Hanjun Guo Cc: Johannes Weiner Cc: Joonsoo Kim Cc: Mel Gorman Cc: Reza Arbab Cc: Taku Izumi Cc: Vitaly Kuznetsov Cc: Xishi Qiu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/migrate.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 43d5deb0c4cc..fa76b516fa47 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -56,6 +56,8 @@ static inline int migrate_pages(struct list_head *l, new_page_t new, free_page_t free, unsigned long private, enum migrate_mode mode, int reason) { return -ENOSYS; } +static inline int isolate_movable_page(struct page *page, isolate_mode_t mode) + { return -EBUSY; } static inline int migrate_prep(void) { return -ENOSYS; } static inline int migrate_prep_local(void) { return -ENOSYS; } -- cgit v1.2.3 From ace71a19cec5eb430207c3269d8a2683f0574306 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Fri, 24 Feb 2017 14:57:45 -0800 Subject: mm: introduce page_vma_mapped_walk() Introduce a new interface to check if a page is mapped into a vma. It aims to address shortcomings of page_check_address{,_transhuge}. Existing interface is not able to handle PTE-mapped THPs: it only finds the first PTE. The rest lefted unnoticed. page_vma_mapped_walk() iterates over all possible mapping of the page in the vma. Link: http://lkml.kernel.org/r/20170129173858.45174-3-kirill.shutemov@linux.intel.com Signed-off-by: Kirill A. Shutemov Cc: Andrea Arcangeli Cc: Hillf Danton Cc: Hugh Dickins Cc: Johannes Weiner Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Rik van Riel Cc: Srikar Dronamraju Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/rmap.h | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) (limited to 'include') diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 15321fb1df6b..b76343610653 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -9,6 +9,7 @@ #include #include #include +#include /* * The anon_vma heads a list of private "related" vmas, to scan if @@ -232,6 +233,31 @@ static inline bool page_check_address_transhuge(struct page *page, } #endif +/* Avoid racy checks */ +#define PVMW_SYNC (1 << 0) +/* Look for migarion entries rather than present PTEs */ +#define PVMW_MIGRATION (1 << 1) + +struct page_vma_mapped_walk { + struct page *page; + struct vm_area_struct *vma; + unsigned long address; + pmd_t *pmd; + pte_t *pte; + spinlock_t *ptl; + unsigned int flags; +}; + +static inline void page_vma_mapped_walk_done(struct page_vma_mapped_walk *pvmw) +{ + if (pvmw->pte) + pte_unmap(pvmw->pte); + if (pvmw->ptl) + spin_unlock(pvmw->ptl); +} + +bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw); + /* * Used by swapoff to help locate where page is expected in vma. */ -- cgit v1.2.3 From d53a8b49a626fdfce4390710da6d04b4314db25f Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Fri, 24 Feb 2017 14:58:13 -0800 Subject: mm: drop page_check_address{,_transhuge} All users are gone. Let's drop them. Link: http://lkml.kernel.org/r/20170129173858.45174-12-kirill.shutemov@linux.intel.com Signed-off-by: Kirill A. Shutemov Cc: Andrea Arcangeli Cc: Hillf Danton Cc: Hugh Dickins Cc: Johannes Weiner Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Rik van Riel Cc: Srikar Dronamraju Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/rmap.h | 36 ------------------------------------ 1 file changed, 36 deletions(-) (limited to 'include') diff --git a/include/linux/rmap.h b/include/linux/rmap.h index b76343610653..8c89e902df3e 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -197,42 +197,6 @@ int page_referenced(struct page *, int is_locked, int try_to_unmap(struct page *, enum ttu_flags flags); -/* - * Used by uprobes to replace a userspace page safely - */ -pte_t *__page_check_address(struct page *, struct mm_struct *, - unsigned long, spinlock_t **, int); - -static inline pte_t *page_check_address(struct page *page, struct mm_struct *mm, - unsigned long address, - spinlock_t **ptlp, int sync) -{ - pte_t *ptep; - - __cond_lock(*ptlp, ptep = __page_check_address(page, mm, address, - ptlp, sync)); - return ptep; -} - -/* - * Used by idle page tracking to check if a page was referenced via page - * tables. - */ -#ifdef CONFIG_TRANSPARENT_HUGEPAGE -bool page_check_address_transhuge(struct page *page, struct mm_struct *mm, - unsigned long address, pmd_t **pmdp, - pte_t **ptep, spinlock_t **ptlp); -#else -static inline bool page_check_address_transhuge(struct page *page, - struct mm_struct *mm, unsigned long address, - pmd_t **pmdp, pte_t **ptep, spinlock_t **ptlp) -{ - *ptep = page_check_address(page, mm, address, ptlp, 0); - *pmdp = NULL; - return !!*ptep; -} -#endif - /* Avoid racy checks */ #define PVMW_SYNC (1 << 0) /* Look for migarion entries rather than present PTEs */ -- cgit v1.2.3 From 897ab3e0c49e24b62e2d54d165c7afec6bbca65b Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Fri, 24 Feb 2017 14:58:22 -0800 Subject: userfaultfd: non-cooperative: add event for memory unmaps When a non-cooperative userfaultfd monitor copies pages in the background, it may encounter regions that were already unmapped. Addition of UFFD_EVENT_UNMAP allows the uffd monitor to track precisely changes in the virtual memory layout. Since there might be different uffd contexts for the affected VMAs, we first should create a temporary representation for the unmap event for each uffd context and then notify them one by one to the appropriate userfault file descriptors. The event notification occurs after the mmap_sem has been released. [arnd@arndb.de: fix nommu build] Link: http://lkml.kernel.org/r/20170203165141.3665284-1-arnd@arndb.de [mhocko@suse.com: fix nommu build] Link: http://lkml.kernel.org/r/20170202091503.GA22823@dhcp22.suse.cz Link: http://lkml.kernel.org/r/1485542673-24387-3-git-send-email-rppt@linux.vnet.ibm.com Signed-off-by: Mike Rapoport Signed-off-by: Michal Hocko Signed-off-by: Arnd Bergmann Acked-by: Hillf Danton Cc: Andrea Arcangeli Cc: "Dr. David Alan Gilbert" Cc: Mike Kravetz Cc: Pavel Emelyanov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 14 +++++++++----- include/linux/userfaultfd_k.h | 18 ++++++++++++++++++ include/uapi/linux/userfaultfd.h | 3 +++ 3 files changed, 30 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index c65aa43b5712..c6fcba1d1ae5 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2090,18 +2090,22 @@ extern int install_special_mapping(struct mm_struct *mm, extern unsigned long get_unmapped_area(struct file *, unsigned long, unsigned long, unsigned long, unsigned long); extern unsigned long mmap_region(struct file *file, unsigned long addr, - unsigned long len, vm_flags_t vm_flags, unsigned long pgoff); + unsigned long len, vm_flags_t vm_flags, unsigned long pgoff, + struct list_head *uf); extern unsigned long do_mmap(struct file *file, unsigned long addr, unsigned long len, unsigned long prot, unsigned long flags, - vm_flags_t vm_flags, unsigned long pgoff, unsigned long *populate); -extern int do_munmap(struct mm_struct *, unsigned long, size_t); + vm_flags_t vm_flags, unsigned long pgoff, unsigned long *populate, + struct list_head *uf); +extern int do_munmap(struct mm_struct *, unsigned long, size_t, + struct list_head *uf); static inline unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, unsigned long len, unsigned long prot, unsigned long flags, - unsigned long pgoff, unsigned long *populate) + unsigned long pgoff, unsigned long *populate, + struct list_head *uf) { - return do_mmap(file, addr, len, prot, flags, 0, pgoff, populate); + return do_mmap(file, addr, len, prot, flags, 0, pgoff, populate, uf); } #ifdef CONFIG_MMU diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index 2521542f6c07..a40be5d0661b 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -66,6 +66,12 @@ extern void userfaultfd_remove(struct vm_area_struct *vma, unsigned long start, unsigned long end); +extern int userfaultfd_unmap_prep(struct vm_area_struct *vma, + unsigned long start, unsigned long end, + struct list_head *uf); +extern void userfaultfd_unmap_complete(struct mm_struct *mm, + struct list_head *uf); + #else /* CONFIG_USERFAULTFD */ /* mm helpers */ @@ -118,6 +124,18 @@ static inline void userfaultfd_remove(struct vm_area_struct *vma, unsigned long end) { } + +static inline int userfaultfd_unmap_prep(struct vm_area_struct *vma, + unsigned long start, unsigned long end, + struct list_head *uf) +{ + return 0; +} + +static inline void userfaultfd_unmap_complete(struct mm_struct *mm, + struct list_head *uf) +{ +} #endif /* CONFIG_USERFAULTFD */ #endif /* _LINUX_USERFAULTFD_K_H */ diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h index b742c40c2880..3b059530dac9 100644 --- a/include/uapi/linux/userfaultfd.h +++ b/include/uapi/linux/userfaultfd.h @@ -21,6 +21,7 @@ #define UFFD_API_FEATURES (UFFD_FEATURE_EVENT_FORK | \ UFFD_FEATURE_EVENT_REMAP | \ UFFD_FEATURE_EVENT_REMOVE | \ + UFFD_FEATURE_EVENT_UNMAP | \ UFFD_FEATURE_MISSING_HUGETLBFS | \ UFFD_FEATURE_MISSING_SHMEM) #define UFFD_API_IOCTLS \ @@ -110,6 +111,7 @@ struct uffd_msg { #define UFFD_EVENT_FORK 0x13 #define UFFD_EVENT_REMAP 0x14 #define UFFD_EVENT_REMOVE 0x15 +#define UFFD_EVENT_UNMAP 0x16 /* flags for UFFD_EVENT_PAGEFAULT */ #define UFFD_PAGEFAULT_FLAG_WRITE (1<<0) /* If this was a write fault */ @@ -158,6 +160,7 @@ struct uffdio_api { #define UFFD_FEATURE_EVENT_REMOVE (1<<3) #define UFFD_FEATURE_MISSING_HUGETLBFS (1<<4) #define UFFD_FEATURE_MISSING_SHMEM (1<<5) +#define UFFD_FEATURE_EVENT_UNMAP (1<<6) __u64 features; __u64 ioctls; -- cgit v1.2.3 From ca49ca7114553587736fe78319e22f073b631380 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Fri, 24 Feb 2017 14:58:25 -0800 Subject: userfaultfd: non-cooperative: add event for exit() notification Allow userfaultfd monitor track termination of the processes that have memory backed by the uffd. [rppt@linux.vnet.ibm.com: add comment] Link: http://lkml.kernel.org/r/20170202135448.GB19804@rapoport-lnxLink: http://lkml.kernel.org/r/1485542673-24387-4-git-send-email-rppt@linux.vnet.ibm.com Signed-off-by: Mike Rapoport Acked-by: Hillf Danton Cc: Andrea Arcangeli Cc: "Dr. David Alan Gilbert" Cc: Mike Kravetz Cc: Pavel Emelyanov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/userfaultfd_k.h | 7 +++++++ include/uapi/linux/userfaultfd.h | 5 ++++- 2 files changed, 11 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index a40be5d0661b..0468548acebf 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -72,6 +72,8 @@ extern int userfaultfd_unmap_prep(struct vm_area_struct *vma, extern void userfaultfd_unmap_complete(struct mm_struct *mm, struct list_head *uf); +extern void userfaultfd_exit(struct mm_struct *mm); + #else /* CONFIG_USERFAULTFD */ /* mm helpers */ @@ -136,6 +138,11 @@ static inline void userfaultfd_unmap_complete(struct mm_struct *mm, struct list_head *uf) { } + +static inline void userfaultfd_exit(struct mm_struct *mm) +{ +} + #endif /* CONFIG_USERFAULTFD */ #endif /* _LINUX_USERFAULTFD_K_H */ diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h index 3b059530dac9..c055947c5c98 100644 --- a/include/uapi/linux/userfaultfd.h +++ b/include/uapi/linux/userfaultfd.h @@ -18,7 +18,8 @@ * means the userland is reading). */ #define UFFD_API ((__u64)0xAA) -#define UFFD_API_FEATURES (UFFD_FEATURE_EVENT_FORK | \ +#define UFFD_API_FEATURES (UFFD_FEATURE_EVENT_EXIT | \ + UFFD_FEATURE_EVENT_FORK | \ UFFD_FEATURE_EVENT_REMAP | \ UFFD_FEATURE_EVENT_REMOVE | \ UFFD_FEATURE_EVENT_UNMAP | \ @@ -112,6 +113,7 @@ struct uffd_msg { #define UFFD_EVENT_REMAP 0x14 #define UFFD_EVENT_REMOVE 0x15 #define UFFD_EVENT_UNMAP 0x16 +#define UFFD_EVENT_EXIT 0x17 /* flags for UFFD_EVENT_PAGEFAULT */ #define UFFD_PAGEFAULT_FLAG_WRITE (1<<0) /* If this was a write fault */ @@ -161,6 +163,7 @@ struct uffdio_api { #define UFFD_FEATURE_MISSING_HUGETLBFS (1<<4) #define UFFD_FEATURE_MISSING_SHMEM (1<<5) #define UFFD_FEATURE_EVENT_UNMAP (1<<6) +#define UFFD_FEATURE_EVENT_EXIT (1<<7) __u64 features; __u64 ioctls; -- cgit v1.2.3 From ca96b625341027f611c3e61351a70311077ebcf5 Mon Sep 17 00:00:00 2001 From: Lucas Stach Date: Fri, 24 Feb 2017 14:58:37 -0800 Subject: mm: alloc_contig_range: allow to specify GFP mask Currently alloc_contig_range assumes that the compaction should be done with the default GFP_KERNEL flags. This is probably right for all current uses of this interface, but may change as CMA is used in more use-cases (including being the default DMA memory allocator on some platforms). Change the function prototype, to allow for passing through the GFP mask set by upper layers. Also respect global restrictions by applying memalloc_noio_flags to the passed in flags. Link: http://lkml.kernel.org/r/20170127172328.18574-1-l.stach@pengutronix.de Signed-off-by: Lucas Stach Acked-by: Michal Hocko Acked-by: Vlastimil Babka Cc: Radim Krcmar Cc: Catalin Marinas Cc: Will Deacon Cc: Chris Zankel Cc: Ralf Baechle Cc: Paolo Bonzini Cc: Alexander Graf Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/gfp.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 0fe0b6295ab5..db373b9d3223 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -541,7 +541,7 @@ static inline bool pm_suspended_storage(void) #if (defined(CONFIG_MEMORY_ISOLATION) && defined(CONFIG_COMPACTION)) || defined(CONFIG_CMA) /* The below functions must be run on a range from a single zone. */ extern int alloc_contig_range(unsigned long start, unsigned long end, - unsigned migratetype); + unsigned migratetype, gfp_t gfp_mask); extern void free_contig_range(unsigned long pfn, unsigned nr_pages); #endif -- cgit v1.2.3 From e2f466e32f56c8b3ee0d1a40cc39e1c779dfd598 Mon Sep 17 00:00:00 2001 From: Lucas Stach Date: Fri, 24 Feb 2017 14:58:41 -0800 Subject: mm: cma_alloc: allow to specify GFP mask Most users of this interface just want to use it with the default GFP_KERNEL flags, but for cases where DMA memory is allocated it may be called from a different context. No functional change yet, just passing through the flag to the underlying alloc_contig_range function. Link: http://lkml.kernel.org/r/20170127172328.18574-2-l.stach@pengutronix.de Signed-off-by: Lucas Stach Acked-by: Vlastimil Babka Acked-by: Michal Hocko Cc: Radim Krcmar Cc: Catalin Marinas Cc: Will Deacon Cc: Chris Zankel Cc: Ralf Baechle Cc: Paolo Bonzini Cc: Alexander Graf Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/cma.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/cma.h b/include/linux/cma.h index 6f0a91b37f68..03f32d0bd1d8 100644 --- a/include/linux/cma.h +++ b/include/linux/cma.h @@ -29,6 +29,7 @@ extern int __init cma_declare_contiguous(phys_addr_t base, extern int cma_init_reserved_mem(phys_addr_t base, phys_addr_t size, unsigned int order_per_bit, struct cma **res_cma); -extern struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align); +extern struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align, + gfp_t gfp_mask); extern bool cma_release(struct cma *cma, const struct page *pages, unsigned int count); #endif -- cgit v1.2.3 From 712c604dcdf8186295e2af694adf52c6842ad100 Mon Sep 17 00:00:00 2001 From: Lucas Stach Date: Fri, 24 Feb 2017 14:58:44 -0800 Subject: mm: wire up GFP flag passing in dma_alloc_from_contiguous The callers of the DMA alloc functions already provide the proper context GFP flags. Make sure to pass them through to the CMA allocator, to make the CMA compaction context aware. Link: http://lkml.kernel.org/r/20170127172328.18574-3-l.stach@pengutronix.de Signed-off-by: Lucas Stach Acked-by: Vlastimil Babka Acked-by: Michal Hocko Cc: Radim Krcmar Cc: Catalin Marinas Cc: Will Deacon Cc: Chris Zankel Cc: Ralf Baechle Cc: Paolo Bonzini Cc: Alexander Graf Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/dma-contiguous.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/dma-contiguous.h b/include/linux/dma-contiguous.h index fec734df1524..b67bf6ac907d 100644 --- a/include/linux/dma-contiguous.h +++ b/include/linux/dma-contiguous.h @@ -112,7 +112,7 @@ static inline int dma_declare_contiguous(struct device *dev, phys_addr_t size, } struct page *dma_alloc_from_contiguous(struct device *dev, size_t count, - unsigned int order); + unsigned int order, gfp_t gfp_mask); bool dma_release_from_contiguous(struct device *dev, struct page *pages, int count); @@ -145,7 +145,7 @@ int dma_declare_contiguous(struct device *dev, phys_addr_t size, static inline struct page *dma_alloc_from_contiguous(struct device *dev, size_t count, - unsigned int order) + unsigned int order, gfp_t gfp_mask) { return NULL; } -- cgit v1.2.3 From def5efe0376501ef7bd6b53ed061512c142e59aa Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Fri, 24 Feb 2017 14:58:47 -0800 Subject: mm, madvise: fail with ENOMEM when splitting vma will hit max_map_count If madvise(2) advice will result in the underlying vma being split and the number of areas mapped by the process will exceed /proc/sys/vm/max_map_count as a result, return ENOMEM instead of EAGAIN. EAGAIN is returned by madvise(2) when a kernel resource, such as slab, is temporarily unavailable. It indicates that userspace should retry the advice in the near future. This is important for advice such as MADV_DONTNEED which is often used by malloc implementations to free memory back to the system: we really do want to free memory back when madvise(2) returns EAGAIN because slab allocations (for vmas, anon_vmas, or mempolicies) cannot be allocated. Encountering /proc/sys/vm/max_map_count is not a temporary failure, however, so return ENOMEM to indicate this is a more serious issue. A followup patch to the man page will specify this behavior. Link: http://lkml.kernel.org/r/alpine.DEB.2.10.1701241431120.42507@chino.kir.corp.google.com Signed-off-by: David Rientjes Cc: Jonathan Corbet Cc: Johannes Weiner Cc: Jerome Marchand Cc: "Kirill A. Shutemov" Cc: Michael Kerrisk Cc: Anshuman Khandual Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index c6fcba1d1ae5..ed233b002e33 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2041,8 +2041,10 @@ extern struct vm_area_struct *vma_merge(struct mm_struct *, unsigned long vm_flags, struct anon_vma *, struct file *, pgoff_t, struct mempolicy *, struct vm_userfaultfd_ctx); extern struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *); -extern int split_vma(struct mm_struct *, - struct vm_area_struct *, unsigned long addr, int new_below); +extern int __split_vma(struct mm_struct *, struct vm_area_struct *, + unsigned long addr, int new_below); +extern int split_vma(struct mm_struct *, struct vm_area_struct *, + unsigned long addr, int new_below); extern int insert_vm_struct(struct mm_struct *, struct vm_area_struct *); extern void __vma_link_rb(struct mm_struct *, struct vm_area_struct *, struct rb_node **, struct rb_node *); -- cgit v1.2.3 From 288bc54949fc2625a4fd811a188fb200cc498946 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Fri, 24 Feb 2017 14:59:16 -0800 Subject: mm/autonuma: let architecture override how the write bit should be stashed in a protnone pte. Patch series "Numabalancing preserve write fix", v2. This patch series address an issue w.r.t THP migration and autonuma preserve write feature. migrate_misplaced_transhuge_page() cannot deal with concurrent modification of the page. It does a page copy without following the migration pte sequence. IIUC, this was done to keep the migration simpler and at the time of implemenation we didn't had THP page cache which would have required a more elaborate migration scheme. That means thp autonuma migration expect the protnone with saved write to be done such that both kernel and user cannot update the page content. This patch series enables archs like ppc64 to do that. We are good with the hash translation mode with the current code, because we never create a hardware page table entry for a protnone pte. This patch (of 2): Autonuma preserves the write permission across numa fault to avoid taking a writefault after a numa fault (Commit: b191f9b106ea " mm: numa: preserve PTE write permissions across a NUMA hinting fault"). Architecture can implement protnone in different ways and some may choose to implement that by clearing Read/ Write/Exec bit of pte. Setting the write bit on such pte can result in wrong behaviour. Fix this up by allowing arch to override how to save the write bit on a protnone pte. [aneesh.kumar@linux.vnet.ibm.com: don't mark pte saved write in case of dirty_accountable] Link: http://lkml.kernel.org/r/1487942884-16517-1-git-send-email-aneesh.kumar@linux.vnet.ibm.com [aneesh.kumar@linux.vnet.ibm.com: v3] Link: http://lkml.kernel.org/r/1487498625-10891-2-git-send-email-aneesh.kumar@linux.vnet.ibm.com Link: http://lkml.kernel.org/r/1487050314-3892-2-git-send-email-aneesh.kumar@linux.vnet.ibm.com Signed-off-by: Aneesh Kumar K.V Acked-by: Michael Neuling Cc: Rik van Riel Cc: Mel Gorman Cc: Paul Mackerras Cc: Benjamin Herrenschmidt Cc: Michael Ellerman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-generic/pgtable.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'include') diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h index a0aba0f9c57b..6265411eb2ed 100644 --- a/include/asm-generic/pgtable.h +++ b/include/asm-generic/pgtable.h @@ -225,6 +225,22 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addres } #endif +#ifndef pte_savedwrite +#define pte_savedwrite pte_write +#endif + +#ifndef pte_mk_savedwrite +#define pte_mk_savedwrite pte_mkwrite +#endif + +#ifndef pmd_savedwrite +#define pmd_savedwrite pmd_write +#endif + +#ifndef pmd_mk_savedwrite +#define pmd_mk_savedwrite pmd_mkwrite +#endif + #ifndef __HAVE_ARCH_PMDP_SET_WRPROTECT #ifdef CONFIG_TRANSPARENT_HUGEPAGE static inline void pmdp_set_wrprotect(struct mm_struct *mm, -- cgit v1.2.3 From 595cd8f256d24face93b2722927ec9c980419c26 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Fri, 24 Feb 2017 14:59:19 -0800 Subject: mm/ksm: handle protnone saved writes when making page write protect Without this KSM will consider the page write protected, but a numa fault can later mark the page writable. This can result in memory corruption. Link: http://lkml.kernel.org/r/1487498625-10891-3-git-send-email-aneesh.kumar@linux.vnet.ibm.com Signed-off-by: Aneesh Kumar K.V Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-generic/pgtable.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include') diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h index 6265411eb2ed..f4ca23b158b3 100644 --- a/include/asm-generic/pgtable.h +++ b/include/asm-generic/pgtable.h @@ -233,6 +233,10 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addres #define pte_mk_savedwrite pte_mkwrite #endif +#ifndef pte_clear_savedwrite +#define pte_clear_savedwrite pte_wrprotect +#endif + #ifndef pmd_savedwrite #define pmd_savedwrite pmd_write #endif @@ -241,6 +245,10 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addres #define pmd_mk_savedwrite pmd_mkwrite #endif +#ifndef pmd_clear_savedwrite +#define pmd_clear_savedwrite pmd_wrprotect +#endif + #ifndef __HAVE_ARCH_PMDP_SET_WRPROTECT #ifdef CONFIG_TRANSPARENT_HUGEPAGE static inline void pmdp_set_wrprotect(struct mm_struct *mm, -- cgit v1.2.3 From 3a4f8a0b3ffa733ffbb327685e83b63383127cf6 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Fri, 24 Feb 2017 14:59:36 -0800 Subject: mm: remove shmem_mapping() shmem_zero_setup() duplicates Remove the prototypes for shmem_mapping() and shmem_zero_setup() from linux/mm.h, since they are already provided in linux/shmem_fs.h. But shmem_fs.h must then provide the inline stub for shmem_mapping() when CONFIG_SHMEM is not set, and a few more cfiles now need to #include it. Link: http://lkml.kernel.org/r/alpine.LSU.2.11.1702081658250.1549@eggly.anvils Signed-off-by: Hugh Dickins Cc: Johannes Weiner Cc: Michal Simek Cc: Michael Ellerman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 10 ---------- include/linux/shmem_fs.h | 7 +++++++ 2 files changed, 7 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index ed233b002e33..0d65dd72c0f4 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1168,16 +1168,6 @@ extern void pagefault_out_of_memory(void); extern void show_free_areas(unsigned int flags, nodemask_t *nodemask); -int shmem_zero_setup(struct vm_area_struct *); -#ifdef CONFIG_SHMEM -bool shmem_mapping(struct address_space *mapping); -#else -static inline bool shmem_mapping(struct address_space *mapping) -{ - return false; -} -#endif - extern bool can_do_mlock(void); extern int user_shm_lock(size_t, struct user_struct *); extern void user_shm_unlock(size_t, struct user_struct *); diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index fdaac9d4d46d..a7d6bd2a918f 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -57,7 +57,14 @@ extern int shmem_zero_setup(struct vm_area_struct *); extern unsigned long shmem_get_unmapped_area(struct file *, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags); extern int shmem_lock(struct file *file, int lock, struct user_struct *user); +#ifdef CONFIG_SHMEM extern bool shmem_mapping(struct address_space *mapping); +#else +static inline bool shmem_mapping(struct address_space *mapping) +{ + return false; +} +#endif /* CONFIG_SHMEM */ extern void shmem_unlock_mapping(struct address_space *mapping); extern struct page *shmem_read_mapping_page_gfp(struct address_space *mapping, pgoff_t index, gfp_t gfp_mask); -- cgit v1.2.3 From dc18d706a4367454ad1fc51e06148d54e8ecfaa0 Mon Sep 17 00:00:00 2001 From: Nathan Fontenot Date: Fri, 24 Feb 2017 15:00:02 -0800 Subject: memory-hotplug: use dev_online for memhp_auto_online Commit 31bc3858ea3e ("add automatic onlining policy for the newly added memory") provides the capability to have added memory automatically onlined during add, but this appears to be slightly broken. The current implementation uses walk_memory_range() to call online_memory_block, which uses memory_block_change_state() to online the memory. Instead, we should be calling device_online() for the memory block in online_memory_block(). This would online the memory (the memory bus online routine memory_subsys_online() called from device_online calls memory_block_change_state()) and properly update the device struct offline flag. As a result of the current implementation, attempting to remove a memory block after adding it using auto online fails. This is because doing a remove, for instance echo offline > /sys/devices/system/memory/memoryXXX/state uses device_offline() which checks the dev->offline flag. Link: http://lkml.kernel.org/r/20170222220744.8119.19687.stgit@ltcalpine2-lp14.aus.stglabs.ibm.com Signed-off-by: Nathan Fontenot Cc: Michael Ellerman Cc: Michael Roth Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memory.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include') diff --git a/include/linux/memory.h b/include/linux/memory.h index 093607f90b91..b723a686fc10 100644 --- a/include/linux/memory.h +++ b/include/linux/memory.h @@ -109,9 +109,6 @@ extern void unregister_memory_notifier(struct notifier_block *nb); extern int register_memory_isolate_notifier(struct notifier_block *nb); extern void unregister_memory_isolate_notifier(struct notifier_block *nb); extern int register_new_memory(int, struct mem_section *); -extern int memory_block_change_state(struct memory_block *mem, - unsigned long to_state, - unsigned long from_state_req); #ifdef CONFIG_MEMORY_HOTREMOVE extern int unregister_memory_section(struct mem_section *); #endif -- cgit v1.2.3 From f9fa1d919c696e90c887d8742198023e7639d139 Mon Sep 17 00:00:00 2001 From: Greg Thelen Date: Fri, 24 Feb 2017 15:00:05 -0800 Subject: kasan: drain quarantine of memcg slab objects Per memcg slab accounting and kasan have a problem with kmem_cache destruction. - kmem_cache_create() allocates a kmem_cache, which is used for allocations from processes running in root (top) memcg. - Processes running in non root memcg and allocating with either __GFP_ACCOUNT or from a SLAB_ACCOUNT cache use a per memcg kmem_cache. - Kasan catches use-after-free by having kfree() and kmem_cache_free() defer freeing of objects. Objects are placed in a quarantine. - kmem_cache_destroy() destroys root and non root kmem_caches. It takes care to drain the quarantine of objects from the root memcg's kmem_cache, but ignores objects associated with non root memcg. This causes leaks because quarantined per memcg objects refer to per memcg kmem cache being destroyed. To see the problem: 1) create a slab cache with kmem_cache_create(,,,SLAB_ACCOUNT,) 2) from non root memcg, allocate and free a few objects from cache 3) dispose of the cache with kmem_cache_destroy() kmem_cache_destroy() will trigger a "Slab cache still has objects" warning indicating that the per memcg kmem_cache structure was leaked. Fix the leak by draining kasan quarantined objects allocated from non root memcg. Racing memcg deletion is tricky, but handled. kmem_cache_destroy() => shutdown_memcg_caches() => __shutdown_memcg_cache() => shutdown_cache() flushes per memcg quarantined objects, even if that memcg has been rmdir'd and gone through memcg_deactivate_kmem_caches(). This leak only affects destroyed SLAB_ACCOUNT kmem caches when kasan is enabled. So I don't think it's worth patching stable kernels. Link: http://lkml.kernel.org/r/1482257462-36948-1-git-send-email-gthelen@google.com Signed-off-by: Greg Thelen Reviewed-by: Vladimir Davydov Acked-by: Andrey Ryabinin Cc: Alexander Potapenko Cc: Dmitry Vyukov Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kasan.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/kasan.h b/include/linux/kasan.h index 820c0ad54a01..c908b25bf5a5 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -52,7 +52,7 @@ void kasan_free_pages(struct page *page, unsigned int order); void kasan_cache_create(struct kmem_cache *cache, size_t *size, unsigned long *flags); void kasan_cache_shrink(struct kmem_cache *cache); -void kasan_cache_destroy(struct kmem_cache *cache); +void kasan_cache_shutdown(struct kmem_cache *cache); void kasan_poison_slab(struct page *page); void kasan_unpoison_object_data(struct kmem_cache *cache, void *object); @@ -98,7 +98,7 @@ static inline void kasan_cache_create(struct kmem_cache *cache, size_t *size, unsigned long *flags) {} static inline void kasan_cache_shrink(struct kmem_cache *cache) {} -static inline void kasan_cache_destroy(struct kmem_cache *cache) {} +static inline void kasan_cache_shutdown(struct kmem_cache *cache) {} static inline void kasan_poison_slab(struct page *page) {} static inline void kasan_unpoison_object_data(struct kmem_cache *cache, -- cgit v1.2.3 From 796f571b0c5cf3efd2f652779770fa7bbbc2bb03 Mon Sep 17 00:00:00 2001 From: Lafcadio Wluiki Date: Fri, 24 Feb 2017 15:00:23 -0800 Subject: procfs: use an enum for possible hidepid values MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previously, the hidepid parameter was checked by comparing literal integers 0, 1, 2. Let's add a proper enum for this, to make the checking more expressive: 0 → HIDEPID_OFF 1 → HIDEPID_NO_ACCESS 2 → HIDEPID_INVISIBLE This changes the internal labelling only, the userspace-facing interface remains unmodified, and still works with literal integers 0, 1, 2. No functional changes. Link: http://lkml.kernel.org/r/1484572984-13388-2-git-send-email-djalal@gmail.com Signed-off-by: Lafcadio Wluiki Signed-off-by: Djalal Harouni Acked-by: Kees Cook Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/pid_namespace.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h index 34cce96741bc..c2a989dee876 100644 --- a/include/linux/pid_namespace.h +++ b/include/linux/pid_namespace.h @@ -21,6 +21,12 @@ struct pidmap { struct fs_pin; +enum { /* definitions for pid_namespace's hide_pid field */ + HIDEPID_OFF = 0, + HIDEPID_NO_ACCESS = 1, + HIDEPID_INVISIBLE = 2, +}; + struct pid_namespace { struct kref kref; struct pidmap pidmap[PIDMAP_ENTRIES]; -- cgit v1.2.3 From 1ca5eebb894a3625b2a543c7b550aa4ae33ba3cc Mon Sep 17 00:00:00 2001 From: Mike Frysinger Date: Fri, 24 Feb 2017 15:00:26 -0800 Subject: uapi: mqueue.h: add missing linux/types.h include Commit 63159f5dcccb ("uapi: Use __kernel_long_t in struct mq_attr") changed the types from long to __kernel_long_t, but didn't add a linux/types.h include. Code that tries to include this header directly breaks: /usr/include/linux/mqueue.h:26:2: error: unknown type name '__kernel_long_t' __kernel_long_t mq_flags; /* message queue flags */ This also upsets configure tests for this header: checking linux/mqueue.h usability... no checking linux/mqueue.h presence... yes configure: WARNING: linux/mqueue.h: present but cannot be compiled configure: WARNING: linux/mqueue.h: check for missing prerequisite headers? configure: WARNING: linux/mqueue.h: see the Autoconf documentation configure: WARNING: linux/mqueue.h: section "Present But Cannot Be Compiled" configure: WARNING: linux/mqueue.h: proceeding with the compiler's result checking for linux/mqueue.h... no Link: http://lkml.kernel.org/r/20170119194644.4403-1-vapier@gentoo.org Signed-off-by: Mike Frysinger Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/uapi/linux/mqueue.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/mqueue.h b/include/uapi/linux/mqueue.h index d0a2b8e89813..bbd5116ea739 100644 --- a/include/uapi/linux/mqueue.h +++ b/include/uapi/linux/mqueue.h @@ -18,6 +18,8 @@ #ifndef _LINUX_MQUEUE_H #define _LINUX_MQUEUE_H +#include + #define MQ_PRIO_MAX 32768 /* per-uid limit of kernel memory used by mqueue, in bytes */ #define MQ_BYTES_MAX 819200 -- cgit v1.2.3 From 16f4e3195156f2f06e90d00a36bc48d7a513f253 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Fri, 24 Feb 2017 15:00:29 -0800 Subject: include/linux/iopoll.h: include instead of The timer APIs this header needs are ktime_get(), ktime_add_us(), and ktime_compare(). So, including seems enough. This commit will cut unnecessary header file parsing. Link: http://lkml.kernel.org/r/1481679225-10885-1-git-send-email-yamada.masahiro@socionext.com Signed-off-by: Masahiro Yamada Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/iopoll.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/iopoll.h b/include/linux/iopoll.h index 1c30014ed176..d29e1e21bf3f 100644 --- a/include/linux/iopoll.h +++ b/include/linux/iopoll.h @@ -17,7 +17,7 @@ #include #include -#include +#include #include #include #include -- cgit v1.2.3 From a3f0825e7e37d99a02a8a1b1599687667ee50d04 Mon Sep 17 00:00:00 2001 From: Gideon Israel Dsouza Date: Fri, 24 Feb 2017 15:00:32 -0800 Subject: compiler-gcc.h: add a new macro to wrap gcc attribute Add __mode(x) into compiler-gcc.h as part of a cleanup task I've taken up, to replace gcc specific attributes with macros. The next patch is a cleanup of the m68k subsystem and it requires a new macro to wrap __attribute__ ((mode (...))) Link: http://lkml.kernel.org/r/1485540901-1988-2-git-send-email-gidisrael@gmail.com Signed-off-by: Gideon Israel Dsouza Cc: Greg Ungerer Cc: Geert Uytterhoeven Cc: Paul Gortmaker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/compiler-gcc.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h index fddd1a5eb322..811f7a915658 100644 --- a/include/linux/compiler-gcc.h +++ b/include/linux/compiler-gcc.h @@ -122,6 +122,7 @@ #define __attribute_const__ __attribute__((__const__)) #define __maybe_unused __attribute__((unused)) #define __always_unused __attribute__((unused)) +#define __mode(x) __attribute__((mode(x))) /* gcc version specific checks */ -- cgit v1.2.3 From 85caa95b9f19bb3a26d7e025d1134760b69e0c40 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 24 Feb 2017 15:00:38 -0800 Subject: bug: switch data corruption check to __must_check The CHECK_DATA_CORRUPTION() macro was designed to have callers do something meaningful/protective on failure. However, using "return false" in the macro too strictly limits the design patterns of callers. Instead, let callers handle the logic test directly, but make sure that the result IS checked by forcing __must_check (which appears to not be able to be used directly on macro expressions). Link: http://lkml.kernel.org/r/20170206204547.GA125312@beast Signed-off-by: Kees Cook Suggested-by: Arnd Bergmann Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/bug.h | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/bug.h b/include/linux/bug.h index baff2e8fc8a8..5828489309bb 100644 --- a/include/linux/bug.h +++ b/include/linux/bug.h @@ -124,18 +124,20 @@ static inline enum bug_trap_type report_bug(unsigned long bug_addr, /* * Since detected data corruption should stop operation on the affected - * structures, this returns false if the corruption condition is found. + * structures. Return value must be checked and sanely acted on by caller. */ +static inline __must_check bool check_data_corruption(bool v) { return v; } #define CHECK_DATA_CORRUPTION(condition, fmt, ...) \ - do { \ - if (unlikely(condition)) { \ + check_data_corruption(({ \ + bool corruption = unlikely(condition); \ + if (corruption) { \ if (IS_ENABLED(CONFIG_BUG_ON_DATA_CORRUPTION)) { \ pr_err(fmt, ##__VA_ARGS__); \ BUG(); \ } else \ WARN(1, fmt, ##__VA_ARGS__); \ - return false; \ } \ - } while (0) + corruption; \ + })) #endif /* _LINUX_BUG_H */ -- cgit v1.2.3 From 4f5901f5a6724f4ed1641e4a94978c5039a1f8c4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Niklas=20S=C3=B6derlund?= Date: Fri, 24 Feb 2017 15:01:01 -0800 Subject: linux/kernel.h: fix DIV_ROUND_CLOSEST to support negative divisors MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit While working on a thermal driver I encounter a scenario where the divisor could be negative, instead of adding local code to handle this I though I first try to add support for this in DIV_ROUND_CLOSEST. Add support to DIV_ROUND_CLOSEST for negative divisors if both dividend and divisor variable types are signed. This should not alter current behavior for users of the macro as previously negative divisors where not supported. Before: DIV_ROUND_CLOSEST( 59, 4) = 15 DIV_ROUND_CLOSEST( 59, -4) = -14 DIV_ROUND_CLOSEST( -59, 4) = -15 DIV_ROUND_CLOSEST( -59, -4) = 14 After: DIV_ROUND_CLOSEST( 59, 4) = 15 DIV_ROUND_CLOSEST( 59, -4) = -15 DIV_ROUND_CLOSEST( -59, 4) = -15 DIV_ROUND_CLOSEST( -59, -4) = 15 [akpm@linux-foundation.org: fix comment, per Guenter] Link: http://lkml.kernel.org/r/20161222102217.29011-1-niklas.soderlund+renesas@ragnatech.se Signed-off-by: Niklas Söderlund Reviewed-by: Guenter Roeck Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kernel.h | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/kernel.h b/include/linux/kernel.h index cb09238f6d32..4c26dc3a8295 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -100,16 +100,18 @@ ) /* - * Divide positive or negative dividend by positive divisor and round - * to closest integer. Result is undefined for negative divisors and - * for negative dividends if the divisor variable type is unsigned. + * Divide positive or negative dividend by positive or negative divisor + * and round to closest integer. Result is undefined for negative + * divisors if he dividend variable type is unsigned and for negative + * dividends if the divisor variable type is unsigned. */ #define DIV_ROUND_CLOSEST(x, divisor)( \ { \ typeof(x) __x = x; \ typeof(divisor) __d = divisor; \ (((typeof(x))-1) > 0 || \ - ((typeof(divisor))-1) > 0 || (__x) > 0) ? \ + ((typeof(divisor))-1) > 0 || \ + (((__x) > 0) == ((__d) > 0))) ? \ (((__x) + ((__d) / 2)) / (__d)) : \ (((__x) - ((__d) / 2)) / (__d)); \ } \ -- cgit v1.2.3 From f231aebfc4cae2f6ed27a46a31e2630909513d77 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 24 Feb 2017 15:01:04 -0800 Subject: rbtree: use designated initializers Prepare to mark sensitive kernel structures for randomization by making sure they're using designated initializers. These were identified during allyesconfig builds of x86, arm, and arm64, with most initializer fixes extracted from grsecurity. Link: http://lkml.kernel.org/r/20161217010253.GA140470@beast Signed-off-by: Kees Cook Acked-by: Peter Zijlstra (Intel) Cc: David Howells Cc: Jie Chen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/rbtree_augmented.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/rbtree_augmented.h b/include/linux/rbtree_augmented.h index d076183e49be..9702b6e183bc 100644 --- a/include/linux/rbtree_augmented.h +++ b/include/linux/rbtree_augmented.h @@ -90,7 +90,9 @@ rbname ## _rotate(struct rb_node *rb_old, struct rb_node *rb_new) \ old->rbaugmented = rbcompute(old); \ } \ rbstatic const struct rb_augment_callbacks rbname = { \ - rbname ## _propagate, rbname ## _copy, rbname ## _rotate \ + .propagate = rbname ## _propagate, \ + .copy = rbname ## _copy, \ + .rotate = rbname ## _rotate \ }; -- cgit v1.2.3 From 4e1a33b105ddf201f66dcc44490c6086a25eca0b Mon Sep 17 00:00:00 2001 From: Sven Schmidt <4sschmid@informatik.uni-hamburg.de> Date: Fri, 24 Feb 2017 15:01:12 -0800 Subject: lib: update LZ4 compressor module Patch series "Update LZ4 compressor module", v7. This patchset updates the LZ4 compression module to a version based on LZ4 v1.7.3 allowing to use the fast compression algorithm aka LZ4 fast which provides an "acceleration" parameter as a tradeoff between high compression ratio and high compression speed. We want to use LZ4 fast in order to support compression in lustre and (mostly, based on that) investigate data reduction techniques in behalf of storage systems. Also, it will be useful for other users of LZ4 compression, as with LZ4 fast it is possible to enable applications to use fast and/or high compression depending on the usecase. For instance, ZRAM is offering a LZ4 backend and could benefit from an updated LZ4 in the kernel. LZ4 homepage: http://www.lz4.org/ LZ4 source repository: https://github.com/lz4/lz4 Source version: 1.7.3 Benchmark (taken from [1], Core i5-4300U @1.9GHz): ----------------|--------------|----------------|---------- Compressor | Compression | Decompression | Ratio ----------------|--------------|----------------|---------- memcpy | 4200 MB/s | 4200 MB/s | 1.000 LZ4 fast 50 | 1080 MB/s | 2650 MB/s | 1.375 LZ4 fast 17 | 680 MB/s | 2220 MB/s | 1.607 LZ4 fast 5 | 475 MB/s | 1920 MB/s | 1.886 LZ4 default | 385 MB/s | 1850 MB/s | 2.101 [1] http://fastcompression.blogspot.de/2015/04/sampling-or-faster-lz4.html [PATCH 1/5] lib: Update LZ4 compressor module [PATCH 2/5] lib/decompress_unlz4: Change module to work with new LZ4 module version [PATCH 3/5] crypto: Change LZ4 modules to work with new LZ4 module version [PATCH 4/5] fs/pstore: fs/squashfs: Change usage of LZ4 to work with new LZ4 version [PATCH 5/5] lib/lz4: Remove back-compat wrappers This patch (of 5): Update the LZ4 kernel module to LZ4 v1.7.3 by Yann Collet. The kernel module is inspired by the previous work by Chanho Min. The updated LZ4 module will not break existing code since the patchset contains appropriate changes. API changes: New method LZ4_compress_fast which differs from the variant available in kernel by the new acceleration parameter, allowing to trade compression ratio for more compression speed and vice versa. LZ4_decompress_fast is the respective decompression method, featuring a very fast decoder (multiple GB/s per core), able to reach RAM speed in multi-core systems. The decompressor allows to decompress data compressed with LZ4 fast as well as the LZ4 HC (high compression) algorithm. Also the useful functions LZ4_decompress_safe_partial and LZ4_compress_destsize were added. The latter reverses the logic by trying to compress as much data as possible from source to dest while the former aims to decompress partial blocks of data. A bunch of streaming functions were also added which allow compressig/decompressing data in multiple steps (so called "streaming mode"). The methods lz4_compress and lz4_decompress_unknownoutputsize are now known as LZ4_compress_default respectivley LZ4_decompress_safe. The old methods will be removed since there's no callers left in the code. [arnd@arndb.de: fix KERNEL_LZ4 support] Link: http://lkml.kernel.org/r/20170208211946.2839649-1-arnd@arndb.de [akpm@linux-foundation.org: simplify] [akpm@linux-foundation.org: fix the simplification] [4sschmid@informatik.uni-hamburg.de: fix performance regressions] Link: http://lkml.kernel.org/r/1486898178-17125-2-git-send-email-4sschmid@informatik.uni-hamburg.de [4sschmid@informatik.uni-hamburg.de: v8] Link: http://lkml.kernel.org/r/1487182598-15351-2-git-send-email-4sschmid@informatik.uni-hamburg.de Link: http://lkml.kernel.org/r/1486321748-19085-2-git-send-email-4sschmid@informatik.uni-hamburg.de Signed-off-by: Sven Schmidt <4sschmid@informatik.uni-hamburg.de> Signed-off-by: Arnd Bergmann Cc: Bongkyu Kim Cc: Rui Salvaterra Cc: Sergey Senozhatsky Cc: Greg Kroah-Hartman Cc: Herbert Xu Cc: David S. Miller Cc: Anton Vorontsov Cc: Colin Cross Cc: Kees Cook Cc: Tony Luck Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/lz4.h | 762 +++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 696 insertions(+), 66 deletions(-) (limited to 'include') diff --git a/include/linux/lz4.h b/include/linux/lz4.h index 6b784c59f321..1b0f8ca0f9c8 100644 --- a/include/linux/lz4.h +++ b/include/linux/lz4.h @@ -1,87 +1,717 @@ -#ifndef __LZ4_H__ -#define __LZ4_H__ -/* - * LZ4 Kernel Interface +/* LZ4 Kernel Interface * * Copyright (C) 2013, LG Electronics, Kyungsik Lee + * Copyright (C) 2016, Sven Schmidt <4sschmid@informatik.uni-hamburg.de> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. + * + * This file is based on the original header file + * for LZ4 - Fast LZ compression algorithm. + * + * LZ4 - Fast LZ compression algorithm + * Copyright (C) 2011-2016, Yann Collet. + * BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following disclaimer + * in the documentation and/or other materials provided with the + * distribution. + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * You can contact the author at : + * - LZ4 homepage : http://www.lz4.org + * - LZ4 source repository : https://github.com/lz4/lz4 */ -#define LZ4_MEM_COMPRESS (16384) -#define LZ4HC_MEM_COMPRESS (262144 + (2 * sizeof(unsigned char *))) +#ifndef __LZ4_H__ +#define __LZ4_H__ + +#include +#include /* memset, memcpy */ + +/*-************************************************************************ + * CONSTANTS + **************************************************************************/ /* - * lz4_compressbound() - * Provides the maximum size that LZ4 may output in a "worst case" scenario - * (input data not compressible) + * LZ4_MEMORY_USAGE : + * Memory usage formula : N->2^N Bytes + * (examples : 10 -> 1KB; 12 -> 4KB ; 16 -> 64KB; 20 -> 1MB; etc.) + * Increasing memory usage improves compression ratio + * Reduced memory usage can improve speed, due to cache effect + * Default value is 14, for 16KB, which nicely fits into Intel x86 L1 cache */ -static inline size_t lz4_compressbound(size_t isize) +#define LZ4_MEMORY_USAGE 14 + +#define LZ4_MAX_INPUT_SIZE 0x7E000000 /* 2 113 929 216 bytes */ +#define LZ4_COMPRESSBOUND(isize) (\ + (unsigned int)(isize) > (unsigned int)LZ4_MAX_INPUT_SIZE \ + ? 0 \ + : (isize) + ((isize)/255) + 16) + +#define LZ4_ACCELERATION_DEFAULT 1 +#define LZ4_HASHLOG (LZ4_MEMORY_USAGE-2) +#define LZ4_HASHTABLESIZE (1 << LZ4_MEMORY_USAGE) +#define LZ4_HASH_SIZE_U32 (1 << LZ4_HASHLOG) + +#define LZ4HC_MIN_CLEVEL 3 +#define LZ4HC_DEFAULT_CLEVEL 9 +#define LZ4HC_MAX_CLEVEL 16 + +#define LZ4HC_DICTIONARY_LOGSIZE 16 +#define LZ4HC_MAXD (1<= LZ4_compressBound(inputSize). + * It also runs faster, so it's a recommended setting. + * If the function cannot compress 'source' into a more limited 'dest' budget, + * compression stops *immediately*, and the function result is zero. + * As a consequence, 'dest' content is not valid. + * + * Return: Number of bytes written into buffer 'dest' + * (necessarily <= maxOutputSize) or 0 if compression fails + */ +int LZ4_compress_default(const char *source, char *dest, int inputSize, + int maxOutputSize, void *wrkmem); + +/** + * LZ4_compress_fast() - As LZ4_compress_default providing an acceleration param + * @source: source address of the original data + * @dest: output buffer address of the compressed data + * @inputSize: size of the input data. Max supported value is LZ4_MAX_INPUT_SIZE + * @maxOutputSize: full or partial size of buffer 'dest' + * which must be already allocated + * @acceleration: acceleration factor + * @wrkmem: address of the working memory. + * This requires 'workmem' of LZ4_MEM_COMPRESS. + * + * Same as LZ4_compress_default(), but allows to select an "acceleration" + * factor. The larger the acceleration value, the faster the algorithm, + * but also the lesser the compression. It's a trade-off. It can be fine tuned, + * with each successive value providing roughly +~3% to speed. + * An acceleration value of "1" is the same as regular LZ4_compress_default() + * Values <= 0 will be replaced by LZ4_ACCELERATION_DEFAULT, which is 1. + * + * Return: Number of bytes written into buffer 'dest' + * (necessarily <= maxOutputSize) or 0 if compression fails + */ +int LZ4_compress_fast(const char *source, char *dest, int inputSize, + int maxOutputSize, int acceleration, void *wrkmem); + +/** + * LZ4_compress_destSize() - Compress as much data as possible + * from source to dest + * @source: source address of the original data + * @dest: output buffer address of the compressed data + * @sourceSizePtr: will be modified to indicate how many bytes where read + * from 'source' to fill 'dest'. New value is necessarily <= old value. + * @targetDestSize: Size of buffer 'dest' which must be already allocated + * @wrkmem: address of the working memory. + * This requires 'workmem' of LZ4_MEM_COMPRESS. + * + * Reverse the logic, by compressing as much data as possible + * from 'source' buffer into already allocated buffer 'dest' + * of size 'targetDestSize'. + * This function either compresses the entire 'source' content into 'dest' + * if it's large enough, or fill 'dest' buffer completely with as much data as + * possible from 'source'. + * + * Return: Number of bytes written into 'dest' (necessarily <= targetDestSize) + * or 0 if compression fails + */ +int LZ4_compress_destSize(const char *source, char *dest, int *sourceSizePtr, + int targetDestSize, void *wrkmem); /* - * lz4_decompress() - * src : source address of the compressed data - * src_len : is the input size, whcih is returned after decompress done - * dest : output buffer address of the decompressed data - * actual_dest_len: is the size of uncompressed data, supposing it's known - * return : Success if return 0 - * Error if return (< 0) - * note : Destination buffer must be already allocated. - * slightly faster than lz4_decompress_unknownoutputsize() + * lz4_compress() - For backward compatibility, see LZ4_compress_default + * @src: source address of the original data + * @src_len: size of the original data + * @dst: output buffer address of the compressed data. This requires 'dst' + * of size LZ4_COMPRESSBOUND + * @dst_len: is the output size, which is returned after compress done + * @workmem: address of the working memory. + * + * Return: Success if return 0, Error if return < 0 */ -int lz4_decompress(const unsigned char *src, size_t *src_len, - unsigned char *dest, size_t actual_dest_len); +int lz4_compress(const unsigned char *src, size_t src_len, unsigned char *dst, + size_t *dst_len, void *wrkmem); + +/*-************************************************************************ + * Decompression Functions + **************************************************************************/ + +/** + * LZ4_decompress_fast() - Decompresses data from 'source' into 'dest' + * @source: source address of the compressed data + * @dest: output buffer address of the uncompressed data + * which must be already allocated with 'originalSize' bytes + * @originalSize: is the original and therefore uncompressed size + * + * Decompresses data from 'source' into 'dest'. + * This function fully respect memory boundaries for properly formed + * compressed data. + * It is a bit faster than LZ4_decompress_safe(). + * However, it does not provide any protection against intentionally + * modified data stream (malicious input). + * Use this function in trusted environment only + * (data to decode comes from a trusted source). + * + * Return: number of bytes read from the source buffer + * or a negative result if decompression fails. + */ +int LZ4_decompress_fast(const char *source, char *dest, int originalSize); + +/** + * LZ4_decompress_safe() - Decompression protected against buffer overflow + * @source: source address of the compressed data + * @dest: output buffer address of the uncompressed data + * which must be already allocated + * @compressedSize: is the precise full size of the compressed block + * @maxDecompressedSize: is the size of 'dest' buffer + * + * Decompresses data fom 'source' into 'dest'. + * If the source stream is detected malformed, the function will + * stop decoding and return a negative result. + * This function is protected against buffer overflow exploits, + * including malicious data packets. It never writes outside output buffer, + * nor reads outside input buffer. + * + * Return: number of bytes decompressed into destination buffer + * (necessarily <= maxDecompressedSize) + * or a negative result in case of error + */ +int LZ4_decompress_safe(const char *source, char *dest, int compressedSize, + int maxDecompressedSize); + +/** + * LZ4_decompress_safe_partial() - Decompress a block of size 'compressedSize' + * at position 'source' into buffer 'dest' + * @source: source address of the compressed data + * @dest: output buffer address of the decompressed data which must be + * already allocated + * @compressedSize: is the precise full size of the compressed block. + * @targetOutputSize: the decompression operation will try + * to stop as soon as 'targetOutputSize' has been reached + * @maxDecompressedSize: is the size of destination buffer + * + * This function decompresses a compressed block of size 'compressedSize' + * at position 'source' into destination buffer 'dest' + * of size 'maxDecompressedSize'. + * The function tries to stop decompressing operation as soon as + * 'targetOutputSize' has been reached, reducing decompression time. + * This function never writes outside of output buffer, + * and never reads outside of input buffer. + * It is therefore protected against malicious data packets. + * + * Return: the number of bytes decoded in the destination buffer + * (necessarily <= maxDecompressedSize) + * or a negative result in case of error + * + */ +int LZ4_decompress_safe_partial(const char *source, char *dest, + int compressedSize, int targetOutputSize, int maxDecompressedSize); /* - * lz4_decompress_unknownoutputsize() - * src : source address of the compressed data - * src_len : is the input size, therefore the compressed size - * dest : output buffer address of the decompressed data - * dest_len: is the max size of the destination buffer, which is - * returned with actual size of decompressed data after - * decompress done - * return : Success if return 0 - * Error if return (< 0) - * note : Destination buffer must be already allocated. + * lz4_decompress_unknownoutputsize() - For backwards compatibility, + * see LZ4_decompress_safe + * @src: source address of the compressed data + * @src_len: is the input size, therefore the compressed size + * @dest: output buffer address of the decompressed data + * which must be already allocated + * @dest_len: is the max size of the destination buffer, which is + * returned with actual size of decompressed data after decompress done + * + * Return: Success if return 0, Error if return (< 0) */ int lz4_decompress_unknownoutputsize(const unsigned char *src, size_t src_len, - unsigned char *dest, size_t *dest_len); + unsigned char *dest, size_t *dest_len); + +/** + * lz4_decompress() - For backwards cocmpatibility, see LZ4_decompress_fast + * @src: source address of the compressed data + * @src_len: is the input size, which is returned after decompress done + * @dest: output buffer address of the decompressed data, + * which must be already allocated + * @actual_dest_len: is the size of uncompressed data, supposing it's known + * + * Return: Success if return 0, Error if return (< 0) + */ +int lz4_decompress(const unsigned char *src, size_t *src_len, + unsigned char *dest, size_t actual_dest_len); + +/*-************************************************************************ + * LZ4 HC Compression + **************************************************************************/ + +/** + * LZ4_compress_HC() - Compress data from `src` into `dst`, using HC algorithm + * @src: source address of the original data + * @dst: output buffer address of the compressed data + * @srcSize: size of the input data. Max supported value is LZ4_MAX_INPUT_SIZE + * @dstCapacity: full or partial size of buffer 'dst', + * which must be already allocated + * @compressionLevel: Recommended values are between 4 and 9, although any + * value between 1 and LZ4HC_MAX_CLEVEL will work. + * Values >LZ4HC_MAX_CLEVEL behave the same as 16. + * @wrkmem: address of the working memory. + * This requires 'wrkmem' of size LZ4HC_MEM_COMPRESS. + * + * Compress data from 'src' into 'dst', using the more powerful + * but slower "HC" algorithm. Compression is guaranteed to succeed if + * `dstCapacity >= LZ4_compressBound(srcSize) + * + * Return : the number of bytes written into 'dst' or 0 if compression fails. + */ +int LZ4_compress_HC(const char *src, char *dst, int srcSize, int dstCapacity, + int compressionLevel, void *wrkmem); + +/** + * lz4hc_compress() - For backwards compatibility, see LZ4_compress_HC + * @src: source address of the original data + * @src_len: size of the original data + * @dst: output buffer address of the compressed data. This requires 'dst' + * of size LZ4_COMPRESSBOUND. + * @dst_len: is the output size, which is returned after compress done + * @wrkmem: address of the working memory. + * This requires 'workmem' of size LZ4HC_MEM_COMPRESS. + * + * Return : Success if return 0, Error if return (< 0) + */ +int lz4hc_compress(const unsigned char *src, size_t src_len, unsigned char *dst, + size_t *dst_len, void *wrkmem); + +/** + * LZ4_resetStreamHC() - Init an allocated 'LZ4_streamHC_t' structure + * @streamHCPtr: pointer to the 'LZ4_streamHC_t' structure + * @compressionLevel: Recommended values are between 4 and 9, although any + * value between 1 and LZ4HC_MAX_CLEVEL will work. + * Values >LZ4HC_MAX_CLEVEL behave the same as 16. + * + * An LZ4_streamHC_t structure can be allocated once + * and re-used multiple times. + * Use this function to init an allocated `LZ4_streamHC_t` structure + * and start a new compression. + */ +void LZ4_resetStreamHC(LZ4_streamHC_t *streamHCPtr, int compressionLevel); + +/** + * LZ4_loadDictHC() - Load a static dictionary into LZ4_streamHC + * @streamHCPtr: pointer to the LZ4HC_stream_t + * @dictionary: dictionary to load + * @dictSize: size of dictionary + * + * Use this function to load a static dictionary into LZ4HC_stream. + * Any previous data will be forgotten, only 'dictionary' + * will remain in memory. + * Loading a size of 0 is allowed. + * + * Return : dictionary size, in bytes (necessarily <= 64 KB) + */ +int LZ4_loadDictHC(LZ4_streamHC_t *streamHCPtr, const char *dictionary, + int dictSize); + +/** + * LZ4_compress_HC_continue() - Compress 'src' using data from previously + * compressed blocks as a dictionary using the HC algorithm + * @streamHCPtr: Pointer to the previous 'LZ4_streamHC_t' structure + * @src: source address of the original data + * @dst: output buffer address of the compressed data, + * which must be already allocated + * @srcSize: size of the input data. Max supported value is LZ4_MAX_INPUT_SIZE + * @maxDstSize: full or partial size of buffer 'dest' + * which must be already allocated + * + * These functions compress data in successive blocks of any size, using + * previous blocks as dictionary. One key assumption is that previous + * blocks (up to 64 KB) remain read-accessible while + * compressing next blocks. There is an exception for ring buffers, + * which can be smaller than 64 KB. + * Ring buffers scenario is automatically detected and handled by + * LZ4_compress_HC_continue(). + * Before starting compression, state must be properly initialized, + * using LZ4_resetStreamHC(). + * A first "fictional block" can then be designated as + * initial dictionary, using LZ4_loadDictHC() (Optional). + * Then, use LZ4_compress_HC_continue() + * to compress each successive block. Previous memory blocks + * (including initial dictionary when present) must remain accessible + * and unmodified during compression. + * 'dst' buffer should be sized to handle worst case scenarios, using + * LZ4_compressBound(), to ensure operation success. + * If, for any reason, previous data blocks can't be preserved unmodified + * in memory during next compression block, + * you must save it to a safer memory space, using LZ4_saveDictHC(). + * Return value of LZ4_saveDictHC() is the size of dictionary + * effectively saved into 'safeBuffer'. + * + * Return: Number of bytes written into buffer 'dst' or 0 if compression fails + */ +int LZ4_compress_HC_continue(LZ4_streamHC_t *streamHCPtr, const char *src, + char *dst, int srcSize, int maxDstSize); + +/** + * LZ4_saveDictHC() - Save static dictionary from LZ4HC_stream + * @streamHCPtr: pointer to the 'LZ4HC_stream_t' structure + * @safeBuffer: buffer to save dictionary to, must be already allocated + * @maxDictSize: size of 'safeBuffer' + * + * If previously compressed data block is not guaranteed + * to remain available at its memory location, + * save it into a safer place (char *safeBuffer). + * Note : you don't need to call LZ4_loadDictHC() afterwards, + * dictionary is immediately usable, you can therefore call + * LZ4_compress_HC_continue(). + * + * Return : saved dictionary size in bytes (necessarily <= maxDictSize), + * or 0 if error. + */ +int LZ4_saveDictHC(LZ4_streamHC_t *streamHCPtr, char *safeBuffer, + int maxDictSize); + +/*-********************************************* + * Streaming Compression Functions + ***********************************************/ + +/** + * LZ4_resetStream() - Init an allocated 'LZ4_stream_t' structure + * @LZ4_stream: pointer to the 'LZ4_stream_t' structure + * + * An LZ4_stream_t structure can be allocated once + * and re-used multiple times. + * Use this function to init an allocated `LZ4_stream_t` structure + * and start a new compression. + */ +void LZ4_resetStream(LZ4_stream_t *LZ4_stream); + +/** + * LZ4_loadDict() - Load a static dictionary into LZ4_stream + * @streamPtr: pointer to the LZ4_stream_t + * @dictionary: dictionary to load + * @dictSize: size of dictionary + * + * Use this function to load a static dictionary into LZ4_stream. + * Any previous data will be forgotten, only 'dictionary' + * will remain in memory. + * Loading a size of 0 is allowed. + * + * Return : dictionary size, in bytes (necessarily <= 64 KB) + */ +int LZ4_loadDict(LZ4_stream_t *streamPtr, const char *dictionary, + int dictSize); + +/** + * LZ4_saveDict() - Save static dictionary from LZ4_stream + * @streamPtr: pointer to the 'LZ4_stream_t' structure + * @safeBuffer: buffer to save dictionary to, must be already allocated + * @dictSize: size of 'safeBuffer' + * + * If previously compressed data block is not guaranteed + * to remain available at its memory location, + * save it into a safer place (char *safeBuffer). + * Note : you don't need to call LZ4_loadDict() afterwards, + * dictionary is immediately usable, you can therefore call + * LZ4_compress_fast_continue(). + * + * Return : saved dictionary size in bytes (necessarily <= dictSize), + * or 0 if error. + */ +int LZ4_saveDict(LZ4_stream_t *streamPtr, char *safeBuffer, int dictSize); + +/** + * LZ4_compress_fast_continue() - Compress 'src' using data from previously + * compressed blocks as a dictionary + * @streamPtr: Pointer to the previous 'LZ4_stream_t' structure + * @src: source address of the original data + * @dst: output buffer address of the compressed data, + * which must be already allocated + * @srcSize: size of the input data. Max supported value is LZ4_MAX_INPUT_SIZE + * @maxDstSize: full or partial size of buffer 'dest' + * which must be already allocated + * @acceleration: acceleration factor + * + * Compress buffer content 'src', using data from previously compressed blocks + * as dictionary to improve compression ratio. + * Important : Previous data blocks are assumed to still + * be present and unmodified ! + * If maxDstSize >= LZ4_compressBound(srcSize), + * compression is guaranteed to succeed, and runs faster. + * + * Return: Number of bytes written into buffer 'dst' or 0 if compression fails + */ +int LZ4_compress_fast_continue(LZ4_stream_t *streamPtr, const char *src, + char *dst, int srcSize, int maxDstSize, int acceleration); + +/** + * LZ4_setStreamDecode() - Instruct where to find dictionary + * @LZ4_streamDecode: the 'LZ4_streamDecode_t' structure + * @dictionary: dictionary to use + * @dictSize: size of dictionary + * + * Use this function to instruct where to find the dictionary. + * Setting a size of 0 is allowed (same effect as reset). + * + * Return: 1 if OK, 0 if error + */ +int LZ4_setStreamDecode(LZ4_streamDecode_t *LZ4_streamDecode, + const char *dictionary, int dictSize); + +/** + * LZ4_decompress_fast_continue() - Decompress blocks in streaming mode + * @LZ4_streamDecode: the 'LZ4_streamDecode_t' structure + * @source: source address of the compressed data + * @dest: output buffer address of the uncompressed data + * which must be already allocated + * @compressedSize: is the precise full size of the compressed block + * @maxDecompressedSize: is the size of 'dest' buffer + * + * These decoding function allows decompression of multiple blocks + * in "streaming" mode. + * Previously decoded blocks *must* remain available at the memory position + * where they were decoded (up to 64 KB) + * In the case of a ring buffers, decoding buffer must be either : + * - Exactly same size as encoding buffer, with same update rule + * (block boundaries at same positions) In which case, + * the decoding & encoding ring buffer can have any size, + * including very small ones ( < 64 KB). + * - Larger than encoding buffer, by a minimum of maxBlockSize more bytes. + * maxBlockSize is implementation dependent. + * It's the maximum size you intend to compress into a single block. + * In which case, encoding and decoding buffers do not need + * to be synchronized, and encoding ring buffer can have any size, + * including small ones ( < 64 KB). + * - _At least_ 64 KB + 8 bytes + maxBlockSize. + * In which case, encoding and decoding buffers do not need to be + * synchronized, and encoding ring buffer can have any size, + * including larger than decoding buffer. W + * Whenever these conditions are not possible, save the last 64KB of decoded + * data into a safe buffer, and indicate where it is saved + * using LZ4_setStreamDecode() + * + * Return: number of bytes decompressed into destination buffer + * (necessarily <= maxDecompressedSize) + * or a negative result in case of error + */ +int LZ4_decompress_safe_continue(LZ4_streamDecode_t *LZ4_streamDecode, + const char *source, char *dest, int compressedSize, + int maxDecompressedSize); + +/** + * LZ4_decompress_fast_continue() - Decompress blocks in streaming mode + * @LZ4_streamDecode: the 'LZ4_streamDecode_t' structure + * @source: source address of the compressed data + * @dest: output buffer address of the uncompressed data + * which must be already allocated with 'originalSize' bytes + * @originalSize: is the original and therefore uncompressed size + * + * These decoding function allows decompression of multiple blocks + * in "streaming" mode. + * Previously decoded blocks *must* remain available at the memory position + * where they were decoded (up to 64 KB) + * In the case of a ring buffers, decoding buffer must be either : + * - Exactly same size as encoding buffer, with same update rule + * (block boundaries at same positions) In which case, + * the decoding & encoding ring buffer can have any size, + * including very small ones ( < 64 KB). + * - Larger than encoding buffer, by a minimum of maxBlockSize more bytes. + * maxBlockSize is implementation dependent. + * It's the maximum size you intend to compress into a single block. + * In which case, encoding and decoding buffers do not need + * to be synchronized, and encoding ring buffer can have any size, + * including small ones ( < 64 KB). + * - _At least_ 64 KB + 8 bytes + maxBlockSize. + * In which case, encoding and decoding buffers do not need to be + * synchronized, and encoding ring buffer can have any size, + * including larger than decoding buffer. W + * Whenever these conditions are not possible, save the last 64KB of decoded + * data into a safe buffer, and indicate where it is saved + * using LZ4_setStreamDecode() + * + * Return: number of bytes decompressed into destination buffer + * (necessarily <= maxDecompressedSize) + * or a negative result in case of error + */ +int LZ4_decompress_fast_continue(LZ4_streamDecode_t *LZ4_streamDecode, + const char *source, char *dest, int originalSize); + +/** + * LZ4_decompress_safe_usingDict() - Same as LZ4_setStreamDecode() + * followed by LZ4_decompress_safe_continue() + * @source: source address of the compressed data + * @dest: output buffer address of the uncompressed data + * which must be already allocated + * @compressedSize: is the precise full size of the compressed block + * @maxDecompressedSize: is the size of 'dest' buffer + * @dictStart: pointer to the start of the dictionary in memory + * @dictSize: size of dictionary + * + * These decoding function works the same as + * a combination of LZ4_setStreamDecode() followed by + * LZ4_decompress_safe_continue() + * It is stand-alone, and don'tn eed a LZ4_streamDecode_t structure. + * + * Return: number of bytes decompressed into destination buffer + * (necessarily <= maxDecompressedSize) + * or a negative result in case of error + */ +int LZ4_decompress_safe_usingDict(const char *source, char *dest, + int compressedSize, int maxDecompressedSize, const char *dictStart, + int dictSize); + +/** + * LZ4_decompress_fast_usingDict() - Same as LZ4_setStreamDecode() + * followed by LZ4_decompress_fast_continue() + * @source: source address of the compressed data + * @dest: output buffer address of the uncompressed data + * which must be already allocated with 'originalSize' bytes + * @originalSize: is the original and therefore uncompressed size + * @dictStart: pointer to the start of the dictionary in memory + * @dictSize: size of dictionary + * + * These decoding function works the same as + * a combination of LZ4_setStreamDecode() followed by + * LZ4_decompress_safe_continue() + * It is stand-alone, and don'tn eed a LZ4_streamDecode_t structure. + * + * Return: number of bytes decompressed into destination buffer + * (necessarily <= maxDecompressedSize) + * or a negative result in case of error + */ +int LZ4_decompress_fast_usingDict(const char *source, char *dest, + int originalSize, const char *dictStart, int dictSize); + #endif -- cgit v1.2.3 From 69c78423b8f439b077929410bdf8f88e7031b891 Mon Sep 17 00:00:00 2001 From: Sven Schmidt <4sschmid@informatik.uni-hamburg.de> Date: Fri, 24 Feb 2017 15:01:25 -0800 Subject: lib/lz4: remove back-compat wrappers Remove the functions introduced as wrappers for providing backwards compatibility to the prior LZ4 version. They're not needed anymore since there's no callers left. Link: http://lkml.kernel.org/r/1486321748-19085-6-git-send-email-4sschmid@informatik.uni-hamburg.de Signed-off-by: Sven Schmidt <4sschmid@informatik.uni-hamburg.de> Cc: Bongkyu Kim Cc: Rui Salvaterra Cc: Sergey Senozhatsky Cc: Greg Kroah-Hartman Cc: Herbert Xu Cc: David S. Miller Cc: Anton Vorontsov Cc: Colin Cross Cc: Kees Cook Cc: Tony Luck Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/lz4.h | 69 ----------------------------------------------------- 1 file changed, 69 deletions(-) (limited to 'include') diff --git a/include/linux/lz4.h b/include/linux/lz4.h index 1b0f8ca0f9c8..394e3d9213b8 100644 --- a/include/linux/lz4.h +++ b/include/linux/lz4.h @@ -172,18 +172,6 @@ static inline int LZ4_compressBound(size_t isize) return LZ4_COMPRESSBOUND(isize); } -/** - * lz4_compressbound() - For backwards compatibility; see LZ4_compressBound - * @isize: Size of the input data - * - * Return: Max. size LZ4 may output in a "worst case" szenario - * (data not compressible) - */ -static inline int lz4_compressbound(size_t isize) -{ - return LZ4_COMPRESSBOUND(isize); -} - /** * LZ4_compress_default() - Compress data from source to dest * @source: source address of the original data @@ -257,20 +245,6 @@ int LZ4_compress_fast(const char *source, char *dest, int inputSize, int LZ4_compress_destSize(const char *source, char *dest, int *sourceSizePtr, int targetDestSize, void *wrkmem); -/* - * lz4_compress() - For backward compatibility, see LZ4_compress_default - * @src: source address of the original data - * @src_len: size of the original data - * @dst: output buffer address of the compressed data. This requires 'dst' - * of size LZ4_COMPRESSBOUND - * @dst_len: is the output size, which is returned after compress done - * @workmem: address of the working memory. - * - * Return: Success if return 0, Error if return < 0 - */ -int lz4_compress(const unsigned char *src, size_t src_len, unsigned char *dst, - size_t *dst_len, void *wrkmem); - /*-************************************************************************ * Decompression Functions **************************************************************************/ @@ -346,34 +320,6 @@ int LZ4_decompress_safe(const char *source, char *dest, int compressedSize, int LZ4_decompress_safe_partial(const char *source, char *dest, int compressedSize, int targetOutputSize, int maxDecompressedSize); -/* - * lz4_decompress_unknownoutputsize() - For backwards compatibility, - * see LZ4_decompress_safe - * @src: source address of the compressed data - * @src_len: is the input size, therefore the compressed size - * @dest: output buffer address of the decompressed data - * which must be already allocated - * @dest_len: is the max size of the destination buffer, which is - * returned with actual size of decompressed data after decompress done - * - * Return: Success if return 0, Error if return (< 0) - */ -int lz4_decompress_unknownoutputsize(const unsigned char *src, size_t src_len, - unsigned char *dest, size_t *dest_len); - -/** - * lz4_decompress() - For backwards cocmpatibility, see LZ4_decompress_fast - * @src: source address of the compressed data - * @src_len: is the input size, which is returned after decompress done - * @dest: output buffer address of the decompressed data, - * which must be already allocated - * @actual_dest_len: is the size of uncompressed data, supposing it's known - * - * Return: Success if return 0, Error if return (< 0) - */ -int lz4_decompress(const unsigned char *src, size_t *src_len, - unsigned char *dest, size_t actual_dest_len); - /*-************************************************************************ * LZ4 HC Compression **************************************************************************/ @@ -400,21 +346,6 @@ int lz4_decompress(const unsigned char *src, size_t *src_len, int LZ4_compress_HC(const char *src, char *dst, int srcSize, int dstCapacity, int compressionLevel, void *wrkmem); -/** - * lz4hc_compress() - For backwards compatibility, see LZ4_compress_HC - * @src: source address of the original data - * @src_len: size of the original data - * @dst: output buffer address of the compressed data. This requires 'dst' - * of size LZ4_COMPRESSBOUND. - * @dst_len: is the output size, which is returned after compress done - * @wrkmem: address of the working memory. - * This requires 'workmem' of size LZ4HC_MEM_COMPRESS. - * - * Return : Success if return 0, Error if return (< 0) - */ -int lz4hc_compress(const unsigned char *src, size_t src_len, unsigned char *dst, - size_t *dst_len, void *wrkmem); - /** * LZ4_resetStreamHC() - Init an allocated 'LZ4_streamHC_t' structure * @streamHCPtr: pointer to the 'LZ4_streamHC_t' structure -- cgit v1.2.3