From 57417cebc96b57122a2207fc84a6077d20c84b4b Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 15 Oct 2020 20:05:13 -0700 Subject: XArray: add xa_get_order Patch series "Fix read-only THP for non-tmpfs filesystems". As described more verbosely in the [3/3] changelog, we can inadvertently put an order-0 page in the page cache which occupies 512 consecutive entries. Users are running into this if they enable the READ_ONLY_THP_FOR_FS config option; see https://bugzilla.kernel.org/show_bug.cgi?id=206569 and Qian Cai has also reported it here: https://lore.kernel.org/lkml/20200616013309.GB815@lca.pw/ This is a rather intrusive way of fixing the problem, but has the advantage that I've actually been testing it with the THP patches, which means that it sees far more use than it does upstream -- indeed, Song has been entirely unable to reproduce it. It also has the advantage that it removes a few patches from my gargantuan backlog of THP patches. This patch (of 3): This function returns the order of the entry at the index. We need this because there isn't space in the shadow entry to encode its order. [akpm@linux-foundation.org: export xa_get_order to modules] Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton Cc: "Kirill A . Shutemov" Cc: Qian Cai Cc: Song Liu Link: https://lkml.kernel.org/r/20200903183029.14930-1-willy@infradead.org Link: https://lkml.kernel.org/r/20200903183029.14930-2-willy@infradead.org Signed-off-by: Linus Torvalds --- include/linux/xarray.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/xarray.h b/include/linux/xarray.h index b4d70e7568b2..5b7f4ebcf4ff 100644 --- a/include/linux/xarray.h +++ b/include/linux/xarray.h @@ -1505,6 +1505,15 @@ void xas_pause(struct xa_state *); void xas_create_range(struct xa_state *); +#ifdef CONFIG_XARRAY_MULTI +int xa_get_order(struct xarray *, unsigned long index); +#else +static inline int xa_get_order(struct xarray *xa, unsigned long index) +{ + return 0; +} +#endif + /** * xas_reload() - Refetch an entry from the xarray. * @xas: XArray operation state. -- cgit v1.2.3 From 8fc75643c5e14574c8be59b69182452ece28315a Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 15 Oct 2020 20:05:16 -0700 Subject: XArray: add xas_split In order to use multi-index entries for huge pages in the page cache, we need to be able to split a multi-index entry (eg if a file is truncated in the middle of a huge page entry). This version does not support splitting more than one level of the tree at a time. This is an acceptable limitation for the page cache as we do not expect to support order-12 pages in the near future. [akpm@linux-foundation.org: export xas_split_alloc() to modules] [willy@infradead.org: fix xarray split] Link: https://lkml.kernel.org/r/20200910175450.GV6583@casper.infradead.org [willy@infradead.org: fix xarray] Link: https://lkml.kernel.org/r/20201001233943.GW20115@casper.infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton Cc: "Kirill A . Shutemov" Cc: Qian Cai Cc: Song Liu Link: https://lkml.kernel.org/r/20200903183029.14930-3-willy@infradead.org Signed-off-by: Linus Torvalds --- include/linux/xarray.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include/linux') diff --git a/include/linux/xarray.h b/include/linux/xarray.h index 5b7f4ebcf4ff..5cdf441f6377 100644 --- a/include/linux/xarray.h +++ b/include/linux/xarray.h @@ -1507,11 +1507,24 @@ void xas_create_range(struct xa_state *); #ifdef CONFIG_XARRAY_MULTI int xa_get_order(struct xarray *, unsigned long index); +void xas_split(struct xa_state *, void *entry, unsigned int order); +void xas_split_alloc(struct xa_state *, void *entry, unsigned int order, gfp_t); #else static inline int xa_get_order(struct xarray *xa, unsigned long index) { return 0; } + +static inline void xas_split(struct xa_state *xas, void *entry, + unsigned int order) +{ + xas_store(xas, entry); +} + +static inline void xas_split_alloc(struct xa_state *xas, void *entry, + unsigned int order, gfp_t gfp) +{ +} #endif /** -- cgit v1.2.3 From 8fb156c9ee2db94f7127c930c89917634a1a9f56 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 15 Oct 2020 20:05:29 -0700 Subject: mm/page_owner: change split_page_owner to take a count The implementation of split_page_owner() prefers a count rather than the old order of the page. When we support a variable size THP, we won't have the order at this point, but we will have the number of pages. So change the interface to what the caller and callee would prefer. Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton Reviewed-by: SeongJae Park Acked-by: Kirill A. Shutemov Cc: Huang Ying Link: https://lkml.kernel.org/r/20200908195539.25896-4-willy@infradead.org Signed-off-by: Linus Torvalds --- include/linux/page_owner.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/page_owner.h b/include/linux/page_owner.h index 8679ccd722e8..3468794f83d2 100644 --- a/include/linux/page_owner.h +++ b/include/linux/page_owner.h @@ -11,7 +11,7 @@ extern struct page_ext_operations page_owner_ops; extern void __reset_page_owner(struct page *page, unsigned int order); extern void __set_page_owner(struct page *page, unsigned int order, gfp_t gfp_mask); -extern void __split_page_owner(struct page *page, unsigned int order); +extern void __split_page_owner(struct page *page, unsigned int nr); extern void __copy_page_owner(struct page *oldpage, struct page *newpage); extern void __set_page_owner_migrate_reason(struct page *page, int reason); extern void __dump_page_owner(struct page *page); @@ -31,10 +31,10 @@ static inline void set_page_owner(struct page *page, __set_page_owner(page, order, gfp_mask); } -static inline void split_page_owner(struct page *page, unsigned int order) +static inline void split_page_owner(struct page *page, unsigned int nr) { if (static_branch_unlikely(&page_owner_inited)) - __split_page_owner(page, order); + __split_page_owner(page, nr); } static inline void copy_page_owner(struct page *oldpage, struct page *newpage) { -- cgit v1.2.3 From 01c70267053d6718820ac0902d8823d5dd2a6adb Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 15 Oct 2020 20:06:00 -0700 Subject: fs: add a filesystem flag for THPs The page cache needs to know whether the filesystem supports THPs so that it doesn't send THPs to filesystems which can't handle them. Dave Chinner points out that getting from the page mapping to the filesystem type is too many steps (mapping->host->i_sb->s_type->fs_flags) so cache that information in the address space flags. Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton Cc: Alexander Viro Cc: "Matthew Wilcox (Oracle)" Cc: Hugh Dickins Cc: Song Liu Cc: Rik van Riel Cc: "Kirill A . Shutemov" Cc: Johannes Weiner Cc: Dave Chinner Cc: Christoph Hellwig Link: https://lkml.kernel.org/r/20200916032717.22917-1-willy@infradead.org Signed-off-by: Linus Torvalds --- include/linux/fs.h | 1 + include/linux/pagemap.h | 6 ++++++ 2 files changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index ae97d87a00d2..72369be23f91 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2209,6 +2209,7 @@ struct file_system_type { #define FS_HAS_SUBTYPE 4 #define FS_USERNS_MOUNT 8 /* Can be mounted by userns root */ #define FS_DISALLOW_NOTIFY_PERM 16 /* Disable fanotify permission events */ +#define FS_THP_SUPPORT 8192 /* Remove once all fs converted */ #define FS_RENAME_DOES_D_MOVE 32768 /* FS will handle d_move() during rename() internally. */ int (*init_fs_context)(struct fs_context *); const struct fs_parameter_spec *parameters; diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index c3afd3242b54..820c970fd24a 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -29,6 +29,7 @@ enum mapping_flags { AS_EXITING = 4, /* final truncate in progress */ /* writeback related tags are not used */ AS_NO_WRITEBACK_TAGS = 5, + AS_THP_SUPPORT = 6, /* THPs supported */ }; /** @@ -120,6 +121,11 @@ static inline void mapping_set_gfp_mask(struct address_space *m, gfp_t mask) m->gfp_mask = mask; } +static inline bool mapping_thp_support(struct address_space *mapping) +{ + return test_bit(AS_THP_SUPPORT, &mapping->flags); +} + void release_pages(struct page **pages, int nr); /* -- cgit v1.2.3 From 6f4d2f9770cf154f9867f466d7b1b463a39f05a7 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 15 Oct 2020 20:06:03 -0700 Subject: fs: do not update nr_thps for mappings which support THPs The nr_thps counter is to support THPs in the page cache when the filesystem doesn't understand THPs. Eventually it will be removed, but we should still support filesystems which do not understand THPs yet. Move the nr_thp manipulation functions to filemap.h since they're page-cache specific. Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton Cc: Alexander Viro Cc: "Matthew Wilcox (Oracle)" Cc: Hugh Dickins Cc: Song Liu Cc: Rik van Riel Cc: "Kirill A . Shutemov" Cc: Johannes Weiner Cc: Dave Chinner Cc: Christoph Hellwig Link: https://lkml.kernel.org/r/20200916032717.22917-2-willy@infradead.org Signed-off-by: Linus Torvalds --- include/linux/fs.h | 27 --------------------------- include/linux/pagemap.h | 29 +++++++++++++++++++++++++++++ 2 files changed, 29 insertions(+), 27 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 72369be23f91..d1d166b46131 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2697,33 +2697,6 @@ static inline errseq_t file_sample_sb_err(struct file *file) return errseq_sample(&file->f_path.dentry->d_sb->s_wb_err); } -static inline int filemap_nr_thps(struct address_space *mapping) -{ -#ifdef CONFIG_READ_ONLY_THP_FOR_FS - return atomic_read(&mapping->nr_thps); -#else - return 0; -#endif -} - -static inline void filemap_nr_thps_inc(struct address_space *mapping) -{ -#ifdef CONFIG_READ_ONLY_THP_FOR_FS - atomic_inc(&mapping->nr_thps); -#else - WARN_ON_ONCE(1); -#endif -} - -static inline void filemap_nr_thps_dec(struct address_space *mapping) -{ -#ifdef CONFIG_READ_ONLY_THP_FOR_FS - atomic_dec(&mapping->nr_thps); -#else - WARN_ON_ONCE(1); -#endif -} - extern int vfs_fsync_range(struct file *file, loff_t start, loff_t end, int datasync); extern int vfs_fsync(struct file *file, int datasync); diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 820c970fd24a..a0024528a9ee 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -126,6 +126,35 @@ static inline bool mapping_thp_support(struct address_space *mapping) return test_bit(AS_THP_SUPPORT, &mapping->flags); } +static inline int filemap_nr_thps(struct address_space *mapping) +{ +#ifdef CONFIG_READ_ONLY_THP_FOR_FS + return atomic_read(&mapping->nr_thps); +#else + return 0; +#endif +} + +static inline void filemap_nr_thps_inc(struct address_space *mapping) +{ +#ifdef CONFIG_READ_ONLY_THP_FOR_FS + if (!mapping_thp_support(mapping)) + atomic_inc(&mapping->nr_thps); +#else + WARN_ON_ONCE(1); +#endif +} + +static inline void filemap_nr_thps_dec(struct address_space *mapping) +{ +#ifdef CONFIG_READ_ONLY_THP_FOR_FS + if (!mapping_thp_support(mapping)) + atomic_dec(&mapping->nr_thps); +#else + WARN_ON_ONCE(1); +#endif +} + void release_pages(struct page **pages, int nr); /* -- cgit v1.2.3 From 1aa83cfa5a20a6bbd39d2355a89c95152e4b37b4 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 15 Oct 2020 20:06:10 -0700 Subject: mm/readahead: add DEFINE_READAHEAD Patch series "Readahead patches for 5.9/5.10". These are infrastructure for both the THP patchset and for the fscache rewrite, For both pieces of infrastructure being build on top of this patchset, we want the ractl to be available higher in the call-stack. For David's work, he wants to add the 'critical page' to the ractl so that he knows which page NEEDS to be brought in from storage, and which ones are nice-to-have. We might want something similar in block storage too. It used to be simple -- the first page was the critical one, but then mmap added fault-around and so for that usecase, the middle page is the critical one. Anyway, I don't have any code to show that yet, we just know that the lowest point in the callchain where we have that information is do_sync_mmap_readahead() and so the ractl needs to start its life there. For THP, we havew the code that needs it. It's actually the apex patch to the series; the one which finally starts to allocate THPs and present them to consenting filesystems: http://git.infradead.org/users/willy/pagecache.git/commitdiff/798bcf30ab2eff278caad03a9edca74d2f8ae760 This patch (of 8): Allow for a more concise definition of a struct readahead_control. Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton Cc: Eric Biggers Cc: David Howells Link: https://lkml.kernel.org/r/20200903140844.14194-1-willy@infradead.org Link: https://lkml.kernel.org/r/20200903140844.14194-3-willy@infradead.org Signed-off-by: Linus Torvalds --- include/linux/pagemap.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index a0024528a9ee..63c81b512e80 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -812,6 +812,13 @@ struct readahead_control { unsigned int _batch_count; }; +#define DEFINE_READAHEAD(rac, f, m, i) \ + struct readahead_control rac = { \ + .file = f, \ + .mapping = m, \ + ._index = i, \ + } + /** * readahead_page - Get the next page to read. * @rac: The current readahead request. -- cgit v1.2.3 From 73bb49da50cd460bb3ba31250ed2e7fbf2115edf Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 15 Oct 2020 20:06:14 -0700 Subject: mm/readahead: make page_cache_ra_unbounded take a readahead_control Define it in the callers instead of in page_cache_ra_unbounded(). Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton Cc: David Howells Cc: Eric Biggers Link: https://lkml.kernel.org/r/20200903140844.14194-4-willy@infradead.org Signed-off-by: Linus Torvalds --- include/linux/pagemap.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 63c81b512e80..37f209ccef0f 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -768,9 +768,8 @@ void page_cache_sync_readahead(struct address_space *, struct file_ra_state *, void page_cache_async_readahead(struct address_space *, struct file_ra_state *, struct file *, struct page *, pgoff_t index, unsigned long req_count); -void page_cache_readahead_unbounded(struct address_space *, struct file *, - pgoff_t index, unsigned long nr_to_read, - unsigned long lookahead_count); +void page_cache_ra_unbounded(struct readahead_control *, + unsigned long nr_to_read, unsigned long lookahead_count); /* * Like add_to_page_cache_locked, but used to add newly allocated pages: -- cgit v1.2.3 From fefa7c478fdafe71c64b5ddf817ac0271aed1146 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 15 Oct 2020 20:06:28 -0700 Subject: mm/readahead: add page_cache_sync_ra and page_cache_async_ra Reimplement page_cache_sync_readahead() and page_cache_async_readahead() as wrappers around versions of the function which take a readahead_control in preparation for making do_sync_mmap_readahead() pass down an RAC struct. Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton Cc: David Howells Cc: Eric Biggers Link: https://lkml.kernel.org/r/20200903140844.14194-8-willy@infradead.org Signed-off-by: Linus Torvalds --- include/linux/pagemap.h | 64 +++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 54 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 37f209ccef0f..c77b7c31b2e4 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -761,16 +761,6 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask); void delete_from_page_cache_batch(struct address_space *mapping, struct pagevec *pvec); -#define VM_READAHEAD_PAGES (SZ_128K / PAGE_SIZE) - -void page_cache_sync_readahead(struct address_space *, struct file_ra_state *, - struct file *, pgoff_t index, unsigned long req_count); -void page_cache_async_readahead(struct address_space *, struct file_ra_state *, - struct file *, struct page *, pgoff_t index, - unsigned long req_count); -void page_cache_ra_unbounded(struct readahead_control *, - unsigned long nr_to_read, unsigned long lookahead_count); - /* * Like add_to_page_cache_locked, but used to add newly allocated pages: * the page is new, so we can just run __SetPageLocked() against it. @@ -818,6 +808,60 @@ struct readahead_control { ._index = i, \ } +#define VM_READAHEAD_PAGES (SZ_128K / PAGE_SIZE) + +void page_cache_ra_unbounded(struct readahead_control *, + unsigned long nr_to_read, unsigned long lookahead_count); +void page_cache_sync_ra(struct readahead_control *, struct file_ra_state *, + unsigned long req_count); +void page_cache_async_ra(struct readahead_control *, struct file_ra_state *, + struct page *, unsigned long req_count); + +/** + * page_cache_sync_readahead - generic file readahead + * @mapping: address_space which holds the pagecache and I/O vectors + * @ra: file_ra_state which holds the readahead state + * @file: Used by the filesystem for authentication. + * @index: Index of first page to be read. + * @req_count: Total number of pages being read by the caller. + * + * page_cache_sync_readahead() should be called when a cache miss happened: + * it will submit the read. The readahead logic may decide to piggyback more + * pages onto the read request if access patterns suggest it will improve + * performance. + */ +static inline +void page_cache_sync_readahead(struct address_space *mapping, + struct file_ra_state *ra, struct file *file, pgoff_t index, + unsigned long req_count) +{ + DEFINE_READAHEAD(ractl, file, mapping, index); + page_cache_sync_ra(&ractl, ra, req_count); +} + +/** + * page_cache_async_readahead - file readahead for marked pages + * @mapping: address_space which holds the pagecache and I/O vectors + * @ra: file_ra_state which holds the readahead state + * @file: Used by the filesystem for authentication. + * @page: The page at @index which triggered the readahead call. + * @index: Index of first page to be read. + * @req_count: Total number of pages being read by the caller. + * + * page_cache_async_readahead() should be called when a page is used which + * is marked as PageReadahead; this is a marker to suggest that the application + * has used up enough of the readahead window that we should start pulling in + * more pages. + */ +static inline +void page_cache_async_readahead(struct address_space *mapping, + struct file_ra_state *ra, struct file *file, + struct page *page, pgoff_t index, unsigned long req_count) +{ + DEFINE_READAHEAD(ractl, file, mapping, index); + page_cache_async_ra(&ractl, ra, page, req_count); +} + /** * readahead_page - Get the next page to read. * @rac: The current readahead request. -- cgit v1.2.3 From 7e27f22c9e40b66186e0675376f0495725ff1b0a Mon Sep 17 00:00:00 2001 From: Oscar Salvador Date: Thu, 15 Oct 2020 20:06:50 -0700 Subject: mm,hwpoison: unexport get_hwpoison_page and make it static Since get_hwpoison_page is only used in memory-failure code now, let us un-export it and make it private to that code. Signed-off-by: Oscar Salvador Signed-off-by: Andrew Morton Acked-by: Naoya Horiguchi Cc: "Aneesh Kumar K.V" Cc: Aneesh Kumar K.V Cc: Aristeu Rozanski Cc: Dave Hansen Cc: David Hildenbrand Cc: Dmitry Yakunin Cc: Michal Hocko Cc: Mike Kravetz Cc: Oscar Salvador Cc: Qian Cai Cc: Tony Luck Link: https://lkml.kernel.org/r/20200922135650.1634-5-osalvador@suse.de Signed-off-by: Linus Torvalds --- include/linux/mm.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 620961e4f32b..1977c09afe7a 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3025,7 +3025,6 @@ extern int memory_failure(unsigned long pfn, int flags); extern void memory_failure_queue(unsigned long pfn, int flags); extern void memory_failure_queue_kick(int cpu); extern int unpoison_memory(unsigned long pfn); -extern int get_hwpoison_page(struct page *page); #define put_hwpoison_page(page) put_page(page) extern int sysctl_memory_failure_early_kill; extern int sysctl_memory_failure_recovery; -- cgit v1.2.3 From dd6e2402fad966290f35dc687294fb6049714aac Mon Sep 17 00:00:00 2001 From: Oscar Salvador Date: Thu, 15 Oct 2020 20:06:57 -0700 Subject: mm,hwpoison: kill put_hwpoison_page After commit 4e41a30c6d50 ("mm: hwpoison: adjust for new thp refcounting"), put_hwpoison_page got reduced to a put_page. Let us just use put_page instead. Signed-off-by: Oscar Salvador Signed-off-by: Andrew Morton Acked-by: Naoya Horiguchi Cc: "Aneesh Kumar K.V" Cc: Aneesh Kumar K.V Cc: Aristeu Rozanski Cc: Dave Hansen Cc: David Hildenbrand Cc: Dmitry Yakunin Cc: Michal Hocko Cc: Mike Kravetz Cc: Oscar Salvador Cc: Qian Cai Cc: Tony Luck Link: https://lkml.kernel.org/r/20200922135650.1634-7-osalvador@suse.de Signed-off-by: Linus Torvalds --- include/linux/mm.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 1977c09afe7a..ab038a3521b4 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3025,7 +3025,6 @@ extern int memory_failure(unsigned long pfn, int flags); extern void memory_failure_queue(unsigned long pfn, int flags); extern void memory_failure_queue_kick(int cpu); extern int unpoison_memory(unsigned long pfn); -#define put_hwpoison_page(page) put_page(page) extern int sysctl_memory_failure_early_kill; extern int sysctl_memory_failure_recovery; extern void shake_page(struct page *p, int access); -- cgit v1.2.3 From 06be6ff3d2ec8be806b859fc054a1909b16d2473 Mon Sep 17 00:00:00 2001 From: Oscar Salvador Date: Thu, 15 Oct 2020 20:07:05 -0700 Subject: mm,hwpoison: rework soft offline for free pages When trying to soft-offline a free page, we need to first take it off the buddy allocator. Once we know is out of reach, we can safely flag it as poisoned. take_page_off_buddy will be used to take a page meant to be poisoned off the buddy allocator. take_page_off_buddy calls break_down_buddy_pages, which splits a higher-order page in case our page belongs to one. Once the page is under our control, we call page_handle_poison to set it as poisoned and grab a refcount on it. Signed-off-by: Oscar Salvador Signed-off-by: Andrew Morton Acked-by: Naoya Horiguchi Cc: "Aneesh Kumar K.V" Cc: Aneesh Kumar K.V Cc: Aristeu Rozanski Cc: Dave Hansen Cc: David Hildenbrand Cc: Dmitry Yakunin Cc: Michal Hocko Cc: Mike Kravetz Cc: Oscar Salvador Cc: Qian Cai Cc: Tony Luck Link: https://lkml.kernel.org/r/20200922135650.1634-9-osalvador@suse.de Signed-off-by: Linus Torvalds --- include/linux/page-flags.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 38ded408bd4c..a02b6d0221db 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -432,6 +432,7 @@ PAGEFLAG(HWPoison, hwpoison, PF_ANY) TESTSCFLAG(HWPoison, hwpoison, PF_ANY) #define __PG_HWPOISON (1UL << PG_hwpoison) extern bool set_hwpoison_free_buddy_page(struct page *page); +extern bool take_page_off_buddy(struct page *page); #else PAGEFLAG_FALSE(HWPoison) static inline bool set_hwpoison_free_buddy_page(struct page *page) -- cgit v1.2.3 From 79f5f8fab482dfff62948214468ac4ebbf0a016f Mon Sep 17 00:00:00 2001 From: Oscar Salvador Date: Thu, 15 Oct 2020 20:07:09 -0700 Subject: mm,hwpoison: rework soft offline for in-use pages This patch changes the way we set and handle in-use poisoned pages. Until now, poisoned pages were released to the buddy allocator, trusting that the checks that take place at allocation time would act as a safe net and would skip that page. This has proved to be wrong, as we got some pfn walkers out there, like compaction, that all they care is the page to be in a buddy freelist. Although this might not be the only user, having poisoned pages in the buddy allocator seems a bad idea as we should only have free pages that are ready and meant to be used as such. Before explaining the taken approach, let us break down the kind of pages we can soft offline. - Anonymous THP (after the split, they end up being 4K pages) - Hugetlb - Order-0 pages (that can be either migrated or invalited) * Normal pages (order-0 and anon-THP) - If they are clean and unmapped page cache pages, we invalidate then by means of invalidate_inode_page(). - If they are mapped/dirty, we do the isolate-and-migrate dance. Either way, do not call put_page directly from those paths. Instead, we keep the page and send it to page_handle_poison to perform the right handling. page_handle_poison sets the HWPoison flag and does the last put_page. Down the chain, we placed a check for HWPoison page in free_pages_prepare, that just skips any poisoned page, so those pages do not end up in any pcplist/freelist. After that, we set the refcount on the page to 1 and we increment the poisoned pages counter. If we see that the check in free_pages_prepare creates trouble, we can always do what we do for free pages: - wait until the page hits buddy's freelists - take it off, and flag it The downside of the above approach is that we could race with an allocation, so by the time we want to take the page off the buddy, the page has been already allocated so we cannot soft offline it. But the user could always retry it. * Hugetlb pages - We isolate-and-migrate them After the migration has been successful, we call dissolve_free_huge_page, and we set HWPoison on the page if we succeed. Hugetlb has a slightly different handling though. While for non-hugetlb pages we cared about closing the race with an allocation, doing so for hugetlb pages requires quite some additional and intrusive code (we would need to hook in free_huge_page and some other places). So I decided to not make the code overly complicated and just fail normally if the page we allocated in the meantime. We can always build on top of this. As a bonus, because of the way we handle now in-use pages, we no longer need the put-as-isolation-migratetype dance, that was guarding for poisoned pages to end up in pcplists. Signed-off-by: Oscar Salvador Signed-off-by: Andrew Morton Acked-by: Naoya Horiguchi Cc: "Aneesh Kumar K.V" Cc: Aneesh Kumar K.V Cc: Aristeu Rozanski Cc: Dave Hansen Cc: David Hildenbrand Cc: Dmitry Yakunin Cc: Michal Hocko Cc: Mike Kravetz Cc: Oscar Salvador Cc: Qian Cai Cc: Tony Luck Link: https://lkml.kernel.org/r/20200922135650.1634-10-osalvador@suse.de Signed-off-by: Linus Torvalds --- include/linux/page-flags.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index a02b6d0221db..4f6ba9379112 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -431,14 +431,9 @@ PAGEFLAG_FALSE(Uncached) PAGEFLAG(HWPoison, hwpoison, PF_ANY) TESTSCFLAG(HWPoison, hwpoison, PF_ANY) #define __PG_HWPOISON (1UL << PG_hwpoison) -extern bool set_hwpoison_free_buddy_page(struct page *page); extern bool take_page_off_buddy(struct page *page); #else PAGEFLAG_FALSE(HWPoison) -static inline bool set_hwpoison_free_buddy_page(struct page *page) -{ - return 0; -} #define __PG_HWPOISON 0 #endif -- cgit v1.2.3 From 5d1fd5dc877bc1c670e7b1c174aa659b76c07de1 Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Thu, 15 Oct 2020 20:07:21 -0700 Subject: mm,hwpoison: introduce MF_MSG_UNSPLIT_THP memory_failure() is supposed to call action_result() when it handles a memory error event, but there's one missing case. So let's add it. I find that include/ras/ras_event.h has some other MF_MSG_* undefined, so this patch also adds them. Signed-off-by: Naoya Horiguchi Signed-off-by: Oscar Salvador Signed-off-by: Andrew Morton Cc: "Aneesh Kumar K.V" Cc: Aneesh Kumar K.V Cc: Aristeu Rozanski Cc: Dave Hansen Cc: David Hildenbrand Cc: Dmitry Yakunin Cc: Michal Hocko Cc: Mike Kravetz Cc: Oscar Salvador Cc: Qian Cai Cc: Tony Luck Link: https://lkml.kernel.org/r/20200922135650.1634-13-osalvador@suse.de Signed-off-by: Linus Torvalds --- include/linux/mm.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index ab038a3521b4..a9df46309e07 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3064,6 +3064,7 @@ enum mf_action_page_type { MF_MSG_BUDDY, MF_MSG_BUDDY_2ND, MF_MSG_DAX, + MF_MSG_UNSPLIT_THP, MF_MSG_UNKNOWN, }; -- cgit v1.2.3 From 257bea71582d895894201b604990a900df489103 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 15 Oct 2020 20:07:59 -0700 Subject: mm/page_alloc: simplify __offline_isolated_pages() offline_pages() is the only user. __offline_isolated_pages() never gets called with ranges that contain memory holes and we no longer care about the return value. Drop the return value handling and all pfn_valid() checks. Update the documentation. Signed-off-by: David Hildenbrand Signed-off-by: Andrew Morton Reviewed-by: Oscar Salvador Acked-by: Michal Hocko Cc: Wei Yang Cc: Baoquan He Cc: Pankaj Gupta Cc: Charan Teja Reddy Cc: Dan Williams Cc: Fenghua Yu Cc: Logan Gunthorpe Cc: "Matthew Wilcox (Oracle)" Cc: Mel Gorman Cc: Mel Gorman Cc: Michel Lespinasse Cc: Mike Rapoport Cc: Tony Luck Link: https://lkml.kernel.org/r/20200819175957.28465-5-david@redhat.com Signed-off-by: Linus Torvalds --- include/linux/memory_hotplug.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index c0faa7a30c46..76b314031f09 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -103,8 +103,8 @@ extern int online_pages(unsigned long pfn, unsigned long nr_pages, int online_type, int nid); extern struct zone *test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn); -extern unsigned long __offline_isolated_pages(unsigned long start_pfn, - unsigned long end_pfn); +extern void __offline_isolated_pages(unsigned long start_pfn, + unsigned long end_pfn); typedef void (*online_page_callback_t)(struct page *page, unsigned int order); -- cgit v1.2.3 From d882c0067d99d0f2add9a41628703cc99511a639 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 15 Oct 2020 20:08:19 -0700 Subject: mm: pass migratetype into memmap_init_zone() and move_pfn_range_to_zone() On the memory onlining path, we want to start with MIGRATE_ISOLATE, to un-isolate the pages after memory onlining is complete. Let's allow passing in the migratetype. Signed-off-by: David Hildenbrand Signed-off-by: Andrew Morton Reviewed-by: Oscar Salvador Acked-by: Michal Hocko Cc: Wei Yang Cc: Baoquan He Cc: Pankaj Gupta Cc: Tony Luck Cc: Fenghua Yu Cc: Logan Gunthorpe Cc: Dan Williams Cc: Mike Rapoport Cc: "Matthew Wilcox (Oracle)" Cc: Michel Lespinasse Cc: Charan Teja Reddy Cc: Mel Gorman Link: https://lkml.kernel.org/r/20200819175957.28465-10-david@redhat.com Signed-off-by: Linus Torvalds --- include/linux/memory_hotplug.h | 3 ++- include/linux/mm.h | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 76b314031f09..51a877fec8da 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -351,7 +351,8 @@ extern int add_memory_resource(int nid, struct resource *resource); extern int add_memory_driver_managed(int nid, u64 start, u64 size, const char *resource_name); extern void move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn, - unsigned long nr_pages, struct vmem_altmap *altmap); + unsigned long nr_pages, + struct vmem_altmap *altmap, int migratetype); extern void remove_pfn_range_from_zone(struct zone *zone, unsigned long start_pfn, unsigned long nr_pages); diff --git a/include/linux/mm.h b/include/linux/mm.h index a9df46309e07..61a2633fcc7f 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2440,7 +2440,7 @@ extern int __meminit __early_pfn_to_nid(unsigned long pfn, extern void set_dma_reserve(unsigned long new_dma_reserve); extern void memmap_init_zone(unsigned long, int, unsigned long, unsigned long, - enum meminit_context, struct vmem_altmap *); + enum meminit_context, struct vmem_altmap *, int migratetype); extern void setup_per_zone_wmarks(void); extern int __meminit init_per_zone_wmark_min(void); extern void mem_init(void); -- cgit v1.2.3 From ec62d04e3fdc4ba3a7912cd7f6da1a4e787a0d75 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 15 Oct 2020 20:08:28 -0700 Subject: kernel/resource: make release_mem_region_adjustable() never fail Patch series "selective merging of system ram resources", v4. Some add_memory*() users add memory in small, contiguous memory blocks. Examples include virtio-mem, hyper-v balloon, and the XEN balloon. This can quickly result in a lot of memory resources, whereby the actual resource boundaries are not of interest (e.g., it might be relevant for DIMMs, exposed via /proc/iomem to user space). We really want to merge added resources in this scenario where possible. Resources are effectively stored in a list-based tree. Having a lot of resources not only wastes memory, it also makes traversing that tree more expensive, and makes /proc/iomem explode in size (e.g., requiring kexec-tools to manually merge resources when creating a kdump header. The current kexec-tools resource count limit does not allow for more than ~100GB of memory with a memory block size of 128MB on x86-64). Let's allow to selectively merge system ram resources by specifying a new flag for add_memory*(). Patch #5 contains a /proc/iomem example. Only tested with virtio-mem. This patch (of 8): Let's make sure splitting a resource on memory hotunplug will never fail. This will become more relevant once we merge selected System RAM resources - then, we'll trigger that case more often on memory hotunplug. In general, this function is already unlikely to fail. When we remove memory, we free up quite a lot of metadata (memmap, page tables, memory block device, etc.). The only reason it could really fail would be when injecting allocation errors. All other error cases inside release_mem_region_adjustable() seem to be sanity checks if the function would be abused in different context - let's add WARN_ON_ONCE() in these cases so we can catch them. [natechancellor@gmail.com: fix use of ternary condition in release_mem_region_adjustable] Link: https://lkml.kernel.org/r/20200922060748.2452056-1-natechancellor@gmail.com Link: https://github.com/ClangBuiltLinux/linux/issues/1159 Signed-off-by: David Hildenbrand Signed-off-by: Nathan Chancellor Signed-off-by: Andrew Morton Cc: Michal Hocko Cc: Dan Williams Cc: Jason Gunthorpe Cc: Kees Cook Cc: Ard Biesheuvel Cc: Pankaj Gupta Cc: Baoquan He Cc: Wei Yang Cc: Anton Blanchard Cc: Benjamin Herrenschmidt Cc: Boris Ostrovsky Cc: Christian Borntraeger Cc: Dave Jiang Cc: Eric Biederman Cc: Greg Kroah-Hartman Cc: Haiyang Zhang Cc: Heiko Carstens Cc: Jason Wang Cc: Juergen Gross Cc: Julien Grall Cc: "K. Y. Srinivasan" Cc: Len Brown Cc: Leonardo Bras Cc: Libor Pechacek Cc: Michael Ellerman Cc: "Michael S. Tsirkin" Cc: Nathan Lynch Cc: "Oliver O'Halloran" Cc: Paul Mackerras Cc: Pingfan Liu Cc: "Rafael J. Wysocki" Cc: Roger Pau Monn Cc: Stefano Stabellini Cc: Stephen Hemminger Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vishal Verma Cc: Wei Liu Link: https://lkml.kernel.org/r/20200911103459.10306-2-david@redhat.com Signed-off-by: Linus Torvalds --- include/linux/ioport.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ioport.h b/include/linux/ioport.h index 6c2b06fe8beb..52a91f5fa1a3 100644 --- a/include/linux/ioport.h +++ b/include/linux/ioport.h @@ -248,8 +248,8 @@ extern struct resource * __request_region(struct resource *, extern void __release_region(struct resource *, resource_size_t, resource_size_t); #ifdef CONFIG_MEMORY_HOTREMOVE -extern int release_mem_region_adjustable(struct resource *, resource_size_t, - resource_size_t); +extern void release_mem_region_adjustable(struct resource *, resource_size_t, + resource_size_t); #endif /* Wrappers for managed devices */ -- cgit v1.2.3 From 7cf603d17d9bddbda90c424b6f30c7bc2e6f48f2 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 15 Oct 2020 20:08:33 -0700 Subject: kernel/resource: move and rename IORESOURCE_MEM_DRIVER_MANAGED MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit IORESOURCE_MEM_DRIVER_MANAGED currently uses an unused PnP bit, which is always set to 0 by hardware. This is far from beautiful (and confusing), and the bit only applies to SYSRAM. So let's move it out of the bus-specific (PnP) defined bits. We'll add another SYSRAM specific bit soon. If we ever need more bits for other purposes, we can steal some from "desc", or reshuffle/regroup what we have. Signed-off-by: David Hildenbrand Signed-off-by: Andrew Morton Cc: Michal Hocko Cc: Dan Williams Cc: Jason Gunthorpe Cc: Kees Cook Cc: Ard Biesheuvel Cc: Pankaj Gupta Cc: Baoquan He Cc: Wei Yang Cc: Eric Biederman Cc: Thomas Gleixner Cc: Greg Kroah-Hartman Cc: Anton Blanchard Cc: Benjamin Herrenschmidt Cc: Boris Ostrovsky Cc: Christian Borntraeger Cc: Dave Jiang Cc: Haiyang Zhang Cc: Heiko Carstens Cc: Jason Wang Cc: Juergen Gross Cc: Julien Grall Cc: "K. Y. Srinivasan" Cc: Len Brown Cc: Leonardo Bras Cc: Libor Pechacek Cc: Michael Ellerman Cc: "Michael S. Tsirkin" Cc: Nathan Lynch Cc: "Oliver O'Halloran" Cc: Paul Mackerras Cc: Pingfan Liu Cc: "Rafael J. Wysocki" Cc: Roger Pau Monné Cc: Stefano Stabellini Cc: Stephen Hemminger Cc: Vasily Gorbik Cc: Vishal Verma Cc: Wei Liu Link: https://lkml.kernel.org/r/20200911103459.10306-3-david@redhat.com Signed-off-by: Linus Torvalds --- include/linux/ioport.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/ioport.h b/include/linux/ioport.h index 52a91f5fa1a3..d7620d7c941a 100644 --- a/include/linux/ioport.h +++ b/include/linux/ioport.h @@ -58,6 +58,9 @@ struct resource { #define IORESOURCE_EXT_TYPE_BITS 0x01000000 /* Resource extended types */ #define IORESOURCE_SYSRAM 0x01000000 /* System RAM (modifier) */ +/* IORESOURCE_SYSRAM specific bits. */ +#define IORESOURCE_SYSRAM_DRIVER_MANAGED 0x02000000 /* Always detected via a driver. */ + #define IORESOURCE_EXCLUSIVE 0x08000000 /* Userland may not map this resource */ #define IORESOURCE_DISABLED 0x10000000 @@ -103,7 +106,6 @@ struct resource { #define IORESOURCE_MEM_32BIT (3<<3) #define IORESOURCE_MEM_SHADOWABLE (1<<5) /* dup: IORESOURCE_SHADOWABLE */ #define IORESOURCE_MEM_EXPANSIONROM (1<<6) -#define IORESOURCE_MEM_DRIVER_MANAGED (1<<7) /* PnP I/O specific bits (IORESOURCE_BITS) */ #define IORESOURCE_IO_16BIT_ADDR (1<<0) -- cgit v1.2.3 From 3a0aaefe4134951b4e89feb873c457428154530c Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 15 Oct 2020 20:08:39 -0700 Subject: mm/memory_hotplug: guard more declarations by CONFIG_MEMORY_HOTPLUG MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We soon want to pass flags via a new type to add_memory() and friends. That revealed that we currently don't guard some declarations by CONFIG_MEMORY_HOTPLUG. While some definitions could be moved to different places, let's keep it minimal for now and use CONFIG_MEMORY_HOTPLUG for all functions only compiled with CONFIG_MEMORY_HOTPLUG. Wrap sparse_decode_mem_map() into CONFIG_MEMORY_HOTPLUG, it's only called from CONFIG_MEMORY_HOTPLUG code. While at it, remove allow_online_pfn_range(), which is no longer around, and mhp_notimplemented(), which is unused. Signed-off-by: David Hildenbrand Signed-off-by: Andrew Morton Cc: Michal Hocko Cc: Dan Williams Cc: Pankaj Gupta Cc: Baoquan He Cc: Wei Yang Cc: Anton Blanchard Cc: Ard Biesheuvel Cc: Benjamin Herrenschmidt Cc: Boris Ostrovsky Cc: Christian Borntraeger Cc: Dave Jiang Cc: Eric Biederman Cc: Greg Kroah-Hartman Cc: Haiyang Zhang Cc: Heiko Carstens Cc: Jason Gunthorpe Cc: Jason Wang Cc: Juergen Gross Cc: Julien Grall Cc: Kees Cook Cc: "K. Y. Srinivasan" Cc: Len Brown Cc: Leonardo Bras Cc: Libor Pechacek Cc: Michael Ellerman Cc: "Michael S. Tsirkin" Cc: Nathan Lynch Cc: "Oliver O'Halloran" Cc: Paul Mackerras Cc: Pingfan Liu Cc: "Rafael J. Wysocki" Cc: Roger Pau Monné Cc: Stefano Stabellini Cc: Stephen Hemminger Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vishal Verma Cc: Wei Liu Link: https://lkml.kernel.org/r/20200911103459.10306-4-david@redhat.com Signed-off-by: Linus Torvalds --- include/linux/memory_hotplug.h | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 51a877fec8da..1504b4d5ae6c 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -247,13 +247,6 @@ static inline void zone_span_writelock(struct zone *zone) {} static inline void zone_span_writeunlock(struct zone *zone) {} static inline void zone_seqlock_init(struct zone *zone) {} -static inline int mhp_notimplemented(const char *func) -{ - printk(KERN_WARNING "%s() called, with CONFIG_MEMORY_HOTPLUG disabled\n", func); - dump_stack(); - return -ENOSYS; -} - static inline void register_page_bootmem_info_node(struct pglist_data *pgdat) { } @@ -344,6 +337,7 @@ static inline void __remove_memory(int nid, u64 start, u64 size) {} extern void set_zone_contiguous(struct zone *zone); extern void clear_zone_contiguous(struct zone *zone); +#ifdef CONFIG_MEMORY_HOTPLUG extern void __ref free_area_init_core_hotplug(int nid); extern int __add_memory(int nid, u64 start, u64 size); extern int add_memory(int nid, u64 start, u64 size); @@ -364,8 +358,8 @@ extern void sparse_remove_section(struct mem_section *ms, unsigned long map_offset, struct vmem_altmap *altmap); extern struct page *sparse_decode_mem_map(unsigned long coded_mem_map, unsigned long pnum); -extern bool allow_online_pfn_range(int nid, unsigned long pfn, unsigned long nr_pages, - int online_type); extern struct zone *zone_for_pfn_range(int online_type, int nid, unsigned start_pfn, unsigned long nr_pages); +#endif /* CONFIG_MEMORY_HOTPLUG */ + #endif /* __LINUX_MEMORY_HOTPLUG_H */ -- cgit v1.2.3 From b6117199787c60539105d2de0d010146e8396fc3 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 15 Oct 2020 20:08:44 -0700 Subject: mm/memory_hotplug: prepare passing flags to add_memory() and friends MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We soon want to pass flags, e.g., to mark added System RAM resources. mergeable. Prepare for that. This patch is based on a similar patch by Oscar Salvador: https://lkml.kernel.org/r/20190625075227.15193-3-osalvador@suse.de Signed-off-by: David Hildenbrand Signed-off-by: Andrew Morton Reviewed-by: Juergen Gross # Xen related part Reviewed-by: Pankaj Gupta Acked-by: Wei Liu Cc: Michal Hocko Cc: Dan Williams Cc: Jason Gunthorpe Cc: Baoquan He Cc: Michael Ellerman Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: "Rafael J. Wysocki" Cc: Len Brown Cc: Greg Kroah-Hartman Cc: Vishal Verma Cc: Dave Jiang Cc: "K. Y. Srinivasan" Cc: Haiyang Zhang Cc: Stephen Hemminger Cc: Wei Liu Cc: Heiko Carstens Cc: Vasily Gorbik Cc: Christian Borntraeger Cc: David Hildenbrand Cc: "Michael S. Tsirkin" Cc: Jason Wang Cc: Boris Ostrovsky Cc: Stefano Stabellini Cc: "Oliver O'Halloran" Cc: Pingfan Liu Cc: Nathan Lynch Cc: Libor Pechacek Cc: Anton Blanchard Cc: Leonardo Bras Cc: Ard Biesheuvel Cc: Eric Biederman Cc: Julien Grall Cc: Kees Cook Cc: Roger Pau Monné Cc: Thomas Gleixner Cc: Wei Yang Link: https://lkml.kernel.org/r/20200911103459.10306-5-david@redhat.com Signed-off-by: Linus Torvalds --- include/linux/memory_hotplug.h | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 1504b4d5ae6c..33eb80fdba22 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -57,6 +57,12 @@ enum { MMOP_ONLINE_MOVABLE, }; +/* Flags for add_memory() and friends to specify memory hotplug details. */ +typedef int __bitwise mhp_t; + +/* No special request */ +#define MHP_NONE ((__force mhp_t)0) + /* * Extended parameters for memory hotplug: * altmap: alternative allocator for memmap array (optional) @@ -339,11 +345,13 @@ extern void clear_zone_contiguous(struct zone *zone); #ifdef CONFIG_MEMORY_HOTPLUG extern void __ref free_area_init_core_hotplug(int nid); -extern int __add_memory(int nid, u64 start, u64 size); -extern int add_memory(int nid, u64 start, u64 size); -extern int add_memory_resource(int nid, struct resource *resource); +extern int __add_memory(int nid, u64 start, u64 size, mhp_t mhp_flags); +extern int add_memory(int nid, u64 start, u64 size, mhp_t mhp_flags); +extern int add_memory_resource(int nid, struct resource *resource, + mhp_t mhp_flags); extern int add_memory_driver_managed(int nid, u64 start, u64 size, - const char *resource_name); + const char *resource_name, + mhp_t mhp_flags); extern void move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn, unsigned long nr_pages, struct vmem_altmap *altmap, int migratetype); -- cgit v1.2.3 From 9ca6551ee24368a4d2b09566ea4d10fe87860379 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 15 Oct 2020 20:08:49 -0700 Subject: mm/memory_hotplug: MEMHP_MERGE_RESOURCE to specify merging of System RAM resources MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Some add_memory*() users add memory in small, contiguous memory blocks. Examples include virtio-mem, hyper-v balloon, and the XEN balloon. This can quickly result in a lot of memory resources, whereby the actual resource boundaries are not of interest (e.g., it might be relevant for DIMMs, exposed via /proc/iomem to user space). We really want to merge added resources in this scenario where possible. Let's provide a flag (MEMHP_MERGE_RESOURCE) to specify that a resource either created within add_memory*() or passed via add_memory_resource() shall be marked mergeable and merged with applicable siblings. To implement that, we need a kernel/resource interface to mark selected System RAM resources mergeable (IORESOURCE_SYSRAM_MERGEABLE) and trigger merging. Note: We really want to merge after the whole operation succeeded, not directly when adding a resource to the resource tree (it would break add_memory_resource() and require splitting resources again when the operation failed - e.g., due to -ENOMEM). Signed-off-by: David Hildenbrand Signed-off-by: Andrew Morton Reviewed-by: Pankaj Gupta Cc: Michal Hocko Cc: Dan Williams Cc: Jason Gunthorpe Cc: Kees Cook Cc: Ard Biesheuvel Cc: Thomas Gleixner Cc: "K. Y. Srinivasan" Cc: Haiyang Zhang Cc: Stephen Hemminger Cc: Wei Liu Cc: Boris Ostrovsky Cc: Juergen Gross Cc: Stefano Stabellini Cc: Roger Pau Monné Cc: Julien Grall Cc: Baoquan He Cc: Wei Yang Cc: Anton Blanchard Cc: Benjamin Herrenschmidt Cc: Christian Borntraeger Cc: Dave Jiang Cc: Eric Biederman Cc: Greg Kroah-Hartman Cc: Heiko Carstens Cc: Jason Wang Cc: Len Brown Cc: Leonardo Bras Cc: Libor Pechacek Cc: Michael Ellerman Cc: "Michael S. Tsirkin" Cc: Nathan Lynch Cc: "Oliver O'Halloran" Cc: Paul Mackerras Cc: Pingfan Liu Cc: "Rafael J. Wysocki" Cc: Vasily Gorbik Cc: Vishal Verma Link: https://lkml.kernel.org/r/20200911103459.10306-6-david@redhat.com Signed-off-by: Linus Torvalds --- include/linux/ioport.h | 4 ++++ include/linux/memory_hotplug.h | 7 +++++++ 2 files changed, 11 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ioport.h b/include/linux/ioport.h index d7620d7c941a..7e61389dcb01 100644 --- a/include/linux/ioport.h +++ b/include/linux/ioport.h @@ -60,6 +60,7 @@ struct resource { /* IORESOURCE_SYSRAM specific bits. */ #define IORESOURCE_SYSRAM_DRIVER_MANAGED 0x02000000 /* Always detected via a driver. */ +#define IORESOURCE_SYSRAM_MERGEABLE 0x04000000 /* Resource can be merged. */ #define IORESOURCE_EXCLUSIVE 0x08000000 /* Userland may not map this resource */ @@ -253,6 +254,9 @@ extern void __release_region(struct resource *, resource_size_t, extern void release_mem_region_adjustable(struct resource *, resource_size_t, resource_size_t); #endif +#ifdef CONFIG_MEMORY_HOTPLUG +extern void merge_system_ram_resource(struct resource *res); +#endif /* Wrappers for managed devices */ struct device; diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 33eb80fdba22..d65c6fdc5cfc 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -62,6 +62,13 @@ typedef int __bitwise mhp_t; /* No special request */ #define MHP_NONE ((__force mhp_t)0) +/* + * Allow merging of the added System RAM resource with adjacent, + * mergeable resources. After a successful call to add_memory_resource() + * with this flag set, the resource pointer must no longer be used as it + * might be stale, or the resource might have changed. + */ +#define MEMHP_MERGE_RESOURCE ((__force mhp_t)BIT(0)) /* * Extended parameters for memory hotplug: -- cgit v1.2.3 From cb8e3c8b4f45e4ed8987a581956dc9c3827a5bcf Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 15 Oct 2020 20:09:12 -0700 Subject: kernel/resource: make iomem_resource implicit in release_mem_region_adjustable() "mem" in the name already indicates the root, similar to release_mem_region() and devm_request_mem_region(). Make it implicit. The only single caller always passes iomem_resource, other parents are not applicable. Suggested-by: Wei Yang Signed-off-by: David Hildenbrand Signed-off-by: Andrew Morton Reviewed-by: Wei Yang Cc: Michal Hocko Cc: Dan Williams Cc: Jason Gunthorpe Cc: Kees Cook Cc: Ard Biesheuvel Cc: Pankaj Gupta Cc: Baoquan He Link: https://lkml.kernel.org/r/20200916073041.10355-1-david@redhat.com Signed-off-by: Linus Torvalds --- include/linux/ioport.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ioport.h b/include/linux/ioport.h index 7e61389dcb01..5135d4b86cd6 100644 --- a/include/linux/ioport.h +++ b/include/linux/ioport.h @@ -251,8 +251,7 @@ extern struct resource * __request_region(struct resource *, extern void __release_region(struct resource *, resource_size_t, resource_size_t); #ifdef CONFIG_MEMORY_HOTREMOVE -extern void release_mem_region_adjustable(struct resource *, resource_size_t, - resource_size_t); +extern void release_mem_region_adjustable(resource_size_t, resource_size_t); #endif #ifdef CONFIG_MEMORY_HOTPLUG extern void merge_system_ram_resource(struct resource *res); -- cgit v1.2.3 From 90c7eaeb14a325a760d732184ff1fbed47e5fa98 Mon Sep 17 00:00:00 2001 From: Laurent Dufour Date: Thu, 15 Oct 2020 20:09:15 -0700 Subject: mm: don't panic when links can't be created in sysfs At boot time, or when doing memory hot-add operations, if the links in sysfs can't be created, the system is still able to run, so just report the error in the kernel log rather than BUG_ON and potentially make system unusable because the callpath can be called with locks held. Since the number of memory blocks managed could be high, the messages are rate limited. As a consequence, link_mem_sections() has no status to report anymore. Signed-off-by: Laurent Dufour Signed-off-by: Andrew Morton Reviewed-by: Oscar Salvador Acked-by: Michal Hocko Acked-by: David Hildenbrand Cc: Greg Kroah-Hartman Cc: Fenghua Yu Cc: Nathan Lynch Cc: "Rafael J . Wysocki" Cc: Scott Cheloha Cc: Tony Luck Link: https://lkml.kernel.org/r/20200915094143.79181-4-ldufour@linux.ibm.com Signed-off-by: Linus Torvalds --- include/linux/node.h | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/node.h b/include/linux/node.h index 014ba3ab2efd..8e5a29897936 100644 --- a/include/linux/node.h +++ b/include/linux/node.h @@ -99,15 +99,14 @@ extern struct node *node_devices[]; typedef void (*node_registration_func_t)(struct node *); #if defined(CONFIG_MEMORY_HOTPLUG_SPARSE) && defined(CONFIG_NUMA) -int link_mem_sections(int nid, unsigned long start_pfn, - unsigned long end_pfn, - enum meminit_context context); +void link_mem_sections(int nid, unsigned long start_pfn, + unsigned long end_pfn, + enum meminit_context context); #else -static inline int link_mem_sections(int nid, unsigned long start_pfn, - unsigned long end_pfn, - enum meminit_context context) +static inline void link_mem_sections(int nid, unsigned long start_pfn, + unsigned long end_pfn, + enum meminit_context context) { - return 0; } #endif @@ -130,8 +129,7 @@ static inline int register_one_node(int nid) if (error) return error; /* link memory sections under this node */ - error = link_mem_sections(nid, start_pfn, end_pfn, - MEMINIT_EARLY); + link_mem_sections(nid, start_pfn, end_pfn, MEMINIT_EARLY); } return error; -- cgit v1.2.3 From ed0173733dd468883198c3136284394320b8fad6 Mon Sep 17 00:00:00 2001 From: Yu Zhao Date: Thu, 15 Oct 2020 20:09:55 -0700 Subject: mm: use self-explanatory macros rather than "2" Signed-off-by: Yu Zhao Signed-off-by: Andrew Morton Cc: Alex Shi Link: http://lkml.kernel.org/r/20200831175042.3527153-2-yuzhao@google.com Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 12 ++++++++---- include/linux/vmstat.h | 2 +- 2 files changed, 9 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index c27fb1faffe5..7e0ea3fe95ca 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -266,6 +266,8 @@ static inline bool is_active_lru(enum lru_list lru) return (lru == LRU_ACTIVE_ANON || lru == LRU_ACTIVE_FILE); } +#define ANON_AND_FILE 2 + enum lruvec_flags { LRUVEC_CONGESTED, /* lruvec has many dirty pages * backed by a congested BDI @@ -283,8 +285,8 @@ struct lruvec { unsigned long file_cost; /* Non-resident age, driven by LRU movement */ atomic_long_t nonresident_age; - /* Refaults at the time of last reclaim cycle, anon=0, file=1 */ - unsigned long refaults[2]; + /* Refaults at the time of last reclaim cycle */ + unsigned long refaults[ANON_AND_FILE]; /* Various lruvec state flags (enum lruvec_flags) */ unsigned long flags; #ifdef CONFIG_MEMCG @@ -441,6 +443,8 @@ enum zone_type { #ifndef __GENERATING_BOUNDS_H +#define ASYNC_AND_SYNC 2 + struct zone { /* Read-mostly fields */ @@ -560,8 +564,8 @@ struct zone { #if defined CONFIG_COMPACTION || defined CONFIG_CMA /* pfn where compaction free scanner should start */ unsigned long compact_cached_free_pfn; - /* pfn where async and sync compaction migration scanner should start */ - unsigned long compact_cached_migrate_pfn[2]; + /* pfn where compaction migration scanner should start */ + unsigned long compact_cached_migrate_pfn[ASYNC_AND_SYNC]; unsigned long compact_init_migrate_pfn; unsigned long compact_init_free_pfn; #endif diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index 7557c1070fd7..322dcbfcc933 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -28,7 +28,7 @@ struct reclaim_stat { unsigned nr_writeback; unsigned nr_immediate; unsigned nr_pageout; - unsigned nr_activate[2]; + unsigned nr_activate[ANON_AND_FILE]; unsigned nr_ref_keep; unsigned nr_unmap_fail; unsigned nr_lazyfree_fail; -- cgit v1.2.3 From 1f0f8c0de09066d23760c1f5fac2cd53b32f1127 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Thu, 15 Oct 2020 20:10:11 -0700 Subject: include/linux/mmzone.h: remove unused early_pfn_valid() The early_pfn_valid() macro is defined but it is never used. Remove it. Signed-off-by: Mike Rapoport Signed-off-by: Andrew Morton Acked-by: David Hildenbrand Link: https://lkml.kernel.org/r/20200923162915.26935-1-rppt@kernel.org Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 7e0ea3fe95ca..fb3bf696c05e 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -1420,7 +1420,6 @@ static inline unsigned long next_present_section_nr(unsigned long section_nr) #define pfn_to_nid(pfn) (0) #endif -#define early_pfn_valid(pfn) pfn_valid(pfn) void sparse_init(void); #else #define sparse_init() do {} while (0) @@ -1440,10 +1439,6 @@ struct mminit_pfnnid_cache { int last_nid; }; -#ifndef early_pfn_valid -#define early_pfn_valid(pfn) (1) -#endif - /* * If it is possible to have holes within a MAX_ORDER_NR_PAGES, then we * need to check pfn validity within that MAX_ORDER_NR_PAGES block. -- cgit v1.2.3 From b296a6d53339a79082c1d2c1761e948e8b3def69 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Thu, 15 Oct 2020 20:10:21 -0700 Subject: kernel.h: split out min()/max() et al. helpers kernel.h is being used as a dump for all kinds of stuff for a long time. Here is the attempt to start cleaning it up by splitting out min()/max() et al. helpers. At the same time convert users in header and lib folder to use new header. Though for time being include new header back to kernel.h to avoid twisted indirected includes for other existing users. Signed-off-by: Andy Shevchenko Signed-off-by: Andrew Morton Cc: "Rafael J. Wysocki" Cc: Steven Rostedt Cc: Rasmus Villemoes Cc: Joe Perches Cc: Linus Torvalds Link: https://lkml.kernel.org/r/20200910164152.GA1891694@smile.fi.intel.com Signed-off-by: Linus Torvalds --- include/linux/blkdev.h | 1 + include/linux/bvec.h | 6 +- include/linux/jiffies.h | 3 +- include/linux/kernel.h | 150 +--------------------------------------------- include/linux/minmax.h | 153 +++++++++++++++++++++++++++++++++++++++++++++++ include/linux/nodemask.h | 2 +- include/linux/uaccess.h | 1 + 7 files changed, 164 insertions(+), 152 deletions(-) create mode 100644 include/linux/minmax.h (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index c09375e0a0eb..639cae2c158b 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include diff --git a/include/linux/bvec.h b/include/linux/bvec.h index dd74503f7e5e..2efec10bf792 100644 --- a/include/linux/bvec.h +++ b/include/linux/bvec.h @@ -7,10 +7,14 @@ #ifndef __LINUX_BVEC_ITER_H #define __LINUX_BVEC_ITER_H -#include #include #include +#include +#include #include +#include + +struct page; /** * struct bio_vec - a contiguous range of physical memory addresses diff --git a/include/linux/jiffies.h b/include/linux/jiffies.h index fed6ba96c527..5e13f801c902 100644 --- a/include/linux/jiffies.h +++ b/include/linux/jiffies.h @@ -3,8 +3,9 @@ #define _LINUX_JIFFIES_H #include +#include #include -#include +#include #include #include #include diff --git a/include/linux/kernel.h b/include/linux/kernel.h index e4aa29b1ad62..c629215fdad9 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -833,155 +834,6 @@ ftrace_vprintk(const char *fmt, va_list ap) static inline void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) { } #endif /* CONFIG_TRACING */ -/* - * min()/max()/clamp() macros must accomplish three things: - * - * - avoid multiple evaluations of the arguments (so side-effects like - * "x++" happen only once) when non-constant. - * - perform strict type-checking (to generate warnings instead of - * nasty runtime surprises). See the "unnecessary" pointer comparison - * in __typecheck(). - * - retain result as a constant expressions when called with only - * constant expressions (to avoid tripping VLA warnings in stack - * allocation usage). - */ -#define __typecheck(x, y) \ - (!!(sizeof((typeof(x) *)1 == (typeof(y) *)1))) - -/* - * This returns a constant expression while determining if an argument is - * a constant expression, most importantly without evaluating the argument. - * Glory to Martin Uecker - */ -#define __is_constexpr(x) \ - (sizeof(int) == sizeof(*(8 ? ((void *)((long)(x) * 0l)) : (int *)8))) - -#define __no_side_effects(x, y) \ - (__is_constexpr(x) && __is_constexpr(y)) - -#define __safe_cmp(x, y) \ - (__typecheck(x, y) && __no_side_effects(x, y)) - -#define __cmp(x, y, op) ((x) op (y) ? (x) : (y)) - -#define __cmp_once(x, y, unique_x, unique_y, op) ({ \ - typeof(x) unique_x = (x); \ - typeof(y) unique_y = (y); \ - __cmp(unique_x, unique_y, op); }) - -#define __careful_cmp(x, y, op) \ - __builtin_choose_expr(__safe_cmp(x, y), \ - __cmp(x, y, op), \ - __cmp_once(x, y, __UNIQUE_ID(__x), __UNIQUE_ID(__y), op)) - -/** - * min - return minimum of two values of the same or compatible types - * @x: first value - * @y: second value - */ -#define min(x, y) __careful_cmp(x, y, <) - -/** - * max - return maximum of two values of the same or compatible types - * @x: first value - * @y: second value - */ -#define max(x, y) __careful_cmp(x, y, >) - -/** - * min3 - return minimum of three values - * @x: first value - * @y: second value - * @z: third value - */ -#define min3(x, y, z) min((typeof(x))min(x, y), z) - -/** - * max3 - return maximum of three values - * @x: first value - * @y: second value - * @z: third value - */ -#define max3(x, y, z) max((typeof(x))max(x, y), z) - -/** - * min_not_zero - return the minimum that is _not_ zero, unless both are zero - * @x: value1 - * @y: value2 - */ -#define min_not_zero(x, y) ({ \ - typeof(x) __x = (x); \ - typeof(y) __y = (y); \ - __x == 0 ? __y : ((__y == 0) ? __x : min(__x, __y)); }) - -/** - * clamp - return a value clamped to a given range with strict typechecking - * @val: current value - * @lo: lowest allowable value - * @hi: highest allowable value - * - * This macro does strict typechecking of @lo/@hi to make sure they are of the - * same type as @val. See the unnecessary pointer comparisons. - */ -#define clamp(val, lo, hi) min((typeof(val))max(val, lo), hi) - -/* - * ..and if you can't take the strict - * types, you can specify one yourself. - * - * Or not use min/max/clamp at all, of course. - */ - -/** - * min_t - return minimum of two values, using the specified type - * @type: data type to use - * @x: first value - * @y: second value - */ -#define min_t(type, x, y) __careful_cmp((type)(x), (type)(y), <) - -/** - * max_t - return maximum of two values, using the specified type - * @type: data type to use - * @x: first value - * @y: second value - */ -#define max_t(type, x, y) __careful_cmp((type)(x), (type)(y), >) - -/** - * clamp_t - return a value clamped to a given range using a given type - * @type: the type of variable to use - * @val: current value - * @lo: minimum allowable value - * @hi: maximum allowable value - * - * This macro does no typechecking and uses temporary variables of type - * @type to make all the comparisons. - */ -#define clamp_t(type, val, lo, hi) min_t(type, max_t(type, val, lo), hi) - -/** - * clamp_val - return a value clamped to a given range using val's type - * @val: current value - * @lo: minimum allowable value - * @hi: maximum allowable value - * - * This macro does no typechecking and uses temporary variables of whatever - * type the input argument @val is. This is useful when @val is an unsigned - * type and @lo and @hi are literals that will otherwise be assigned a signed - * integer type. - */ -#define clamp_val(val, lo, hi) clamp_t(typeof(val), val, lo, hi) - - -/** - * swap - swap values of @a and @b - * @a: first value - * @b: second value - */ -#define swap(a, b) \ - do { typeof(a) __tmp = (a); (a) = (b); (b) = __tmp; } while (0) - /* This counts to 12. Any more, it will return 13th argument. */ #define __COUNT_ARGS(_0, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _n, X...) _n #define COUNT_ARGS(X...) __COUNT_ARGS(, ##X, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0) diff --git a/include/linux/minmax.h b/include/linux/minmax.h new file mode 100644 index 000000000000..c0f57b0c64d9 --- /dev/null +++ b/include/linux/minmax.h @@ -0,0 +1,153 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_MINMAX_H +#define _LINUX_MINMAX_H + +/* + * min()/max()/clamp() macros must accomplish three things: + * + * - avoid multiple evaluations of the arguments (so side-effects like + * "x++" happen only once) when non-constant. + * - perform strict type-checking (to generate warnings instead of + * nasty runtime surprises). See the "unnecessary" pointer comparison + * in __typecheck(). + * - retain result as a constant expressions when called with only + * constant expressions (to avoid tripping VLA warnings in stack + * allocation usage). + */ +#define __typecheck(x, y) \ + (!!(sizeof((typeof(x) *)1 == (typeof(y) *)1))) + +/* + * This returns a constant expression while determining if an argument is + * a constant expression, most importantly without evaluating the argument. + * Glory to Martin Uecker + */ +#define __is_constexpr(x) \ + (sizeof(int) == sizeof(*(8 ? ((void *)((long)(x) * 0l)) : (int *)8))) + +#define __no_side_effects(x, y) \ + (__is_constexpr(x) && __is_constexpr(y)) + +#define __safe_cmp(x, y) \ + (__typecheck(x, y) && __no_side_effects(x, y)) + +#define __cmp(x, y, op) ((x) op (y) ? (x) : (y)) + +#define __cmp_once(x, y, unique_x, unique_y, op) ({ \ + typeof(x) unique_x = (x); \ + typeof(y) unique_y = (y); \ + __cmp(unique_x, unique_y, op); }) + +#define __careful_cmp(x, y, op) \ + __builtin_choose_expr(__safe_cmp(x, y), \ + __cmp(x, y, op), \ + __cmp_once(x, y, __UNIQUE_ID(__x), __UNIQUE_ID(__y), op)) + +/** + * min - return minimum of two values of the same or compatible types + * @x: first value + * @y: second value + */ +#define min(x, y) __careful_cmp(x, y, <) + +/** + * max - return maximum of two values of the same or compatible types + * @x: first value + * @y: second value + */ +#define max(x, y) __careful_cmp(x, y, >) + +/** + * min3 - return minimum of three values + * @x: first value + * @y: second value + * @z: third value + */ +#define min3(x, y, z) min((typeof(x))min(x, y), z) + +/** + * max3 - return maximum of three values + * @x: first value + * @y: second value + * @z: third value + */ +#define max3(x, y, z) max((typeof(x))max(x, y), z) + +/** + * min_not_zero - return the minimum that is _not_ zero, unless both are zero + * @x: value1 + * @y: value2 + */ +#define min_not_zero(x, y) ({ \ + typeof(x) __x = (x); \ + typeof(y) __y = (y); \ + __x == 0 ? __y : ((__y == 0) ? __x : min(__x, __y)); }) + +/** + * clamp - return a value clamped to a given range with strict typechecking + * @val: current value + * @lo: lowest allowable value + * @hi: highest allowable value + * + * This macro does strict typechecking of @lo/@hi to make sure they are of the + * same type as @val. See the unnecessary pointer comparisons. + */ +#define clamp(val, lo, hi) min((typeof(val))max(val, lo), hi) + +/* + * ..and if you can't take the strict + * types, you can specify one yourself. + * + * Or not use min/max/clamp at all, of course. + */ + +/** + * min_t - return minimum of two values, using the specified type + * @type: data type to use + * @x: first value + * @y: second value + */ +#define min_t(type, x, y) __careful_cmp((type)(x), (type)(y), <) + +/** + * max_t - return maximum of two values, using the specified type + * @type: data type to use + * @x: first value + * @y: second value + */ +#define max_t(type, x, y) __careful_cmp((type)(x), (type)(y), >) + +/** + * clamp_t - return a value clamped to a given range using a given type + * @type: the type of variable to use + * @val: current value + * @lo: minimum allowable value + * @hi: maximum allowable value + * + * This macro does no typechecking and uses temporary variables of type + * @type to make all the comparisons. + */ +#define clamp_t(type, val, lo, hi) min_t(type, max_t(type, val, lo), hi) + +/** + * clamp_val - return a value clamped to a given range using val's type + * @val: current value + * @lo: minimum allowable value + * @hi: maximum allowable value + * + * This macro does no typechecking and uses temporary variables of whatever + * type the input argument @val is. This is useful when @val is an unsigned + * type and @lo and @hi are literals that will otherwise be assigned a signed + * integer type. + */ +#define clamp_val(val, lo, hi) clamp_t(typeof(val), val, lo, hi) + +/** + * swap - swap values of @a and @b + * @a: first value + * @b: second value + */ +#define swap(a, b) \ + do { typeof(a) __tmp = (a); (a) = (b); (b) = __tmp; } while (0) + +#endif /* _LINUX_MINMAX_H */ diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h index 3334ce056335..ac398e143c9a 100644 --- a/include/linux/nodemask.h +++ b/include/linux/nodemask.h @@ -90,9 +90,9 @@ * for such situations. See below and CPUMASK_ALLOC also. */ -#include #include #include +#include #include typedef struct { DECLARE_BITMAP(bits, MAX_NUMNODES); } nodemask_t; diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h index 1ae36bc8db35..ef084eacaa7c 100644 --- a/include/linux/uaccess.h +++ b/include/linux/uaccess.h @@ -3,6 +3,7 @@ #define __LINUX_UACCESS_H__ #include +#include #include #include -- cgit v1.2.3 From 3b6742618ed9216dd6caad968fe8c83b32dff485 Mon Sep 17 00:00:00 2001 From: Stephen Boyd Date: Thu, 15 Oct 2020 20:11:17 -0700 Subject: lib/idr.c: document calling context for IDA APIs mustn't use locks The documentation for these functions indicates that callers don't need to hold a lock while calling them, but that documentation is only in one place under "IDA Usage". Let's state the same information on each IDA function so that it's clear what the calling context requires. Furthermore, let's document ida_simple_get() with the same information so that callers know how this API works. Signed-off-by: Stephen Boyd Signed-off-by: Andrew Morton Reviewed-by: Greg Kroah-Hartman Cc: Tri Vo Cc: Jonathan Corbet Cc: Matthew Wilcox Link: https://lkml.kernel.org/r/20200910055246.2297797-1-swboyd@chromium.org Signed-off-by: Linus Torvalds --- include/linux/idr.h | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/idr.h b/include/linux/idr.h index 3ade03e5c7af..b235ed987021 100644 --- a/include/linux/idr.h +++ b/include/linux/idr.h @@ -263,7 +263,8 @@ void ida_destroy(struct ida *ida); * * Allocate an ID between 0 and %INT_MAX, inclusive. * - * Context: Any context. + * Context: Any context. It is safe to call this function without + * locking in your code. * Return: The allocated ID, or %-ENOMEM if memory could not be allocated, * or %-ENOSPC if there are no free IDs. */ @@ -280,7 +281,8 @@ static inline int ida_alloc(struct ida *ida, gfp_t gfp) * * Allocate an ID between @min and %INT_MAX, inclusive. * - * Context: Any context. + * Context: Any context. It is safe to call this function without + * locking in your code. * Return: The allocated ID, or %-ENOMEM if memory could not be allocated, * or %-ENOSPC if there are no free IDs. */ @@ -297,7 +299,8 @@ static inline int ida_alloc_min(struct ida *ida, unsigned int min, gfp_t gfp) * * Allocate an ID between 0 and @max, inclusive. * - * Context: Any context. + * Context: Any context. It is safe to call this function without + * locking in your code. * Return: The allocated ID, or %-ENOMEM if memory could not be allocated, * or %-ENOSPC if there are no free IDs. */ -- cgit v1.2.3 From 3264ceec8f17a99a3895de7de06b4d7e9c8f3f30 Mon Sep 17 00:00:00 2001 From: Stephen Boyd Date: Thu, 15 Oct 2020 20:11:21 -0700 Subject: lib/idr.c: document that ida_simple_{get,remove}() are deprecated These two functions are deprecated. Users should call ida_alloc() or ida_free() respectively instead. Add documentation to this effect until the macro can be removed. Signed-off-by: Stephen Boyd Signed-off-by: Andrew Morton Reviewed-by: Tri Vo Cc: Greg KH Cc: Jonathan Corbet Cc: Matthew Wilcox Link: https://lkml.kernel.org/r/20200910055246.2297797-2-swboyd@chromium.org Signed-off-by: Linus Torvalds --- include/linux/idr.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/idr.h b/include/linux/idr.h index b235ed987021..a0dce14090a9 100644 --- a/include/linux/idr.h +++ b/include/linux/idr.h @@ -314,6 +314,10 @@ static inline void ida_init(struct ida *ida) xa_init_flags(&ida->xa, IDA_INIT_FLAGS); } +/* + * ida_simple_get() and ida_simple_remove() are deprecated. Use + * ida_alloc() and ida_free() instead respectively. + */ #define ida_simple_get(ida, start, end, gfp) \ ida_alloc_range(ida, start, (end) - 1, gfp) #define ida_simple_remove(ida, id) ida_free(ida, id) -- cgit v1.2.3 From e130816164e244b692921de49771eeb28205152d Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Thu, 15 Oct 2020 20:11:31 -0700 Subject: include/linux/list.h: add a macro to test if entry is pointing to the head Add a macro to test if entry is pointing to the head of the list which is useful in cases like: list_for_each_entry(pos, &head, member) { if (cond) break; } if (list_entry_is_head(pos, &head, member)) return -ERRNO; that allows to avoid additional variable to be added to track if loop has not been stopped in the middle. While here, convert list_for_each_entry*() family of macros to use a new one. Signed-off-by: Andy Shevchenko Signed-off-by: Andrew Morton Reviewed-by: Cezary Rojewski Link: https://lkml.kernel.org/r/20200929134342.51489-1-andriy.shevchenko@linux.intel.com Signed-off-by: Linus Torvalds --- include/linux/list.h | 29 +++++++++++++++++++---------- 1 file changed, 19 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/list.h b/include/linux/list.h index 0d0d17a10d25..a18c87b63376 100644 --- a/include/linux/list.h +++ b/include/linux/list.h @@ -609,6 +609,15 @@ static inline void list_splice_tail_init(struct list_head *list, pos != (head); \ pos = n, n = pos->prev) +/** + * list_entry_is_head - test if the entry points to the head of the list + * @pos: the type * to cursor + * @head: the head for your list. + * @member: the name of the list_head within the struct. + */ +#define list_entry_is_head(pos, head, member) \ + (&pos->member == (head)) + /** * list_for_each_entry - iterate over list of given type * @pos: the type * to use as a loop cursor. @@ -617,7 +626,7 @@ static inline void list_splice_tail_init(struct list_head *list, */ #define list_for_each_entry(pos, head, member) \ for (pos = list_first_entry(head, typeof(*pos), member); \ - &pos->member != (head); \ + !list_entry_is_head(pos, head, member); \ pos = list_next_entry(pos, member)) /** @@ -628,7 +637,7 @@ static inline void list_splice_tail_init(struct list_head *list, */ #define list_for_each_entry_reverse(pos, head, member) \ for (pos = list_last_entry(head, typeof(*pos), member); \ - &pos->member != (head); \ + !list_entry_is_head(pos, head, member); \ pos = list_prev_entry(pos, member)) /** @@ -653,7 +662,7 @@ static inline void list_splice_tail_init(struct list_head *list, */ #define list_for_each_entry_continue(pos, head, member) \ for (pos = list_next_entry(pos, member); \ - &pos->member != (head); \ + !list_entry_is_head(pos, head, member); \ pos = list_next_entry(pos, member)) /** @@ -667,7 +676,7 @@ static inline void list_splice_tail_init(struct list_head *list, */ #define list_for_each_entry_continue_reverse(pos, head, member) \ for (pos = list_prev_entry(pos, member); \ - &pos->member != (head); \ + !list_entry_is_head(pos, head, member); \ pos = list_prev_entry(pos, member)) /** @@ -679,7 +688,7 @@ static inline void list_splice_tail_init(struct list_head *list, * Iterate over list of given type, continuing from current position. */ #define list_for_each_entry_from(pos, head, member) \ - for (; &pos->member != (head); \ + for (; !list_entry_is_head(pos, head, member); \ pos = list_next_entry(pos, member)) /** @@ -692,7 +701,7 @@ static inline void list_splice_tail_init(struct list_head *list, * Iterate backwards over list of given type, continuing from current position. */ #define list_for_each_entry_from_reverse(pos, head, member) \ - for (; &pos->member != (head); \ + for (; !list_entry_is_head(pos, head, member); \ pos = list_prev_entry(pos, member)) /** @@ -705,7 +714,7 @@ static inline void list_splice_tail_init(struct list_head *list, #define list_for_each_entry_safe(pos, n, head, member) \ for (pos = list_first_entry(head, typeof(*pos), member), \ n = list_next_entry(pos, member); \ - &pos->member != (head); \ + !list_entry_is_head(pos, head, member); \ pos = n, n = list_next_entry(n, member)) /** @@ -721,7 +730,7 @@ static inline void list_splice_tail_init(struct list_head *list, #define list_for_each_entry_safe_continue(pos, n, head, member) \ for (pos = list_next_entry(pos, member), \ n = list_next_entry(pos, member); \ - &pos->member != (head); \ + !list_entry_is_head(pos, head, member); \ pos = n, n = list_next_entry(n, member)) /** @@ -736,7 +745,7 @@ static inline void list_splice_tail_init(struct list_head *list, */ #define list_for_each_entry_safe_from(pos, n, head, member) \ for (n = list_next_entry(pos, member); \ - &pos->member != (head); \ + !list_entry_is_head(pos, head, member); \ pos = n, n = list_next_entry(n, member)) /** @@ -752,7 +761,7 @@ static inline void list_splice_tail_init(struct list_head *list, #define list_for_each_entry_safe_reverse(pos, n, head, member) \ for (pos = list_last_entry(head, typeof(*pos), member), \ n = list_prev_entry(pos, member); \ - &pos->member != (head); \ + !list_entry_is_head(pos, head, member); \ pos = n, n = list_prev_entry(n, member)) /** -- cgit v1.2.3 From a9eb63705e379f10a3c9d13fc6aee8b50805e862 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Thu, 15 Oct 2020 20:11:41 -0700 Subject: bitops: simplify get_count_order_long() These two cases could be unified into one. Signed-off-by: Wei Yang Signed-off-by: Andrew Morton Cc: Christian Brauner Cc: Andy Shevchenko Link: https://lkml.kernel.org/r/20200807085837.11697-2-richard.weiyang@linux.alibaba.com Signed-off-by: Linus Torvalds --- include/linux/bitops.h | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bitops.h b/include/linux/bitops.h index 99f2ac30b1d9..030a98f0c452 100644 --- a/include/linux/bitops.h +++ b/include/linux/bitops.h @@ -206,10 +206,7 @@ static inline int get_count_order_long(unsigned long l) { if (l == 0UL) return -1; - else if (l & (l - 1UL)) - return (int)fls_long(l); - else - return (int)fls_long(l) - 1; + return (int)fls_long(--l); } /** -- cgit v1.2.3 From 004fba1ae6ddd66ba0faa4f60c603b3ca77b3554 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Thu, 15 Oct 2020 20:11:46 -0700 Subject: bitops: use the same mechanism for get_count_order[_long] These two functions share the same logic. Signed-off-by: Wei Yang Signed-off-by: Andrew Morton Cc: Christian Brauner Cc: Andy Shevchenko Link: https://lkml.kernel.org/r/20200807085837.11697-3-richard.weiyang@linux.alibaba.com Signed-off-by: Linus Torvalds --- include/linux/bitops.h | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bitops.h b/include/linux/bitops.h index 030a98f0c452..5b74bdf159d6 100644 --- a/include/linux/bitops.h +++ b/include/linux/bitops.h @@ -188,12 +188,10 @@ static inline unsigned fls_long(unsigned long l) static inline int get_count_order(unsigned int count) { - int order; + if (count == 0) + return -1; - order = fls(count) - 1; - if (count & (count - 1)) - order++; - return order; + return fls(--count); } /** -- cgit v1.2.3 From afc63a97b764bc5a715762d0d9cc9785c2ef4e75 Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Thu, 15 Oct 2020 20:12:46 -0700 Subject: coredump: refactor page range dumping into common helper Both fs/binfmt_elf.c and fs/binfmt_elf_fdpic.c need to dump ranges of pages into the coredump file. Extract that logic into a common helper. Signed-off-by: Jann Horn Signed-off-by: Andrew Morton Acked-by: Linus Torvalds Cc: Christoph Hellwig Cc: Alexander Viro Cc: "Eric W . Biederman" Cc: Oleg Nesterov Cc: Hugh Dickins Link: http://lkml.kernel.org/r/20200827114932.3572699-4-jannh@google.com Signed-off-by: Linus Torvalds --- include/linux/coredump.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/coredump.h b/include/linux/coredump.h index 7a899e83835d..f0b71a74d0bc 100644 --- a/include/linux/coredump.h +++ b/include/linux/coredump.h @@ -16,6 +16,8 @@ extern int dump_skip(struct coredump_params *cprm, size_t nr); extern int dump_emit(struct coredump_params *cprm, const void *addr, int nr); extern int dump_align(struct coredump_params *cprm, int align); extern void dump_truncate(struct coredump_params *cprm); +int dump_user_range(struct coredump_params *cprm, unsigned long start, + unsigned long len); #ifdef CONFIG_COREDUMP extern void do_coredump(const kernel_siginfo_t *siginfo); #else -- cgit v1.2.3 From 429a22e776a2b9f85a2b9c53d8e647598b553dd1 Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Thu, 15 Oct 2020 20:12:50 -0700 Subject: coredump: rework elf/elf_fdpic vma_dump_size() into common helper At the moment, the binfmt_elf and binfmt_elf_fdpic code have slightly different code to figure out which VMAs should be dumped, and if so, whether the dump should contain the entire VMA or just its first page. Eliminate duplicate code by reworking the binfmt_elf version into a generic core dumping helper in coredump.c. As part of that, change the heuristic for detecting executable/library header pages to check whether the inode is executable instead of looking at the file mode. This is less problematic in terms of locking because it lets us avoid get_user() under the mmap_sem. (And arguably it looks nicer and makes more sense in generic code.) Adjust a little bit based on the binfmt_elf_fdpic version: ->anon_vma is only meaningful under CONFIG_MMU, otherwise we have to assume that the VMA has been written to. Suggested-by: Linus Torvalds Signed-off-by: Jann Horn Signed-off-by: Andrew Morton Acked-by: Linus Torvalds Cc: Christoph Hellwig Cc: Alexander Viro Cc: "Eric W . Biederman" Cc: Oleg Nesterov Cc: Hugh Dickins Link: http://lkml.kernel.org/r/20200827114932.3572699-5-jannh@google.com Signed-off-by: Linus Torvalds --- include/linux/coredump.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/coredump.h b/include/linux/coredump.h index f0b71a74d0bc..bfecb8d79a7f 100644 --- a/include/linux/coredump.h +++ b/include/linux/coredump.h @@ -16,6 +16,7 @@ extern int dump_skip(struct coredump_params *cprm, size_t nr); extern int dump_emit(struct coredump_params *cprm, const void *addr, int nr); extern int dump_align(struct coredump_params *cprm, int align); extern void dump_truncate(struct coredump_params *cprm); +unsigned long vma_dump_size(struct vm_area_struct *vma, unsigned long mm_flags); int dump_user_range(struct coredump_params *cprm, unsigned long start, unsigned long len); #ifdef CONFIG_COREDUMP -- cgit v1.2.3 From a07279c9a8cd7dbd321640ff7210591599ee00a4 Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Thu, 15 Oct 2020 20:12:54 -0700 Subject: binfmt_elf, binfmt_elf_fdpic: use a VMA list snapshot In both binfmt_elf and binfmt_elf_fdpic, use a new helper dump_vma_snapshot() to take a snapshot of the VMA list (including the gate VMA, if we have one) while protected by the mmap_lock, and then use that snapshot instead of walking the VMA list without locking. An alternative approach would be to keep the mmap_lock held across the entire core dumping operation; however, keeping the mmap_lock locked while we may be blocked for an unbounded amount of time (e.g. because we're dumping to a FUSE filesystem or so) isn't really optimal; the mmap_lock blocks things like the ->release handler of userfaultfd, and we don't really want critical system daemons to grind to a halt just because someone "gifted" them SCM_RIGHTS to an eternally-locked userfaultfd, or something like that. Since both the normal ELF code and the FDPIC ELF code need this functionality (and if any other binfmt wants to add coredump support in the future, they'd probably need it, too), implement this with a common helper in fs/coredump.c. A downside of this approach is that we now need a bigger amount of kernel memory per userspace VMA in the normal ELF case, and that we need O(n) kernel memory in the FDPIC ELF case at all; but 40 bytes per VMA shouldn't be terribly bad. There currently is a data race between stack expansion and anything that reads ->vm_start or ->vm_end under the mmap_lock held in read mode; to mitigate that for core dumping, take the mmap_lock in write mode when taking a snapshot of the VMA hierarchy. (If we only took the mmap_lock in read mode, we could end up with a corrupted core dump if someone does get_user_pages_remote() concurrently. Not really a major problem, but taking the mmap_lock either way works here, so we might as well avoid the issue.) (This doesn't do anything about the existing data races with stack expansion in other mm code.) Signed-off-by: Jann Horn Signed-off-by: Andrew Morton Acked-by: Linus Torvalds Cc: Christoph Hellwig Cc: Alexander Viro Cc: "Eric W . Biederman" Cc: Oleg Nesterov Cc: Hugh Dickins Link: http://lkml.kernel.org/r/20200827114932.3572699-6-jannh@google.com Signed-off-by: Linus Torvalds --- include/linux/coredump.h | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/coredump.h b/include/linux/coredump.h index bfecb8d79a7f..e58e8c207782 100644 --- a/include/linux/coredump.h +++ b/include/linux/coredump.h @@ -7,6 +7,12 @@ #include #include +struct core_vma_metadata { + unsigned long start, end; + unsigned long flags; + unsigned long dump_size; +}; + /* * These are the only things you should do on a core-file: use only these * functions to write out all the necessary info. @@ -16,9 +22,11 @@ extern int dump_skip(struct coredump_params *cprm, size_t nr); extern int dump_emit(struct coredump_params *cprm, const void *addr, int nr); extern int dump_align(struct coredump_params *cprm, int align); extern void dump_truncate(struct coredump_params *cprm); -unsigned long vma_dump_size(struct vm_area_struct *vma, unsigned long mm_flags); int dump_user_range(struct coredump_params *cprm, unsigned long start, unsigned long len); +int dump_vma_snapshot(struct coredump_params *cprm, int *vma_count, + struct core_vma_metadata **vma_meta, + size_t *vma_data_size_ptr); #ifdef CONFIG_COREDUMP extern void do_coredump(const kernel_siginfo_t *siginfo); #else -- cgit v1.2.3 From 4d45e75a9955ade5c2f49bd96fc4173b2cec9a72 Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Thu, 15 Oct 2020 20:13:00 -0700 Subject: mm: remove the now-unnecessary mmget_still_valid() hack The preceding patches have ensured that core dumping properly takes the mmap_lock. Thanks to that, we can now remove mmget_still_valid() and all its users. Signed-off-by: Jann Horn Signed-off-by: Andrew Morton Acked-by: Linus Torvalds Cc: Christoph Hellwig Cc: Alexander Viro Cc: "Eric W . Biederman" Cc: Oleg Nesterov Cc: Hugh Dickins Link: http://lkml.kernel.org/r/20200827114932.3572699-8-jannh@google.com Signed-off-by: Linus Torvalds --- include/linux/sched/mm.h | 25 ------------------------- 1 file changed, 25 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h index 15bfb06f2884..981e34cb1409 100644 --- a/include/linux/sched/mm.h +++ b/include/linux/sched/mm.h @@ -49,31 +49,6 @@ static inline void mmdrop(struct mm_struct *mm) __mmdrop(mm); } -/* - * This has to be called after a get_task_mm()/mmget_not_zero() - * followed by taking the mmap_lock for writing before modifying the - * vmas or anything the coredump pretends not to change from under it. - * - * It also has to be called when mmgrab() is used in the context of - * the process, but then the mm_count refcount is transferred outside - * the context of the process to run down_write() on that pinned mm. - * - * NOTE: find_extend_vma() called from GUP context is the only place - * that can modify the "mm" (notably the vm_start/end) under mmap_lock - * for reading and outside the context of the process, so it is also - * the only case that holds the mmap_lock for reading that must call - * this function. Generally if the mmap_lock is hold for reading - * there's no need of this check after get_task_mm()/mmget_not_zero(). - * - * This function can be obsoleted and the check can be removed, after - * the coredump code will hold the mmap_lock for writing before - * invoking the ->core_dump methods. - */ -static inline bool mmget_still_valid(struct mm_struct *mm) -{ - return likely(!mm->core_state); -} - /** * mmget() - Pin the address space associated with a &struct mm_struct. * @mm: The address space to pin. -- cgit v1.2.3 From 5cf53f3ce3b9ff5321b56f9ed9d90d59307be7d0 Mon Sep 17 00:00:00 2001 From: Elena Petrova Date: Thu, 15 Oct 2020 20:13:35 -0700 Subject: sched.h: drop in_ubsan field when UBSAN is in trap mode in_ubsan field of task_struct is only used in lib/ubsan.c, which in its turn is used only `ifneq ($(CONFIG_UBSAN_TRAP),y)`. Removing unnecessary field from a task_struct will help preserve the ABI between vanilla and CONFIG_UBSAN_TRAP'ed kernels. In particular, this will help enabling bounds sanitizer transparently for Android's GKI. Signed-off-by: Elena Petrova Signed-off-by: Andrew Morton Acked-by: Kees Cook Cc: Jann Horn Link: https://lkml.kernel.org/r/20200910134802.3160311-1-lenaptr@google.com Signed-off-by: Linus Torvalds --- include/linux/sched.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 9030f3abd969..063cd120b459 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1013,7 +1013,7 @@ struct task_struct { struct held_lock held_locks[MAX_LOCK_DEPTH]; #endif -#ifdef CONFIG_UBSAN +#if defined(CONFIG_UBSAN) && !defined(CONFIG_UBSAN_TRAP) unsigned int in_ubsan; #endif -- cgit v1.2.3 From 2c739ced5886cd8c8361faa79a9522ec05174ed0 Mon Sep 17 00:00:00 2001 From: Albert van der Linde Date: Thu, 15 Oct 2020 20:13:46 -0700 Subject: lib, include/linux: add usercopy failure capability Patch series "add fault injection to user memory access", v3. The goal of this series is to improve testing of fault-tolerance in usages of user memory access functions, by adding support for fault injection. syzkaller/syzbot are using the existing fault injection modes and will use this particular feature also. The first patch adds failure injection capability for usercopy functions. The second changes usercopy functions to use this new failure capability (copy_from_user, ...). The third patch adds get/put/clear_user failures to x86. This patch (of 3): Add a failure injection capability to improve testing of fault-tolerance in usages of user memory access functions. Add CONFIG_FAULT_INJECTION_USERCOPY to enable faults in usercopy functions. The should_fail_usercopy function is to be called by these functions (copy_from_user, get_user, ...) in order to fail or not. Signed-off-by: Albert van der Linde Signed-off-by: Andrew Morton Reviewed-by: Akinobu Mita Reviewed-by: Alexander Potapenko Cc: Borislav Petkov Cc: Ingo Molnar Cc: Jonathan Corbet Cc: Thomas Gleixner Cc: Arnd Bergmann Cc: Peter Zijlstra (Intel) Cc: "H. Peter Anvin" Cc: Al Viro Cc: Andrey Konovalov Cc: Dmitry Vyukov Cc: Marco Elver Cc: Christoph Hellwig Link: http://lkml.kernel.org/r/20200831171733.955393-1-alinde@google.com Link: http://lkml.kernel.org/r/20200831171733.955393-2-alinde@google.com Signed-off-by: Linus Torvalds --- include/linux/fault-inject-usercopy.h | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) create mode 100644 include/linux/fault-inject-usercopy.h (limited to 'include/linux') diff --git a/include/linux/fault-inject-usercopy.h b/include/linux/fault-inject-usercopy.h new file mode 100644 index 000000000000..56c3a693fdd9 --- /dev/null +++ b/include/linux/fault-inject-usercopy.h @@ -0,0 +1,22 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __LINUX_FAULT_INJECT_USERCOPY_H__ +#define __LINUX_FAULT_INJECT_USERCOPY_H__ + +/* + * This header provides a wrapper for injecting failures to user space memory + * access functions. + */ + +#include + +#ifdef CONFIG_FAULT_INJECTION_USERCOPY + +bool should_fail_usercopy(void); + +#else + +static inline bool should_fail_usercopy(void) { return false; } + +#endif /* CONFIG_FAULT_INJECTION_USERCOPY */ + +#endif /* __LINUX_FAULT_INJECT_USERCOPY_H__ */ -- cgit v1.2.3 From 4d0e9df5e43dba52d38b251e3b909df8fa1110be Mon Sep 17 00:00:00 2001 From: Albert van der Linde Date: Thu, 15 Oct 2020 20:13:50 -0700 Subject: lib, uaccess: add failure injection to usercopy functions To test fault-tolerance of user memory access functions, introduce fault injection to usercopy functions. If a failure is expected return either -EFAULT or the total amount of bytes that were not copied. Signed-off-by: Albert van der Linde Signed-off-by: Andrew Morton Reviewed-by: Akinobu Mita Reviewed-by: Alexander Potapenko Cc: Al Viro Cc: Andrey Konovalov Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Dmitry Vyukov Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Jonathan Corbet Cc: Marco Elver Cc: Peter Zijlstra (Intel) Cc: Thomas Gleixner Cc: Christoph Hellwig Link: http://lkml.kernel.org/r/20200831171733.955393-3-alinde@google.com Signed-off-by: Linus Torvalds --- include/linux/uaccess.h | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h index ef084eacaa7c..1b8c9d6162bc 100644 --- a/include/linux/uaccess.h +++ b/include/linux/uaccess.h @@ -2,6 +2,7 @@ #ifndef __LINUX_UACCESS_H__ #define __LINUX_UACCESS_H__ +#include #include #include #include @@ -84,6 +85,8 @@ static __always_inline __must_check unsigned long __copy_from_user(void *to, const void __user *from, unsigned long n) { might_fault(); + if (should_fail_usercopy()) + return n; instrument_copy_from_user(to, from, n); check_object_size(to, n, false); return raw_copy_from_user(to, from, n); @@ -105,6 +108,8 @@ __copy_from_user(void *to, const void __user *from, unsigned long n) static __always_inline __must_check unsigned long __copy_to_user_inatomic(void __user *to, const void *from, unsigned long n) { + if (should_fail_usercopy()) + return n; instrument_copy_to_user(to, from, n); check_object_size(from, n, true); return raw_copy_to_user(to, from, n); @@ -114,6 +119,8 @@ static __always_inline __must_check unsigned long __copy_to_user(void __user *to, const void *from, unsigned long n) { might_fault(); + if (should_fail_usercopy()) + return n; instrument_copy_to_user(to, from, n); check_object_size(from, n, true); return raw_copy_to_user(to, from, n); @@ -125,7 +132,7 @@ _copy_from_user(void *to, const void __user *from, unsigned long n) { unsigned long res = n; might_fault(); - if (likely(access_ok(from, n))) { + if (!should_fail_usercopy() && likely(access_ok(from, n))) { instrument_copy_from_user(to, from, n); res = raw_copy_from_user(to, from, n); } @@ -143,6 +150,8 @@ static inline __must_check unsigned long _copy_to_user(void __user *to, const void *from, unsigned long n) { might_fault(); + if (should_fail_usercopy()) + return n; if (access_ok(to, n)) { instrument_copy_to_user(to, from, n); n = raw_copy_to_user(to, from, n); -- cgit v1.2.3