From d50112edde1d0c621520e53747044009f11c656b Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Wed, 15 Nov 2017 17:32:18 -0800 Subject: slab, slub, slob: add slab_flags_t Add sparse-checked slab_flags_t for struct kmem_cache::flags (SLAB_POISON, etc). SLAB is bloated temporarily by switching to "unsigned long", but only temporarily. Link: http://lkml.kernel.org/r/20171021100225.GA22428@avx2 Signed-off-by: Alexey Dobriyan Acked-by: Pekka Enberg Cc: Christoph Lameter Cc: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kasan.h | 4 ++-- include/linux/kmemleak.h | 8 +++---- include/linux/slab.h | 60 +++++++++++++++++++++++++++++------------------- include/linux/slab_def.h | 2 +- include/linux/slub_def.h | 2 +- include/linux/types.h | 1 + include/net/sock.h | 2 +- 7 files changed, 47 insertions(+), 32 deletions(-) (limited to 'include') diff --git a/include/linux/kasan.h b/include/linux/kasan.h index 5017269e3f04..e3eb834c9a35 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -46,7 +46,7 @@ void kasan_alloc_pages(struct page *page, unsigned int order); void kasan_free_pages(struct page *page, unsigned int order); void kasan_cache_create(struct kmem_cache *cache, size_t *size, - unsigned long *flags); + slab_flags_t *flags); void kasan_cache_shrink(struct kmem_cache *cache); void kasan_cache_shutdown(struct kmem_cache *cache); @@ -95,7 +95,7 @@ static inline void kasan_free_pages(struct page *page, unsigned int order) {} static inline void kasan_cache_create(struct kmem_cache *cache, size_t *size, - unsigned long *flags) {} + slab_flags_t *flags) {} static inline void kasan_cache_shrink(struct kmem_cache *cache) {} static inline void kasan_cache_shutdown(struct kmem_cache *cache) {} diff --git a/include/linux/kmemleak.h b/include/linux/kmemleak.h index 590343f6c1b1..5ac416e2d339 100644 --- a/include/linux/kmemleak.h +++ b/include/linux/kmemleak.h @@ -48,14 +48,14 @@ extern void kmemleak_not_leak_phys(phys_addr_t phys) __ref; extern void kmemleak_ignore_phys(phys_addr_t phys) __ref; static inline void kmemleak_alloc_recursive(const void *ptr, size_t size, - int min_count, unsigned long flags, + int min_count, slab_flags_t flags, gfp_t gfp) { if (!(flags & SLAB_NOLEAKTRACE)) kmemleak_alloc(ptr, size, min_count, gfp); } -static inline void kmemleak_free_recursive(const void *ptr, unsigned long flags) +static inline void kmemleak_free_recursive(const void *ptr, slab_flags_t flags) { if (!(flags & SLAB_NOLEAKTRACE)) kmemleak_free(ptr); @@ -76,7 +76,7 @@ static inline void kmemleak_alloc(const void *ptr, size_t size, int min_count, { } static inline void kmemleak_alloc_recursive(const void *ptr, size_t size, - int min_count, unsigned long flags, + int min_count, slab_flags_t flags, gfp_t gfp) { } @@ -94,7 +94,7 @@ static inline void kmemleak_free(const void *ptr) static inline void kmemleak_free_part(const void *ptr, size_t size) { } -static inline void kmemleak_free_recursive(const void *ptr, unsigned long flags) +static inline void kmemleak_free_recursive(const void *ptr, slab_flags_t flags) { } static inline void kmemleak_free_percpu(const void __percpu *ptr) diff --git a/include/linux/slab.h b/include/linux/slab.h index af5aa65c7c18..0c4c579f52ed 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -21,13 +21,20 @@ * Flags to pass to kmem_cache_create(). * The ones marked DEBUG are only valid if CONFIG_DEBUG_SLAB is set. */ -#define SLAB_CONSISTENCY_CHECKS 0x00000100UL /* DEBUG: Perform (expensive) checks on alloc/free */ -#define SLAB_RED_ZONE 0x00000400UL /* DEBUG: Red zone objs in a cache */ -#define SLAB_POISON 0x00000800UL /* DEBUG: Poison objects */ -#define SLAB_HWCACHE_ALIGN 0x00002000UL /* Align objs on cache lines */ -#define SLAB_CACHE_DMA 0x00004000UL /* Use GFP_DMA memory */ -#define SLAB_STORE_USER 0x00010000UL /* DEBUG: Store the last owner for bug hunting */ -#define SLAB_PANIC 0x00040000UL /* Panic if kmem_cache_create() fails */ +/* DEBUG: Perform (expensive) checks on alloc/free */ +#define SLAB_CONSISTENCY_CHECKS ((slab_flags_t __force)0x00000100UL) +/* DEBUG: Red zone objs in a cache */ +#define SLAB_RED_ZONE ((slab_flags_t __force)0x00000400UL) +/* DEBUG: Poison objects */ +#define SLAB_POISON ((slab_flags_t __force)0x00000800UL) +/* Align objs on cache lines */ +#define SLAB_HWCACHE_ALIGN ((slab_flags_t __force)0x00002000UL) +/* Use GFP_DMA memory */ +#define SLAB_CACHE_DMA ((slab_flags_t __force)0x00004000UL) +/* DEBUG: Store the last owner for bug hunting */ +#define SLAB_STORE_USER ((slab_flags_t __force)0x00010000UL) +/* Panic if kmem_cache_create() fails */ +#define SLAB_PANIC ((slab_flags_t __force)0x00040000UL) /* * SLAB_TYPESAFE_BY_RCU - **WARNING** READ THIS! * @@ -65,44 +72,51 @@ * * Note that SLAB_TYPESAFE_BY_RCU was originally named SLAB_DESTROY_BY_RCU. */ -#define SLAB_TYPESAFE_BY_RCU 0x00080000UL /* Defer freeing slabs to RCU */ -#define SLAB_MEM_SPREAD 0x00100000UL /* Spread some memory over cpuset */ -#define SLAB_TRACE 0x00200000UL /* Trace allocations and frees */ +/* Defer freeing slabs to RCU */ +#define SLAB_TYPESAFE_BY_RCU ((slab_flags_t __force)0x00080000UL) +/* Spread some memory over cpuset */ +#define SLAB_MEM_SPREAD ((slab_flags_t __force)0x00100000UL) +/* Trace allocations and frees */ +#define SLAB_TRACE ((slab_flags_t __force)0x00200000UL) /* Flag to prevent checks on free */ #ifdef CONFIG_DEBUG_OBJECTS -# define SLAB_DEBUG_OBJECTS 0x00400000UL +# define SLAB_DEBUG_OBJECTS ((slab_flags_t __force)0x00400000UL) #else -# define SLAB_DEBUG_OBJECTS 0x00000000UL +# define SLAB_DEBUG_OBJECTS ((slab_flags_t __force)0x00000000UL) #endif -#define SLAB_NOLEAKTRACE 0x00800000UL /* Avoid kmemleak tracing */ +/* Avoid kmemleak tracing */ +#define SLAB_NOLEAKTRACE ((slab_flags_t __force)0x00800000UL) /* Don't track use of uninitialized memory */ #ifdef CONFIG_KMEMCHECK -# define SLAB_NOTRACK 0x01000000UL +# define SLAB_NOTRACK ((slab_flags_t __force)0x01000000UL) #else -# define SLAB_NOTRACK 0x00000000UL +# define SLAB_NOTRACK ((slab_flags_t __force)0x00000000UL) #endif +/* Fault injection mark */ #ifdef CONFIG_FAILSLAB -# define SLAB_FAILSLAB 0x02000000UL /* Fault injection mark */ +# define SLAB_FAILSLAB ((slab_flags_t __force)0x02000000UL) #else -# define SLAB_FAILSLAB 0x00000000UL +# define SLAB_FAILSLAB ((slab_flags_t __force)0x00000000UL) #endif +/* Account to memcg */ #if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB) -# define SLAB_ACCOUNT 0x04000000UL /* Account to memcg */ +# define SLAB_ACCOUNT ((slab_flags_t __force)0x04000000UL) #else -# define SLAB_ACCOUNT 0x00000000UL +# define SLAB_ACCOUNT ((slab_flags_t __force)0x00000000UL) #endif #ifdef CONFIG_KASAN -#define SLAB_KASAN 0x08000000UL +#define SLAB_KASAN ((slab_flags_t __force)0x08000000UL) #else -#define SLAB_KASAN 0x00000000UL +#define SLAB_KASAN ((slab_flags_t __force)0x00000000UL) #endif /* The following flags affect the page allocator grouping pages by mobility */ -#define SLAB_RECLAIM_ACCOUNT 0x00020000UL /* Objects are reclaimable */ +/* Objects are reclaimable */ +#define SLAB_RECLAIM_ACCOUNT ((slab_flags_t __force)0x00020000UL) #define SLAB_TEMPORARY SLAB_RECLAIM_ACCOUNT /* Objects are short-lived */ /* * ZERO_SIZE_PTR will be returned for zero sized kmalloc requests. @@ -128,7 +142,7 @@ void __init kmem_cache_init(void); bool slab_is_available(void); struct kmem_cache *kmem_cache_create(const char *, size_t, size_t, - unsigned long, + slab_flags_t, void (*)(void *)); void kmem_cache_destroy(struct kmem_cache *); int kmem_cache_shrink(struct kmem_cache *); diff --git a/include/linux/slab_def.h b/include/linux/slab_def.h index 8f7d2b1656d2..072e46e9e1d5 100644 --- a/include/linux/slab_def.h +++ b/include/linux/slab_def.h @@ -20,7 +20,7 @@ struct kmem_cache { struct reciprocal_value reciprocal_buffer_size; /* 2) touched by every alloc & free from the backend */ - unsigned int flags; /* constant flags */ + slab_flags_t flags; /* constant flags */ unsigned int num; /* # of objs per slab */ /* 3) cache_grow/shrink */ diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h index 39fa09bcde23..0adae162dc8f 100644 --- a/include/linux/slub_def.h +++ b/include/linux/slub_def.h @@ -82,7 +82,7 @@ struct kmem_cache_order_objects { struct kmem_cache { struct kmem_cache_cpu __percpu *cpu_slab; /* Used for retriving partial slabs etc */ - unsigned long flags; + slab_flags_t flags; unsigned long min_partial; int size; /* The size of an object including meta data */ int object_size; /* The size of an object without meta data */ diff --git a/include/linux/types.h b/include/linux/types.h index 34fce54e4f1b..732b52c2eae4 100644 --- a/include/linux/types.h +++ b/include/linux/types.h @@ -156,6 +156,7 @@ typedef u32 dma_addr_t; #endif typedef unsigned __bitwise gfp_t; +typedef unsigned long __bitwise slab_flags_t; typedef unsigned __bitwise fmode_t; #ifdef CONFIG_PHYS_ADDR_T_64BIT diff --git a/include/net/sock.h b/include/net/sock.h index a6b9a8d1a6df..c577286dbffb 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1105,7 +1105,7 @@ struct proto { struct kmem_cache *slab; unsigned int obj_size; - int slab_flags; + slab_flags_t slab_flags; struct percpu_counter *orphan_count; -- cgit v1.2.3 From 4fd0b46e898791009b03b2fdd6510044fa8730a6 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Wed, 15 Nov 2017 17:32:21 -0800 Subject: slab, slub, slob: convert slab_flags_t to 32-bit struct kmem_cache::flags is "unsigned long" which is unnecessary on 64-bit as no flags are defined in the higher bits. Switch the field to 32-bit and save some space on x86_64 until such flags appear: add/remove: 0/0 grow/shrink: 0/107 up/down: 0/-657 (-657) function old new delta sysfs_slab_add 720 719 -1 ... check_object 699 676 -23 [akpm@linux-foundation.org: fix printk warning] Link: http://lkml.kernel.org/r/20171021100635.GA8287@avx2 Signed-off-by: Alexey Dobriyan Acked-by: Pekka Enberg Cc: Christoph Lameter Cc: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/slab.h | 44 ++++++++++++++++++++++---------------------- include/linux/types.h | 2 +- 2 files changed, 23 insertions(+), 23 deletions(-) (limited to 'include') diff --git a/include/linux/slab.h b/include/linux/slab.h index 0c4c579f52ed..f37cb93768ab 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -22,19 +22,19 @@ * The ones marked DEBUG are only valid if CONFIG_DEBUG_SLAB is set. */ /* DEBUG: Perform (expensive) checks on alloc/free */ -#define SLAB_CONSISTENCY_CHECKS ((slab_flags_t __force)0x00000100UL) +#define SLAB_CONSISTENCY_CHECKS ((slab_flags_t __force)0x00000100U) /* DEBUG: Red zone objs in a cache */ -#define SLAB_RED_ZONE ((slab_flags_t __force)0x00000400UL) +#define SLAB_RED_ZONE ((slab_flags_t __force)0x00000400U) /* DEBUG: Poison objects */ -#define SLAB_POISON ((slab_flags_t __force)0x00000800UL) +#define SLAB_POISON ((slab_flags_t __force)0x00000800U) /* Align objs on cache lines */ -#define SLAB_HWCACHE_ALIGN ((slab_flags_t __force)0x00002000UL) +#define SLAB_HWCACHE_ALIGN ((slab_flags_t __force)0x00002000U) /* Use GFP_DMA memory */ -#define SLAB_CACHE_DMA ((slab_flags_t __force)0x00004000UL) +#define SLAB_CACHE_DMA ((slab_flags_t __force)0x00004000U) /* DEBUG: Store the last owner for bug hunting */ -#define SLAB_STORE_USER ((slab_flags_t __force)0x00010000UL) +#define SLAB_STORE_USER ((slab_flags_t __force)0x00010000U) /* Panic if kmem_cache_create() fails */ -#define SLAB_PANIC ((slab_flags_t __force)0x00040000UL) +#define SLAB_PANIC ((slab_flags_t __force)0x00040000U) /* * SLAB_TYPESAFE_BY_RCU - **WARNING** READ THIS! * @@ -73,50 +73,50 @@ * Note that SLAB_TYPESAFE_BY_RCU was originally named SLAB_DESTROY_BY_RCU. */ /* Defer freeing slabs to RCU */ -#define SLAB_TYPESAFE_BY_RCU ((slab_flags_t __force)0x00080000UL) +#define SLAB_TYPESAFE_BY_RCU ((slab_flags_t __force)0x00080000U) /* Spread some memory over cpuset */ -#define SLAB_MEM_SPREAD ((slab_flags_t __force)0x00100000UL) +#define SLAB_MEM_SPREAD ((slab_flags_t __force)0x00100000U) /* Trace allocations and frees */ -#define SLAB_TRACE ((slab_flags_t __force)0x00200000UL) +#define SLAB_TRACE ((slab_flags_t __force)0x00200000U) /* Flag to prevent checks on free */ #ifdef CONFIG_DEBUG_OBJECTS -# define SLAB_DEBUG_OBJECTS ((slab_flags_t __force)0x00400000UL) +# define SLAB_DEBUG_OBJECTS ((slab_flags_t __force)0x00400000U) #else -# define SLAB_DEBUG_OBJECTS ((slab_flags_t __force)0x00000000UL) +# define SLAB_DEBUG_OBJECTS 0 #endif /* Avoid kmemleak tracing */ -#define SLAB_NOLEAKTRACE ((slab_flags_t __force)0x00800000UL) +#define SLAB_NOLEAKTRACE ((slab_flags_t __force)0x00800000U) /* Don't track use of uninitialized memory */ #ifdef CONFIG_KMEMCHECK -# define SLAB_NOTRACK ((slab_flags_t __force)0x01000000UL) +# define SLAB_NOTRACK ((slab_flags_t __force)0x01000000U) #else -# define SLAB_NOTRACK ((slab_flags_t __force)0x00000000UL) +# define SLAB_NOTRACK 0 #endif /* Fault injection mark */ #ifdef CONFIG_FAILSLAB -# define SLAB_FAILSLAB ((slab_flags_t __force)0x02000000UL) +# define SLAB_FAILSLAB ((slab_flags_t __force)0x02000000U) #else -# define SLAB_FAILSLAB ((slab_flags_t __force)0x00000000UL) +# define SLAB_FAILSLAB 0 #endif /* Account to memcg */ #if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB) -# define SLAB_ACCOUNT ((slab_flags_t __force)0x04000000UL) +# define SLAB_ACCOUNT ((slab_flags_t __force)0x04000000U) #else -# define SLAB_ACCOUNT ((slab_flags_t __force)0x00000000UL) +# define SLAB_ACCOUNT 0 #endif #ifdef CONFIG_KASAN -#define SLAB_KASAN ((slab_flags_t __force)0x08000000UL) +#define SLAB_KASAN ((slab_flags_t __force)0x08000000U) #else -#define SLAB_KASAN ((slab_flags_t __force)0x00000000UL) +#define SLAB_KASAN 0 #endif /* The following flags affect the page allocator grouping pages by mobility */ /* Objects are reclaimable */ -#define SLAB_RECLAIM_ACCOUNT ((slab_flags_t __force)0x00020000UL) +#define SLAB_RECLAIM_ACCOUNT ((slab_flags_t __force)0x00020000U) #define SLAB_TEMPORARY SLAB_RECLAIM_ACCOUNT /* Objects are short-lived */ /* * ZERO_SIZE_PTR will be returned for zero sized kmalloc requests. diff --git a/include/linux/types.h b/include/linux/types.h index 732b52c2eae4..c94d59ef96cc 100644 --- a/include/linux/types.h +++ b/include/linux/types.h @@ -156,7 +156,7 @@ typedef u32 dma_addr_t; #endif typedef unsigned __bitwise gfp_t; -typedef unsigned long __bitwise slab_flags_t; +typedef unsigned __bitwise slab_flags_t; typedef unsigned __bitwise fmode_t; #ifdef CONFIG_PHYS_ADDR_T_64BIT -- cgit v1.2.3 From 5799b255c491d853b73fb9e0e1760210315d06cd Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Wed, 15 Nov 2017 17:32:29 -0800 Subject: include/linux/slab.h: add kmalloc_array_node() and kcalloc_node() Patch series "Add kmalloc_array_node() and kcalloc_node()". Our current memeory allocation routines suffer form an API imbalance, for one we have kmalloc_array() and kcalloc() which check for overflows in size multiplication and we have kmalloc_node() and kzalloc_node() which allow for memory allocation on a certain NUMA node but don't check for eventual overflows. This patch (of 6): We have kmalloc_array() and kcalloc() wrappers on top of kmalloc() which ensure us overflow free multiplication for the size of a memory allocation but these implementations are not NUMA-aware. Likewise we have kmalloc_node() which is a NUMA-aware version of kmalloc() but the implementation is not aware of any possible overflows in eventual size calculations. Introduce a combination of the two above cases to have a NUMA-node aware version of kmalloc_array() and kcalloc(). Link: http://lkml.kernel.org/r/20170927082038.3782-2-jthumshirn@suse.de Signed-off-by: Johannes Thumshirn Acked-by: Vlastimil Babka Cc: Christoph Hellwig Cc: Christoph Lameter Cc: Damien Le Moal Cc: David Rientjes Cc: "David S. Miller" Cc: Doug Ledford Cc: Hal Rosenstock Cc: Jens Axboe Cc: Joonsoo Kim Cc: Mike Marciniszyn Cc: Pekka Enberg Cc: Santosh Shilimkar Cc: Sean Hefty Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/slab.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'include') diff --git a/include/linux/slab.h b/include/linux/slab.h index f37cb93768ab..ebddfca9e24b 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -650,6 +650,22 @@ extern void *__kmalloc_track_caller(size_t, gfp_t, unsigned long); #define kmalloc_track_caller(size, flags) \ __kmalloc_track_caller(size, flags, _RET_IP_) +static inline void *kmalloc_array_node(size_t n, size_t size, gfp_t flags, + int node) +{ + if (size != 0 && n > SIZE_MAX / size) + return NULL; + if (__builtin_constant_p(n) && __builtin_constant_p(size)) + return kmalloc_node(n * size, flags, node); + return __kmalloc_node(n * size, flags, node); +} + +static inline void *kcalloc_node(size_t n, size_t size, gfp_t flags, int node) +{ + return kmalloc_array_node(n, size, flags | __GFP_ZERO, node); +} + + #ifdef CONFIG_NUMA extern void *__kmalloc_node_track_caller(size_t, gfp_t, int, unsigned long); #define kmalloc_node_track_caller(size, flags, node) \ -- cgit v1.2.3 From 41710443f790b5f7f5305eba99dacce88e259f4c Mon Sep 17 00:00:00 2001 From: Changbin Du Date: Wed, 15 Nov 2017 17:32:53 -0800 Subject: mm: update comments for struct page.mapping struct page.mapping can be NULL or points to one object of type address_space, anon_vma or KSM private structure. Link: http://lkml.kernel.org/r/1506485067-15954-1-git-send-email-changbin.du@intel.com Signed-off-by: Changbin Du Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm_types.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index c85f11dafd56..d1b8e8f97fc2 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -48,8 +48,10 @@ struct page { * inode address_space, or NULL. * If page mapped as anonymous * memory, low bit is set, and - * it points to anon_vma object: - * see PAGE_MAPPING_ANON below. + * it points to anon_vma object + * or KSM private structure. See + * PAGE_MAPPING_ANON and + * PAGE_MAPPING_KSM. */ void *s_mem; /* slab first object */ atomic_t compound_mapcount; /* first tail page */ -- cgit v1.2.3 From 23c47d2ada9f96731492a67b28c0072715075baa Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Wed, 15 Nov 2017 17:33:00 -0800 Subject: bdi: introduce BDI_CAP_SYNCHRONOUS_IO As discussed at https://lkml.kernel.org/r/<20170728165604.10455-1-ross.zwisler@linux.intel.com> someday we will remove rw_page(). If so, we need something to detect such super-fast storage on which synchronous IO operations like the current rw_page are always a win. Introduces BDI_CAP_SYNCHRONOUS_IO to indicate such devices. With it, we could use various optimization techniques. Link: http://lkml.kernel.org/r/1505886205-9671-3-git-send-email-minchan@kernel.org Signed-off-by: Minchan Kim Cc: Christoph Hellwig Cc: Dan Williams Cc: Ross Zwisler Cc: Hugh Dickins Cc: Ilya Dryomov Cc: Jens Axboe Cc: Sergey Senozhatsky Cc: Huang Ying Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/backing-dev.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include') diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index f41ca8486e02..7360e79e9a2f 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -122,6 +122,8 @@ int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio); * BDI_CAP_STRICTLIMIT: Keep number of dirty pages below bdi threshold. * * BDI_CAP_CGROUP_WRITEBACK: Supports cgroup-aware writeback. + * BDI_CAP_SYNCHRONOUS_IO: Device is so fast that asynchronous IO would be + * inefficient. */ #define BDI_CAP_NO_ACCT_DIRTY 0x00000001 #define BDI_CAP_NO_WRITEBACK 0x00000002 @@ -129,6 +131,7 @@ int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio); #define BDI_CAP_STABLE_WRITES 0x00000008 #define BDI_CAP_STRICTLIMIT 0x00000010 #define BDI_CAP_CGROUP_WRITEBACK 0x00000020 +#define BDI_CAP_SYNCHRONOUS_IO 0x00000040 #define BDI_CAP_NO_ACCT_AND_WRITEBACK \ (BDI_CAP_NO_WRITEBACK | BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_ACCT_WB) @@ -174,6 +177,11 @@ static inline int wb_congested(struct bdi_writeback *wb, int cong_bits) long congestion_wait(int sync, long timeout); long wait_iff_congested(struct pglist_data *pgdat, int sync, long timeout); +static inline bool bdi_cap_synchronous_io(struct backing_dev_info *bdi) +{ + return bdi->capabilities & BDI_CAP_SYNCHRONOUS_IO; +} + static inline bool bdi_cap_stable_pages_required(struct backing_dev_info *bdi) { return bdi->capabilities & BDI_CAP_STABLE_WRITES; -- cgit v1.2.3 From 539a6fea7fdcade532bd3e77be2862a683f8f0c9 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Wed, 15 Nov 2017 17:33:04 -0800 Subject: mm, swap: introduce SWP_SYNCHRONOUS_IO If rw-page based fast storage is used for swap devices, we need to detect it to enhance swap IO operations. This patch is preparation for optimizing of swap-in operation with next patch. Link: http://lkml.kernel.org/r/1505886205-9671-4-git-send-email-minchan@kernel.org Signed-off-by: Minchan Kim Cc: Hugh Dickins Cc: Christoph Hellwig Cc: Dan Williams Cc: Ilya Dryomov Cc: Jens Axboe Cc: Ross Zwisler Cc: Sergey Senozhatsky Cc: Huang Ying Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/swap.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/swap.h b/include/linux/swap.h index f02fb5db8914..933d7c0c3542 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -171,8 +171,9 @@ enum { SWP_AREA_DISCARD = (1 << 8), /* single-time swap area discards */ SWP_PAGE_DISCARD = (1 << 9), /* freed swap page-cluster discards */ SWP_STABLE_WRITES = (1 << 10), /* no overwrite PG_writeback pages */ + SWP_SYNCHRONOUS_IO = (1 << 11), /* synchronous IO is efficient */ /* add others here before... */ - SWP_SCANNING = (1 << 11), /* refcount in scan_swap_map */ + SWP_SCANNING = (1 << 12), /* refcount in scan_swap_map */ }; #define SWAP_CLUSTER_MAX 32UL -- cgit v1.2.3 From 0bcac06f27d7528591c27ac2b093ccd71c5d0168 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Wed, 15 Nov 2017 17:33:07 -0800 Subject: mm, swap: skip swapcache for swapin of synchronous device With fast swap storage, the platforms want to use swap more aggressively and swap-in is crucial to application latency. The rw_page() based synchronous devices like zram, pmem and btt are such fast storage. When I profile swapin performance with zram lz4 decompress test, S/W overhead is more than 70%. Maybe, it would be bigger in nvdimm. This patch aims to reduce swap-in latency by skipping swapcache if the swap device is synchronous device like rw_page based device. It enhances 45% my swapin test(5G sequential swapin, no readahead, from 2.41sec to 1.64sec). Link: http://lkml.kernel.org/r/1505886205-9671-5-git-send-email-minchan@kernel.org Signed-off-by: Minchan Kim Cc: Dan Williams Cc: Ross Zwisler Cc: Hugh Dickins Cc: Christoph Hellwig Cc: Ilya Dryomov Cc: Jens Axboe Cc: Sergey Senozhatsky Cc: Huang Ying Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/swap.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include') diff --git a/include/linux/swap.h b/include/linux/swap.h index 933d7c0c3542..32c06f028c7b 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -466,6 +466,7 @@ extern int page_swapcount(struct page *); extern int __swp_swapcount(swp_entry_t entry); extern int swp_swapcount(swp_entry_t entry); extern struct swap_info_struct *page_swap_info(struct page *); +extern struct swap_info_struct *swp_swap_info(swp_entry_t entry); extern bool reuse_swap_page(struct page *, int *); extern int try_to_free_swap(struct page *); struct backing_dev_info; @@ -474,6 +475,16 @@ extern void exit_swap_address_space(unsigned int type); #else /* CONFIG_SWAP */ +static inline int swap_readpage(struct page *page, bool do_poll) +{ + return 0; +} + +static inline struct swap_info_struct *swp_swap_info(swp_entry_t entry) +{ + return NULL; +} + #define swap_address_space(entry) (NULL) #define get_nr_swap_pages() 0L #define total_swap_pages 0L -- cgit v1.2.3 From aa8d22a11da933dbf880b4933b58931f4aefe91c Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Wed, 15 Nov 2017 17:33:11 -0800 Subject: mm: swap: SWP_SYNCHRONOUS_IO: skip swapcache only if swapped page has no other reference When SWP_SYNCHRONOUS_IO swapped-in pages are shared by several processes, it can cause unnecessary memory wastage by skipping swap cache. Because, with swapin fault by read, they could share a page if the page were in swap cache. Thus, it avoids allocating same content new pages. This patch makes the swapcache skipping work only if the swap pte is non-sharable. [akpm@linux-foundation.org: coding-style fixes] Link: http://lkml.kernel.org/r/1507620825-5537-1-git-send-email-minchan@kernel.org Signed-off-by: Minchan Kim Cc: Dan Williams Cc: Ross Zwisler Cc: Hugh Dickins Cc: Christoph Hellwig Cc: Ilya Dryomov Cc: Jens Axboe Cc: Sergey Senozhatsky Cc: Huang Ying Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/swap.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/linux/swap.h b/include/linux/swap.h index 32c06f028c7b..8b8a6f965785 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -463,6 +463,7 @@ extern unsigned int count_swap_pages(int, int); extern sector_t map_swap_page(struct page *, struct block_device **); extern sector_t swapdev_block(int, pgoff_t); extern int page_swapcount(struct page *); +extern int __swap_count(struct swap_info_struct *si, swp_entry_t entry); extern int __swp_swapcount(swp_entry_t entry); extern int swp_swapcount(swp_entry_t entry); extern struct swap_info_struct *page_swap_info(struct page *); @@ -589,6 +590,11 @@ static inline int page_swapcount(struct page *page) return 0; } +static inline int __swap_count(struct swap_info_struct *si, swp_entry_t entry) +{ + return 0; +} + static inline int __swp_swapcount(swp_entry_t entry) { return 0; -- cgit v1.2.3 From 4da2ce250f986060750fcc5b29112914e31803ba Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Wed, 15 Nov 2017 17:33:26 -0800 Subject: mm: distinguish CMA and MOVABLE isolation in has_unmovable_pages() Joonsoo has noticed that "mm: drop migrate type checks from has_unmovable_pages" would break CMA allocator because it relies on has_unmovable_pages returning false even for CMA pageblocks which in fact don't have to be movable: alloc_contig_range start_isolate_page_range set_migratetype_isolate has_unmovable_pages This is a result of the code sharing between CMA and memory hotplug while each one has a different idea of what has_unmovable_pages should return. This is unfortunate but fixing it properly would require a lot of code duplication. Fix the issue by introducing the requested migrate type argument and special case MIGRATE_CMA case where CMA page blocks are handled properly. This will work for memory hotplug because it requires MIGRATE_MOVABLE. Link: http://lkml.kernel.org/r/20171019122118.y6cndierwl2vnguj@dhcp22.suse.cz Signed-off-by: Michal Hocko Reported-by: Joonsoo Kim Tested-by: Stefan Wahren Tested-by: Ran Wang Cc: Michael Ellerman Cc: Vlastimil Babka Cc: Igor Mammedov Cc: KAMEZAWA Hiroyuki Cc: Reza Arbab Cc: Vitaly Kuznetsov Cc: Xishi Qiu Cc: Yasuaki Ishimatsu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/page-isolation.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/page-isolation.h b/include/linux/page-isolation.h index 05a04e603686..cdad58bbfd8b 100644 --- a/include/linux/page-isolation.h +++ b/include/linux/page-isolation.h @@ -31,7 +31,7 @@ static inline bool is_migrate_isolate(int migratetype) #endif bool has_unmovable_pages(struct zone *zone, struct page *page, int count, - bool skip_hwpoisoned_pages); + int migratetype, bool skip_hwpoisoned_pages); void set_pageblock_migratetype(struct page *page, int migratetype); int move_freepages_block(struct zone *zone, struct page *page, int migratetype, int *num_movable); -- cgit v1.2.3 From 66e8b438bd5c75498cfe915c4219049eaebcb869 Mon Sep 17 00:00:00 2001 From: Gioh Kim Date: Wed, 15 Nov 2017 17:33:42 -0800 Subject: mm/memblock.c: make the index explicit argument of for_each_memblock_type for_each_memblock_type macro function relies on idx variable defined in the caller context. Silent macro arguments are almost always wrong thing to do. They make code harder to read and easier to get wrong. Let's use an explicit iterator parameter for for_each_memblock_type and make the code more obious. This patch is a mere cleanup and it shouldn't introduce any functional change. Link: http://lkml.kernel.org/r/20170913133029.28911-1-gi-oh.kim@profitbricks.com Signed-off-by: Gioh Kim Acked-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memblock.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/memblock.h b/include/linux/memblock.h index bae11c7e7bf3..ce0e5634c2f9 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -389,10 +389,10 @@ static inline unsigned long memblock_region_reserved_end_pfn(const struct memblo region < (memblock.memblock_type.regions + memblock.memblock_type.cnt); \ region++) -#define for_each_memblock_type(memblock_type, rgn) \ - for (idx = 0, rgn = &memblock_type->regions[0]; \ - idx < memblock_type->cnt; \ - idx++, rgn = &memblock_type->regions[idx]) +#define for_each_memblock_type(i, memblock_type, rgn) \ + for (i = 0, rgn = &memblock_type->regions[0]; \ + i < memblock_type->cnt; \ + i++, rgn = &memblock_type->regions[i]) #ifdef CONFIG_MEMTEST extern void early_memtest(phys_addr_t start, phys_addr_t end); -- cgit v1.2.3 From 0bea803e9e6bf3836d5368a6426c30a8c0e5eab5 Mon Sep 17 00:00:00 2001 From: Ralph Campbell Date: Wed, 15 Nov 2017 17:34:00 -0800 Subject: mm/hmm: constify hmm_devmem_page_get_drvdata() parameter MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Constify pointer parameter to avoid issue when use from code that only has const struct page pointer to use in the first place. Link: http://lkml.kernel.org/r/1506972774-10191-1-git-send-email-jglisse@redhat.com Signed-off-by: Ralph Campbell Signed-off-by: Jérôme Glisse Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hmm.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/hmm.h b/include/linux/hmm.h index 96e69979f84d..325017ad9311 100644 --- a/include/linux/hmm.h +++ b/include/linux/hmm.h @@ -471,9 +471,9 @@ static inline void hmm_devmem_page_set_drvdata(struct page *page, * @page: pointer to struct page * Return: driver data value */ -static inline unsigned long hmm_devmem_page_get_drvdata(struct page *page) +static inline unsigned long hmm_devmem_page_get_drvdata(const struct page *page) { - unsigned long *drvdata = (unsigned long *)&page->pgmap; + const unsigned long *drvdata = (const unsigned long *)&page->pgmap; return drvdata[1]; } -- cgit v1.2.3 From 0f10851ea475e08896ee5d9a2036d1bb46a8f3a4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Glisse?= Date: Wed, 15 Nov 2017 17:34:07 -0800 Subject: mm/mmu_notifier: avoid double notification when it is useless MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch only affects users of mmu_notifier->invalidate_range callback which are device drivers related to ATS/PASID, CAPI, IOMMUv2, SVM ... and it is an optimization for those users. Everyone else is unaffected by it. When clearing a pte/pmd we are given a choice to notify the event under the page table lock (notify version of *_clear_flush helpers do call the mmu_notifier_invalidate_range). But that notification is not necessary in all cases. This patch removes almost all cases where it is useless to have a call to mmu_notifier_invalidate_range before mmu_notifier_invalidate_range_end. It also adds documentation in all those cases explaining why. Below is a more in depth analysis of why this is fine to do this: For secondary TLB (non CPU TLB) like IOMMU TLB or device TLB (when device use thing like ATS/PASID to get the IOMMU to walk the CPU page table to access a process virtual address space). There is only 2 cases when you need to notify those secondary TLB while holding page table lock when clearing a pte/pmd: A) page backing address is free before mmu_notifier_invalidate_range_end B) a page table entry is updated to point to a new page (COW, write fault on zero page, __replace_page(), ...) Case A is obvious you do not want to take the risk for the device to write to a page that might now be used by something completely different. Case B is more subtle. For correctness it requires the following sequence to happen: - take page table lock - clear page table entry and notify (pmd/pte_huge_clear_flush_notify()) - set page table entry to point to new page If clearing the page table entry is not followed by a notify before setting the new pte/pmd value then you can break memory model like C11 or C++11 for the device. Consider the following scenario (device use a feature similar to ATS/ PASID): Two address addrA and addrB such that |addrA - addrB| >= PAGE_SIZE we assume they are write protected for COW (other case of B apply too). [Time N] ----------------------------------------------------------------- CPU-thread-0 {try to write to addrA} CPU-thread-1 {try to write to addrB} CPU-thread-2 {} CPU-thread-3 {} DEV-thread-0 {read addrA and populate device TLB} DEV-thread-2 {read addrB and populate device TLB} [Time N+1] --------------------------------------------------------------- CPU-thread-0 {COW_step0: {mmu_notifier_invalidate_range_start(addrA)}} CPU-thread-1 {COW_step0: {mmu_notifier_invalidate_range_start(addrB)}} CPU-thread-2 {} CPU-thread-3 {} DEV-thread-0 {} DEV-thread-2 {} [Time N+2] --------------------------------------------------------------- CPU-thread-0 {COW_step1: {update page table point to new page for addrA}} CPU-thread-1 {COW_step1: {update page table point to new page for addrB}} CPU-thread-2 {} CPU-thread-3 {} DEV-thread-0 {} DEV-thread-2 {} [Time N+3] --------------------------------------------------------------- CPU-thread-0 {preempted} CPU-thread-1 {preempted} CPU-thread-2 {write to addrA which is a write to new page} CPU-thread-3 {} DEV-thread-0 {} DEV-thread-2 {} [Time N+3] --------------------------------------------------------------- CPU-thread-0 {preempted} CPU-thread-1 {preempted} CPU-thread-2 {} CPU-thread-3 {write to addrB which is a write to new page} DEV-thread-0 {} DEV-thread-2 {} [Time N+4] --------------------------------------------------------------- CPU-thread-0 {preempted} CPU-thread-1 {COW_step3: {mmu_notifier_invalidate_range_end(addrB)}} CPU-thread-2 {} CPU-thread-3 {} DEV-thread-0 {} DEV-thread-2 {} [Time N+5] --------------------------------------------------------------- CPU-thread-0 {preempted} CPU-thread-1 {} CPU-thread-2 {} CPU-thread-3 {} DEV-thread-0 {read addrA from old page} DEV-thread-2 {read addrB from new page} So here because at time N+2 the clear page table entry was not pair with a notification to invalidate the secondary TLB, the device see the new value for addrB before seing the new value for addrA. This break total memory ordering for the device. When changing a pte to write protect or to point to a new write protected page with same content (KSM) it is ok to delay invalidate_range callback to mmu_notifier_invalidate_range_end() outside the page table lock. This is true even if the thread doing page table update is preempted right after releasing page table lock before calling mmu_notifier_invalidate_range_end Thanks to Andrea for thinking of a problematic scenario for COW. [jglisse@redhat.com: v2] Link: http://lkml.kernel.org/r/20171017031003.7481-2-jglisse@redhat.com Link: http://lkml.kernel.org/r/20170901173011.10745-1-jglisse@redhat.com Signed-off-by: Jérôme Glisse Cc: Andrea Arcangeli Cc: Nadav Amit Cc: Joerg Roedel Cc: Suravee Suthikulpanit Cc: David Woodhouse Cc: Alistair Popple Cc: Michael Ellerman Cc: Benjamin Herrenschmidt Cc: Stephen Rothwell Cc: Andrew Donnellan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmu_notifier.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h index 2cf1c3c807f6..130831718e95 100644 --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h @@ -156,7 +156,8 @@ struct mmu_notifier_ops { * shared page-tables, it not necessary to implement the * invalidate_range_start()/end() notifiers, as * invalidate_range() alread catches the points in time when an - * external TLB range needs to be flushed. + * external TLB range needs to be flushed. For more in depth + * discussion on this see Documentation/vm/mmu_notifier.txt * * The invalidate_range() function is called under the ptl * spin-lock and not allowed to sleep. -- cgit v1.2.3 From 4645b9fe84bf4878f04c7959a75df7c3c2d1bbb9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Glisse?= Date: Wed, 15 Nov 2017 17:34:11 -0800 Subject: mm/mmu_notifier: avoid call to invalidate_range() in range_end() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This is an optimization patch that only affect mmu_notifier users which rely on the invalidate_range() callback. This patch avoids calling that callback twice in a row from inside __mmu_notifier_invalidate_range_end Existing pattern (before this patch): mmu_notifier_invalidate_range_start() pte/pmd/pud_clear_flush_notify() mmu_notifier_invalidate_range() mmu_notifier_invalidate_range_end() mmu_notifier_invalidate_range() New pattern (after this patch): mmu_notifier_invalidate_range_start() pte/pmd/pud_clear_flush_notify() mmu_notifier_invalidate_range() mmu_notifier_invalidate_range_only_end() We call the invalidate_range callback after clearing the page table under the page table lock and we skip the call to invalidate_range inside the __mmu_notifier_invalidate_range_end() function. Idea from Andrea Arcangeli Link: http://lkml.kernel.org/r/20171017031003.7481-3-jglisse@redhat.com Signed-off-by: Jérôme Glisse Cc: Andrea Arcangeli Cc: Joerg Roedel Cc: Suravee Suthikulpanit Cc: David Woodhouse Cc: Alistair Popple Cc: Michael Ellerman Cc: Benjamin Herrenschmidt Cc: Stephen Rothwell Cc: Andrew Donnellan Cc: Nadav Amit Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmu_notifier.h | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h index 130831718e95..b25dc9db19fc 100644 --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h @@ -214,7 +214,8 @@ extern void __mmu_notifier_change_pte(struct mm_struct *mm, extern void __mmu_notifier_invalidate_range_start(struct mm_struct *mm, unsigned long start, unsigned long end); extern void __mmu_notifier_invalidate_range_end(struct mm_struct *mm, - unsigned long start, unsigned long end); + unsigned long start, unsigned long end, + bool only_end); extern void __mmu_notifier_invalidate_range(struct mm_struct *mm, unsigned long start, unsigned long end); @@ -268,7 +269,14 @@ static inline void mmu_notifier_invalidate_range_end(struct mm_struct *mm, unsigned long start, unsigned long end) { if (mm_has_notifiers(mm)) - __mmu_notifier_invalidate_range_end(mm, start, end); + __mmu_notifier_invalidate_range_end(mm, start, end, false); +} + +static inline void mmu_notifier_invalidate_range_only_end(struct mm_struct *mm, + unsigned long start, unsigned long end) +{ + if (mm_has_notifiers(mm)) + __mmu_notifier_invalidate_range_end(mm, start, end, true); } static inline void mmu_notifier_invalidate_range(struct mm_struct *mm, @@ -439,6 +447,11 @@ static inline void mmu_notifier_invalidate_range_end(struct mm_struct *mm, { } +static inline void mmu_notifier_invalidate_range_only_end(struct mm_struct *mm, + unsigned long start, unsigned long end) +{ +} + static inline void mmu_notifier_invalidate_range(struct mm_struct *mm, unsigned long start, unsigned long end) { -- cgit v1.2.3 From 3a50d14d0df5776e002a8683a290c87eeac93a21 Mon Sep 17 00:00:00 2001 From: Andrey Ryabinin Date: Wed, 15 Nov 2017 17:34:15 -0800 Subject: mm: remove unused pgdat->inactive_ratio Since commit 59dc76b0d4df ("mm: vmscan: reduce size of inactive file list") 'pgdat->inactive_ratio' is not used, except for printing "node_inactive_ratio: 0" in /proc/zoneinfo output. Remove it. Link: http://lkml.kernel.org/r/20171003152611.27483-1-aryabinin@virtuozzo.com Signed-off-by: Andrey Ryabinin Reviewed-by: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index a507f43ad221..39ccaa9c428b 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -712,12 +712,6 @@ typedef struct pglist_data { /* Fields commonly accessed by the page reclaim scanner */ struct lruvec lruvec; - /* - * The target ratio of ACTIVE_ANON to INACTIVE_ANON pages on - * this node's LRU. Maintained by the pageout code. - */ - unsigned int inactive_ratio; - unsigned long flags; ZONE_PADDING(_pad2_) -- cgit v1.2.3 From 8745808fda84c638e45cc860c8fb600bf4b0a2a6 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Wed, 15 Nov 2017 17:34:22 -0800 Subject: mm, arch: remove empty_bad_page* empty_bad_page() and empty_bad_pte_table() seem to be relics from old days which is not used by any code for a long time. I have tried to find when exactly but this is not really all that straightforward due to many code movements - traces disappear around 2.4 times. Anyway no code really references neither empty_bad_page nor empty_bad_pte_table. We only allocate the storage which is not used by anybody so remove them. Link: http://lkml.kernel.org/r/20171004150045.30755-1-mhocko@kernel.org Signed-off-by: Michal Hocko Acked-by: Ralf Baechle Acked-by: Ingo Molnar Cc: Yoshinori Sato Cc: David Howells Cc: Rich Felker Cc: Jeff Dike Cc: Richard Weinberger Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/page-flags.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 584b14c774c1..3ec44e27aa9d 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -18,7 +18,7 @@ * Various page->flags bits: * * PG_reserved is set for special pages, which can never be swapped out. Some - * of them might not even exist (eg empty_bad_page)... + * of them might not even exist... * * The PG_private bitflag is set on pagecache pages if they contain filesystem * specific data (which is normally at page->private). It can be used by -- cgit v1.2.3 From 72b045aecdd856b083521f2a963705b4c2e59680 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 15 Nov 2017 17:34:33 -0800 Subject: mm: implement find_get_pages_range_tag() Patch series "Ranged pagevec tagged lookup", v3. In this series I provide a ranged variant of pagevec_lookup_tag() and use it in places where it makes sense. This series removes some common code and it also has a potential for speeding up some operations similarly as for pagevec_lookup_range() (but for now I can think of only artificial cases where this happens). This patch (of 16): Implement a variant of find_get_pages_tag() that stops iterating at given index. Lots of users of this function (through pagevec_lookup()) actually want a range lookup and all of them are currently open-coding this. Also create corresponding pagevec_lookup_range_tag() function. Link: http://lkml.kernel.org/r/20171009151359.31984-2-jack@suse.cz Signed-off-by: Jan Kara Reviewed-by: Daniel Jordan Cc: Bob Peterson Cc: Chao Yu Cc: David Howells Cc: David Sterba Cc: Ilya Dryomov Cc: Jaegeuk Kim Cc: Ryusuke Konishi Cc: Steve French Cc: "Theodore Ts'o" Cc: "Yan, Zheng" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/pagemap.h | 12 ++++++++++-- include/linux/pagevec.h | 11 +++++++++-- 2 files changed, 19 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index e08b5339023c..956d6a80e126 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -366,8 +366,16 @@ static inline unsigned find_get_pages(struct address_space *mapping, } unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t start, unsigned int nr_pages, struct page **pages); -unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index, - int tag, unsigned int nr_pages, struct page **pages); +unsigned find_get_pages_range_tag(struct address_space *mapping, pgoff_t *index, + pgoff_t end, int tag, unsigned int nr_pages, + struct page **pages); +static inline unsigned find_get_pages_tag(struct address_space *mapping, + pgoff_t *index, int tag, unsigned int nr_pages, + struct page **pages) +{ + return find_get_pages_range_tag(mapping, index, (pgoff_t)-1, tag, + nr_pages, pages); +} unsigned find_get_entries_tag(struct address_space *mapping, pgoff_t start, int tag, unsigned int nr_entries, struct page **entries, pgoff_t *indices); diff --git a/include/linux/pagevec.h b/include/linux/pagevec.h index 2636c0c0f279..afc718f586f8 100644 --- a/include/linux/pagevec.h +++ b/include/linux/pagevec.h @@ -38,9 +38,16 @@ static inline unsigned pagevec_lookup(struct pagevec *pvec, return pagevec_lookup_range(pvec, mapping, start, (pgoff_t)-1); } -unsigned pagevec_lookup_tag(struct pagevec *pvec, +unsigned pagevec_lookup_range_tag(struct pagevec *pvec, + struct address_space *mapping, pgoff_t *index, pgoff_t end, + int tag, unsigned nr_pages); +static inline unsigned pagevec_lookup_tag(struct pagevec *pvec, struct address_space *mapping, pgoff_t *index, int tag, - unsigned nr_pages); + unsigned nr_pages) +{ + return pagevec_lookup_range_tag(pvec, mapping, index, (pgoff_t)-1, tag, + nr_pages); +} static inline void pagevec_init(struct pagevec *pvec, int cold) { -- cgit v1.2.3 From 93d3b7140ad379885849ad2674b4290c9e8273da Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 15 Nov 2017 17:35:12 -0800 Subject: mm: add variant of pagevec_lookup_range_tag() taking number of pages Currently pagevec_lookup_range_tag() takes number of pages to look up but most users don't need this. Create a new function pagevec_lookup_range_nr_tag() that takes maximum number of pages to lookup for Ceph which wants this functionality so that we can drop nr_pages argument from pagevec_lookup_range_tag(). Link: http://lkml.kernel.org/r/20171009151359.31984-13-jack@suse.cz Signed-off-by: Jan Kara Reviewed-by: Daniel Jordan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/pagevec.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/pagevec.h b/include/linux/pagevec.h index afc718f586f8..87a2dfd62f5e 100644 --- a/include/linux/pagevec.h +++ b/include/linux/pagevec.h @@ -41,6 +41,9 @@ static inline unsigned pagevec_lookup(struct pagevec *pvec, unsigned pagevec_lookup_range_tag(struct pagevec *pvec, struct address_space *mapping, pgoff_t *index, pgoff_t end, int tag, unsigned nr_pages); +unsigned pagevec_lookup_range_nr_tag(struct pagevec *pvec, + struct address_space *mapping, pgoff_t *index, pgoff_t end, + int tag, unsigned max_pages); static inline unsigned pagevec_lookup_tag(struct pagevec *pvec, struct address_space *mapping, pgoff_t *index, int tag, unsigned nr_pages) -- cgit v1.2.3 From 67fd707f468142d0f689a6240044bb45c1913003 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 15 Nov 2017 17:35:19 -0800 Subject: mm: remove nr_pages argument from pagevec_lookup_{,range}_tag() All users of pagevec_lookup() and pagevec_lookup_range() now pass PAGEVEC_SIZE as a desired number of pages. Just drop the argument. Link: http://lkml.kernel.org/r/20171009151359.31984-15-jack@suse.cz Signed-off-by: Jan Kara Reviewed-by: Daniel Jordan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/pagevec.h | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/pagevec.h b/include/linux/pagevec.h index 87a2dfd62f5e..4f56e0ad9d00 100644 --- a/include/linux/pagevec.h +++ b/include/linux/pagevec.h @@ -40,16 +40,14 @@ static inline unsigned pagevec_lookup(struct pagevec *pvec, unsigned pagevec_lookup_range_tag(struct pagevec *pvec, struct address_space *mapping, pgoff_t *index, pgoff_t end, - int tag, unsigned nr_pages); + int tag); unsigned pagevec_lookup_range_nr_tag(struct pagevec *pvec, struct address_space *mapping, pgoff_t *index, pgoff_t end, int tag, unsigned max_pages); static inline unsigned pagevec_lookup_tag(struct pagevec *pvec, - struct address_space *mapping, pgoff_t *index, int tag, - unsigned nr_pages) + struct address_space *mapping, pgoff_t *index, int tag) { - return pagevec_lookup_range_tag(pvec, mapping, index, (pgoff_t)-1, tag, - nr_pages); + return pagevec_lookup_range_tag(pvec, mapping, index, (pgoff_t)-1, tag); } static inline void pagevec_init(struct pagevec *pvec, int cold) -- cgit v1.2.3 From b4e98d9ac775907cc53fb08fcb6776deb7694e30 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 15 Nov 2017 17:35:33 -0800 Subject: mm: account pud page tables On a machine with 5-level paging support a process can allocate significant amount of memory and stay unnoticed by oom-killer and memory cgroup. The trick is to allocate a lot of PUD page tables. We don't account PUD page tables, only PMD and PTE. We already addressed the same issue for PMD page tables, see commit dc6c9a35b66b ("mm: account pmd page tables to the process"). Introduction of 5-level paging brings the same issue for PUD page tables. The patch expands accounting to PUD level. [kirill.shutemov@linux.intel.com: s/pmd_t/pud_t/] Link: http://lkml.kernel.org/r/20171004074305.x35eh5u7ybbt5kar@black.fi.intel.com [heiko.carstens@de.ibm.com: s390/mm: fix pud table accounting] Link: http://lkml.kernel.org/r/20171103090551.18231-1-heiko.carstens@de.ibm.com Link: http://lkml.kernel.org/r/20171002080427.3320-1-kirill.shutemov@linux.intel.com Signed-off-by: Kirill A. Shutemov Signed-off-by: Heiko Carstens Acked-by: Rik van Riel Acked-by: Michal Hocko Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 36 +++++++++++++++++++++++++++++++++--- include/linux/mm_types.h | 3 +++ 2 files changed, 36 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 91b46f99b4d2..9af86f39d928 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1599,14 +1599,44 @@ static inline int __p4d_alloc(struct mm_struct *mm, pgd_t *pgd, int __p4d_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address); #endif -#ifdef __PAGETABLE_PUD_FOLDED +#if defined(__PAGETABLE_PUD_FOLDED) || !defined(CONFIG_MMU) static inline int __pud_alloc(struct mm_struct *mm, p4d_t *p4d, unsigned long address) { return 0; } + +static inline unsigned long mm_nr_puds(const struct mm_struct *mm) +{ + return 0; +} + +static inline void mm_nr_puds_init(struct mm_struct *mm) {} +static inline void mm_inc_nr_puds(struct mm_struct *mm) {} +static inline void mm_dec_nr_puds(struct mm_struct *mm) {} + #else int __pud_alloc(struct mm_struct *mm, p4d_t *p4d, unsigned long address); + +static inline void mm_nr_puds_init(struct mm_struct *mm) +{ + atomic_long_set(&mm->nr_puds, 0); +} + +static inline unsigned long mm_nr_puds(const struct mm_struct *mm) +{ + return atomic_long_read(&mm->nr_puds); +} + +static inline void mm_inc_nr_puds(struct mm_struct *mm) +{ + atomic_long_inc(&mm->nr_puds); +} + +static inline void mm_dec_nr_puds(struct mm_struct *mm) +{ + atomic_long_dec(&mm->nr_puds); +} #endif #if defined(__PAGETABLE_PMD_FOLDED) || !defined(CONFIG_MMU) @@ -1618,7 +1648,7 @@ static inline int __pmd_alloc(struct mm_struct *mm, pud_t *pud, static inline void mm_nr_pmds_init(struct mm_struct *mm) {} -static inline unsigned long mm_nr_pmds(struct mm_struct *mm) +static inline unsigned long mm_nr_pmds(const struct mm_struct *mm) { return 0; } @@ -1634,7 +1664,7 @@ static inline void mm_nr_pmds_init(struct mm_struct *mm) atomic_long_set(&mm->nr_pmds, 0); } -static inline unsigned long mm_nr_pmds(struct mm_struct *mm) +static inline unsigned long mm_nr_pmds(const struct mm_struct *mm) { return atomic_long_read(&mm->nr_pmds); } diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index d1b8e8f97fc2..e9e561e02d22 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -404,6 +404,9 @@ struct mm_struct { atomic_long_t nr_ptes; /* PTE page table pages */ #if CONFIG_PGTABLE_LEVELS > 2 atomic_long_t nr_pmds; /* PMD page table pages */ +#endif +#if CONFIG_PGTABLE_LEVELS > 3 + atomic_long_t nr_puds; /* PUD page table pages */ #endif int map_count; /* number of VMAs */ -- cgit v1.2.3 From c4812909f5d5a9b7f1c85a2d95be388a066cda52 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 15 Nov 2017 17:35:37 -0800 Subject: mm: introduce wrappers to access mm->nr_ptes Let's add wrappers for ->nr_ptes with the same interface as for nr_pmd and nr_pud. The patch also makes nr_ptes accounting dependent onto CONFIG_MMU. Page table accounting doesn't make sense if you don't have page tables. It's preparation for consolidation of page-table counters in mm_struct. Link: http://lkml.kernel.org/r/20171006100651.44742-1-kirill.shutemov@linux.intel.com Signed-off-by: Kirill A. Shutemov Acked-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 32 ++++++++++++++++++++++++++++++++ include/linux/mm_types.h | 2 ++ 2 files changed, 34 insertions(+) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 9af86f39d928..2ca799f0d762 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1680,6 +1680,38 @@ static inline void mm_dec_nr_pmds(struct mm_struct *mm) } #endif +#ifdef CONFIG_MMU +static inline void mm_nr_ptes_init(struct mm_struct *mm) +{ + atomic_long_set(&mm->nr_ptes, 0); +} + +static inline unsigned long mm_nr_ptes(const struct mm_struct *mm) +{ + return atomic_long_read(&mm->nr_ptes); +} + +static inline void mm_inc_nr_ptes(struct mm_struct *mm) +{ + atomic_long_inc(&mm->nr_ptes); +} + +static inline void mm_dec_nr_ptes(struct mm_struct *mm) +{ + atomic_long_dec(&mm->nr_ptes); +} +#else +static inline void mm_nr_ptes_init(struct mm_struct *mm) {} + +static inline unsigned long mm_nr_ptes(const struct mm_struct *mm) +{ + return 0; +} + +static inline void mm_inc_nr_ptes(struct mm_struct *mm) {} +static inline void mm_dec_nr_ptes(struct mm_struct *mm) {} +#endif + int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address); int __pte_alloc_kernel(pmd_t *pmd, unsigned long address); diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index e9e561e02d22..e42048020664 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -401,7 +401,9 @@ struct mm_struct { */ atomic_t mm_count; +#ifdef CONFIG_MMU atomic_long_t nr_ptes; /* PTE page table pages */ +#endif #if CONFIG_PGTABLE_LEVELS > 2 atomic_long_t nr_pmds; /* PMD page table pages */ #endif -- cgit v1.2.3 From af5b0f6a09e42c9f4fa87735f2a366748767b686 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 15 Nov 2017 17:35:40 -0800 Subject: mm: consolidate page table accounting Currently, we account page tables separately for each page table level, but that's redundant -- we only make use of total memory allocated to page tables for oom_badness calculation. We also provide the information to userspace, but it has dubious value there too. This patch switches page table accounting to single counter. mm->pgtables_bytes is now used to account all page table levels. We use bytes, because page table size for different levels of page table tree may be different. The change has user-visible effect: we don't have VmPMD and VmPUD reported in /proc/[pid]/status. Not sure if anybody uses them. (As alternative, we can always report 0 kB for them.) OOM-killer report is also slightly changed: we now report pgtables_bytes instead of nr_ptes, nr_pmd, nr_puds. Apart from reducing number of counters per-mm, the benefit is that we now calculate oom_badness() more correctly for machines which have different size of page tables depending on level or where page tables are less than a page in size. The only downside can be debuggability because we do not know which page table level could leak. But I do not remember many bugs that would be caught by separate counters so I wouldn't lose sleep over this. [akpm@linux-foundation.org: fix mm/huge_memory.c] Link: http://lkml.kernel.org/r/20171006100651.44742-2-kirill.shutemov@linux.intel.com Signed-off-by: Kirill A. Shutemov Acked-by: Michal Hocko [kirill.shutemov@linux.intel.com: fix build] Link: http://lkml.kernel.org/r/20171016150113.ikfxy3e7zzfvsr4w@black.fi.intel.com Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 58 ++++++++++-------------------------------------- include/linux/mm_types.h | 8 +------ 2 files changed, 13 insertions(+), 53 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 2ca799f0d762..7c1e82a1aa77 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1605,37 +1605,20 @@ static inline int __pud_alloc(struct mm_struct *mm, p4d_t *p4d, { return 0; } - -static inline unsigned long mm_nr_puds(const struct mm_struct *mm) -{ - return 0; -} - -static inline void mm_nr_puds_init(struct mm_struct *mm) {} static inline void mm_inc_nr_puds(struct mm_struct *mm) {} static inline void mm_dec_nr_puds(struct mm_struct *mm) {} #else int __pud_alloc(struct mm_struct *mm, p4d_t *p4d, unsigned long address); -static inline void mm_nr_puds_init(struct mm_struct *mm) -{ - atomic_long_set(&mm->nr_puds, 0); -} - -static inline unsigned long mm_nr_puds(const struct mm_struct *mm) -{ - return atomic_long_read(&mm->nr_puds); -} - static inline void mm_inc_nr_puds(struct mm_struct *mm) { - atomic_long_inc(&mm->nr_puds); + atomic_long_add(PTRS_PER_PUD * sizeof(pud_t), &mm->pgtables_bytes); } static inline void mm_dec_nr_puds(struct mm_struct *mm) { - atomic_long_dec(&mm->nr_puds); + atomic_long_sub(PTRS_PER_PUD * sizeof(pud_t), &mm->pgtables_bytes); } #endif @@ -1646,64 +1629,47 @@ static inline int __pmd_alloc(struct mm_struct *mm, pud_t *pud, return 0; } -static inline void mm_nr_pmds_init(struct mm_struct *mm) {} - -static inline unsigned long mm_nr_pmds(const struct mm_struct *mm) -{ - return 0; -} - static inline void mm_inc_nr_pmds(struct mm_struct *mm) {} static inline void mm_dec_nr_pmds(struct mm_struct *mm) {} #else int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address); -static inline void mm_nr_pmds_init(struct mm_struct *mm) -{ - atomic_long_set(&mm->nr_pmds, 0); -} - -static inline unsigned long mm_nr_pmds(const struct mm_struct *mm) -{ - return atomic_long_read(&mm->nr_pmds); -} - static inline void mm_inc_nr_pmds(struct mm_struct *mm) { - atomic_long_inc(&mm->nr_pmds); + atomic_long_add(PTRS_PER_PMD * sizeof(pmd_t), &mm->pgtables_bytes); } static inline void mm_dec_nr_pmds(struct mm_struct *mm) { - atomic_long_dec(&mm->nr_pmds); + atomic_long_sub(PTRS_PER_PMD * sizeof(pmd_t), &mm->pgtables_bytes); } #endif #ifdef CONFIG_MMU -static inline void mm_nr_ptes_init(struct mm_struct *mm) +static inline void mm_pgtables_bytes_init(struct mm_struct *mm) { - atomic_long_set(&mm->nr_ptes, 0); + atomic_long_set(&mm->pgtables_bytes, 0); } -static inline unsigned long mm_nr_ptes(const struct mm_struct *mm) +static inline unsigned long mm_pgtables_bytes(const struct mm_struct *mm) { - return atomic_long_read(&mm->nr_ptes); + return atomic_long_read(&mm->pgtables_bytes); } static inline void mm_inc_nr_ptes(struct mm_struct *mm) { - atomic_long_inc(&mm->nr_ptes); + atomic_long_add(PTRS_PER_PTE * sizeof(pte_t), &mm->pgtables_bytes); } static inline void mm_dec_nr_ptes(struct mm_struct *mm) { - atomic_long_dec(&mm->nr_ptes); + atomic_long_sub(PTRS_PER_PTE * sizeof(pte_t), &mm->pgtables_bytes); } #else -static inline void mm_nr_ptes_init(struct mm_struct *mm) {} -static inline unsigned long mm_nr_ptes(const struct mm_struct *mm) +static inline void mm_pgtables_bytes_init(struct mm_struct *mm) {} +static inline unsigned long mm_pgtables_bytes(const struct mm_struct *mm) { return 0; } diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index e42048020664..09643e0472fc 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -402,13 +402,7 @@ struct mm_struct { atomic_t mm_count; #ifdef CONFIG_MMU - atomic_long_t nr_ptes; /* PTE page table pages */ -#endif -#if CONFIG_PGTABLE_LEVELS > 2 - atomic_long_t nr_pmds; /* PMD page table pages */ -#endif -#if CONFIG_PGTABLE_LEVELS > 3 - atomic_long_t nr_puds; /* PUD page table pages */ + atomic_long_t pgtables_bytes; /* PTE page table pages */ #endif int map_count; /* number of VMAs */ -- cgit v1.2.3 From 4950276672fce5c241857540f8561c440663673d Mon Sep 17 00:00:00 2001 From: "Levin, Alexander (Sasha Levin)" Date: Wed, 15 Nov 2017 17:35:51 -0800 Subject: kmemcheck: remove annotations Patch series "kmemcheck: kill kmemcheck", v2. As discussed at LSF/MM, kill kmemcheck. KASan is a replacement that is able to work without the limitation of kmemcheck (single CPU, slow). KASan is already upstream. We are also not aware of any users of kmemcheck (or users who don't consider KASan as a suitable replacement). The only objection was that since KASAN wasn't supported by all GCC versions provided by distros at that time we should hold off for 2 years, and try again. Now that 2 years have passed, and all distros provide gcc that supports KASAN, kill kmemcheck again for the very same reasons. This patch (of 4): Remove kmemcheck annotations, and calls to kmemcheck from the kernel. [alexander.levin@verizon.com: correctly remove kmemcheck call from dma_map_sg_attrs] Link: http://lkml.kernel.org/r/20171012192151.26531-1-alexander.levin@verizon.com Link: http://lkml.kernel.org/r/20171007030159.22241-2-alexander.levin@verizon.com Signed-off-by: Sasha Levin Cc: Alexander Potapenko Cc: Eric W. Biederman Cc: Michal Hocko Cc: Pekka Enberg Cc: Steven Rostedt Cc: Tim Hansen Cc: Vegard Nossum Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/c2port.h | 4 ---- include/linux/dma-mapping.h | 8 +------- include/linux/filter.h | 2 -- include/linux/mm_types.h | 8 -------- include/linux/net.h | 3 --- include/linux/ring_buffer.h | 3 --- include/linux/skbuff.h | 3 --- include/net/inet_sock.h | 3 --- include/net/inet_timewait_sock.h | 4 ---- include/net/sock.h | 3 --- 10 files changed, 1 insertion(+), 40 deletions(-) (limited to 'include') diff --git a/include/linux/c2port.h b/include/linux/c2port.h index 4efabcb51347..f2736348ca26 100644 --- a/include/linux/c2port.h +++ b/include/linux/c2port.h @@ -9,8 +9,6 @@ * the Free Software Foundation */ -#include - #define C2PORT_NAME_LEN 32 struct device; @@ -22,10 +20,8 @@ struct device; /* Main struct */ struct c2port_ops; struct c2port_device { - kmemcheck_bitfield_begin(flags); unsigned int access:1; unsigned int flash_access:1; - kmemcheck_bitfield_end(flags); int id; char name[C2PORT_NAME_LEN]; diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h index eee1499db396..e8f8e8fb244d 100644 --- a/include/linux/dma-mapping.h +++ b/include/linux/dma-mapping.h @@ -9,7 +9,6 @@ #include #include #include -#include #include #include @@ -232,7 +231,6 @@ static inline dma_addr_t dma_map_single_attrs(struct device *dev, void *ptr, const struct dma_map_ops *ops = get_dma_ops(dev); dma_addr_t addr; - kmemcheck_mark_initialized(ptr, size); BUG_ON(!valid_dma_direction(dir)); addr = ops->map_page(dev, virt_to_page(ptr), offset_in_page(ptr), size, @@ -265,11 +263,8 @@ static inline int dma_map_sg_attrs(struct device *dev, struct scatterlist *sg, unsigned long attrs) { const struct dma_map_ops *ops = get_dma_ops(dev); - int i, ents; - struct scatterlist *s; + int ents; - for_each_sg(sg, s, nents, i) - kmemcheck_mark_initialized(sg_virt(s), s->length); BUG_ON(!valid_dma_direction(dir)); ents = ops->map_sg(dev, sg, nents, dir, attrs); BUG_ON(ents < 0); @@ -299,7 +294,6 @@ static inline dma_addr_t dma_map_page_attrs(struct device *dev, const struct dma_map_ops *ops = get_dma_ops(dev); dma_addr_t addr; - kmemcheck_mark_initialized(page_address(page) + offset, size); BUG_ON(!valid_dma_direction(dir)); addr = ops->map_page(dev, page, offset, size, dir, attrs); debug_dma_map_page(dev, page, offset, size, dir, addr, false); diff --git a/include/linux/filter.h b/include/linux/filter.h index 48ec57e70f9f..42197b16dd78 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -454,13 +454,11 @@ struct bpf_binary_header { struct bpf_prog { u16 pages; /* Number of allocated pages */ - kmemcheck_bitfield_begin(meta); u16 jited:1, /* Is our filter JIT'ed? */ locked:1, /* Program image locked? */ gpl_compatible:1, /* Is filter GPL compatible? */ cb_access:1, /* Is control block accessed? */ dst_needed:1; /* Do we need dst entry? */ - kmemcheck_bitfield_end(meta); enum bpf_prog_type type; /* Type of BPF program */ u32 len; /* Number of filter blocks */ u32 jited_len; /* Size of jited insns in bytes */ diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 09643e0472fc..cfd0ac4e5e0e 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -209,14 +209,6 @@ struct page { not kmapped, ie. highmem) */ #endif /* WANT_PAGE_VIRTUAL */ -#ifdef CONFIG_KMEMCHECK - /* - * kmemcheck wants to track the status of each byte in a page; this - * is a pointer to such a status block. NULL if not tracked. - */ - void *shadow; -#endif - #ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS int _last_cpupid; #endif diff --git a/include/linux/net.h b/include/linux/net.h index d97d80d7fdf8..caeb159abda5 100644 --- a/include/linux/net.h +++ b/include/linux/net.h @@ -22,7 +22,6 @@ #include #include #include /* For O_CLOEXEC and O_NONBLOCK */ -#include #include #include #include @@ -111,9 +110,7 @@ struct socket_wq { struct socket { socket_state state; - kmemcheck_bitfield_begin(type); short type; - kmemcheck_bitfield_end(type); unsigned long flags; diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h index fa6ace66fea5..289e4d54e3e0 100644 --- a/include/linux/ring_buffer.h +++ b/include/linux/ring_buffer.h @@ -2,7 +2,6 @@ #ifndef _LINUX_RING_BUFFER_H #define _LINUX_RING_BUFFER_H -#include #include #include #include @@ -14,9 +13,7 @@ struct ring_buffer_iter; * Don't refer to this struct directly, use functions below. */ struct ring_buffer_event { - kmemcheck_bitfield_begin(bitfield); u32 type_len:5, time_delta:27; - kmemcheck_bitfield_end(bitfield); u32 array[]; }; diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index d448a4804aea..aa1341474916 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -15,7 +15,6 @@ #define _LINUX_SKBUFF_H #include -#include #include #include #include @@ -704,7 +703,6 @@ struct sk_buff { /* Following fields are _not_ copied in __copy_skb_header() * Note that queue_mapping is here mostly to fill a hole. */ - kmemcheck_bitfield_begin(flags1); __u16 queue_mapping; /* if you move cloned around you also must adapt those constants */ @@ -723,7 +721,6 @@ struct sk_buff { head_frag:1, xmit_more:1, __unused:1; /* one bit hole */ - kmemcheck_bitfield_end(flags1); /* fields enclosed in headers_start/headers_end are copied * using a single memcpy() in __copy_skb_header() diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h index db8162dd8c0b..8e51b4a69088 100644 --- a/include/net/inet_sock.h +++ b/include/net/inet_sock.h @@ -17,7 +17,6 @@ #define _INET_SOCK_H #include -#include #include #include #include @@ -84,7 +83,6 @@ struct inet_request_sock { #define ireq_state req.__req_common.skc_state #define ireq_family req.__req_common.skc_family - kmemcheck_bitfield_begin(flags); u16 snd_wscale : 4, rcv_wscale : 4, tstamp_ok : 1, @@ -93,7 +91,6 @@ struct inet_request_sock { ecn_ok : 1, acked : 1, no_srccheck: 1; - kmemcheck_bitfield_end(flags); u32 ir_mark; union { struct ip_options_rcu __rcu *ireq_opt; diff --git a/include/net/inet_timewait_sock.h b/include/net/inet_timewait_sock.h index 6a75d67a30fd..1356fa6a7566 100644 --- a/include/net/inet_timewait_sock.h +++ b/include/net/inet_timewait_sock.h @@ -15,8 +15,6 @@ #ifndef _INET_TIMEWAIT_SOCK_ #define _INET_TIMEWAIT_SOCK_ - -#include #include #include #include @@ -69,14 +67,12 @@ struct inet_timewait_sock { /* Socket demultiplex comparisons on incoming packets. */ /* these three are in inet_sock */ __be16 tw_sport; - kmemcheck_bitfield_begin(flags); /* And these are ours. */ unsigned int tw_kill : 1, tw_transparent : 1, tw_flowlabel : 20, tw_pad : 2, /* 2 bits hole */ tw_tos : 8; - kmemcheck_bitfield_end(flags); struct timer_list tw_timer; struct inet_bind_bucket *tw_tb; }; diff --git a/include/net/sock.h b/include/net/sock.h index c577286dbffb..a63e6a8bb7e0 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -436,7 +436,6 @@ struct sock { #define SK_FL_TYPE_MASK 0xffff0000 #endif - kmemcheck_bitfield_begin(flags); unsigned int sk_padding : 1, sk_kern_sock : 1, sk_no_check_tx : 1, @@ -445,8 +444,6 @@ struct sock { sk_protocol : 8, sk_type : 16; #define SK_PROTOCOL_MAX U8_MAX - kmemcheck_bitfield_end(flags); - u16 sk_gso_max_segs; unsigned long sk_lingertime; struct proto *sk_prot_creator; -- cgit v1.2.3 From 75f296d93bcebcfe375884ddac79e30263a31766 Mon Sep 17 00:00:00 2001 From: "Levin, Alexander (Sasha Levin)" Date: Wed, 15 Nov 2017 17:35:54 -0800 Subject: kmemcheck: stop using GFP_NOTRACK and SLAB_NOTRACK Convert all allocations that used a NOTRACK flag to stop using it. Link: http://lkml.kernel.org/r/20171007030159.22241-3-alexander.levin@verizon.com Signed-off-by: Sasha Levin Cc: Alexander Potapenko Cc: Eric W. Biederman Cc: Michal Hocko Cc: Pekka Enberg Cc: Steven Rostedt Cc: Tim Hansen Cc: Vegard Nossum Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/thread_info.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h index 4bcdf00c110f..34f053a150a9 100644 --- a/include/linux/thread_info.h +++ b/include/linux/thread_info.h @@ -44,10 +44,9 @@ enum { #endif #if IS_ENABLED(CONFIG_DEBUG_STACK_USAGE) || IS_ENABLED(CONFIG_DEBUG_KMEMLEAK) -# define THREADINFO_GFP (GFP_KERNEL_ACCOUNT | __GFP_NOTRACK | \ - __GFP_ZERO) +# define THREADINFO_GFP (GFP_KERNEL_ACCOUNT | __GFP_ZERO) #else -# define THREADINFO_GFP (GFP_KERNEL_ACCOUNT | __GFP_NOTRACK) +# define THREADINFO_GFP (GFP_KERNEL_ACCOUNT) #endif /* -- cgit v1.2.3 From d8be75663cec0069b85f80191abd2682ce4a512f Mon Sep 17 00:00:00 2001 From: "Levin, Alexander (Sasha Levin)" Date: Wed, 15 Nov 2017 17:35:58 -0800 Subject: kmemcheck: remove whats left of NOTRACK flags Now that kmemcheck is gone, we don't need the NOTRACK flags. Link: http://lkml.kernel.org/r/20171007030159.22241-5-alexander.levin@verizon.com Signed-off-by: Sasha Levin Cc: Alexander Potapenko Cc: Eric W. Biederman Cc: Michal Hocko Cc: Pekka Enberg Cc: Steven Rostedt Cc: Tim Hansen Cc: Vegard Nossum Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/gfp.h | 9 --------- include/linux/slab.h | 6 ------ include/trace/events/mmflags.h | 1 - 3 files changed, 16 deletions(-) (limited to 'include') diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 710143741eb5..b041f94678de 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -37,7 +37,6 @@ struct vm_area_struct; #define ___GFP_THISNODE 0x40000u #define ___GFP_ATOMIC 0x80000u #define ___GFP_ACCOUNT 0x100000u -#define ___GFP_NOTRACK 0x200000u #define ___GFP_DIRECT_RECLAIM 0x400000u #define ___GFP_WRITE 0x800000u #define ___GFP_KSWAPD_RECLAIM 0x1000000u @@ -201,19 +200,11 @@ struct vm_area_struct; * __GFP_COMP address compound page metadata. * * __GFP_ZERO returns a zeroed page on success. - * - * __GFP_NOTRACK avoids tracking with kmemcheck. - * - * __GFP_NOTRACK_FALSE_POSITIVE is an alias of __GFP_NOTRACK. It's a means of - * distinguishing in the source between false positives and allocations that - * cannot be supported (e.g. page tables). */ #define __GFP_COLD ((__force gfp_t)___GFP_COLD) #define __GFP_NOWARN ((__force gfp_t)___GFP_NOWARN) #define __GFP_COMP ((__force gfp_t)___GFP_COMP) #define __GFP_ZERO ((__force gfp_t)___GFP_ZERO) -#define __GFP_NOTRACK ((__force gfp_t)___GFP_NOTRACK) -#define __GFP_NOTRACK_FALSE_POSITIVE (__GFP_NOTRACK) /* Disable lockdep for GFP context tracking */ #define __GFP_NOLOCKDEP ((__force gfp_t)___GFP_NOLOCKDEP) diff --git a/include/linux/slab.h b/include/linux/slab.h index ebddfca9e24b..79f6532f8a0b 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -89,12 +89,6 @@ /* Avoid kmemleak tracing */ #define SLAB_NOLEAKTRACE ((slab_flags_t __force)0x00800000U) -/* Don't track use of uninitialized memory */ -#ifdef CONFIG_KMEMCHECK -# define SLAB_NOTRACK ((slab_flags_t __force)0x01000000U) -#else -# define SLAB_NOTRACK 0 -#endif /* Fault injection mark */ #ifdef CONFIG_FAILSLAB # define SLAB_FAILSLAB ((slab_flags_t __force)0x02000000U) diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h index 648cbf603736..72162f3a03fa 100644 --- a/include/trace/events/mmflags.h +++ b/include/trace/events/mmflags.h @@ -46,7 +46,6 @@ {(unsigned long)__GFP_RECLAIMABLE, "__GFP_RECLAIMABLE"}, \ {(unsigned long)__GFP_MOVABLE, "__GFP_MOVABLE"}, \ {(unsigned long)__GFP_ACCOUNT, "__GFP_ACCOUNT"}, \ - {(unsigned long)__GFP_NOTRACK, "__GFP_NOTRACK"}, \ {(unsigned long)__GFP_WRITE, "__GFP_WRITE"}, \ {(unsigned long)__GFP_RECLAIM, "__GFP_RECLAIM"}, \ {(unsigned long)__GFP_DIRECT_RECLAIM, "__GFP_DIRECT_RECLAIM"},\ -- cgit v1.2.3 From 4675ff05de2d76d167336b368bd07f3fef6ed5a6 Mon Sep 17 00:00:00 2001 From: "Levin, Alexander (Sasha Levin)" Date: Wed, 15 Nov 2017 17:36:02 -0800 Subject: kmemcheck: rip it out Fix up makefiles, remove references, and git rm kmemcheck. Link: http://lkml.kernel.org/r/20171007030159.22241-4-alexander.levin@verizon.com Signed-off-by: Sasha Levin Cc: Steven Rostedt Cc: Vegard Nossum Cc: Pekka Enberg Cc: Michal Hocko Cc: Eric W. Biederman Cc: Alexander Potapenko Cc: Tim Hansen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/interrupt.h | 15 ---- include/linux/kmemcheck.h | 171 ---------------------------------------------- 2 files changed, 186 deletions(-) (limited to 'include') diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index baeb872283d9..69c238210325 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -594,21 +594,6 @@ static inline void tasklet_hi_schedule(struct tasklet_struct *t) __tasklet_hi_schedule(t); } -extern void __tasklet_hi_schedule_first(struct tasklet_struct *t); - -/* - * This version avoids touching any other tasklets. Needed for kmemcheck - * in order not to take any page faults while enqueueing this tasklet; - * consider VERY carefully whether you really need this or - * tasklet_hi_schedule()... - */ -static inline void tasklet_hi_schedule_first(struct tasklet_struct *t) -{ - if (!test_and_set_bit(TASKLET_STATE_SCHED, &t->state)) - __tasklet_hi_schedule_first(t); -} - - static inline void tasklet_disable_nosync(struct tasklet_struct *t) { atomic_inc(&t->count); diff --git a/include/linux/kmemcheck.h b/include/linux/kmemcheck.h index 7b1d7bead7d9..ea32a7d3cf1b 100644 --- a/include/linux/kmemcheck.h +++ b/include/linux/kmemcheck.h @@ -1,172 +1 @@ /* SPDX-License-Identifier: GPL-2.0 */ -#ifndef LINUX_KMEMCHECK_H -#define LINUX_KMEMCHECK_H - -#include -#include - -#ifdef CONFIG_KMEMCHECK -extern int kmemcheck_enabled; - -/* The slab-related functions. */ -void kmemcheck_alloc_shadow(struct page *page, int order, gfp_t flags, int node); -void kmemcheck_free_shadow(struct page *page, int order); -void kmemcheck_slab_alloc(struct kmem_cache *s, gfp_t gfpflags, void *object, - size_t size); -void kmemcheck_slab_free(struct kmem_cache *s, void *object, size_t size); - -void kmemcheck_pagealloc_alloc(struct page *p, unsigned int order, - gfp_t gfpflags); - -void kmemcheck_show_pages(struct page *p, unsigned int n); -void kmemcheck_hide_pages(struct page *p, unsigned int n); - -bool kmemcheck_page_is_tracked(struct page *p); - -void kmemcheck_mark_unallocated(void *address, unsigned int n); -void kmemcheck_mark_uninitialized(void *address, unsigned int n); -void kmemcheck_mark_initialized(void *address, unsigned int n); -void kmemcheck_mark_freed(void *address, unsigned int n); - -void kmemcheck_mark_unallocated_pages(struct page *p, unsigned int n); -void kmemcheck_mark_uninitialized_pages(struct page *p, unsigned int n); -void kmemcheck_mark_initialized_pages(struct page *p, unsigned int n); - -int kmemcheck_show_addr(unsigned long address); -int kmemcheck_hide_addr(unsigned long address); - -bool kmemcheck_is_obj_initialized(unsigned long addr, size_t size); - -/* - * Bitfield annotations - * - * How to use: If you have a struct using bitfields, for example - * - * struct a { - * int x:8, y:8; - * }; - * - * then this should be rewritten as - * - * struct a { - * kmemcheck_bitfield_begin(flags); - * int x:8, y:8; - * kmemcheck_bitfield_end(flags); - * }; - * - * Now the "flags_begin" and "flags_end" members may be used to refer to the - * beginning and end, respectively, of the bitfield (and things like - * &x.flags_begin is allowed). As soon as the struct is allocated, the bit- - * fields should be annotated: - * - * struct a *a = kmalloc(sizeof(struct a), GFP_KERNEL); - * kmemcheck_annotate_bitfield(a, flags); - */ -#define kmemcheck_bitfield_begin(name) \ - int name##_begin[0]; - -#define kmemcheck_bitfield_end(name) \ - int name##_end[0]; - -#define kmemcheck_annotate_bitfield(ptr, name) \ - do { \ - int _n; \ - \ - if (!ptr) \ - break; \ - \ - _n = (long) &((ptr)->name##_end) \ - - (long) &((ptr)->name##_begin); \ - BUILD_BUG_ON(_n < 0); \ - \ - kmemcheck_mark_initialized(&((ptr)->name##_begin), _n); \ - } while (0) - -#define kmemcheck_annotate_variable(var) \ - do { \ - kmemcheck_mark_initialized(&(var), sizeof(var)); \ - } while (0) \ - -#else -#define kmemcheck_enabled 0 - -static inline void -kmemcheck_alloc_shadow(struct page *page, int order, gfp_t flags, int node) -{ -} - -static inline void -kmemcheck_free_shadow(struct page *page, int order) -{ -} - -static inline void -kmemcheck_slab_alloc(struct kmem_cache *s, gfp_t gfpflags, void *object, - size_t size) -{ -} - -static inline void kmemcheck_slab_free(struct kmem_cache *s, void *object, - size_t size) -{ -} - -static inline void kmemcheck_pagealloc_alloc(struct page *p, - unsigned int order, gfp_t gfpflags) -{ -} - -static inline bool kmemcheck_page_is_tracked(struct page *p) -{ - return false; -} - -static inline void kmemcheck_mark_unallocated(void *address, unsigned int n) -{ -} - -static inline void kmemcheck_mark_uninitialized(void *address, unsigned int n) -{ -} - -static inline void kmemcheck_mark_initialized(void *address, unsigned int n) -{ -} - -static inline void kmemcheck_mark_freed(void *address, unsigned int n) -{ -} - -static inline void kmemcheck_mark_unallocated_pages(struct page *p, - unsigned int n) -{ -} - -static inline void kmemcheck_mark_uninitialized_pages(struct page *p, - unsigned int n) -{ -} - -static inline void kmemcheck_mark_initialized_pages(struct page *p, - unsigned int n) -{ -} - -static inline bool kmemcheck_is_obj_initialized(unsigned long addr, size_t size) -{ - return true; -} - -#define kmemcheck_bitfield_begin(name) -#define kmemcheck_bitfield_end(name) -#define kmemcheck_annotate_bitfield(ptr, name) \ - do { \ - } while (0) - -#define kmemcheck_annotate_variable(var) \ - do { \ - } while (0) - -#endif /* CONFIG_KMEMCHECK */ - -#endif /* LINUX_KMEMCHECK_H */ -- cgit v1.2.3 From ea1f5f3712afe895dfa4176ec87376b4a9ac23be Mon Sep 17 00:00:00 2001 From: Pavel Tatashin Date: Wed, 15 Nov 2017 17:36:27 -0800 Subject: mm: define memblock_virt_alloc_try_nid_raw * A new variant of memblock_virt_alloc_* allocations: memblock_virt_alloc_try_nid_raw() - Does not zero the allocated memory - Does not panic if request cannot be satisfied * optimize early system hash allocations Clients can call alloc_large_system_hash() with flag: HASH_ZERO to specify that memory that was allocated for system hash needs to be zeroed, otherwise the memory does not need to be zeroed, and client will initialize it. If memory does not need to be zero'd, call the new memblock_virt_alloc_raw() interface, and thus improve the boot performance. * debug for raw alloctor When CONFIG_DEBUG_VM is enabled, this patch sets all the memory that is returned by memblock_virt_alloc_try_nid_raw() to ones to ensure that no places excpect zeroed memory. Link: http://lkml.kernel.org/r/20171013173214.27300-6-pasha.tatashin@oracle.com Signed-off-by: Pavel Tatashin Reviewed-by: Steven Sistare Reviewed-by: Daniel Jordan Reviewed-by: Bob Picco Tested-by: Bob Picco Acked-by: Michal Hocko Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Ard Biesheuvel Cc: Catalin Marinas Cc: Christian Borntraeger Cc: David S. Miller Cc: Dmitry Vyukov Cc: Heiko Carstens Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Mark Rutland Cc: Matthew Wilcox Cc: Mel Gorman Cc: Michal Hocko Cc: Sam Ravnborg Cc: Thomas Gleixner Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/bootmem.h | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) (limited to 'include') diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h index fdf40ca04b3c..a53063e9d7d8 100644 --- a/include/linux/bootmem.h +++ b/include/linux/bootmem.h @@ -161,6 +161,9 @@ extern void *__alloc_bootmem_low_node(pg_data_t *pgdat, #define BOOTMEM_ALLOC_ANYWHERE (~(phys_addr_t)0) /* FIXME: Move to memblock.h at a point where we remove nobootmem.c */ +void *memblock_virt_alloc_try_nid_raw(phys_addr_t size, phys_addr_t align, + phys_addr_t min_addr, + phys_addr_t max_addr, int nid); void *memblock_virt_alloc_try_nid_nopanic(phys_addr_t size, phys_addr_t align, phys_addr_t min_addr, phys_addr_t max_addr, int nid); @@ -177,6 +180,14 @@ static inline void * __init memblock_virt_alloc( NUMA_NO_NODE); } +static inline void * __init memblock_virt_alloc_raw( + phys_addr_t size, phys_addr_t align) +{ + return memblock_virt_alloc_try_nid_raw(size, align, BOOTMEM_LOW_LIMIT, + BOOTMEM_ALLOC_ACCESSIBLE, + NUMA_NO_NODE); +} + static inline void * __init memblock_virt_alloc_nopanic( phys_addr_t size, phys_addr_t align) { @@ -258,6 +269,14 @@ static inline void * __init memblock_virt_alloc( return __alloc_bootmem(size, align, BOOTMEM_LOW_LIMIT); } +static inline void * __init memblock_virt_alloc_raw( + phys_addr_t size, phys_addr_t align) +{ + if (!align) + align = SMP_CACHE_BYTES; + return __alloc_bootmem_nopanic(size, align, BOOTMEM_LOW_LIMIT); +} + static inline void * __init memblock_virt_alloc_nopanic( phys_addr_t size, phys_addr_t align) { @@ -310,6 +329,14 @@ static inline void * __init memblock_virt_alloc_try_nid(phys_addr_t size, min_addr); } +static inline void * __init memblock_virt_alloc_try_nid_raw( + phys_addr_t size, phys_addr_t align, + phys_addr_t min_addr, phys_addr_t max_addr, int nid) +{ + return ___alloc_bootmem_node_nopanic(NODE_DATA(nid), size, align, + min_addr, max_addr); +} + static inline void * __init memblock_virt_alloc_try_nid_nopanic( phys_addr_t size, phys_addr_t align, phys_addr_t min_addr, phys_addr_t max_addr, int nid) -- cgit v1.2.3 From a4a3ede2132ae0863e2d43e06f9b5697c51a7a3b Mon Sep 17 00:00:00 2001 From: Pavel Tatashin Date: Wed, 15 Nov 2017 17:36:31 -0800 Subject: mm: zero reserved and unavailable struct pages Some memory is reserved but unavailable: not present in memblock.memory (because not backed by physical pages), but present in memblock.reserved. Such memory has backing struct pages, but they are not initialized by going through __init_single_page(). In some cases these struct pages are accessed even if they do not contain any data. One example is page_to_pfn() might access page->flags if this is where section information is stored (CONFIG_SPARSEMEM, SECTION_IN_PAGE_FLAGS). One example of such memory: trim_low_memory_range() unconditionally reserves from pfn 0, but e820__memblock_setup() might provide the exiting memory from pfn 1 (i.e. KVM). Since struct pages are zeroed in __init_single_page(), and not during allocation time, we must zero such struct pages explicitly. The patch involves adding a new memblock iterator: for_each_resv_unavail_range(i, p_start, p_end) Which iterates through reserved && !memory lists, and we zero struct pages explicitly by calling mm_zero_struct_page(). === Here is more detailed example of problem that this patch is addressing: Run tested on qemu with the following arguments: -enable-kvm -cpu kvm64 -m 512 -smp 2 This patch reports that there are 98 unavailable pages. They are: pfn 0 and pfns in range [159, 255]. Note, trim_low_memory_range() reserves only pfns in range [0, 15], it does not reserve [159, 255] ones. e820__memblock_setup() reports linux that the following physical ranges are available: [1 , 158] [256, 130783] Notice, that exactly unavailable pfns are missing! Now, lets check what we have in zone 0: [1, 131039] pfn 0, is not part of the zone, but pfns [1, 158], are. However, the bigger problem we have if we do not initialize these struct pages is with memory hotplug. Because, that path operates at 2M boundaries (section_nr). And checks if 2M range of pages is hot removable. It starts with first pfn from zone, rounds it down to 2M boundary (sturct pages are allocated at 2M boundaries when vmemmap is created), and checks if that section is hot removable. In this case start with pfn 1 and convert it down to pfn 0. Later pfn is converted to struct page, and some fields are checked. Now, if we do not zero struct pages, we get unpredictable results. In fact when CONFIG_VM_DEBUG is enabled, and we explicitly set all vmemmap memory to ones, the following panic is observed with kernel test without this patch applied: BUG: unable to handle kernel NULL pointer dereference at (null) IP: is_pageblock_removable_nolock+0x35/0x90 PGD 0 P4D 0 Oops: 0000 [#1] PREEMPT ... task: ffff88001f4e2900 task.stack: ffffc90000314000 RIP: 0010:is_pageblock_removable_nolock+0x35/0x90 Call Trace: ? is_mem_section_removable+0x5a/0xd0 show_mem_removable+0x6b/0xa0 dev_attr_show+0x1b/0x50 sysfs_kf_seq_show+0xa1/0x100 kernfs_seq_show+0x22/0x30 seq_read+0x1ac/0x3a0 kernfs_fop_read+0x36/0x190 ? security_file_permission+0x90/0xb0 __vfs_read+0x16/0x30 vfs_read+0x81/0x130 SyS_read+0x44/0xa0 entry_SYSCALL_64_fastpath+0x1f/0xbd Link: http://lkml.kernel.org/r/20171013173214.27300-7-pasha.tatashin@oracle.com Signed-off-by: Pavel Tatashin Reviewed-by: Steven Sistare Reviewed-by: Daniel Jordan Reviewed-by: Bob Picco Tested-by: Bob Picco Acked-by: Michal Hocko Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Ard Biesheuvel Cc: Catalin Marinas Cc: Christian Borntraeger Cc: David S. Miller Cc: Dmitry Vyukov Cc: Heiko Carstens Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Mark Rutland Cc: Matthew Wilcox Cc: Mel Gorman Cc: Michal Hocko Cc: Sam Ravnborg Cc: Thomas Gleixner Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memblock.h | 16 ++++++++++++++++ include/linux/mm.h | 15 +++++++++++++++ 2 files changed, 31 insertions(+) (limited to 'include') diff --git a/include/linux/memblock.h b/include/linux/memblock.h index ce0e5634c2f9..7ed0f7782d16 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -237,6 +237,22 @@ unsigned long memblock_next_valid_pfn(unsigned long pfn, unsigned long max_pfn); for_each_mem_range_rev(i, &memblock.memory, &memblock.reserved, \ nid, flags, p_start, p_end, p_nid) +/** + * for_each_resv_unavail_range - iterate through reserved and unavailable memory + * @i: u64 used as loop variable + * @flags: pick from blocks based on memory attributes + * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL + * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL + * + * Walks over unavailable but reserved (reserved && !memory) areas of memblock. + * Available as soon as memblock is initialized. + * Note: because this memory does not belong to any physical node, flags and + * nid arguments do not make sense and thus not exported as arguments. + */ +#define for_each_resv_unavail_range(i, p_start, p_end) \ + for_each_mem_range(i, &memblock.reserved, &memblock.memory, \ + NUMA_NO_NODE, MEMBLOCK_NONE, p_start, p_end, NULL) + static inline void memblock_set_region_flags(struct memblock_region *r, unsigned long flags) { diff --git a/include/linux/mm.h b/include/linux/mm.h index 7c1e82a1aa77..703599fa8288 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -95,6 +95,15 @@ extern int mmap_rnd_compat_bits __read_mostly; #define mm_forbids_zeropage(X) (0) #endif +/* + * On some architectures it is expensive to call memset() for small sizes. + * Those architectures should provide their own implementation of "struct page" + * zeroing by defining this macro in . + */ +#ifndef mm_zero_struct_page +#define mm_zero_struct_page(pp) ((void)memset((pp), 0, sizeof(struct page))) +#endif + /* * Default maximum number of active map areas, this limits the number of vmas * per mm struct. Users can overwrite this number by sysctl but there is a @@ -2030,6 +2039,12 @@ extern int __meminit __early_pfn_to_nid(unsigned long pfn, struct mminit_pfnnid_cache *state); #endif +#ifdef CONFIG_HAVE_MEMBLOCK +void zero_resv_unavail(void); +#else +static inline void zero_resv_unavail(void) {} +#endif + extern void set_dma_reserve(unsigned long new_dma_reserve); extern void memmap_init_zone(unsigned long, int, unsigned long, unsigned long, enum memmap_context); -- cgit v1.2.3 From 736304f3245f39392895ff3392e1325d3e49e7d2 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 15 Nov 2017 17:37:11 -0800 Subject: mm: speed up cancel_dirty_page() for clean pages Patch series "Speed up page cache truncation", v1. When rebasing our enterprise distro to a newer kernel (from 4.4 to 4.12) we have noticed a regression in bonnie++ benchmark when deleting files. Eventually we have tracked this down to a fact that page cache truncation got slower by about 10%. There were both gains and losses in the above interval of kernels but we have been able to identify that commit 83929372f629 ("filemap: prepare find and delete operations for huge pages") caused about 10% regression on its own. After some investigation it didn't seem easily possible to fix the regression while maintaining the THP in page cache functionality so we've decided to optimize the page cache truncation path instead to make up for the change. This series is a result of that effort. Patch 1 is an easy speedup of cancel_dirty_page(). Patches 2-6 refactor page cache truncation code so that it is easier to batch radix tree operations. Patch 7 implements batching of deletes from the radix tree which more than makes up for the original regression. This patch (of 7): cancel_dirty_page() does quite some work even for clean pages (fetching of mapping, locking of memcg, atomic bit op on page flags) so it accounts for ~2.5% of cost of truncation of a clean page. That is not much but still dumb for something we don't need at all. Check whether a page is actually dirty and avoid any work if not. Link: http://lkml.kernel.org/r/20171010151937.26984-2-jack@suse.cz Signed-off-by: Jan Kara Acked-by: Mel Gorman Reviewed-by: Andi Kleen Cc: Dave Hansen Cc: Dave Chinner Cc: "Kirill A. Shutemov" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 703599fa8288..c7b1d617dff6 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1440,7 +1440,13 @@ void account_page_cleaned(struct page *page, struct address_space *mapping, struct bdi_writeback *wb); int set_page_dirty(struct page *page); int set_page_dirty_lock(struct page *page); -void cancel_dirty_page(struct page *page); +void __cancel_dirty_page(struct page *page); +static inline void cancel_dirty_page(struct page *page) +{ + /* Avoid atomic ops, locking, etc. when not actually needed. */ + if (PageDirty(page)) + __cancel_dirty_page(page); +} int clear_page_dirty_for_io(struct page *page); int get_cmdline(struct task_struct *task, char *buffer, int buflen); -- cgit v1.2.3 From aa65c29ce1b6e1990cd2c7d8004bbea7ff3aff38 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 15 Nov 2017 17:37:33 -0800 Subject: mm: batch radix tree operations when truncating pages Currently we remove pages from the radix tree one by one. To speed up page cache truncation, lock several pages at once and free them in one go. This allows us to batch radix tree operations in a more efficient way and also save round-trips on mapping->tree_lock. As a result we gain about 20% speed improvement in page cache truncation. Data from a simple benchmark timing 10000 truncates of 1024 pages (on ext4 on ramdisk but the filesystem is barely visible in the profiles). The range shows 1% and 95% percentiles of the measured times: 4.14-rc2 4.14-rc2 + batched truncation 248-256 209-219 249-258 209-217 248-255 211-239 248-255 209-217 247-256 210-218 [jack@suse.cz: convert delete_from_page_cache_batch() to pagevec] Link: http://lkml.kernel.org/r/20171018111648.13714-1-jack@suse.cz [akpm@linux-foundation.org: move struct pagevec forward declaration to top-of-file] Link: http://lkml.kernel.org/r/20171010151937.26984-8-jack@suse.cz Signed-off-by: Jan Kara Acked-by: Mel Gorman Reviewed-by: Andi Kleen Cc: Dave Chinner Cc: Dave Hansen Cc: "Kirill A. Shutemov" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/pagemap.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 956d6a80e126..e0f7181118fe 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -16,6 +16,8 @@ #include /* for in_interrupt() */ #include +struct pagevec; + /* * Bits in mapping->flags. */ @@ -624,6 +626,8 @@ int add_to_page_cache_lru(struct page *page, struct address_space *mapping, extern void delete_from_page_cache(struct page *page); extern void __delete_from_page_cache(struct page *page, void *shadow); int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask); +void delete_from_page_cache_batch(struct address_space *mapping, + struct pagevec *pvec); /* * Like add_to_page_cache_locked, but used to add newly allocated pages: -- cgit v1.2.3 From c7df8ad2910e965a6241b6d8f52fd122e26b0315 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 15 Nov 2017 17:37:41 -0800 Subject: mm, truncate: do not check mapping for every page being truncated During truncation, the mapping has already been checked for shmem and dax so it's known that workingset_update_node is required. This patch avoids the checks on mapping for each page being truncated. In all other cases, a lookup helper is used to determine if workingset_update_node() needs to be called. The one danger is that the API is slightly harder to use as calling workingset_update_node directly without checking for dax or shmem mappings could lead to surprises. However, the API rarely needs to be used and hopefully the comment is enough to give people the hint. sparsetruncate (tiny) 4.14.0-rc4 4.14.0-rc4 oneirq-v1r1 pickhelper-v1r1 Min Time 141.00 ( 0.00%) 140.00 ( 0.71%) 1st-qrtle Time 142.00 ( 0.00%) 141.00 ( 0.70%) 2nd-qrtle Time 142.00 ( 0.00%) 142.00 ( 0.00%) 3rd-qrtle Time 143.00 ( 0.00%) 143.00 ( 0.00%) Max-90% Time 144.00 ( 0.00%) 144.00 ( 0.00%) Max-95% Time 147.00 ( 0.00%) 145.00 ( 1.36%) Max-99% Time 195.00 ( 0.00%) 191.00 ( 2.05%) Max Time 230.00 ( 0.00%) 205.00 ( 10.87%) Amean Time 144.37 ( 0.00%) 143.82 ( 0.38%) Stddev Time 10.44 ( 0.00%) 9.00 ( 13.74%) Coeff Time 7.23 ( 0.00%) 6.26 ( 13.41%) Best99%Amean Time 143.72 ( 0.00%) 143.34 ( 0.26%) Best95%Amean Time 142.37 ( 0.00%) 142.00 ( 0.26%) Best90%Amean Time 142.19 ( 0.00%) 141.85 ( 0.24%) Best75%Amean Time 141.92 ( 0.00%) 141.58 ( 0.24%) Best50%Amean Time 141.69 ( 0.00%) 141.31 ( 0.27%) Best25%Amean Time 141.38 ( 0.00%) 140.97 ( 0.29%) As you'd expect, the gain is marginal but it can be detected. The differences in bonnie are all within the noise which is not surprising given the impact on the microbenchmark. radix_tree_update_node_t is a callback for some radix operations that optionally passes in a private field. The only user of the callback is workingset_update_node and as it no longer requires a mapping, the private field is removed. Link: http://lkml.kernel.org/r/20171018075952.10627-3-mgorman@techsingularity.net Signed-off-by: Mel Gorman Acked-by: Johannes Weiner Reviewed-by: Jan Kara Cc: Andi Kleen Cc: Dave Chinner Cc: Dave Hansen Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/radix-tree.h | 7 +++---- include/linux/swap.h | 13 ++++++++++++- 2 files changed, 15 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h index 567ebb5eaab0..0ca448c1cb42 100644 --- a/include/linux/radix-tree.h +++ b/include/linux/radix-tree.h @@ -301,18 +301,17 @@ void *__radix_tree_lookup(const struct radix_tree_root *, unsigned long index, void *radix_tree_lookup(const struct radix_tree_root *, unsigned long); void __rcu **radix_tree_lookup_slot(const struct radix_tree_root *, unsigned long index); -typedef void (*radix_tree_update_node_t)(struct radix_tree_node *, void *); +typedef void (*radix_tree_update_node_t)(struct radix_tree_node *); void __radix_tree_replace(struct radix_tree_root *, struct radix_tree_node *, void __rcu **slot, void *entry, - radix_tree_update_node_t update_node, void *private); + radix_tree_update_node_t update_node); void radix_tree_iter_replace(struct radix_tree_root *, const struct radix_tree_iter *, void __rcu **slot, void *entry); void radix_tree_replace_slot(struct radix_tree_root *, void __rcu **slot, void *entry); void __radix_tree_delete_node(struct radix_tree_root *, struct radix_tree_node *, - radix_tree_update_node_t update_node, - void *private); + radix_tree_update_node_t update_node); void radix_tree_iter_delete(struct radix_tree_root *, struct radix_tree_iter *iter, void __rcu **slot); void *radix_tree_delete_item(struct radix_tree_root *, unsigned long, void *); diff --git a/include/linux/swap.h b/include/linux/swap.h index 8b8a6f965785..454f042bcdd5 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -298,7 +298,18 @@ struct vma_swap_readahead { void *workingset_eviction(struct address_space *mapping, struct page *page); bool workingset_refault(void *shadow); void workingset_activation(struct page *page); -void workingset_update_node(struct radix_tree_node *node, void *private); + +/* Do not use directly, use workingset_lookup_update */ +void workingset_update_node(struct radix_tree_node *node); + +/* Returns workingset_update_node() if the mapping has shadow entries. */ +#define workingset_lookup_update(mapping) \ +({ \ + radix_tree_update_node_t __helper = workingset_update_node; \ + if (dax_mapping(mapping) || shmem_mapping(mapping)) \ + __helper = NULL; \ + __helper; \ +}) /* linux/mm/page_alloc.c */ extern unsigned long totalram_pages; -- cgit v1.2.3 From d9ed0d08b6c6a882da1d8e75bb3162fc889fd199 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 15 Nov 2017 17:37:48 -0800 Subject: mm: only drain per-cpu pagevecs once per pagevec usage When a pagevec is initialised on the stack, it is generally used multiple times over a range of pages, looking up entries and then releasing them. On each pagevec_release, the per-cpu deferred LRU pagevecs are drained on the grounds the page being released may be on those queues and the pages may be cache hot. In many cases only the first drain is necessary as it's unlikely that the range of pages being walked is racing against LRU addition. Even if there is such a race, the impact is marginal where as constantly redraining the lru pagevecs costs. This patch ensures that pagevec is only drained once in a given lifecycle without increasing the cache footprint of the pagevec structure. Only sparsetruncate tiny is shown here as large files have many exceptional entries and calls pagecache_release less frequently. sparsetruncate (tiny) 4.14.0-rc4 4.14.0-rc4 batchshadow-v1r1 onedrain-v1r1 Min Time 141.00 ( 0.00%) 141.00 ( 0.00%) 1st-qrtle Time 142.00 ( 0.00%) 142.00 ( 0.00%) 2nd-qrtle Time 142.00 ( 0.00%) 142.00 ( 0.00%) 3rd-qrtle Time 143.00 ( 0.00%) 143.00 ( 0.00%) Max-90% Time 144.00 ( 0.00%) 144.00 ( 0.00%) Max-95% Time 146.00 ( 0.00%) 145.00 ( 0.68%) Max-99% Time 198.00 ( 0.00%) 194.00 ( 2.02%) Max Time 254.00 ( 0.00%) 208.00 ( 18.11%) Amean Time 145.12 ( 0.00%) 144.30 ( 0.56%) Stddev Time 12.74 ( 0.00%) 9.62 ( 24.49%) Coeff Time 8.78 ( 0.00%) 6.67 ( 24.06%) Best99%Amean Time 144.29 ( 0.00%) 143.82 ( 0.32%) Best95%Amean Time 142.68 ( 0.00%) 142.31 ( 0.26%) Best90%Amean Time 142.52 ( 0.00%) 142.19 ( 0.24%) Best75%Amean Time 142.26 ( 0.00%) 141.98 ( 0.20%) Best50%Amean Time 141.90 ( 0.00%) 141.71 ( 0.13%) Best25%Amean Time 141.80 ( 0.00%) 141.43 ( 0.26%) The impact on bonnie is marginal and within the noise because a significant percentage of the file being truncated has been reclaimed and consists of shadow entries which reduce the hotness of the pagevec_release path. Link: http://lkml.kernel.org/r/20171018075952.10627-5-mgorman@techsingularity.net Signed-off-by: Mel Gorman Cc: Andi Kleen Cc: Dave Chinner Cc: Dave Hansen Cc: Jan Kara Cc: Johannes Weiner Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/pagevec.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/pagevec.h b/include/linux/pagevec.h index 4f56e0ad9d00..95c75c858d1f 100644 --- a/include/linux/pagevec.h +++ b/include/linux/pagevec.h @@ -17,7 +17,8 @@ struct address_space; struct pagevec { unsigned long nr; - unsigned long cold; + bool cold; + bool drained; struct page *pages[PAGEVEC_SIZE]; }; @@ -54,6 +55,7 @@ static inline void pagevec_init(struct pagevec *pvec, int cold) { pvec->nr = 0; pvec->cold = cold; + pvec->drained = false; } static inline void pagevec_reinit(struct pagevec *pvec) -- cgit v1.2.3 From 8667982014d6048e0b5e286b6247ff24f48d4cc6 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 15 Nov 2017 17:37:52 -0800 Subject: mm, pagevec: remove cold parameter for pagevecs Every pagevec_init user claims the pages being released are hot even in cases where it is unlikely the pages are hot. As no one cares about the hotness of pages being released to the allocator, just ditch the parameter. No performance impact is expected as the overhead is marginal. The parameter is removed simply because it is a bit stupid to have a useless parameter copied everywhere. Link: http://lkml.kernel.org/r/20171018075952.10627-6-mgorman@techsingularity.net Signed-off-by: Mel Gorman Acked-by: Vlastimil Babka Cc: Andi Kleen Cc: Dave Chinner Cc: Dave Hansen Cc: Jan Kara Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/pagevec.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/pagevec.h b/include/linux/pagevec.h index 95c75c858d1f..eebefd209424 100644 --- a/include/linux/pagevec.h +++ b/include/linux/pagevec.h @@ -17,7 +17,6 @@ struct address_space; struct pagevec { unsigned long nr; - bool cold; bool drained; struct page *pages[PAGEVEC_SIZE]; }; @@ -51,10 +50,9 @@ static inline unsigned pagevec_lookup_tag(struct pagevec *pvec, return pagevec_lookup_range_tag(pvec, mapping, index, (pgoff_t)-1, tag); } -static inline void pagevec_init(struct pagevec *pvec, int cold) +static inline void pagevec_init(struct pagevec *pvec) { pvec->nr = 0; - pvec->cold = cold; pvec->drained = false; } -- cgit v1.2.3 From c6f92f9fbe7dbcc8903a67229aa88b4077ae4422 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 15 Nov 2017 17:37:55 -0800 Subject: mm: remove cold parameter for release_pages All callers of release_pages claim the pages being released are cache hot. As no one cares about the hotness of pages being released to the allocator, just ditch the parameter. No performance impact is expected as the overhead is marginal. The parameter is removed simply because it is a bit stupid to have a useless parameter copied everywhere. Link: http://lkml.kernel.org/r/20171018075952.10627-7-mgorman@techsingularity.net Signed-off-by: Mel Gorman Acked-by: Vlastimil Babka Cc: Andi Kleen Cc: Dave Chinner Cc: Dave Hansen Cc: Jan Kara Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/pagemap.h | 2 +- include/linux/swap.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index e0f7181118fe..4c6790bb7afb 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -118,7 +118,7 @@ static inline void mapping_set_gfp_mask(struct address_space *m, gfp_t mask) m->gfp_mask = mask; } -void release_pages(struct page **pages, int nr, bool cold); +void release_pages(struct page **pages, int nr); /* * speculatively take a reference to a page. diff --git a/include/linux/swap.h b/include/linux/swap.h index 454f042bcdd5..c2b8128799c1 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -510,7 +510,7 @@ static inline struct swap_info_struct *swp_swap_info(swp_entry_t entry) #define free_page_and_swap_cache(page) \ put_page(page) #define free_pages_and_swap_cache(pages, nr) \ - release_pages((pages), (nr), false); + release_pages((pages), (nr)); static inline void show_swap_cache_info(void) { -- cgit v1.2.3 From 2d4894b5d2ae0fe1725ea7abd57b33bfbbe45492 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 15 Nov 2017 17:37:59 -0800 Subject: mm: remove cold parameter from free_hot_cold_page* Most callers users of free_hot_cold_page claim the pages being released are cache hot. The exception is the page reclaim paths where it is likely that enough pages will be freed in the near future that the per-cpu lists are going to be recycled and the cache hotness information is lost. As no one really cares about the hotness of pages being released to the allocator, just ditch the parameter. The APIs are renamed to indicate that it's no longer about hot/cold pages. It should also be less confusing as there are subtle differences between them. __free_pages drops a reference and frees a page when the refcount reaches zero. free_hot_cold_page handled pages whose refcount was already zero which is non-obvious from the name. free_unref_page should be more obvious. No performance impact is expected as the overhead is marginal. The parameter is removed simply because it is a bit stupid to have a useless parameter copied everywhere. [mgorman@techsingularity.net: add pages to head, not tail] Link: http://lkml.kernel.org/r/20171019154321.qtpzaeftoyyw4iey@techsingularity.net Link: http://lkml.kernel.org/r/20171018075952.10627-8-mgorman@techsingularity.net Signed-off-by: Mel Gorman Acked-by: Vlastimil Babka Cc: Andi Kleen Cc: Dave Chinner Cc: Dave Hansen Cc: Jan Kara Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/gfp.h | 4 ++-- include/trace/events/kmem.h | 11 ++++------- 2 files changed, 6 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/linux/gfp.h b/include/linux/gfp.h index b041f94678de..f7e62d9096fe 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -530,8 +530,8 @@ void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask); extern void __free_pages(struct page *page, unsigned int order); extern void free_pages(unsigned long addr, unsigned int order); -extern void free_hot_cold_page(struct page *page, bool cold); -extern void free_hot_cold_page_list(struct list_head *list, bool cold); +extern void free_unref_page(struct page *page); +extern void free_unref_page_list(struct list_head *list); struct page_frag_cache; extern void __page_frag_cache_drain(struct page *page, unsigned int count); diff --git a/include/trace/events/kmem.h b/include/trace/events/kmem.h index 285feeadac39..eb57e3037deb 100644 --- a/include/trace/events/kmem.h +++ b/include/trace/events/kmem.h @@ -172,24 +172,21 @@ TRACE_EVENT(mm_page_free, TRACE_EVENT(mm_page_free_batched, - TP_PROTO(struct page *page, int cold), + TP_PROTO(struct page *page), - TP_ARGS(page, cold), + TP_ARGS(page), TP_STRUCT__entry( __field( unsigned long, pfn ) - __field( int, cold ) ), TP_fast_assign( __entry->pfn = page_to_pfn(page); - __entry->cold = cold; ), - TP_printk("page=%p pfn=%lu order=0 cold=%d", + TP_printk("page=%p pfn=%lu order=0", pfn_to_page(__entry->pfn), - __entry->pfn, - __entry->cold) + __entry->pfn) ); TRACE_EVENT(mm_page_alloc, -- cgit v1.2.3 From 453f85d43fa9ee243f0fc3ac4e1be45615301e3f Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 15 Nov 2017 17:38:03 -0800 Subject: mm: remove __GFP_COLD As the page free path makes no distinction between cache hot and cold pages, there is no real useful ordering of pages in the free list that allocation requests can take advantage of. Juding from the users of __GFP_COLD, it is likely that a number of them are the result of copying other sites instead of actually measuring the impact. Remove the __GFP_COLD parameter which simplifies a number of paths in the page allocator. This is potentially controversial but bear in mind that the size of the per-cpu pagelists versus modern cache sizes means that the whole per-cpu list can often fit in the L3 cache. Hence, there is only a potential benefit for microbenchmarks that alloc/free pages in a tight loop. It's even worse when THP is taken into account which has little or no chance of getting a cache-hot page as the per-cpu list is bypassed and the zeroing of multiple pages will thrash the cache anyway. The truncate microbenchmarks are not shown as this patch affects the allocation path and not the free path. A page fault microbenchmark was tested but it showed no sigificant difference which is not surprising given that the __GFP_COLD branches are a miniscule percentage of the fault path. Link: http://lkml.kernel.org/r/20171018075952.10627-9-mgorman@techsingularity.net Signed-off-by: Mel Gorman Acked-by: Vlastimil Babka Cc: Andi Kleen Cc: Dave Chinner Cc: Dave Hansen Cc: Jan Kara Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/gfp.h | 5 ----- include/linux/pagemap.h | 8 +------- include/linux/skbuff.h | 2 +- include/linux/slab.h | 3 --- include/trace/events/mmflags.h | 1 - 5 files changed, 2 insertions(+), 17 deletions(-) (limited to 'include') diff --git a/include/linux/gfp.h b/include/linux/gfp.h index f7e62d9096fe..1a4582b44d32 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -24,7 +24,6 @@ struct vm_area_struct; #define ___GFP_HIGH 0x20u #define ___GFP_IO 0x40u #define ___GFP_FS 0x80u -#define ___GFP_COLD 0x100u #define ___GFP_NOWARN 0x200u #define ___GFP_RETRY_MAYFAIL 0x400u #define ___GFP_NOFAIL 0x800u @@ -192,16 +191,12 @@ struct vm_area_struct; /* * Action modifiers * - * __GFP_COLD indicates that the caller does not expect to be used in the near - * future. Where possible, a cache-cold page will be returned. - * * __GFP_NOWARN suppresses allocation failure reports. * * __GFP_COMP address compound page metadata. * * __GFP_ZERO returns a zeroed page on success. */ -#define __GFP_COLD ((__force gfp_t)___GFP_COLD) #define __GFP_NOWARN ((__force gfp_t)___GFP_NOWARN) #define __GFP_COMP ((__force gfp_t)___GFP_COMP) #define __GFP_ZERO ((__force gfp_t)___GFP_ZERO) diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 4c6790bb7afb..34ce3ebf97d5 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -234,15 +234,9 @@ static inline struct page *page_cache_alloc(struct address_space *x) return __page_cache_alloc(mapping_gfp_mask(x)); } -static inline struct page *page_cache_alloc_cold(struct address_space *x) -{ - return __page_cache_alloc(mapping_gfp_mask(x)|__GFP_COLD); -} - static inline gfp_t readahead_gfp_mask(struct address_space *x) { - return mapping_gfp_mask(x) | - __GFP_COLD | __GFP_NORETRY | __GFP_NOWARN; + return mapping_gfp_mask(x) | __GFP_NORETRY | __GFP_NOWARN; } typedef int filler_t(void *, struct page *); diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index aa1341474916..7c46fd0b8b64 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -2672,7 +2672,7 @@ static inline struct page *__dev_alloc_pages(gfp_t gfp_mask, * 4. __GFP_MEMALLOC is ignored if __GFP_NOMEMALLOC is set due to * code in gfp_to_alloc_flags that should be enforcing this. */ - gfp_mask |= __GFP_COLD | __GFP_COMP | __GFP_MEMALLOC; + gfp_mask |= __GFP_COMP | __GFP_MEMALLOC; return alloc_pages_node(NUMA_NO_NODE, gfp_mask, order); } diff --git a/include/linux/slab.h b/include/linux/slab.h index 79f6532f8a0b..50697a1d6621 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -467,9 +467,6 @@ static __always_inline void *kmalloc_large(size_t size, gfp_t flags) * Also it is possible to set different flags by OR'ing * in one or more of the following additional @flags: * - * %__GFP_COLD - Request cache-cold pages instead of - * trying to return cache-warm pages. - * * %__GFP_HIGH - This allocation has high priority and may use emergency pools. * * %__GFP_NOFAIL - Indicate that this allocation is in no way allowed to fail diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h index 72162f3a03fa..dbe1bb058c09 100644 --- a/include/trace/events/mmflags.h +++ b/include/trace/events/mmflags.h @@ -32,7 +32,6 @@ {(unsigned long)__GFP_ATOMIC, "__GFP_ATOMIC"}, \ {(unsigned long)__GFP_IO, "__GFP_IO"}, \ {(unsigned long)__GFP_FS, "__GFP_FS"}, \ - {(unsigned long)__GFP_COLD, "__GFP_COLD"}, \ {(unsigned long)__GFP_NOWARN, "__GFP_NOWARN"}, \ {(unsigned long)__GFP_RETRY_MAYFAIL, "__GFP_RETRY_MAYFAIL"}, \ {(unsigned long)__GFP_NOFAIL, "__GFP_NOFAIL"}, \ -- cgit v1.2.3 From 7f0b5fb953e750a7410cc96c67a656d79db48bcb Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 15 Nov 2017 17:38:10 -0800 Subject: mm, pagevec: rename pagevec drained field According to Vlastimil Babka, the drained field in pagevec is potentially misleading because it might be interpreted as draining this pagevec instead of the percpu lru pagevecs. Rename the field for clarity. Link: http://lkml.kernel.org/r/20171019093346.ylahzdpzmoriyf4v@techsingularity.net Signed-off-by: Mel Gorman Suggested-by: Vlastimil Babka Cc: Andi Kleen Cc: Dave Chinner Cc: Dave Hansen Cc: Jan Kara Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/pagevec.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/pagevec.h b/include/linux/pagevec.h index eebefd209424..5fb6580f7f23 100644 --- a/include/linux/pagevec.h +++ b/include/linux/pagevec.h @@ -17,7 +17,7 @@ struct address_space; struct pagevec { unsigned long nr; - bool drained; + bool percpu_pvec_drained; struct page *pages[PAGEVEC_SIZE]; }; @@ -53,7 +53,7 @@ static inline unsigned pagevec_lookup_tag(struct pagevec *pvec, static inline void pagevec_init(struct pagevec *pvec) { pvec->nr = 0; - pvec->drained = false; + pvec->percpu_pvec_drained = false; } static inline void pagevec_reinit(struct pagevec *pvec) -- cgit v1.2.3 From 4518085e127dff97e74f74a8780d7564e273bec8 Mon Sep 17 00:00:00 2001 From: Kemi Wang Date: Wed, 15 Nov 2017 17:38:22 -0800 Subject: mm, sysctl: make NUMA stats configurable This is the second step which introduces a tunable interface that allow numa stats configurable for optimizing zone_statistics(), as suggested by Dave Hansen and Ying Huang. ========================================================================= When page allocation performance becomes a bottleneck and you can tolerate some possible tool breakage and decreased numa counter precision, you can do: echo 0 > /proc/sys/vm/numa_stat In this case, numa counter update is ignored. We can see about *4.8%*(185->176) drop of cpu cycles per single page allocation and reclaim on Jesper's page_bench01 (single thread) and *8.1%*(343->315) drop of cpu cycles per single page allocation and reclaim on Jesper's page_bench03 (88 threads) running on a 2-Socket Broadwell-based server (88 threads, 126G memory). Benchmark link provided by Jesper D Brouer (increase loop times to 10000000): https://github.com/netoptimizer/prototype-kernel/tree/master/kernel/mm/bench ========================================================================= When page allocation performance is not a bottleneck and you want all tooling to work, you can do: echo 1 > /proc/sys/vm/numa_stat This is system default setting. Many thanks to Michal Hocko, Dave Hansen, Ying Huang and Vlastimil Babka for comments to help improve the original patch. [keescook@chromium.org: make sure mutex is a global static] Link: http://lkml.kernel.org/r/20171107213809.GA4314@beast Link: http://lkml.kernel.org/r/1508290927-8518-1-git-send-email-kemi.wang@intel.com Signed-off-by: Kemi Wang Signed-off-by: Kees Cook Reported-by: Jesper Dangaard Brouer Suggested-by: Dave Hansen Suggested-by: Ying Huang Acked-by: Vlastimil Babka Acked-by: Michal Hocko Cc: "Luis R . Rodriguez" Cc: Kees Cook Cc: Jonathan Corbet Cc: Mel Gorman Cc: Johannes Weiner Cc: Christopher Lameter Cc: Sebastian Andrzej Siewior Cc: Andrey Ryabinin Cc: Tim Chen Cc: Andi Kleen Cc: Aaron Lu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/vmstat.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include') diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index 1e0cb72e0598..1779c9817b39 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -7,9 +7,19 @@ #include #include #include +#include extern int sysctl_stat_interval; +#ifdef CONFIG_NUMA +#define ENABLE_NUMA_STAT 1 +#define DISABLE_NUMA_STAT 0 +extern int sysctl_vm_numa_stat; +DECLARE_STATIC_KEY_TRUE(vm_numa_stat_key); +extern int sysctl_vm_numa_stat_handler(struct ctl_table *table, + int write, void __user *buffer, size_t *length, loff_t *ppos); +#endif + #ifdef CONFIG_VM_EVENT_COUNTERS /* * Light weight per cpu counter implementation. -- cgit v1.2.3 From d135e5750205a21a212a19dbb05aeb339e2cbea7 Mon Sep 17 00:00:00 2001 From: Pavel Tatashin Date: Wed, 15 Nov 2017 17:38:41 -0800 Subject: mm/page_alloc.c: broken deferred calculation In reset_deferred_meminit() we determine number of pages that must not be deferred. We initialize pages for at least 2G of memory, but also pages for reserved memory in this node. The reserved memory is determined in this function: memblock_reserved_memory_within(), which operates over physical addresses, and returns size in bytes. However, reset_deferred_meminit() assumes that that this function operates with pfns, and returns page count. The result is that in the best case machine boots slower than expected due to initializing more pages than needed in single thread, and in the worst case panics because fewer than needed pages are initialized early. Link: http://lkml.kernel.org/r/20171021011707.15191-1-pasha.tatashin@oracle.com Fixes: 864b9a393dcb ("mm: consider memblock reservations for deferred memory initialization sizing") Signed-off-by: Pavel Tatashin Acked-by: Michal Hocko Cc: Mel Gorman Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 39ccaa9c428b..67f2e3c38939 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -700,7 +700,8 @@ typedef struct pglist_data { * is the first PFN that needs to be initialised. */ unsigned long first_deferred_pfn; - unsigned long static_init_size; + /* Number of non-deferred pages */ + unsigned long static_init_pgcnt; #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */ #ifdef CONFIG_TRANSPARENT_HUGEPAGE -- cgit v1.2.3 From 2bce774e8245e95db81872ec39522cde8b486fc8 Mon Sep 17 00:00:00 2001 From: Wang Long Date: Wed, 15 Nov 2017 17:39:03 -0800 Subject: writeback: remove unused function parameter The parameter `struct bdi_writeback *wb` is not been used in the function body. Remove it. Link: http://lkml.kernel.org/r/1509685485-15278-1-git-send-email-wanglong19@meituan.com Signed-off-by: Wang Long Reviewed-by: Jan Kara Acked-by: Tejun Heo Cc: Jens Axboe Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/backing-dev.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 7360e79e9a2f..e54e7e0033eb 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -93,7 +93,7 @@ extern void wb_writeout_inc(struct bdi_writeback *wb); /* * maximal error of a stat counter. */ -static inline unsigned long wb_stat_error(struct bdi_writeback *wb) +static inline unsigned long wb_stat_error(void) { #ifdef CONFIG_SMP return nr_cpu_ids * WB_STAT_BATCH; -- cgit v1.2.3 From 0205f75571e3a70c35f0dd5e608773cce97d9dbb Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Wed, 15 Nov 2017 17:39:14 -0800 Subject: mm: simplify nodemask printing alloc_warn() and dump_header() have to explicitly handle NULL nodemask which forces both paths to use pr_cont. We can do better. printk already handles NULL pointers properly so all we need is to teach nodemask_pr_args to handle NULL nodemask carefully. This allows simplification of both alloc_warn() and dump_header() and gets rid of pr_cont altogether. This patch has been motivated by patch from Joe Perches http://lkml.kernel.org/r/b31236dfe3fc924054fd7842bde678e71d193638.1509991345.git.joe@perches.com [akpm@linux-foundation.org: fix tile warning, per Arnd] Link: http://lkml.kernel.org/r/20171109100531.3cn2hcqnuj7mjaju@dhcp22.suse.cz Signed-off-by: Michal Hocko Acked-by: Joe Perches Cc: Arnd Bergmann Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/nodemask.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h index de1c50b93c61..15cab3967d6d 100644 --- a/include/linux/nodemask.h +++ b/include/linux/nodemask.h @@ -104,7 +104,9 @@ extern nodemask_t _unused_nodemask_arg_; * * Can be used to provide arguments for '%*pb[l]' when printing a nodemask. */ -#define nodemask_pr_args(maskp) MAX_NUMNODES, (maskp)->bits +#define nodemask_pr_args(maskp) \ + ((maskp) != NULL) ? MAX_NUMNODES : 0, \ + ((maskp) != NULL) ? (maskp)->bits : NULL /* * The inline keyword gives the compiler room to decide to inline, or -- cgit v1.2.3