From ac79f78dab892fcdc11fda8af5cc5e80d09dca8a Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Wed, 4 Sep 2019 12:54:18 -0700 Subject: Revert "Revert "mm, thp: restore node-local hugepage allocations"" This reverts commit a8282608c88e08b1782141026eab61204c1e533f. The commit references the original intended semantic for MADV_HUGEPAGE which has subsequently taken on three unique purposes: - enables or disables thp for a range of memory depending on the system's config (is thp "enabled" set to "always" or "madvise"), - determines the synchronous compaction behavior for thp allocations at fault (is thp "defrag" set to "always", "defer+madvise", or "madvise"), and - reverts a previous MADV_NOHUGEPAGE (there is no madvise mode to only clear previous hugepage advice). These are the three purposes that currently exist in 5.2 and over the past several years that userspace has been written around. Adding a NUMA locality preference adds a fourth dimension to an already conflated advice mode. Based on the semantic that MADV_HUGEPAGE has provided over the past several years, there exist workloads that use the tunable based on these principles: specifically that the allocation should attempt to defragment a local node before falling back. It is agreed that remote hugepages typically (but not always) have a better access latency than remote native pages, although on Naples this is at parity for intersocket. The revert commit that this patch reverts allows hugepage allocation to immediately allocate remotely when local memory is fragmented. This is contrary to the semantic of MADV_HUGEPAGE over the past several years: that is, memory compaction should be attempted locally before falling back. The performance degradation of remote hugepages over local hugepages on Rome, for example, is 53.5% increased access latency. For this reason, the goal is to revert back to the 5.2 and previous behavior that would attempt local defragmentation before falling back. With the patch that is reverted by this patch, we see performance degradations at the tail because the allocator happily allocates the remote hugepage rather than even attempting to make a local hugepage available. zone_reclaim_mode is not a solution to this problem since it does not only impact hugepage allocations but rather changes the memory allocation strategy for *all* page allocations. Signed-off-by: David Rientjes Cc: Andrea Arcangeli Cc: Michal Hocko Cc: Mel Gorman Cc: Vlastimil Babka Cc: Stefan Priebe - Profihost AG Cc: "Kirill A. Shutemov" Cc: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mempolicy.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h index bac395f1d00a..5228c62af416 100644 --- a/include/linux/mempolicy.h +++ b/include/linux/mempolicy.h @@ -139,8 +139,6 @@ struct mempolicy *mpol_shared_policy_lookup(struct shared_policy *sp, struct mempolicy *get_task_policy(struct task_struct *p); struct mempolicy *__get_vma_policy(struct vm_area_struct *vma, unsigned long addr); -struct mempolicy *get_vma_policy(struct vm_area_struct *vma, - unsigned long addr); bool vma_policy_mof(struct vm_area_struct *vma); extern void numa_default_policy(void); -- cgit v1.2.3 From 19deb7695e072deaff025e03de40c61b525bd57e Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Wed, 4 Sep 2019 12:54:20 -0700 Subject: Revert "Revert "Revert "mm, thp: consolidate THP gfp handling into alloc_hugepage_direct_gfpmask"" This reverts commit 92717d429b38e4f9f934eed7e605cc42858f1839. Since commit a8282608c88e ("Revert "mm, thp: restore node-local hugepage allocations"") is reverted in this series, it is better to restore the previous 5.2 behavior between the thp allocation and the page allocator rather than to attempt any consolidation or cleanup for a policy that is now reverted. It's less risky during an rc cycle and subsequent patches in this series further modify the same policy that the pre-5.3 behavior implements. Consolidation and cleanup can be done subsequent to a sane default page allocation strategy, so this patch reverts a cleanup done on a strategy that is now reverted and thus is the least risky option. Signed-off-by: David Rientjes Cc: Andrea Arcangeli Cc: Michal Hocko Cc: Mel Gorman Cc: Vlastimil Babka Cc: Stefan Priebe - Profihost AG Cc: "Kirill A. Shutemov" Cc: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/gfp.h | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/gfp.h b/include/linux/gfp.h index f33881688f42..fb07b503dc45 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -510,18 +510,22 @@ alloc_pages(gfp_t gfp_mask, unsigned int order) } extern struct page *alloc_pages_vma(gfp_t gfp_mask, int order, struct vm_area_struct *vma, unsigned long addr, - int node); + int node, bool hugepage); +#define alloc_hugepage_vma(gfp_mask, vma, addr, order) \ + alloc_pages_vma(gfp_mask, order, vma, addr, numa_node_id(), true) #else #define alloc_pages(gfp_mask, order) \ alloc_pages_node(numa_node_id(), gfp_mask, order) -#define alloc_pages_vma(gfp_mask, order, vma, addr, node)\ +#define alloc_pages_vma(gfp_mask, order, vma, addr, node, false)\ + alloc_pages(gfp_mask, order) +#define alloc_hugepage_vma(gfp_mask, vma, addr, order) \ alloc_pages(gfp_mask, order) #endif #define alloc_page(gfp_mask) alloc_pages(gfp_mask, 0) #define alloc_page_vma(gfp_mask, vma, addr) \ - alloc_pages_vma(gfp_mask, 0, vma, addr, numa_node_id()) + alloc_pages_vma(gfp_mask, 0, vma, addr, numa_node_id(), false) #define alloc_page_vma_node(gfp_mask, vma, addr, node) \ - alloc_pages_vma(gfp_mask, 0, vma, addr, node) + alloc_pages_vma(gfp_mask, 0, vma, addr, node, false) extern unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order); extern unsigned long get_zeroed_page(gfp_t gfp_mask); -- cgit v1.2.3