From ab77dab46210bb630e06c6803c5d84074bacd351 Mon Sep 17 00:00:00 2001 From: Souptick Joarder Date: Thu, 7 Jun 2018 17:04:29 -0700 Subject: fs/dax.c: use new return type vm_fault_t Use new return type vm_fault_t for fault handler. For now, this is just documenting that the function returns a VM_FAULT value rather than an errno. Once all instances are converted, vm_fault_t will become a distinct type. commit 1c8f422059ae ("mm: change return type to vm_fault_t") There was an existing bug inside dax_load_hole() if vm_insert_mixed had failed to allocate a page table, we'd return VM_FAULT_NOPAGE instead of VM_FAULT_OOM. With new vmf_insert_mixed() this issue is addressed. vm_insert_mixed_mkwrite has inefficiency when it returns an error value, driver has to convert it to vm_fault_t type. With new vmf_insert_mixed_mkwrite() this limitation will be addressed. Link: http://lkml.kernel.org/r/20180510181121.GA15239@jordon-HP-15-Notebook-PC Signed-off-by: Souptick Joarder Reviewed-by: Jan Kara Reviewed-by: Matthew Wilcox Reviewed-by: Ross Zwisler Cc: Alexander Viro Cc: Dan Williams Cc: Michal Hocko Cc: "Kirill A. Shutemov" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/dax.h | 4 ++-- include/linux/mm.h | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dax.h b/include/linux/dax.h index c99692ddd4b5..88504e87cd6c 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -125,8 +125,8 @@ ssize_t dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter, const struct iomap_ops *ops); int dax_iomap_fault(struct vm_fault *vmf, enum page_entry_size pe_size, pfn_t *pfnp, int *errp, const struct iomap_ops *ops); -int dax_finish_sync_fault(struct vm_fault *vmf, enum page_entry_size pe_size, - pfn_t pfn); +vm_fault_t dax_finish_sync_fault(struct vm_fault *vmf, + enum page_entry_size pe_size, pfn_t pfn); int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index); int dax_invalidate_mapping_entry_sync(struct address_space *mapping, pgoff_t index); diff --git a/include/linux/mm.h b/include/linux/mm.h index 0495e6f97fae..b7012bb1d218 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2431,8 +2431,8 @@ int vm_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn, pgprot_t pgprot); int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr, pfn_t pfn); -int vm_insert_mixed_mkwrite(struct vm_area_struct *vma, unsigned long addr, - pfn_t pfn); +vm_fault_t vmf_insert_mixed_mkwrite(struct vm_area_struct *vma, + unsigned long addr, pfn_t pfn); int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len); static inline vm_fault_t vmf_insert_page(struct vm_area_struct *vma, -- cgit v1.2.3 From 05fec35ebb0e0e1f4603cc241fdf65fd3abb3092 Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Thu, 7 Jun 2018 17:05:24 -0700 Subject: slab: clean up the code comment in slab kmem_cache struct In commit 3b0efdfa1e7 ("mm, sl[aou]b: Extract common fields from struct kmem_cache") the variable 'obj_size' was moved above, however the related code comment is not updated accordingly. Do it here. Link: http://lkml.kernel.org/r/20180603032402.27526-1-bhe@redhat.com Signed-off-by: Baoquan He Reviewed-by: Andrew Morton Acked-by: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/slab_def.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/slab_def.h b/include/linux/slab_def.h index d9228e4d0320..3485c58cfd1c 100644 --- a/include/linux/slab_def.h +++ b/include/linux/slab_def.h @@ -67,9 +67,10 @@ struct kmem_cache { /* * If debugging is enabled, then the allocator can add additional - * fields and/or padding to every object. size contains the total - * object size including these internal fields, the following two - * variables contain the offset to the user object and its size. + * fields and/or padding to every object. 'size' contains the total + * object size including these internal fields, while 'obj_offset' + * and 'object_size' contain the offset to the user object and its + * size. */ int obj_offset; #endif /* CONFIG_DEBUG_SLAB */ -- cgit v1.2.3 From 88aa7cc688d48ddd84558b41d5905a0db9535c4b Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Thu, 7 Jun 2018 17:05:28 -0700 Subject: mm: introduce arg_lock to protect arg_start|end and env_start|end in mm_struct mmap_sem is on the hot path of kernel, and it very contended, but it is abused too. It is used to protect arg_start|end and evn_start|end when reading /proc/$PID/cmdline and /proc/$PID/environ, but it doesn't make sense since those proc files just expect to read 4 values atomically and not related to VM, they could be set to arbitrary values by C/R. And, the mmap_sem contention may cause unexpected issue like below: INFO: task ps:14018 blocked for more than 120 seconds. Tainted: G E 4.9.79-009.ali3000.alios7.x86_64 #1 "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. ps D 0 14018 1 0x00000004 Call Trace: schedule+0x36/0x80 rwsem_down_read_failed+0xf0/0x150 call_rwsem_down_read_failed+0x18/0x30 down_read+0x20/0x40 proc_pid_cmdline_read+0xd9/0x4e0 __vfs_read+0x37/0x150 vfs_read+0x96/0x130 SyS_read+0x55/0xc0 entry_SYSCALL_64_fastpath+0x1a/0xc5 Both Alexey Dobriyan and Michal Hocko suggested to use dedicated lock for them to mitigate the abuse of mmap_sem. So, introduce a new spinlock in mm_struct to protect the concurrent access to arg_start|end, env_start|end and others, as well as replace write map_sem to read to protect the race condition between prctl and sys_brk which might break check_data_rlimit(), and makes prctl more friendly to other VM operations. This patch just eliminates the abuse of mmap_sem, but it can't resolve the above hung task warning completely since the later access_remote_vm() call needs acquire mmap_sem. The mmap_sem scalability issue will be solved in the future. [yang.shi@linux.alibaba.com: add comment about mmap_sem and arg_lock] Link: http://lkml.kernel.org/r/1524077799-80690-1-git-send-email-yang.shi@linux.alibaba.com Link: http://lkml.kernel.org/r/1523730291-109696-1-git-send-email-yang.shi@linux.alibaba.com Signed-off-by: Yang Shi Reviewed-by: Cyrill Gorcunov Acked-by: Michal Hocko Cc: Alexey Dobriyan Cc: Matthew Wilcox Cc: Mateusz Guzik Cc: Kirill Tkhai Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm_types.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 21612347d311..49dd59ea90f7 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -413,6 +413,8 @@ struct mm_struct { unsigned long exec_vm; /* VM_EXEC & ~VM_WRITE & ~VM_STACK */ unsigned long stack_vm; /* VM_STACK */ unsigned long def_flags; + + spinlock_t arg_lock; /* protect the below fields */ unsigned long start_code, end_code, start_data, end_data; unsigned long start_brk, brk, start_stack; unsigned long arg_start, arg_end, env_start, env_end; -- cgit v1.2.3 From f3a53a3a1e5b3b9bc114b5b7930fbe89d9ffed5d Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 7 Jun 2018 17:05:35 -0700 Subject: mm, memcontrol: implement memory.swap.events Add swap max and fail events so that userland can monitor and respond to running out of swap. I'm not too sure about the fail event. Right now, it's a bit confusing which stats / events are recursive and which aren't and also which ones reflect events which originate from a given cgroup and which targets the cgroup. No idea what the right long term solution is and it could just be that growing them organically is actually the only right thing to do. Link: http://lkml.kernel.org/r/20180416231151.GI1911913@devbig577.frc2.facebook.com Signed-off-by: Tejun Heo Reviewed-by: Andrew Morton Cc: Johannes Weiner Cc: Michal Hocko Cc: Vladimir Davydov Cc: Roman Gushchin Cc: Rik van Riel Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index d99b71bc2c66..517096c3cc99 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -53,6 +53,8 @@ enum memcg_memory_event { MEMCG_HIGH, MEMCG_MAX, MEMCG_OOM, + MEMCG_SWAP_MAX, + MEMCG_SWAP_FAIL, MEMCG_NR_MEMORY_EVENTS, }; @@ -208,6 +210,9 @@ struct mem_cgroup { atomic_long_t memory_events[MEMCG_NR_MEMORY_EVENTS]; struct cgroup_file events_file; + /* handle for "memory.swap.events" */ + struct cgroup_file swap_events_file; + /* protect arrays of thresholds */ struct mutex thresholds_lock; -- cgit v1.2.3 From 5d752600a8c373382264392f5b573b2fc9c0e8ea Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Thu, 7 Jun 2018 17:06:01 -0700 Subject: mm: restructure memfd code With the addition of memfd hugetlbfs support, we now have the situation where memfd depends on TMPFS -or- HUGETLBFS. Previously, memfd was only supported on tmpfs, so it made sense that the code resided in shmem.c. In the current code, memfd is only functional if TMPFS is defined. If HUGETLFS is defined and TMPFS is not defined, then memfd functionality will not be available for hugetlbfs. This does not cause BUGs, just a lack of potentially desired functionality. Code is restructured in the following way: - include/linux/memfd.h is a new file containing memfd specific definitions previously contained in shmem_fs.h. - mm/memfd.c is a new file containing memfd specific code previously contained in shmem.c. - memfd specific code is removed from shmem_fs.h and shmem.c. - A new config option MEMFD_CREATE is added that is defined if TMPFS or HUGETLBFS is defined. No functional changes are made to the code: restructuring only. Link: http://lkml.kernel.org/r/20180415182119.4517-4-mike.kravetz@oracle.com Signed-off-by: Mike Kravetz Reviewed-by: Khalid Aziz Cc: Andrea Arcangeli Cc: David Herrmann Cc: Hugh Dickins Cc: Marc-Andr Lureau Cc: Matthew Wilcox Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memfd.h | 16 ++++++++++++++++ include/linux/shmem_fs.h | 13 ------------- 2 files changed, 16 insertions(+), 13 deletions(-) create mode 100644 include/linux/memfd.h (limited to 'include/linux') diff --git a/include/linux/memfd.h b/include/linux/memfd.h new file mode 100644 index 000000000000..4f1600413f91 --- /dev/null +++ b/include/linux/memfd.h @@ -0,0 +1,16 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __LINUX_MEMFD_H +#define __LINUX_MEMFD_H + +#include + +#ifdef CONFIG_MEMFD_CREATE +extern long memfd_fcntl(struct file *file, unsigned int cmd, unsigned long arg); +#else +static inline long memfd_fcntl(struct file *f, unsigned int c, unsigned long a) +{ + return -EINVAL; +} +#endif + +#endif /* __LINUX_MEMFD_H */ diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index 73b5e655a76e..f155dc607112 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -110,19 +110,6 @@ static inline bool shmem_file(struct file *file) extern bool shmem_charge(struct inode *inode, long pages); extern void shmem_uncharge(struct inode *inode, long pages); -#ifdef CONFIG_TMPFS - -extern long memfd_fcntl(struct file *file, unsigned int cmd, unsigned long arg); - -#else - -static inline long memfd_fcntl(struct file *f, unsigned int c, unsigned long a) -{ - return -EINVAL; -} - -#endif - #ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE extern bool shmem_huge_enabled(struct vm_area_struct *vma); #else -- cgit v1.2.3 From 3010a5ea665a089361e435093bd737399123fcc4 Mon Sep 17 00:00:00 2001 From: Laurent Dufour Date: Thu, 7 Jun 2018 17:06:08 -0700 Subject: mm: introduce ARCH_HAS_PTE_SPECIAL Currently the PTE special supports is turned on in per architecture header files. Most of the time, it is defined in arch/*/include/asm/pgtable.h depending or not on some other per architecture static definition. This patch introduce a new configuration variable to manage this directly in the Kconfig files. It would later replace __HAVE_ARCH_PTE_SPECIAL. Here notes for some architecture where the definition of __HAVE_ARCH_PTE_SPECIAL is not obvious: arm __HAVE_ARCH_PTE_SPECIAL which is currently defined in arch/arm/include/asm/pgtable-3level.h which is included by arch/arm/include/asm/pgtable.h when CONFIG_ARM_LPAE is set. So select ARCH_HAS_PTE_SPECIAL if ARM_LPAE. powerpc __HAVE_ARCH_PTE_SPECIAL is defined in 2 files: - arch/powerpc/include/asm/book3s/64/pgtable.h - arch/powerpc/include/asm/pte-common.h The first one is included if (PPC_BOOK3S & PPC64) while the second is included in all the other cases. So select ARCH_HAS_PTE_SPECIAL all the time. sparc: __HAVE_ARCH_PTE_SPECIAL is defined if defined(__sparc__) && defined(__arch64__) which are defined through the compiler in sparc/Makefile if !SPARC32 which I assume to be if SPARC64. So select ARCH_HAS_PTE_SPECIAL if SPARC64 There is no functional change introduced by this patch. Link: http://lkml.kernel.org/r/1523433816-14460-2-git-send-email-ldufour@linux.vnet.ibm.com Signed-off-by: Laurent Dufour Suggested-by: Jerome Glisse Reviewed-by: Jerome Glisse Acked-by: David Rientjes Cc: Michal Hocko Cc: "Aneesh Kumar K . V" Cc: Michael Ellerman Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Jonathan Corbet Cc: Catalin Marinas Cc: Will Deacon Cc: Yoshinori Sato Cc: Rich Felker Cc: David S. Miller Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Vineet Gupta Cc: Palmer Dabbelt Cc: Albert Ou Cc: Martin Schwidefsky Cc: Heiko Carstens Cc: David Rientjes Cc: Robin Murphy Cc: Christophe LEROY Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/pfn_t.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pfn_t.h b/include/linux/pfn_t.h index a03c2642a87c..21713dc14ce2 100644 --- a/include/linux/pfn_t.h +++ b/include/linux/pfn_t.h @@ -122,7 +122,7 @@ pud_t pud_mkdevmap(pud_t pud); #endif #endif /* __HAVE_ARCH_PTE_DEVMAP */ -#ifdef __HAVE_ARCH_PTE_SPECIAL +#ifdef CONFIG_ARCH_HAS_PTE_SPECIAL static inline bool pfn_t_special(pfn_t pfn) { return (pfn.val & PFN_SPECIAL) == PFN_SPECIAL; @@ -132,5 +132,5 @@ static inline bool pfn_t_special(pfn_t pfn) { return false; } -#endif /* __HAVE_ARCH_PTE_SPECIAL */ +#endif /* CONFIG_ARCH_HAS_PTE_SPECIAL */ #endif /* _LINUX_PFN_T_H_ */ -- cgit v1.2.3 From 1c4bc43ddfd52cbe5a08bb86ae636f55d2799424 Mon Sep 17 00:00:00 2001 From: Stefan Agner Date: Thu, 7 Jun 2018 17:06:15 -0700 Subject: mm/memblock: introduce PHYS_ADDR_MAX So far code was using ULLONG_MAX and type casting to obtain a phys_addr_t with all bits set. The typecast is necessary to silence compiler warnings on 32-bit platforms. Use the simpler but still type safe approach "~(phys_addr_t)0" to create a preprocessor define for all bits set. Link: http://lkml.kernel.org/r/20180406213809.566-1-stefan@agner.ch Signed-off-by: Stefan Agner Suggested-by: Linus Torvalds Acked-by: Michal Hocko Reviewed-by: Andrew Morton Cc: Catalin Marinas Cc: Pavel Tatashin Cc: Ard Biesheuvel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kernel.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 7aed92624531..7c4e8f1f72d8 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -29,6 +29,7 @@ #define LLONG_MIN (-LLONG_MAX - 1) #define ULLONG_MAX (~0ULL) #define SIZE_MAX (~(size_t)0) +#define PHYS_ADDR_MAX (~(phys_addr_t)0) #define U8_MAX ((u8)~0U) #define S8_MAX ((s8)(U8_MAX>>1)) -- cgit v1.2.3 From bbec2e15170aae3e084d7d9afc730aeebe01b654 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Thu, 7 Jun 2018 17:06:18 -0700 Subject: mm: rename page_counter's count/limit into usage/max This patch renames struct page_counter fields: count -> usage limit -> max and the corresponding functions: page_counter_limit() -> page_counter_set_max() mem_cgroup_get_limit() -> mem_cgroup_get_max() mem_cgroup_resize_limit() -> mem_cgroup_resize_max() memcg_update_kmem_limit() -> memcg_update_kmem_max() memcg_update_tcp_limit() -> memcg_update_tcp_max() The idea behind this renaming is to have the direct matching between memory cgroup knobs (low, high, max) and page_counters API. This is pure renaming, this patch doesn't bring any functional change. Link: http://lkml.kernel.org/r/20180405185921.4942-1-guro@fb.com Signed-off-by: Roman Gushchin Acked-by: Johannes Weiner Cc: Michal Hocko Cc: Vladimir Davydov Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 4 ++-- include/linux/page_counter.h | 12 ++++++------ 2 files changed, 8 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 517096c3cc99..577a19a6a93b 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -467,7 +467,7 @@ unsigned long mem_cgroup_get_zone_lru_size(struct lruvec *lruvec, void mem_cgroup_handle_over_high(void); -unsigned long mem_cgroup_get_limit(struct mem_cgroup *memcg); +unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg); void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p); @@ -858,7 +858,7 @@ mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, return 0; } -static inline unsigned long mem_cgroup_get_limit(struct mem_cgroup *memcg) +static inline unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg) { return 0; } diff --git a/include/linux/page_counter.h b/include/linux/page_counter.h index c15ab80ad32d..94029dad9317 100644 --- a/include/linux/page_counter.h +++ b/include/linux/page_counter.h @@ -7,8 +7,8 @@ #include struct page_counter { - atomic_long_t count; - unsigned long limit; + atomic_long_t usage; + unsigned long max; struct page_counter *parent; /* legacy */ @@ -25,14 +25,14 @@ struct page_counter { static inline void page_counter_init(struct page_counter *counter, struct page_counter *parent) { - atomic_long_set(&counter->count, 0); - counter->limit = PAGE_COUNTER_MAX; + atomic_long_set(&counter->usage, 0); + counter->max = PAGE_COUNTER_MAX; counter->parent = parent; } static inline unsigned long page_counter_read(struct page_counter *counter) { - return atomic_long_read(&counter->count); + return atomic_long_read(&counter->usage); } void page_counter_cancel(struct page_counter *counter, unsigned long nr_pages); @@ -41,7 +41,7 @@ bool page_counter_try_charge(struct page_counter *counter, unsigned long nr_pages, struct page_counter **fail); void page_counter_uncharge(struct page_counter *counter, unsigned long nr_pages); -int page_counter_limit(struct page_counter *counter, unsigned long limit); +int page_counter_set_max(struct page_counter *counter, unsigned long nr_pages); int page_counter_memparse(const char *buf, const char *max, unsigned long *nr_pages); -- cgit v1.2.3 From 230671533d64631116be3ff9d407bd9ca5a58e1b Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Thu, 7 Jun 2018 17:06:22 -0700 Subject: mm: memory.low hierarchical behavior This patch aims to address an issue in current memory.low semantics, which makes it hard to use it in a hierarchy, where some leaf memory cgroups are more valuable than others. For example, there are memcgs A, A/B, A/C, A/D and A/E: A A/memory.low = 2G, A/memory.current = 6G //\\ BC DE B/memory.low = 3G B/memory.current = 2G C/memory.low = 1G C/memory.current = 2G D/memory.low = 0 D/memory.current = 2G E/memory.low = 10G E/memory.current = 0 If we apply memory pressure, B, C and D are reclaimed at the same pace while A's usage exceeds 2G. This is obviously wrong, as B's usage is fully below B's memory.low, and C has 1G of protection as well. Also, A is pushed to the size, which is less than A's 2G memory.low, which is also wrong. A simple bash script (provided below) can be used to reproduce the problem. Current results are: A: 1430097920 A/B: 711929856 A/C: 717426688 A/D: 741376 A/E: 0 To address the issue a concept of effective memory.low is introduced. Effective memory.low is always equal or less than original memory.low. In a case, when there is no memory.low overcommittment (and also for top-level cgroups), these two values are equal. Otherwise it's a part of parent's effective memory.low, calculated as a cgroup's memory.low usage divided by sum of sibling's memory.low usages (under memory.low usage I mean the size of actually protected memory: memory.current if memory.current < memory.low, 0 otherwise). It's necessary to track the actual usage, because otherwise an empty cgroup with memory.low set (A/E in my example) will affect actual memory distribution, which makes no sense. To avoid traversing the cgroup tree twice, page_counters code is reused. Calculating effective memory.low can be done in the reclaim path, as we conveniently traversing the cgroup tree from top to bottom and check memory.low on each level. So, it's a perfect place to calculate effective memory low and save it to use it for children cgroups. This also eliminates a need to traverse the cgroup tree from bottom to top each time to check if parent's guarantee is not exceeded. Setting/resetting effective memory.low is intentionally racy, but it's fine and shouldn't lead to any significant differences in actual memory distribution. With this patch applied results are matching the expectations: A: 2147930112 A/B: 1428721664 A/C: 718393344 A/D: 815104 A/E: 0 Test script: #!/bin/bash CGPATH="/sys/fs/cgroup" truncate /file1 --size 2G truncate /file2 --size 2G truncate /file3 --size 2G truncate /file4 --size 50G mkdir "${CGPATH}/A" echo "+memory" > "${CGPATH}/A/cgroup.subtree_control" mkdir "${CGPATH}/A/B" "${CGPATH}/A/C" "${CGPATH}/A/D" "${CGPATH}/A/E" echo 2G > "${CGPATH}/A/memory.low" echo 3G > "${CGPATH}/A/B/memory.low" echo 1G > "${CGPATH}/A/C/memory.low" echo 0 > "${CGPATH}/A/D/memory.low" echo 10G > "${CGPATH}/A/E/memory.low" echo $$ > "${CGPATH}/A/B/cgroup.procs" && vmtouch -qt /file1 echo $$ > "${CGPATH}/A/C/cgroup.procs" && vmtouch -qt /file2 echo $$ > "${CGPATH}/A/D/cgroup.procs" && vmtouch -qt /file3 echo $$ > "${CGPATH}/cgroup.procs" && vmtouch -qt /file4 echo "A: " `cat "${CGPATH}/A/memory.current"` echo "A/B: " `cat "${CGPATH}/A/B/memory.current"` echo "A/C: " `cat "${CGPATH}/A/C/memory.current"` echo "A/D: " `cat "${CGPATH}/A/D/memory.current"` echo "A/E: " `cat "${CGPATH}/A/E/memory.current"` rmdir "${CGPATH}/A/B" "${CGPATH}/A/C" "${CGPATH}/A/D" "${CGPATH}/A/E" rmdir "${CGPATH}/A" rm /file1 /file2 /file3 /file4 Link: http://lkml.kernel.org/r/20180405185921.4942-2-guro@fb.com Signed-off-by: Roman Gushchin Acked-by: Johannes Weiner Cc: Michal Hocko Cc: Vladimir Davydov Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 3 +-- include/linux/page_counter.h | 7 +++++++ 2 files changed, 8 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 577a19a6a93b..891945507044 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -181,8 +181,7 @@ struct mem_cgroup { struct page_counter kmem; struct page_counter tcpmem; - /* Normal memory consumption range */ - unsigned long low; + /* Upper bound of normal memory consumption range */ unsigned long high; /* Range enforcement for interrupt charges */ diff --git a/include/linux/page_counter.h b/include/linux/page_counter.h index 94029dad9317..7902a727d3b6 100644 --- a/include/linux/page_counter.h +++ b/include/linux/page_counter.h @@ -9,8 +9,14 @@ struct page_counter { atomic_long_t usage; unsigned long max; + unsigned long low; struct page_counter *parent; + /* effective memory.low and memory.low usage tracking */ + unsigned long elow; + atomic_long_t low_usage; + atomic_long_t children_low_usage; + /* legacy */ unsigned long watermark; unsigned long failcnt; @@ -42,6 +48,7 @@ bool page_counter_try_charge(struct page_counter *counter, struct page_counter **fail); void page_counter_uncharge(struct page_counter *counter, unsigned long nr_pages); int page_counter_set_max(struct page_counter *counter, unsigned long nr_pages); +void page_counter_set_low(struct page_counter *counter, unsigned long nr_pages); int page_counter_memparse(const char *buf, const char *max, unsigned long *nr_pages); -- cgit v1.2.3 From 93781325da6e07af57a21f111d1f2b9f128fa397 Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Thu, 7 Jun 2018 17:07:02 -0700 Subject: lockdep: fix fs_reclaim annotation While revisiting my Btrfs swapfile series [1], I introduced a situation in which reclaim would lock i_rwsem, and even though the swapon() path clearly made GFP_KERNEL allocations while holding i_rwsem, I got no complaints from lockdep. It turns out that the rework of the fs_reclaim annotation was broken: if the current task has PF_MEMALLOC set, we don't acquire the dummy fs_reclaim lock, but when reclaiming we always check this _after_ we've just set the PF_MEMALLOC flag. In most cases, we can fix this by moving the fs_reclaim_{acquire,release}() outside of the memalloc_noreclaim_{save,restore}(), althought kswapd is slightly different. After applying this, I got the expected lockdep splats. 1: https://lwn.net/Articles/625412/ Link: http://lkml.kernel.org/r/9f8aa70652a98e98d7c4de0fc96a4addcee13efe.1523778026.git.osandov@fb.com Fixes: d92a8cfcb37e ("locking/lockdep: Rework FS_RECLAIM annotation") Signed-off-by: Omar Sandoval Reviewed-by: Andrew Morton Cc: Peter Zijlstra Cc: Tetsuo Handa Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched/mm.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h index 76a8cb4ef178..44d356f5e47c 100644 --- a/include/linux/sched/mm.h +++ b/include/linux/sched/mm.h @@ -163,9 +163,13 @@ static inline gfp_t current_gfp_context(gfp_t flags) } #ifdef CONFIG_LOCKDEP +extern void __fs_reclaim_acquire(void); +extern void __fs_reclaim_release(void); extern void fs_reclaim_acquire(gfp_t gfp_mask); extern void fs_reclaim_release(gfp_t gfp_mask); #else +static inline void __fs_reclaim_acquire(void) { } +static inline void __fs_reclaim_release(void) { } static inline void fs_reclaim_acquire(gfp_t gfp_mask) { } static inline void fs_reclaim_release(gfp_t gfp_mask) { } #endif -- cgit v1.2.3 From 48f49e1f66854ef44439b84a5cf47179539f3804 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Thu, 7 Jun 2018 17:07:07 -0700 Subject: mm/ksm: remove unused page_referenced_ksm declaration Commit 9f32624be943 ("mm/rmap: use rmap_walk() in page_referenced()") removed the declaration of page_referenced_ksm for the case CONFIG_KSM=y, but left one for CONFIG_KSM=n. Remove the unused leftover. Link: http://lkml.kernel.org/r/1524552106-7356-2-git-send-email-rppt@linux.vnet.ibm.com Signed-off-by: Mike Rapoport Reviewed-by: Andrew Morton Cc: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/ksm.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ksm.h b/include/linux/ksm.h index 44368b19b27e..bbdfca3db6e1 100644 --- a/include/linux/ksm.h +++ b/include/linux/ksm.h @@ -89,12 +89,6 @@ static inline struct page *ksm_might_need_to_copy(struct page *page, return page; } -static inline int page_referenced_ksm(struct page *page, - struct mem_cgroup *memcg, unsigned long *vm_flags) -{ - return 0; -} - static inline void rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc) { -- cgit v1.2.3 From 88484826bc67844eeae1855ebe32cce9edd937c9 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Thu, 7 Jun 2018 17:07:11 -0700 Subject: mm/ksm: move [set_]page_stable_node from ksm.h to ksm.c page_stable_node() and set_page_stable_node() are only used in mm/ksm.c and there is no point to keep them in the include/linux/ksm.h [akpm@linux-foundation.org: fix SYSFS=n build] Link: http://lkml.kernel.org/r/1524552106-7356-3-git-send-email-rppt@linux.vnet.ibm.com Signed-off-by: Mike Rapoport Reviewed-by: Andrew Morton Cc: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/ksm.h | 11 ----------- 1 file changed, 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ksm.h b/include/linux/ksm.h index bbdfca3db6e1..161e8164abcf 100644 --- a/include/linux/ksm.h +++ b/include/linux/ksm.h @@ -37,17 +37,6 @@ static inline void ksm_exit(struct mm_struct *mm) __ksm_exit(mm); } -static inline struct stable_node *page_stable_node(struct page *page) -{ - return PageKsm(page) ? page_rmapping(page) : NULL; -} - -static inline void set_page_stable_node(struct page *page, - struct stable_node *stable_node) -{ - page->mapping = (void *)((unsigned long)stable_node | PAGE_MAPPING_KSM); -} - /* * When do_swap_page() first faults in from swap what used to be a KSM page, * no problem, it will be assigned to this vma's anon_vma; but thereafter, -- cgit v1.2.3 From 9ccc361716362e8abb01d9944f8df97b4aac3e23 Mon Sep 17 00:00:00 2001 From: Wang Long Date: Thu, 7 Jun 2018 17:07:19 -0700 Subject: memcg: writeback: use memcg->cgwb_list directly mem_cgroup_cgwb_list is a very simple wrapper and it will never be used outside of code under CONFIG_CGROUP_WRITEBACK. so use memcg->cgwb_list directly. Link: http://lkml.kernel.org/r/1524406173-212182-1-git-send-email-wanglong19@meituan.com Signed-off-by: Wang Long Reviewed-by: Jan Kara Acked-by: Tejun Heo Acked-by: Michal Hocko Reviewed-by: Andrew Morton Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 891945507044..10d741e8fe51 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1097,7 +1097,6 @@ static inline void dec_lruvec_page_state(struct page *page, #ifdef CONFIG_CGROUP_WRITEBACK -struct list_head *mem_cgroup_cgwb_list(struct mem_cgroup *memcg); struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb); void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages, unsigned long *pheadroom, unsigned long *pdirty, -- cgit v1.2.3 From fb52bbaee598f58352d8732637ebe7013b2df79f Mon Sep 17 00:00:00 2001 From: Mathieu Malaterre Date: Thu, 7 Jun 2018 17:07:43 -0700 Subject: mm: move is_pageblock_removable_nolock() to mm/memory_hotplug.c is_pageblock_removable_nolock() is not used outside of mm/memory_hotplug.c. Move it next to unique caller is_mem_section_removable() and make it static. Remove prototype in to silence gcc warning (W=1): mm/page_alloc.c:7704:6: warning: no previous prototype for `is_pageblock_removable_nolock' [-Wmissing-prototypes] Link: http://lkml.kernel.org/r/20180509190001.24789-1-malat@debian.org Signed-off-by: Mathieu Malaterre Suggested-by: Michal Hocko Reviewed-by: Andrew Morton Acked-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memory_hotplug.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 2b0265265c28..4e9828cda7a2 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -107,7 +107,6 @@ static inline bool movable_node_is_enabled(void) } #ifdef CONFIG_MEMORY_HOTREMOVE -extern bool is_pageblock_removable_nolock(struct page *page); extern int arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap); extern int __remove_pages(struct zone *zone, unsigned long start_pfn, -- cgit v1.2.3 From bf8d5d52ffe89aac5b46ddb39dd1a4351fae5df4 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Thu, 7 Jun 2018 17:07:46 -0700 Subject: memcg: introduce memory.min Memory controller implements the memory.low best-effort memory protection mechanism, which works perfectly in many cases and allows protecting working sets of important workloads from sudden reclaim. But its semantics has a significant limitation: it works only as long as there is a supply of reclaimable memory. This makes it pretty useless against any sort of slow memory leaks or memory usage increases. This is especially true for swapless systems. If swap is enabled, memory soft protection effectively postpones problems, allowing a leaking application to fill all swap area, which makes no sense. The only effective way to guarantee the memory protection in this case is to invoke the OOM killer. It's possible to handle this case in userspace by reacting on MEMCG_LOW events; but there is still a place for a fail-safe in-kernel mechanism to provide stronger guarantees. This patch introduces the memory.min interface for cgroup v2 memory controller. It works very similarly to memory.low (sharing the same hierarchical behavior), except that it's not disabled if there is no more reclaimable memory in the system. If cgroup is not populated, its memory.min is ignored, because otherwise even the OOM killer wouldn't be able to reclaim the protected memory, and the system can stall. [guro@fb.com: s/low/min/ in docs] Link: http://lkml.kernel.org/r/20180510130758.GA9129@castle.DHCP.thefacebook.com Link: http://lkml.kernel.org/r/20180509180734.GA4856@castle.DHCP.thefacebook.com Signed-off-by: Roman Gushchin Reviewed-by: Randy Dunlap Acked-by: Johannes Weiner Cc: Michal Hocko Cc: Vladimir Davydov Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 15 +++++++++++---- include/linux/page_counter.h | 11 +++++++++-- 2 files changed, 20 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 10d741e8fe51..9c04cf8e6487 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -58,6 +58,12 @@ enum memcg_memory_event { MEMCG_NR_MEMORY_EVENTS, }; +enum mem_cgroup_protection { + MEMCG_PROT_NONE, + MEMCG_PROT_LOW, + MEMCG_PROT_MIN, +}; + struct mem_cgroup_reclaim_cookie { pg_data_t *pgdat; int priority; @@ -289,7 +295,8 @@ static inline bool mem_cgroup_disabled(void) return !cgroup_subsys_enabled(memory_cgrp_subsys); } -bool mem_cgroup_low(struct mem_cgroup *root, struct mem_cgroup *memcg); +enum mem_cgroup_protection mem_cgroup_protected(struct mem_cgroup *root, + struct mem_cgroup *memcg); int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask, struct mem_cgroup **memcgp, @@ -734,10 +741,10 @@ static inline void memcg_memory_event(struct mem_cgroup *memcg, { } -static inline bool mem_cgroup_low(struct mem_cgroup *root, - struct mem_cgroup *memcg) +static inline enum mem_cgroup_protection mem_cgroup_protected( + struct mem_cgroup *root, struct mem_cgroup *memcg) { - return false; + return MEMCG_PROT_NONE; } static inline int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm, diff --git a/include/linux/page_counter.h b/include/linux/page_counter.h index 7902a727d3b6..bab7e57f659b 100644 --- a/include/linux/page_counter.h +++ b/include/linux/page_counter.h @@ -8,10 +8,16 @@ struct page_counter { atomic_long_t usage; - unsigned long max; + unsigned long min; unsigned long low; + unsigned long max; struct page_counter *parent; + /* effective memory.min and memory.min usage tracking */ + unsigned long emin; + atomic_long_t min_usage; + atomic_long_t children_min_usage; + /* effective memory.low and memory.low usage tracking */ unsigned long elow; atomic_long_t low_usage; @@ -47,8 +53,9 @@ bool page_counter_try_charge(struct page_counter *counter, unsigned long nr_pages, struct page_counter **fail); void page_counter_uncharge(struct page_counter *counter, unsigned long nr_pages); -int page_counter_set_max(struct page_counter *counter, unsigned long nr_pages); +void page_counter_set_min(struct page_counter *counter, unsigned long nr_pages); void page_counter_set_low(struct page_counter *counter, unsigned long nr_pages); +int page_counter_set_max(struct page_counter *counter, unsigned long nr_pages); int page_counter_memparse(const char *buf, const char *max, unsigned long *nr_pages); -- cgit v1.2.3 From 2bcd6454bae78775632e1848ce1eabf65c6fbd4f Mon Sep 17 00:00:00 2001 From: Souptick Joarder Date: Thu, 7 Jun 2018 17:08:00 -0700 Subject: mm: use new return type vm_fault_t Use new return type vm_fault_t for fault handler in struct vm_operations_struct. For now, this is just documenting that the function returns a VM_FAULT value rather than an errno. Once all instances are converted, vm_fault_t will become a distinct type. Link: http://lkml.kernel.org/r/20180511190542.GA2412@jordon-HP-15-Notebook-PC Signed-off-by: Souptick Joarder Reviewed-by: Matthew Wilcox Cc: Dan Williams Cc: Jan Kara Cc: Ross Zwisler Cc: Rik van Riel Cc: Matthew Wilcox Cc: Hugh Dickins Cc: Pavel Tatashin Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index b7012bb1d218..d1dc618a56bf 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2303,10 +2303,10 @@ extern void truncate_inode_pages_range(struct address_space *, extern void truncate_inode_pages_final(struct address_space *); /* generic vm_area_ops exported for stackable file systems */ -extern int filemap_fault(struct vm_fault *vmf); +extern vm_fault_t filemap_fault(struct vm_fault *vmf); extern void filemap_map_pages(struct vm_fault *vmf, pgoff_t start_pgoff, pgoff_t end_pgoff); -extern int filemap_page_mkwrite(struct vm_fault *vmf); +extern vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf); /* mm/page-writeback.c */ int __must_check write_one_page(struct page *page); -- cgit v1.2.3 From b3ec9f33acb8d3a6173515d3d547be969dc70566 Mon Sep 17 00:00:00 2001 From: Souptick Joarder Date: Thu, 7 Jun 2018 17:08:04 -0700 Subject: mm: change return type to vm_fault_t Use new return type vm_fault_t for fault handler in struct vm_operations_struct. For now, this is just documenting that the function returns a VM_FAULT value rather than an errno. Once all instances are converted, vm_fault_t will become a distinct type. See commit 1c8f422059ae ("mm: change return type to vm_fault_t") Link: http://lkml.kernel.org/r/20180512063745.GA26866@jordon-HP-15-Notebook-PC Signed-off-by: Souptick Joarder Reviewed-by: Matthew Wilcox Reviewed-by: Andrew Morton Cc: Joe Perches Cc: Michal Hocko Cc: Hugh Dickins Cc: Dan Williams Cc: David Rientjes Cc: Mike Kravetz Cc: Naoya Horiguchi Cc: Aneesh Kumar K.V Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm_types.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 49dd59ea90f7..eb9da245286d 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -629,9 +629,9 @@ struct vm_special_mapping { * If non-NULL, then this is called to resolve page faults * on the special mapping. If used, .pages is not checked. */ - int (*fault)(const struct vm_special_mapping *sm, - struct vm_area_struct *vma, - struct vm_fault *vmf); + vm_fault_t (*fault)(const struct vm_special_mapping *sm, + struct vm_area_struct *vma, + struct vm_fault *vmf); int (*mremap)(const struct vm_special_mapping *sm, struct vm_area_struct *new_vma); -- cgit v1.2.3 From e67d4ca79aaf9d13a00d229b1b1c96b86828e8ba Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Thu, 7 Jun 2018 17:08:11 -0700 Subject: mm: save two stranded bits in gfp_mask ___GFP_COLD and ___GFP_OTHER_NODE were removed but their bits were stranded. Fill the gaps by moving the existing gfp masks around. Link: http://lkml.kernel.org/r/20180516211439.177440-1-shakeelb@google.com Signed-off-by: Shakeel Butt Suggested-by: Vlastimil Babka Acked-by: Michal Hocko Acked-by: Vlastimil Babka Reviewed-by: Andrew Morton Cc: Greg Thelen Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/gfp.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/gfp.h b/include/linux/gfp.h index fc5ab85278d5..7f860ea29ec6 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -24,6 +24,7 @@ struct vm_area_struct; #define ___GFP_HIGH 0x20u #define ___GFP_IO 0x40u #define ___GFP_FS 0x80u +#define ___GFP_WRITE 0x100u #define ___GFP_NOWARN 0x200u #define ___GFP_RETRY_MAYFAIL 0x400u #define ___GFP_NOFAIL 0x800u @@ -36,11 +37,10 @@ struct vm_area_struct; #define ___GFP_THISNODE 0x40000u #define ___GFP_ATOMIC 0x80000u #define ___GFP_ACCOUNT 0x100000u -#define ___GFP_DIRECT_RECLAIM 0x400000u -#define ___GFP_WRITE 0x800000u -#define ___GFP_KSWAPD_RECLAIM 0x1000000u +#define ___GFP_DIRECT_RECLAIM 0x200000u +#define ___GFP_KSWAPD_RECLAIM 0x400000u #ifdef CONFIG_LOCKDEP -#define ___GFP_NOLOCKDEP 0x2000000u +#define ___GFP_NOLOCKDEP 0x800000u #else #define ___GFP_NOLOCKDEP 0 #endif @@ -205,7 +205,7 @@ struct vm_area_struct; #define __GFP_NOLOCKDEP ((__force gfp_t)___GFP_NOLOCKDEP) /* Room for N __GFP_FOO bits */ -#define __GFP_BITS_SHIFT (25 + IS_ENABLED(CONFIG_LOCKDEP)) +#define __GFP_BITS_SHIFT (23 + IS_ENABLED(CONFIG_LOCKDEP)) #define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1)) /* -- cgit v1.2.3 From 6e292b9be7f4358985ce33ae1f59ab30a8c09e08 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Thu, 7 Jun 2018 17:08:18 -0700 Subject: mm: split page_type out from _mapcount MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We're already using a union of many fields here, so stop abusing the _mapcount and make page_type its own field. That implies renaming some of the machinery that creates PageBuddy, PageBalloon and PageKmemcg; bring back the PG_buddy, PG_balloon and PG_kmemcg names. As suggested by Kirill, make page_type a bitmask. Because it starts out life as -1 (thanks to sharing the storage with _mapcount), setting a page flag means clearing the appropriate bit. This gives us space for probably twenty or so extra bits (depending how paranoid we want to be about _mapcount underflow). Link: http://lkml.kernel.org/r/20180518194519.3820-3-willy@infradead.org Signed-off-by: Matthew Wilcox Acked-by: Kirill A. Shutemov Acked-by: Vlastimil Babka Cc: Christoph Lameter Cc: Dave Hansen Cc: Jérôme Glisse Cc: Lai Jiangshan Cc: Martin Schwidefsky Cc: Pekka Enberg Cc: Randy Dunlap Cc: Andrey Ryabinin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm_types.h | 13 ++++++++----- include/linux/page-flags.h | 45 ++++++++++++++++++++++++++------------------- 2 files changed, 34 insertions(+), 24 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index eb9da245286d..3a554fdf45c2 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -96,6 +96,14 @@ struct page { }; union { + /* + * If the page is neither PageSlab nor mappable to userspace, + * the value stored here may help determine what this page + * is used for. See page-flags.h for a list of page types + * which are currently stored here. + */ + unsigned int page_type; + _slub_counter_t counters; unsigned int active; /* SLAB */ struct { /* SLUB */ @@ -109,11 +117,6 @@ struct page { /* * Count of ptes mapped in mms, to show when * page is mapped & limit reverse map searches. - * - * Extra information about page type may be - * stored here for pages that are never mapped, - * in which case the value MUST BE <= -2. - * See page-flags.h for more details. */ atomic_t _mapcount; diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index e34a27727b9a..8c25b28a35aa 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -642,49 +642,56 @@ PAGEFLAG_FALSE(DoubleMap) #endif /* - * For pages that are never mapped to userspace, page->mapcount may be - * used for storing extra information about page type. Any value used - * for this purpose must be <= -2, but it's better start not too close - * to -2 so that an underflow of the page_mapcount() won't be mistaken - * for a special page. + * For pages that are never mapped to userspace (and aren't PageSlab), + * page_type may be used. Because it is initialised to -1, we invert the + * sense of the bit, so __SetPageFoo *clears* the bit used for PageFoo, and + * __ClearPageFoo *sets* the bit used for PageFoo. We reserve a few high and + * low bits so that an underflow or overflow of page_mapcount() won't be + * mistaken for a page type value. */ -#define PAGE_MAPCOUNT_OPS(uname, lname) \ + +#define PAGE_TYPE_BASE 0xf0000000 +/* Reserve 0x0000007f to catch underflows of page_mapcount */ +#define PG_buddy 0x00000080 +#define PG_balloon 0x00000100 +#define PG_kmemcg 0x00000200 + +#define PageType(page, flag) \ + ((page->page_type & (PAGE_TYPE_BASE | flag)) == PAGE_TYPE_BASE) + +#define PAGE_TYPE_OPS(uname, lname) \ static __always_inline int Page##uname(struct page *page) \ { \ - return atomic_read(&page->_mapcount) == \ - PAGE_##lname##_MAPCOUNT_VALUE; \ + return PageType(page, PG_##lname); \ } \ static __always_inline void __SetPage##uname(struct page *page) \ { \ - VM_BUG_ON_PAGE(atomic_read(&page->_mapcount) != -1, page); \ - atomic_set(&page->_mapcount, PAGE_##lname##_MAPCOUNT_VALUE); \ + VM_BUG_ON_PAGE(!PageType(page, 0), page); \ + page->page_type &= ~PG_##lname; \ } \ static __always_inline void __ClearPage##uname(struct page *page) \ { \ VM_BUG_ON_PAGE(!Page##uname(page), page); \ - atomic_set(&page->_mapcount, -1); \ + page->page_type |= PG_##lname; \ } /* - * PageBuddy() indicate that the page is free and in the buddy system + * PageBuddy() indicates that the page is free and in the buddy system * (see mm/page_alloc.c). */ -#define PAGE_BUDDY_MAPCOUNT_VALUE (-128) -PAGE_MAPCOUNT_OPS(Buddy, BUDDY) +PAGE_TYPE_OPS(Buddy, buddy) /* - * PageBalloon() is set on pages that are on the balloon page list + * PageBalloon() is true for pages that are on the balloon page list * (see mm/balloon_compaction.c). */ -#define PAGE_BALLOON_MAPCOUNT_VALUE (-256) -PAGE_MAPCOUNT_OPS(Balloon, BALLOON) +PAGE_TYPE_OPS(Balloon, balloon) /* * If kmemcg is enabled, the buddy allocator will set PageKmemcg() on * pages allocated with __GFP_ACCOUNT. It gets cleared on page free. */ -#define PAGE_KMEMCG_MAPCOUNT_VALUE (-512) -PAGE_MAPCOUNT_OPS(Kmemcg, KMEMCG) +PAGE_TYPE_OPS(Kmemcg, kmemcg) extern bool is_free_buddy_page(struct page *page); -- cgit v1.2.3 From 1d40a5ea01d53251c23c7be541d3f4a656cfc537 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Thu, 7 Jun 2018 17:08:23 -0700 Subject: mm: mark pages in use for page tables MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Define a new PageTable bit in the page_type and use it to mark pages in use as page tables. This can be helpful when debugging crashdumps or analysing memory fragmentation. Add a KPF flag to report these pages to userspace and update page-types.c to interpret that flag. Note that only pages currently accounted as NR_PAGETABLES are tracked as PageTable; this does not include pgd/p4d/pud/pmd pages. Those will be the subject of a later patch. Link: http://lkml.kernel.org/r/20180518194519.3820-4-willy@infradead.org Signed-off-by: Matthew Wilcox Acked-by: Kirill A. Shutemov Acked-by: Vlastimil Babka Cc: Christoph Lameter Cc: Dave Hansen Cc: Jérôme Glisse Cc: Lai Jiangshan Cc: Martin Schwidefsky Cc: Pekka Enberg Cc: Randy Dunlap Cc: Andrey Ryabinin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 2 ++ include/linux/page-flags.h | 6 ++++++ 2 files changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index d1dc618a56bf..1dd588b7c649 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1851,6 +1851,7 @@ static inline bool pgtable_page_ctor(struct page *page) { if (!ptlock_init(page)) return false; + __SetPageTable(page); inc_zone_page_state(page, NR_PAGETABLE); return true; } @@ -1858,6 +1859,7 @@ static inline bool pgtable_page_ctor(struct page *page) static inline void pgtable_page_dtor(struct page *page) { pte_lock_deinit(page); + __ClearPageTable(page); dec_zone_page_state(page, NR_PAGETABLE); } diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 8c25b28a35aa..901943e4754b 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -655,6 +655,7 @@ PAGEFLAG_FALSE(DoubleMap) #define PG_buddy 0x00000080 #define PG_balloon 0x00000100 #define PG_kmemcg 0x00000200 +#define PG_table 0x00000400 #define PageType(page, flag) \ ((page->page_type & (PAGE_TYPE_BASE | flag)) == PAGE_TYPE_BASE) @@ -693,6 +694,11 @@ PAGE_TYPE_OPS(Balloon, balloon) */ PAGE_TYPE_OPS(Kmemcg, kmemcg) +/* + * Marks pages in use as page tables. + */ +PAGE_TYPE_OPS(Table, table) + extern bool is_free_buddy_page(struct page *page); __PAGEFLAG(Isolated, isolated, PF_ANY); -- cgit v1.2.3 From d4fc5069a39405d67937b25c0dc9249a9b6c50f1 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Thu, 7 Jun 2018 17:08:26 -0700 Subject: mm: switch s_mem and slab_cache in struct page MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This will allow us to store slub's counters in the same bits as slab's s_mem. slub now needs to set page->mapping to NULL as it frees the page, just like slab does. Link: http://lkml.kernel.org/r/20180518194519.3820-5-willy@infradead.org Signed-off-by: Matthew Wilcox Acked-by: Christoph Lameter Acked-by: Vlastimil Babka Cc: Dave Hansen Cc: Jérôme Glisse Cc: "Kirill A . Shutemov" Cc: Lai Jiangshan Cc: Martin Schwidefsky Cc: Pekka Enberg Cc: Randy Dunlap Cc: Andrey Ryabinin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm_types.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 3a554fdf45c2..6f153f2fab50 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -83,7 +83,7 @@ struct page { /* See page-flags.h for the definition of PAGE_MAPPING_FLAGS */ struct address_space *mapping; - void *s_mem; /* slab first object */ + struct kmem_cache *slab_cache; /* SL[AU]B: Pointer to slab */ atomic_t compound_mapcount; /* first tail page */ /* page_deferred_list().next -- second tail page */ }; @@ -194,7 +194,7 @@ struct page { spinlock_t ptl; #endif #endif - struct kmem_cache *slab_cache; /* SL[AU]B: Pointer to slab */ + void *s_mem; /* slab first object */ }; #ifdef CONFIG_MEMCG -- cgit v1.2.3 From 7d27a04bb2b5bd665f147439d18ae236080eef32 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Thu, 7 Jun 2018 17:08:31 -0700 Subject: mm: move 'private' union within struct page MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit By moving page->private to the fourth word of struct page, we can put the SLUB counters in the same word as SLAB's s_mem and still do the cmpxchg_double trick. Now the SLUB counters no longer overlap with the mapcount or refcount so we can drop the call to page_mapcount_reset() and simplify set_page_slub_counters() to a single line. Link: http://lkml.kernel.org/r/20180518194519.3820-6-willy@infradead.org Signed-off-by: Matthew Wilcox Acked-by: Vlastimil Babka Acked-by: Kirill A. Shutemov Cc: Christoph Lameter Cc: Dave Hansen Cc: Jérôme Glisse Cc: Lai Jiangshan Cc: Martin Schwidefsky Cc: Pekka Enberg Cc: Randy Dunlap Cc: Andrey Ryabinin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm_types.h | 56 +++++++++++++++++++++--------------------------- 1 file changed, 25 insertions(+), 31 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 6f153f2fab50..bcc5ee8b7b07 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -65,15 +65,9 @@ struct hmm; */ #ifdef CONFIG_HAVE_ALIGNED_STRUCT_PAGE #define _struct_page_alignment __aligned(2 * sizeof(unsigned long)) -#if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) -#define _slub_counter_t unsigned long #else -#define _slub_counter_t unsigned int -#endif -#else /* !CONFIG_HAVE_ALIGNED_STRUCT_PAGE */ #define _struct_page_alignment -#define _slub_counter_t unsigned int -#endif /* !CONFIG_HAVE_ALIGNED_STRUCT_PAGE */ +#endif struct page { /* First double word block */ @@ -95,6 +89,30 @@ struct page { /* page_deferred_list().prev -- second tail page */ }; + union { + /* + * Mapping-private opaque data: + * Usually used for buffer_heads if PagePrivate + * Used for swp_entry_t if PageSwapCache + * Indicates order in the buddy system if PageBuddy + */ + unsigned long private; +#if USE_SPLIT_PTE_PTLOCKS +#if ALLOC_SPLIT_PTLOCKS + spinlock_t *ptl; +#else + spinlock_t ptl; +#endif +#endif + void *s_mem; /* slab first object */ + unsigned long counters; /* SLUB */ + struct { /* SLUB */ + unsigned inuse:16; + unsigned objects:15; + unsigned frozen:1; + }; + }; + union { /* * If the page is neither PageSlab nor mappable to userspace, @@ -104,13 +122,7 @@ struct page { */ unsigned int page_type; - _slub_counter_t counters; unsigned int active; /* SLAB */ - struct { /* SLUB */ - unsigned inuse:16; - unsigned objects:15; - unsigned frozen:1; - }; int units; /* SLOB */ struct { /* Page cache */ @@ -179,24 +191,6 @@ struct page { #endif }; - union { - /* - * Mapping-private opaque data: - * Usually used for buffer_heads if PagePrivate - * Used for swp_entry_t if PageSwapCache - * Indicates order in the buddy system if PageBuddy - */ - unsigned long private; -#if USE_SPLIT_PTE_PTLOCKS -#if ALLOC_SPLIT_PTLOCKS - spinlock_t *ptl; -#else - spinlock_t ptl; -#endif -#endif - void *s_mem; /* slab first object */ - }; - #ifdef CONFIG_MEMCG struct mem_cgroup *mem_cgroup; #endif -- cgit v1.2.3 From b21999da02f891aafa362252f4db7ba5c34a9a8d Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Thu, 7 Jun 2018 17:08:35 -0700 Subject: mm: move _refcount out of struct page union MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Keeping the refcount in the union only encourages people to put something else in the union which will overlap with _refcount and eventually explode messily. pahole reports no fields change location. Link: http://lkml.kernel.org/r/20180518194519.3820-7-willy@infradead.org Signed-off-by: Matthew Wilcox Acked-by: Vlastimil Babka Acked-by: Kirill A. Shutemov Cc: Christoph Lameter Cc: Dave Hansen Cc: Jérôme Glisse Cc: Lai Jiangshan Cc: Martin Schwidefsky Cc: Pekka Enberg Cc: Randy Dunlap Cc: Andrey Ryabinin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm_types.h | 25 ++++++++++--------------- 1 file changed, 10 insertions(+), 15 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index bcc5ee8b7b07..0b0c0e224011 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -113,7 +113,13 @@ struct page { }; }; - union { + union { /* This union is 4 bytes in size. */ + /* + * If the page can be mapped to userspace, encodes the number + * of times this page is referenced by a page table. + */ + atomic_t _mapcount; + /* * If the page is neither PageSlab nor mappable to userspace, * the value stored here may help determine what this page @@ -124,22 +130,11 @@ struct page { unsigned int active; /* SLAB */ int units; /* SLOB */ - - struct { /* Page cache */ - /* - * Count of ptes mapped in mms, to show when - * page is mapped & limit reverse map searches. - */ - atomic_t _mapcount; - - /* - * Usage count, *USE WRAPPER FUNCTION* when manual - * accounting. See page_ref.h - */ - atomic_t _refcount; - }; }; + /* Usage count. *DO NOT USE DIRECTLY*. See page_ref.h */ + atomic_t _refcount; + /* * WARNING: bit 0 of the first word encode PageTail(). That means * the rest users of the storage space MUST NOT use the bit to -- cgit v1.2.3 From 66a6ffd2af42b0bafaecccbd7c1c30d386de21ab Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Thu, 7 Jun 2018 17:08:39 -0700 Subject: mm: combine first three unions in struct page MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit By combining these three one-word unions into one three-word union, we make it easier for users to add their own multi-word fields to struct page, as well as making it obvious that SLUB needs to keep its double-word alignment for its freelist & counters. No field moves position; verified with pahole. Link: http://lkml.kernel.org/r/20180518194519.3820-8-willy@infradead.org Signed-off-by: Matthew Wilcox Acked-by: Kirill A. Shutemov Acked-by: Vlastimil Babka Cc: Christoph Lameter Cc: Dave Hansen Cc: Jérôme Glisse Cc: Lai Jiangshan Cc: Martin Schwidefsky Cc: Pekka Enberg Cc: Randy Dunlap Cc: Andrey Ryabinin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm_types.h | 66 ++++++++++++++++++++++++------------------------ 1 file changed, 33 insertions(+), 33 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 0b0c0e224011..158f9693f865 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -70,46 +70,46 @@ struct hmm; #endif struct page { - /* First double word block */ unsigned long flags; /* Atomic flags, some possibly * updated asynchronously */ + /* Three words (12/24 bytes) are available in this union. */ union { - /* See page-flags.h for the definition of PAGE_MAPPING_FLAGS */ - struct address_space *mapping; - - struct kmem_cache *slab_cache; /* SL[AU]B: Pointer to slab */ + struct { /* Page cache and anonymous pages */ + /* See page-flags.h for PAGE_MAPPING_FLAGS */ + struct address_space *mapping; + pgoff_t index; /* Our offset within mapping. */ + /** + * @private: Mapping-private opaque data. + * Usually used for buffer_heads if PagePrivate. + * Used for swp_entry_t if PageSwapCache. + * Indicates order in the buddy system if PageBuddy. + */ + unsigned long private; + }; + struct { /* slab, slob and slub */ + struct kmem_cache *slab_cache; /* not slob */ + /* Double-word boundary */ + void *freelist; /* first free object */ + union { + void *s_mem; /* slab: first object */ + unsigned long counters; /* SLUB */ + struct { /* SLUB */ + unsigned inuse:16; + unsigned objects:15; + unsigned frozen:1; + }; + }; + }; atomic_t compound_mapcount; /* first tail page */ - /* page_deferred_list().next -- second tail page */ - }; - - /* Second double word */ - union { - pgoff_t index; /* Our offset within mapping. */ - void *freelist; /* sl[aou]b first free object */ - /* page_deferred_list().prev -- second tail page */ - }; - - union { - /* - * Mapping-private opaque data: - * Usually used for buffer_heads if PagePrivate - * Used for swp_entry_t if PageSwapCache - * Indicates order in the buddy system if PageBuddy - */ - unsigned long private; -#if USE_SPLIT_PTE_PTLOCKS + struct list_head deferred_list; /* second tail page */ + struct { /* Page table pages */ + unsigned long _pt_pad_2; /* mapping */ + unsigned long _pt_pad_3; #if ALLOC_SPLIT_PTLOCKS - spinlock_t *ptl; + spinlock_t *ptl; #else - spinlock_t ptl; -#endif + spinlock_t ptl; #endif - void *s_mem; /* slab first object */ - unsigned long counters; /* SLUB */ - struct { /* SLUB */ - unsigned inuse:16; - unsigned objects:15; - unsigned frozen:1; }; }; -- cgit v1.2.3 From b7ccc7f8c643de0b71cd2da3245d0c27038a8dd7 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Thu, 7 Jun 2018 17:08:46 -0700 Subject: mm: move lru union within struct page MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since the LRU is two words, this does not affect the double-word alignment of SLUB's freelist. Link: http://lkml.kernel.org/r/20180518194519.3820-10-willy@infradead.org Signed-off-by: Matthew Wilcox Acked-by: Vlastimil Babka Acked-by: Kirill A. Shutemov Cc: Christoph Lameter Cc: Dave Hansen Cc: Jérôme Glisse Cc: Lai Jiangshan Cc: Martin Schwidefsky Cc: Pekka Enberg Cc: Randy Dunlap Cc: Andrey Ryabinin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm_types.h | 102 +++++++++++++++++++++++------------------------ 1 file changed, 51 insertions(+), 51 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 158f9693f865..5b97bd445ada 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -72,6 +72,57 @@ struct hmm; struct page { unsigned long flags; /* Atomic flags, some possibly * updated asynchronously */ + /* + * WARNING: bit 0 of the first word encode PageTail(). That means + * the rest users of the storage space MUST NOT use the bit to + * avoid collision and false-positive PageTail(). + */ + union { + struct list_head lru; /* Pageout list, eg. active_list + * protected by zone_lru_lock ! + * Can be used as a generic list + * by the page owner. + */ + struct dev_pagemap *pgmap; /* ZONE_DEVICE pages are never on an + * lru or handled by a slab + * allocator, this points to the + * hosting device page map. + */ + struct { /* slub per cpu partial pages */ + struct page *next; /* Next partial slab */ +#ifdef CONFIG_64BIT + int pages; /* Nr of partial slabs left */ + int pobjects; /* Approximate # of objects */ +#else + short int pages; + short int pobjects; +#endif + }; + + struct rcu_head rcu_head; /* Used by SLAB + * when destroying via RCU + */ + /* Tail pages of compound page */ + struct { + unsigned long compound_head; /* If bit zero is set */ + + /* First tail page only */ + unsigned char compound_dtor; + unsigned char compound_order; + /* two/six bytes available here */ + }; + +#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && USE_SPLIT_PMD_PTLOCKS + struct { + unsigned long __pad; /* do not overlay pmd_huge_pte + * with compound_head to avoid + * possible bit 0 collision. + */ + pgtable_t pmd_huge_pte; /* protected by page->ptl */ + }; +#endif + }; + /* Three words (12/24 bytes) are available in this union. */ union { struct { /* Page cache and anonymous pages */ @@ -135,57 +186,6 @@ struct page { /* Usage count. *DO NOT USE DIRECTLY*. See page_ref.h */ atomic_t _refcount; - /* - * WARNING: bit 0 of the first word encode PageTail(). That means - * the rest users of the storage space MUST NOT use the bit to - * avoid collision and false-positive PageTail(). - */ - union { - struct list_head lru; /* Pageout list, eg. active_list - * protected by zone_lru_lock ! - * Can be used as a generic list - * by the page owner. - */ - struct dev_pagemap *pgmap; /* ZONE_DEVICE pages are never on an - * lru or handled by a slab - * allocator, this points to the - * hosting device page map. - */ - struct { /* slub per cpu partial pages */ - struct page *next; /* Next partial slab */ -#ifdef CONFIG_64BIT - int pages; /* Nr of partial slabs left */ - int pobjects; /* Approximate # of objects */ -#else - short int pages; - short int pobjects; -#endif - }; - - struct rcu_head rcu_head; /* Used by SLAB - * when destroying via RCU - */ - /* Tail pages of compound page */ - struct { - unsigned long compound_head; /* If bit zero is set */ - - /* First tail page only */ - unsigned char compound_dtor; - unsigned char compound_order; - /* two/six bytes available here */ - }; - -#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && USE_SPLIT_PMD_PTLOCKS - struct { - unsigned long __pad; /* do not overlay pmd_huge_pte - * with compound_head to avoid - * possible bit 0 collision. - */ - pgtable_t pmd_huge_pte; /* protected by page->ptl */ - }; -#endif - }; - #ifdef CONFIG_MEMCG struct mem_cgroup *mem_cgroup; #endif -- cgit v1.2.3 From 4da1984edbbed0ffc961006d265d78a7b0095ea4 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Thu, 7 Jun 2018 17:08:50 -0700 Subject: mm: combine LRU and main union in struct page MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This gives us five words of space in a single union in struct page. The compound_mapcount moves position (from offset 24 to offset 20) on 64-bit systems, but that does not seem likely to cause any trouble. Link: http://lkml.kernel.org/r/20180518194519.3820-11-willy@infradead.org Signed-off-by: Matthew Wilcox Acked-by: Vlastimil Babka Acked-by: Kirill A. Shutemov Cc: Christoph Lameter Cc: Dave Hansen Cc: Jérôme Glisse Cc: Lai Jiangshan Cc: Martin Schwidefsky Cc: Pekka Enberg Cc: Randy Dunlap Cc: Andrey Ryabinin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm_types.h | 97 +++++++++++++++++++++++------------------------- 1 file changed, 46 insertions(+), 51 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 5b97bd445ada..dfe6d648e055 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -73,59 +73,19 @@ struct page { unsigned long flags; /* Atomic flags, some possibly * updated asynchronously */ /* - * WARNING: bit 0 of the first word encode PageTail(). That means - * the rest users of the storage space MUST NOT use the bit to + * Five words (20/40 bytes) are available in this union. + * WARNING: bit 0 of the first word is used for PageTail(). That + * means the other users of this union MUST NOT use the bit to * avoid collision and false-positive PageTail(). */ - union { - struct list_head lru; /* Pageout list, eg. active_list - * protected by zone_lru_lock ! - * Can be used as a generic list - * by the page owner. - */ - struct dev_pagemap *pgmap; /* ZONE_DEVICE pages are never on an - * lru or handled by a slab - * allocator, this points to the - * hosting device page map. - */ - struct { /* slub per cpu partial pages */ - struct page *next; /* Next partial slab */ -#ifdef CONFIG_64BIT - int pages; /* Nr of partial slabs left */ - int pobjects; /* Approximate # of objects */ -#else - short int pages; - short int pobjects; -#endif - }; - - struct rcu_head rcu_head; /* Used by SLAB - * when destroying via RCU - */ - /* Tail pages of compound page */ - struct { - unsigned long compound_head; /* If bit zero is set */ - - /* First tail page only */ - unsigned char compound_dtor; - unsigned char compound_order; - /* two/six bytes available here */ - }; - -#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && USE_SPLIT_PMD_PTLOCKS - struct { - unsigned long __pad; /* do not overlay pmd_huge_pte - * with compound_head to avoid - * possible bit 0 collision. - */ - pgtable_t pmd_huge_pte; /* protected by page->ptl */ - }; -#endif - }; - - /* Three words (12/24 bytes) are available in this union. */ union { struct { /* Page cache and anonymous pages */ + /** + * @lru: Pageout list, eg. active_list protected by + * zone_lru_lock. Sometimes used as a generic list + * by the page owner. + */ + struct list_head lru; /* See page-flags.h for PAGE_MAPPING_FLAGS */ struct address_space *mapping; pgoff_t index; /* Our offset within mapping. */ @@ -138,6 +98,19 @@ struct page { unsigned long private; }; struct { /* slab, slob and slub */ + union { + struct list_head slab_list; /* uses lru */ + struct { /* Partial pages */ + struct page *next; +#ifdef CONFIG_64BIT + int pages; /* Nr of pages left */ + int pobjects; /* Approximate count */ +#else + short int pages; + short int pobjects; +#endif + }; + }; struct kmem_cache *slab_cache; /* not slob */ /* Double-word boundary */ void *freelist; /* first free object */ @@ -151,9 +124,22 @@ struct page { }; }; }; - atomic_t compound_mapcount; /* first tail page */ - struct list_head deferred_list; /* second tail page */ + struct { /* Tail pages of compound page */ + unsigned long compound_head; /* Bit zero is set */ + + /* First tail page only */ + unsigned char compound_dtor; + unsigned char compound_order; + atomic_t compound_mapcount; + }; + struct { /* Second tail page of compound page */ + unsigned long _compound_pad_1; /* compound_head */ + unsigned long _compound_pad_2; + struct list_head deferred_list; + }; struct { /* Page table pages */ + unsigned long _pt_pad_1; /* compound_head */ + pgtable_t pmd_huge_pte; /* protected by page->ptl */ unsigned long _pt_pad_2; /* mapping */ unsigned long _pt_pad_3; #if ALLOC_SPLIT_PTLOCKS @@ -162,6 +148,15 @@ struct page { spinlock_t ptl; #endif }; + + /** @rcu_head: You can use this to free a page by RCU. */ + struct rcu_head rcu_head; + + /** + * @pgmap: For ZONE_DEVICE pages, this points to the hosting + * device page map. + */ + struct dev_pagemap *pgmap; }; union { /* This union is 4 bytes in size. */ -- cgit v1.2.3 From 97b4a671987123009724dfbdf188ec6e122573d4 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Thu, 7 Jun 2018 17:08:53 -0700 Subject: mm: improve struct page documentation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Rewrite the documentation to describe what you can use in struct page rather than what you can't. Link: http://lkml.kernel.org/r/20180518194519.3820-12-willy@infradead.org Signed-off-by: Matthew Wilcox Reviewed-by: Randy Dunlap Acked-by: Vlastimil Babka Cc: Christoph Lameter Cc: Dave Hansen Cc: Jérôme Glisse Cc: "Kirill A . Shutemov" Cc: Lai Jiangshan Cc: Martin Schwidefsky Cc: Pekka Enberg Cc: Andrey Ryabinin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm_types.h | 40 +++++++++++++++++++--------------------- 1 file changed, 19 insertions(+), 21 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index dfe6d648e055..9b45760cc797 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -33,29 +33,27 @@ struct hmm; * it to keep track of whatever it is we are using the page for at the * moment. Note that we have no way to track which tasks are using * a page, though if it is a pagecache page, rmap structures can tell us - * who is mapping it. If you allocate the page using alloc_pages(), you - * can use some of the space in struct page for your own purposes. + * who is mapping it. * - * Pages that were once in the page cache may be found under the RCU lock - * even after they have been recycled to a different purpose. The page - * cache reads and writes some of the fields in struct page to pin the - * page before checking that it's still in the page cache. It is vital - * that all users of struct page: - * 1. Use the first word as PageFlags. - * 2. Clear or preserve bit 0 of page->compound_head. It is used as - * PageTail for compound pages, and the page cache must not see false - * positives. Some users put a pointer here (guaranteed to be at least - * 4-byte aligned), other users avoid using the field altogether. - * 3. page->_refcount must either not be used, or must be used in such a - * way that other CPUs temporarily incrementing and then decrementing the - * refcount does not cause problems. On receiving the page from - * alloc_pages(), the refcount will be positive. - * 4. Either preserve page->_mapcount or restore it to -1 before freeing it. + * If you allocate the page using alloc_pages(), you can use some of the + * space in struct page for your own purposes. The five words in the main + * union are available, except for bit 0 of the first word which must be + * kept clear. Many users use this word to store a pointer to an object + * which is guaranteed to be aligned. If you use the same storage as + * page->mapping, you must restore it to NULL before freeing the page. * - * If you allocate pages of order > 0, you can use the fields in the struct - * page associated with each page, but bear in mind that the pages may have - * been inserted individually into the page cache, so you must use the above - * four fields in a compatible way for each struct page. + * If your page will not be mapped to userspace, you can also use the four + * bytes in the mapcount union, but you must call page_mapcount_reset() + * before freeing it. + * + * If you want to use the refcount field, it must be used in such a way + * that other CPUs temporarily incrementing and then decrementing the + * refcount does not cause problems. On receiving the page from + * alloc_pages(), the refcount will be positive. + * + * If you allocate pages of order > 0, you can use some of the fields + * in each subpage, but you may need to restore some of their values + * afterwards. * * SLUB uses cmpxchg_double() to atomically update its freelist and * counters. That requires that freelist & counters be adjacent and -- cgit v1.2.3 From a052f0a5168ef99c517f6638aaec59f39060f81d Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Thu, 7 Jun 2018 17:08:57 -0700 Subject: mm: add pt_mm to struct page MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit For pgd page table pages, x86 overloads the page->index field to store a pointer to the mm_struct. Rename this to pt_mm so it's visible to other users. Link: http://lkml.kernel.org/r/20180518194519.3820-13-willy@infradead.org Signed-off-by: Matthew Wilcox Acked-by: Vlastimil Babka Cc: Christoph Lameter Cc: Dave Hansen Cc: Jérôme Glisse Cc: "Kirill A . Shutemov" Cc: Lai Jiangshan Cc: Martin Schwidefsky Cc: Pekka Enberg Cc: Randy Dunlap Cc: Andrey Ryabinin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm_types.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 9b45760cc797..811aa0c71e1a 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -139,7 +139,7 @@ struct page { unsigned long _pt_pad_1; /* compound_head */ pgtable_t pmd_huge_pte; /* protected by page->ptl */ unsigned long _pt_pad_2; /* mapping */ - unsigned long _pt_pad_3; + struct mm_struct *pt_mm; /* x86 pgds only */ #if ALLOC_SPLIT_PTLOCKS spinlock_t *ptl; #else -- cgit v1.2.3 From 50e7fbc3bf0c3e41af0075f17a9e4ddefa82e1b5 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Thu, 7 Jun 2018 17:09:01 -0700 Subject: mm: add hmm_data to struct page MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Make hmm_data an explicit member of the struct page union. Link: http://lkml.kernel.org/r/20180518194519.3820-14-willy@infradead.org Signed-off-by: Matthew Wilcox Acked-by: Vlastimil Babka Cc: Christoph Lameter Cc: Dave Hansen Cc: Jérôme Glisse Cc: "Kirill A . Shutemov" Cc: Lai Jiangshan Cc: Martin Schwidefsky Cc: Pekka Enberg Cc: Randy Dunlap Cc: Andrey Ryabinin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hmm.h | 8 ++------ include/linux/mm_types.h | 12 ++++++------ 2 files changed, 8 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hmm.h b/include/linux/hmm.h index 2f1327c37a63..4c92e3ba3e16 100644 --- a/include/linux/hmm.h +++ b/include/linux/hmm.h @@ -522,9 +522,7 @@ void hmm_devmem_remove(struct hmm_devmem *devmem); static inline void hmm_devmem_page_set_drvdata(struct page *page, unsigned long data) { - unsigned long *drvdata = (unsigned long *)&page->pgmap; - - drvdata[1] = data; + page->hmm_data = data; } /* @@ -535,9 +533,7 @@ static inline void hmm_devmem_page_set_drvdata(struct page *page, */ static inline unsigned long hmm_devmem_page_get_drvdata(const struct page *page) { - const unsigned long *drvdata = (const unsigned long *)&page->pgmap; - - return drvdata[1]; + return page->hmm_data; } diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 811aa0c71e1a..99ce070e7dcb 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -146,15 +146,15 @@ struct page { spinlock_t ptl; #endif }; + struct { /* ZONE_DEVICE pages */ + /** @pgmap: Points to the hosting device page map. */ + struct dev_pagemap *pgmap; + unsigned long hmm_data; + unsigned long _zd_pad_1; /* uses mapping */ + }; /** @rcu_head: You can use this to free a page by RCU. */ struct rcu_head rcu_head; - - /** - * @pgmap: For ZONE_DEVICE pages, this points to the hosting - * device page map. - */ - struct dev_pagemap *pgmap; }; union { /* This union is 4 bytes in size. */ -- cgit v1.2.3 From 9736d2a95e36ac3f60b063f498961103f3d4f165 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Thu, 7 Jun 2018 17:09:10 -0700 Subject: slub: remove kmem_cache->reserved MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The reserved field was only used for embedding an rcu_head in the data structure. With the previous commit, we no longer need it. That lets us remove the 'reserved' argument to a lot of functions. Link: http://lkml.kernel.org/r/20180518194519.3820-16-willy@infradead.org Signed-off-by: Matthew Wilcox Acked-by: Christoph Lameter Cc: Dave Hansen Cc: Jérôme Glisse Cc: "Kirill A . Shutemov" Cc: Lai Jiangshan Cc: Martin Schwidefsky Cc: Pekka Enberg Cc: Randy Dunlap Cc: Vlastimil Babka Cc: Andrey Ryabinin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/slub_def.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h index 3773e26c08c1..09fa2c6f0e68 100644 --- a/include/linux/slub_def.h +++ b/include/linux/slub_def.h @@ -101,7 +101,6 @@ struct kmem_cache { void (*ctor)(void *); unsigned int inuse; /* Offset to metadata */ unsigned int align; /* Alignment */ - unsigned int reserved; /* Reserved bytes at the end of slabs */ unsigned int red_left_pad; /* Left redzone padding size */ const char *name; /* Name (only for display!) */ struct list_head list; /* List of slab caches */ -- cgit v1.2.3 From df2cc96e77011cf7989208b206da9817e0321028 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Thu, 7 Jun 2018 17:09:25 -0700 Subject: userfaultfd: prevent non-cooperative events vs mcopy_atomic races If a process monitored with userfaultfd changes it's memory mappings or forks() at the same time as uffd monitor fills the process memory with UFFDIO_COPY, the actual creation of page table entries and copying of the data in mcopy_atomic may happen either before of after the memory mapping modifications and there is no way for the uffd monitor to maintain consistent view of the process memory layout. For instance, let's consider fork() running in parallel with userfaultfd_copy(): process | uffd monitor ---------------------------------+------------------------------ fork() | userfaultfd_copy() ... | ... dup_mmap() | down_read(mmap_sem) down_write(mmap_sem) | /* create PTEs, copy data */ dup_uffd() | up_read(mmap_sem) copy_page_range() | up_write(mmap_sem) | dup_uffd_complete() | /* notify monitor */ | If the userfaultfd_copy() takes the mmap_sem first, the new page(s) will be present by the time copy_page_range() is called and they will appear in the child's memory mappings. However, if the fork() is the first to take the mmap_sem, the new pages won't be mapped in the child's address space. If the pages are not present and child tries to access them, the monitor will get page fault notification and everything is fine. However, if the pages *are present*, the child can access them without uffd noticing. And if we copy them into child it'll see the wrong data. Since we are talking about background copy, we'd need to decide whether the pages should be copied or not regardless #PF notifications. Since userfaultfd monitor has no way to determine what was the order, let's disallow userfaultfd_copy in parallel with the non-cooperative events. In such case we return -EAGAIN and the uffd monitor can understand that userfaultfd_copy() clashed with a non-cooperative event and take an appropriate action. Link: http://lkml.kernel.org/r/1527061324-19949-1-git-send-email-rppt@linux.vnet.ibm.com Signed-off-by: Mike Rapoport Acked-by: Pavel Emelyanov Cc: Andrea Arcangeli Cc: Mike Kravetz Cc: Andrei Vagin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/userfaultfd_k.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index f2f3b68ba910..e091f0a11b11 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -31,10 +31,12 @@ extern int handle_userfault(struct vm_fault *vmf, unsigned long reason); extern ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start, - unsigned long src_start, unsigned long len); + unsigned long src_start, unsigned long len, + bool *mmap_changing); extern ssize_t mfill_zeropage(struct mm_struct *dst_mm, unsigned long dst_start, - unsigned long len); + unsigned long len, + bool *mmap_changing); /* mm helpers */ static inline bool is_mergeable_vm_userfaultfd_ctx(struct vm_area_struct *vma, -- cgit v1.2.3 From 4b33b6959581d5093af2badb489d914911d99eaf Mon Sep 17 00:00:00 2001 From: Huaisheng Ye Date: Thu, 7 Jun 2018 17:09:36 -0700 Subject: include/linux/gfp.h: fix the annotation of GFP_ZONE_TABLE When bit is equal to 0x4, it means OPT_ZONE_DMA32 should be got from GFP_ZONE_TABLE. OPT_ZONE_DMA32 shall be equal to ZONE_DMA32 or ZONE_NORMAL according to the status of CONFIG_ZONE_DMA32. Similarly, when bit is equal to 0xc, that means OPT_ZONE_DMA32 should be got with an allocation policy GFP_MOVABLE. So ZONE_DMA32 or ZONE_NORMAL is the possible result value. Link: http://lkml.kernel.org/r/20180601163403.1032-1-yehs2007@zoho.com Signed-off-by: Huaisheng Ye Reviewed-by: Andrew Morton Cc: Vlastimil Babka Cc: Michal Hocko Cc: Mel Gorman Cc: Kate Stewart Cc: "Levin, Alexander (Sasha Levin)" Cc: Greg Kroah-Hartman Cc: Christoph Hellwig Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/gfp.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 7f860ea29ec6..a6afcec53795 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -343,7 +343,7 @@ static inline bool gfpflags_allow_blocking(const gfp_t gfp_flags) * 0x1 => DMA or NORMAL * 0x2 => HIGHMEM or NORMAL * 0x3 => BAD (DMA+HIGHMEM) - * 0x4 => DMA32 or DMA or NORMAL + * 0x4 => DMA32 or NORMAL * 0x5 => BAD (DMA+DMA32) * 0x6 => BAD (HIGHMEM+DMA32) * 0x7 => BAD (HIGHMEM+DMA32+DMA) @@ -351,7 +351,7 @@ static inline bool gfpflags_allow_blocking(const gfp_t gfp_flags) * 0x9 => DMA or NORMAL (MOVABLE+DMA) * 0xa => MOVABLE (Movable is valid only if HIGHMEM is set too) * 0xb => BAD (MOVABLE+HIGHMEM+DMA) - * 0xc => DMA32 (MOVABLE+DMA32) + * 0xc => DMA32 or NORMAL (MOVABLE+DMA32) * 0xd => BAD (MOVABLE+DMA32+DMA) * 0xe => BAD (MOVABLE+DMA32+HIGHMEM) * 0xf => BAD (MOVABLE+DMA32+HIGHMEM+DMA) -- cgit v1.2.3 From e81bf9793b1861d74953ef041b4f6c7faecc2dbd Mon Sep 17 00:00:00 2001 From: Aaron Lu Date: Thu, 7 Jun 2018 17:09:44 -0700 Subject: mem_cgroup: make sure moving_account, move_lock_task and stat_cpu in the same cacheline The LKP robot found a 27% will-it-scale/page_fault3 performance regression regarding commit e27be240df53("mm: memcg: make sure memory.events is uptodate when waking pollers"). What the test does is: 1 mkstemp() a 128M file on a tmpfs; 2 start $nr_cpu processes, each to loop the following: 2.1 mmap() this file in shared write mode; 2.2 write 0 to this file in a PAGE_SIZE step till the end of the file; 2.3 unmap() this file and repeat this process. 3 After 5 minutes, check how many loops they managed to complete, the higher the better. The commit itself looks innocent enough as it merely changed some event counting mechanism and this test didn't trigger those events at all. Perf shows increased cycles spent on accessing root_mem_cgroup->stat_cpu in count_memcg_event_mm()(called by handle_mm_fault()) and in __mod_memcg_state() called by page_add_file_rmap(). So it's likely due to the changed layout of 'struct mem_cgroup' that either make stat_cpu falling into a constantly modifying cacheline or some hot fields stop being in the same cacheline. I verified this by moving memory_events[] back to where it was: : --- a/include/linux/memcontrol.h : +++ b/include/linux/memcontrol.h : @@ -205,7 +205,6 @@ struct mem_cgroup { : int oom_kill_disable; : : /* memory.events */ : - atomic_long_t memory_events[MEMCG_NR_MEMORY_EVENTS]; : struct cgroup_file events_file; : : /* protect arrays of thresholds */ : @@ -238,6 +237,7 @@ struct mem_cgroup { : struct mem_cgroup_stat_cpu __percpu *stat_cpu; : atomic_long_t stat[MEMCG_NR_STAT]; : atomic_long_t events[NR_VM_EVENT_ITEMS]; : + atomic_long_t memory_events[MEMCG_NR_MEMORY_EVENTS]; : : unsigned long socket_pressure; And performance restored. Later investigation found that as long as the following 3 fields moving_account, move_lock_task and stat_cpu are in the same cacheline, performance will be good. To avoid future performance surprise by other commits changing the layout of 'struct mem_cgroup', this patch makes sure the 3 fields stay in the same cacheline. One concern of this approach is, moving_account and move_lock_task could be modified when a process changes memory cgroup while stat_cpu is a always read field, it might hurt to place them in the same cacheline. I assume it is rare for a process to change memory cgroup so this should be OK. Link: https://lkml.kernel.org/r/20180528114019.GF9904@yexl-desktop Link: http://lkml.kernel.org/r/20180601071115.GA27302@intel.com Signed-off-by: Aaron Lu Reported-by: kernel test robot Cc: Johannes Weiner Cc: Michal Hocko Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 9c04cf8e6487..4f52ec755725 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -166,6 +166,15 @@ enum memcg_kmem_state { KMEM_ONLINE, }; +#if defined(CONFIG_SMP) +struct memcg_padding { + char x[0]; +} ____cacheline_internodealigned_in_smp; +#define MEMCG_PADDING(name) struct memcg_padding name; +#else +#define MEMCG_PADDING(name) +#endif + /* * The memory controller data structure. The memory controller controls both * page cache and RSS per cgroup. We would eventually like to provide @@ -212,7 +221,6 @@ struct mem_cgroup { int oom_kill_disable; /* memory.events */ - atomic_long_t memory_events[MEMCG_NR_MEMORY_EVENTS]; struct cgroup_file events_file; /* handle for "memory.swap.events" */ @@ -235,19 +243,26 @@ struct mem_cgroup { * mem_cgroup ? And what type of charges should we move ? */ unsigned long move_charge_at_immigrate; + /* taken only while moving_account > 0 */ + spinlock_t move_lock; + unsigned long move_lock_flags; + + MEMCG_PADDING(_pad1_); + /* * set > 0 if pages under this cgroup are moving to other cgroup. */ atomic_t moving_account; - /* taken only while moving_account > 0 */ - spinlock_t move_lock; struct task_struct *move_lock_task; - unsigned long move_lock_flags; /* memory.stat */ struct mem_cgroup_stat_cpu __percpu *stat_cpu; + + MEMCG_PADDING(_pad2_); + atomic_long_t stat[MEMCG_NR_STAT]; atomic_long_t events[NR_VM_EVENT_ITEMS]; + atomic_long_t memory_events[MEMCG_NR_MEMORY_EVENTS]; unsigned long socket_pressure; -- cgit v1.2.3 From 72eb7de9c1580cf5705d4f592aca21a8438fde22 Mon Sep 17 00:00:00 2001 From: Sahara Date: Thu, 7 Jun 2018 17:09:48 -0700 Subject: mm: remove page_is_poisoned() from linux/mm.h When commit bd33ef368135 ("mm: enable page poisoning early at boot") got rid of the PAGE_EXT_DEBUG_POISON, page_is_poisoned in the header left behind. This patch cleans up the leftovers under the table. Link: http://lkml.kernel.org/r/1528101069-21637-1-git-send-email-kpark3469@gmail.com Signed-off-by: Sahara Acked-by: Michal Hocko Reviewed-by: Andrew Morton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 1dd588b7c649..f0f3905f8bb8 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2532,12 +2532,10 @@ extern int apply_to_page_range(struct mm_struct *mm, unsigned long address, #ifdef CONFIG_PAGE_POISONING extern bool page_poisoning_enabled(void); extern void kernel_poison_pages(struct page *page, int numpages, int enable); -extern bool page_is_poisoned(struct page *page); #else static inline bool page_poisoning_enabled(void) { return false; } static inline void kernel_poison_pages(struct page *page, int numpages, int enable) { } -static inline bool page_is_poisoned(struct page *page) { return false; } #endif #ifdef CONFIG_DEBUG_PAGEALLOC -- cgit v1.2.3 From 6d0e8d53849fb491549a01893d786bf33d4b28df Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Thu, 7 Jun 2018 17:10:27 -0700 Subject: include/linux/types.h: define aligned_ types based on uapi header has the same typedefs except that it prefixes them with double-underscore for user space. Use them for the kernel space typedefs. Link: http://lkml.kernel.org/r/1526350925-14922-2-git-send-email-yamada.masahiro@socionext.com Signed-off-by: Masahiro Yamada Reviewed-by: Andrew Morton Cc: Geert Uytterhoeven Cc: Alexey Dobriyan Cc: Lihao Liang Cc: Philippe Ombredanne Cc: Pekka Enberg Cc: Greg Kroah-Hartman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/types.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/types.h b/include/linux/types.h index ec13d02b3481..be1589763e0a 100644 --- a/include/linux/types.h +++ b/include/linux/types.h @@ -115,9 +115,9 @@ typedef __s64 int64_t; #endif /* this is a special 64bit data type that is 8-byte aligned */ -#define aligned_u64 __u64 __attribute__((aligned(8))) -#define aligned_be64 __be64 __attribute__((aligned(8))) -#define aligned_le64 __le64 __attribute__((aligned(8))) +#define aligned_u64 __aligned_u64 +#define aligned_be64 __aligned_be64 +#define aligned_le64 __aligned_le64 /** * The type used for indexing onto a disc or disc partition. -- cgit v1.2.3 From b22f22a3c199b3282b05a927c60eb8edbe42d25d Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Thu, 7 Jun 2018 17:10:30 -0700 Subject: include/linux/types.h: use fixed width types without double-underscore prefix This header file is not exported. It is safe to reference types without double-underscore prefix. Link: http://lkml.kernel.org/r/1526350925-14922-3-git-send-email-yamada.masahiro@socionext.com Signed-off-by: Masahiro Yamada Cc: Geert Uytterhoeven Cc: Alexey Dobriyan Cc: Lihao Liang Cc: Philippe Ombredanne Cc: Pekka Enberg Cc: Greg Kroah-Hartman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/types.h | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/include/linux/types.h b/include/linux/types.h index be1589763e0a..9834e90aa010 100644 --- a/include/linux/types.h +++ b/include/linux/types.h @@ -10,14 +10,14 @@ #define DECLARE_BITMAP(name,bits) \ unsigned long name[BITS_TO_LONGS(bits)] -typedef __u32 __kernel_dev_t; +typedef u32 __kernel_dev_t; typedef __kernel_fd_set fd_set; typedef __kernel_dev_t dev_t; typedef __kernel_ino_t ino_t; typedef __kernel_mode_t mode_t; typedef unsigned short umode_t; -typedef __u32 nlink_t; +typedef u32 nlink_t; typedef __kernel_off_t off_t; typedef __kernel_pid_t pid_t; typedef __kernel_daddr_t daddr_t; @@ -95,23 +95,23 @@ typedef unsigned long ulong; #ifndef __BIT_TYPES_DEFINED__ #define __BIT_TYPES_DEFINED__ -typedef __u8 u_int8_t; -typedef __s8 int8_t; -typedef __u16 u_int16_t; -typedef __s16 int16_t; -typedef __u32 u_int32_t; -typedef __s32 int32_t; +typedef u8 u_int8_t; +typedef s8 int8_t; +typedef u16 u_int16_t; +typedef s16 int16_t; +typedef u32 u_int32_t; +typedef s32 int32_t; #endif /* !(__BIT_TYPES_DEFINED__) */ -typedef __u8 uint8_t; -typedef __u16 uint16_t; -typedef __u32 uint32_t; +typedef u8 uint8_t; +typedef u16 uint16_t; +typedef u32 uint32_t; #if defined(__GNUC__) -typedef __u64 uint64_t; -typedef __u64 u_int64_t; -typedef __s64 int64_t; +typedef u64 uint64_t; +typedef u64 u_int64_t; +typedef s64 int64_t; #endif /* this is a special 64bit data type that is 8-byte aligned */ -- cgit v1.2.3 From cbdc61ae1fa5d824fcfd59282b040f21144999ab Mon Sep 17 00:00:00 2001 From: Vasily Averin Date: Thu, 7 Jun 2018 17:10:51 -0700 Subject: lib/mpi: headers cleanup MPI headers contain definitions for huge number of non-existing functions. Most part of these functions was removed in 2012 by Dmitry Kasatkin - 7cf4206a99d1 ("Remove unused code from MPI library") - 9e235dcaf4f6 ("Revert "crypto: GnuPG based MPI lib - additional ...") - bc95eeadf5c6 ("lib/mpi: removed unused functions") however headers wwere not updated properly. Also I deleted some unused macros. Link: http://lkml.kernel.org/r/fb2fc1ef-1185-f0a3-d8d0-173d2f97bbaf@virtuozzo.com Signed-off-by: Vasily Averin Reviewed-by: Andrew Morton Cc: Dmitry Kasatkin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mpi.h | 61 ----------------------------------------------------- 1 file changed, 61 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mpi.h b/include/linux/mpi.h index 1cc5ffb769af..7cd1473c64a4 100644 --- a/include/linux/mpi.h +++ b/include/linux/mpi.h @@ -53,93 +53,32 @@ struct gcry_mpi { typedef struct gcry_mpi *MPI; #define mpi_get_nlimbs(a) ((a)->nlimbs) -#define mpi_is_neg(a) ((a)->sign) /*-- mpiutil.c --*/ MPI mpi_alloc(unsigned nlimbs); -MPI mpi_alloc_secure(unsigned nlimbs); -MPI mpi_alloc_like(MPI a); void mpi_free(MPI a); int mpi_resize(MPI a, unsigned nlimbs); -int mpi_copy(MPI *copy, const MPI a); -void mpi_clear(MPI a); -int mpi_set(MPI w, MPI u); -int mpi_set_ui(MPI w, ulong u); -MPI mpi_alloc_set_ui(unsigned long u); -void mpi_m_check(MPI a); -void mpi_swap(MPI a, MPI b); /*-- mpicoder.c --*/ -MPI do_encode_md(const void *sha_buffer, unsigned nbits); MPI mpi_read_raw_data(const void *xbuffer, size_t nbytes); MPI mpi_read_from_buffer(const void *buffer, unsigned *ret_nread); MPI mpi_read_raw_from_sgl(struct scatterlist *sgl, unsigned int len); -int mpi_fromstr(MPI val, const char *str); -u32 mpi_get_keyid(MPI a, u32 *keyid); void *mpi_get_buffer(MPI a, unsigned *nbytes, int *sign); int mpi_read_buffer(MPI a, uint8_t *buf, unsigned buf_len, unsigned *nbytes, int *sign); -void *mpi_get_secure_buffer(MPI a, unsigned *nbytes, int *sign); int mpi_write_to_sgl(MPI a, struct scatterlist *sg, unsigned nbytes, int *sign); -#define log_mpidump g10_log_mpidump - -/*-- mpi-add.c --*/ -int mpi_add_ui(MPI w, MPI u, ulong v); -int mpi_add(MPI w, MPI u, MPI v); -int mpi_addm(MPI w, MPI u, MPI v, MPI m); -int mpi_sub_ui(MPI w, MPI u, ulong v); -int mpi_sub(MPI w, MPI u, MPI v); -int mpi_subm(MPI w, MPI u, MPI v, MPI m); - -/*-- mpi-mul.c --*/ -int mpi_mul_ui(MPI w, MPI u, ulong v); -int mpi_mul_2exp(MPI w, MPI u, ulong cnt); -int mpi_mul(MPI w, MPI u, MPI v); -int mpi_mulm(MPI w, MPI u, MPI v, MPI m); - -/*-- mpi-div.c --*/ -ulong mpi_fdiv_r_ui(MPI rem, MPI dividend, ulong divisor); -int mpi_fdiv_r(MPI rem, MPI dividend, MPI divisor); -int mpi_fdiv_q(MPI quot, MPI dividend, MPI divisor); -int mpi_fdiv_qr(MPI quot, MPI rem, MPI dividend, MPI divisor); -int mpi_tdiv_r(MPI rem, MPI num, MPI den); -int mpi_tdiv_qr(MPI quot, MPI rem, MPI num, MPI den); -int mpi_tdiv_q_2exp(MPI w, MPI u, unsigned count); -int mpi_divisible_ui(const MPI dividend, ulong divisor); - -/*-- mpi-gcd.c --*/ -int mpi_gcd(MPI g, const MPI a, const MPI b); - /*-- mpi-pow.c --*/ -int mpi_pow(MPI w, MPI u, MPI v); int mpi_powm(MPI res, MPI base, MPI exp, MPI mod); -/*-- mpi-mpow.c --*/ -int mpi_mulpowm(MPI res, MPI *basearray, MPI *exparray, MPI mod); - /*-- mpi-cmp.c --*/ int mpi_cmp_ui(MPI u, ulong v); int mpi_cmp(MPI u, MPI v); -/*-- mpi-scan.c --*/ -int mpi_getbyte(MPI a, unsigned idx); -void mpi_putbyte(MPI a, unsigned idx, int value); -unsigned mpi_trailing_zeros(MPI a); - /*-- mpi-bit.c --*/ void mpi_normalize(MPI a); unsigned mpi_get_nbits(MPI a); -int mpi_test_bit(MPI a, unsigned n); -int mpi_set_bit(MPI a, unsigned n); -int mpi_set_highbit(MPI a, unsigned n); -void mpi_clear_highbit(MPI a, unsigned n); -void mpi_clear_bit(MPI a, unsigned n); -int mpi_rshift(MPI x, MPI a, unsigned n); - -/*-- mpi-inv.c --*/ -int mpi_invm(MPI x, MPI u, MPI v); /* inline functions */ -- cgit v1.2.3