From 69c978232aaa99476f9bd002c2a29a84fa3779b5 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Wed, 21 Mar 2012 16:33:49 -0700 Subject: mm: make get_mm_counter static-inline Make get_mm_counter() always static inline, it is simple enough for that. And remove unused set_mm_counter() bloat-o-meter: add/remove: 0/1 grow/shrink: 4/12 up/down: 99/-341 (-242) function old new delta try_to_unmap_one 886 952 +66 sys_remap_file_pages 1214 1230 +16 dup_mm 1684 1700 +16 do_exit 2277 2278 +1 zap_page_range 208 205 -3 unmap_region 304 296 -8 static.oom_kill_process 554 546 -8 try_to_unmap_file 1716 1700 -16 getrusage 925 909 -16 flush_old_exec 1704 1688 -16 static.dump_header 416 390 -26 acct_update_integrals 218 187 -31 do_task_stat 2986 2954 -32 get_mm_counter 34 - -34 xacct_add_tsk 371 334 -37 task_statm 172 118 -54 task_mem 383 323 -60 try_to_unmap_one() grows because update_hiwater_rss() now completely inline. Signed-off-by: Konstantin Khlebnikov Cc: KAMEZAWA Hiroyuki Acked-by: Kirill A. Shutemov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 17b27cd269c4..378bccebc26c 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1058,19 +1058,20 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write, /* * per-process(per-mm_struct) statistics. */ -static inline void set_mm_counter(struct mm_struct *mm, int member, long value) -{ - atomic_long_set(&mm->rss_stat.count[member], value); -} - -#if defined(SPLIT_RSS_COUNTING) -unsigned long get_mm_counter(struct mm_struct *mm, int member); -#else static inline unsigned long get_mm_counter(struct mm_struct *mm, int member) { - return atomic_long_read(&mm->rss_stat.count[member]); -} + long val = atomic_long_read(&mm->rss_stat.count[member]); + +#ifdef SPLIT_RSS_COUNTING + /* + * counter is updated in asynchronous manner and may go to minus. + * But it's never be expected number for users. + */ + if (val < 0) + val = 0; #endif + return (unsigned long)val; +} static inline void add_mm_counter(struct mm_struct *mm, int member, long value) { -- cgit v1.2.3 From 67f96aa252e606cdf6c3cf1032952ec207ec0cf0 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Wed, 21 Mar 2012 16:33:50 -0700 Subject: mm: make swapin readahead skip over holes Ever since abandoning the virtual scan of processes, for scalability reasons, swap space has been a little more fragmented than before. This can lead to the situation where a large memory user is killed, swap space ends up full of "holes" and swapin readahead is totally ineffective. On my home system, after killing a leaky firefox it took over an hour to page just under 2GB of memory back in, slowing the virtual machines down to a crawl. This patch makes swapin readahead simply skip over holes, instead of stopping at them. This allows the system to swap things back in at rates of several MB/second, instead of a few hundred kB/second. The checks done in valid_swaphandles are already done in read_swap_cache_async as well, allowing us to remove a fair amount of code. [akpm@linux-foundation.org: fix it for page_cluster >= 32] Signed-off-by: Rik van Riel Cc: Minchan Kim Cc: KOSAKI Motohiro Acked-by: Johannes Weiner Acked-by: Mel Gorman Cc: Adrian Drzewiecki Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/swap.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/swap.h b/include/linux/swap.h index 3e60228e7299..64a7dba67840 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -329,7 +329,6 @@ extern long total_swap_pages; extern void si_swapinfo(struct sysinfo *); extern swp_entry_t get_swap_page(void); extern swp_entry_t get_swap_page_of_type(int); -extern int valid_swaphandles(swp_entry_t, unsigned long *); extern int add_swap_count_continuation(swp_entry_t, gfp_t); extern void swap_shmem_alloc(swp_entry_t); extern int swap_duplicate(swp_entry_t); -- cgit v1.2.3 From 7be62de99adcab4449d416977b4274985c5fe023 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Wed, 21 Mar 2012 16:33:52 -0700 Subject: vmscan: kswapd carefully call compaction With CONFIG_COMPACTION enabled, kswapd does not try to free contiguous free pages, even when it is woken for a higher order request. This could be bad for eg. jumbo frame network allocations, which are done from interrupt context and cannot compact memory themselves. Higher than before allocation failure rates in the network receive path have been observed in kernels with compaction enabled. Teach kswapd to defragment the memory zones in a node, but only if required and compaction is not deferred in a zone. [akpm@linux-foundation.org: reduce scope of zones_need_compaction] Signed-off-by: Rik van Riel Acked-by: Mel Gorman Cc: Andrea Arcangeli Cc: Johannes Weiner Cc: Minchan Kim Cc: KOSAKI Motohiro Cc: Hillf Danton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/compaction.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/compaction.h b/include/linux/compaction.h index bb2bbdbe5464..7a9323aef4a3 100644 --- a/include/linux/compaction.h +++ b/include/linux/compaction.h @@ -23,6 +23,7 @@ extern int fragmentation_index(struct zone *zone, unsigned int order); extern unsigned long try_to_compact_pages(struct zonelist *zonelist, int order, gfp_t gfp_mask, nodemask_t *mask, bool sync); +extern int compact_pgdat(pg_data_t *pgdat, int order); extern unsigned long compaction_suitable(struct zone *zone, int order); /* Do not skip compaction more than 64 times */ @@ -62,6 +63,11 @@ static inline unsigned long try_to_compact_pages(struct zonelist *zonelist, return COMPACT_CONTINUE; } +static inline int compact_pgdat(pg_data_t *pgdat, int order) +{ + return COMPACT_CONTINUE; +} + static inline unsigned long compaction_suitable(struct zone *zone, int order) { return COMPACT_SKIPPED; -- cgit v1.2.3 From aff622495c9a0b56148192e53bdec539f5e147f2 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Wed, 21 Mar 2012 16:33:52 -0700 Subject: vmscan: only defer compaction for failed order and higher Currently a failed order-9 (transparent hugepage) compaction can lead to memory compaction being temporarily disabled for a memory zone. Even if we only need compaction for an order 2 allocation, eg. for jumbo frames networking. The fix is relatively straightforward: keep track of the highest order at which compaction is succeeding, and only defer compaction for orders at which compaction is failing. Signed-off-by: Rik van Riel Cc: Andrea Arcangeli Acked-by: Mel Gorman Cc: Johannes Weiner Cc: Minchan Kim Cc: KOSAKI Motohiro Cc: Hillf Danton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/compaction.h | 14 ++++++++++---- include/linux/mmzone.h | 1 + 2 files changed, 11 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/compaction.h b/include/linux/compaction.h index 7a9323aef4a3..51a90b7f2d60 100644 --- a/include/linux/compaction.h +++ b/include/linux/compaction.h @@ -34,20 +34,26 @@ extern unsigned long compaction_suitable(struct zone *zone, int order); * allocation success. 1 << compact_defer_limit compactions are skipped up * to a limit of 1 << COMPACT_MAX_DEFER_SHIFT */ -static inline void defer_compaction(struct zone *zone) +static inline void defer_compaction(struct zone *zone, int order) { zone->compact_considered = 0; zone->compact_defer_shift++; + if (order < zone->compact_order_failed) + zone->compact_order_failed = order; + if (zone->compact_defer_shift > COMPACT_MAX_DEFER_SHIFT) zone->compact_defer_shift = COMPACT_MAX_DEFER_SHIFT; } /* Returns true if compaction should be skipped this time */ -static inline bool compaction_deferred(struct zone *zone) +static inline bool compaction_deferred(struct zone *zone, int order) { unsigned long defer_limit = 1UL << zone->compact_defer_shift; + if (order < zone->compact_order_failed) + return false; + /* Avoid possible overflow */ if (++zone->compact_considered > defer_limit) zone->compact_considered = defer_limit; @@ -73,11 +79,11 @@ static inline unsigned long compaction_suitable(struct zone *zone, int order) return COMPACT_SKIPPED; } -static inline void defer_compaction(struct zone *zone) +static inline void defer_compaction(struct zone *zone, int order) { } -static inline bool compaction_deferred(struct zone *zone) +static inline bool compaction_deferred(struct zone *zone, int order) { return 1; } diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 650ba2fb3301..dff711509661 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -365,6 +365,7 @@ struct zone { */ unsigned int compact_considered; unsigned int compact_defer_shift; + int compact_order_failed; #endif ZONE_PADDING(_pad1_) -- cgit v1.2.3 From 025c5b2451e42c9e8dfdecd6dc84956ce8f321b5 Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Wed, 21 Mar 2012 16:33:57 -0700 Subject: thp: optimize away unnecessary page table locking Currently when we check if we can handle thp as it is or we need to split it into regular sized pages, we hold page table lock prior to check whether a given pmd is mapping thp or not. Because of this, when it's not "huge pmd" we suffer from unnecessary lock/unlock overhead. To remove it, this patch introduces a optimized check function and replace several similar logics with it. [akpm@linux-foundation.org: checkpatch fixes] Signed-off-by: Naoya Horiguchi Cc: David Rientjes Cc: Andi Kleen Cc: Wu Fengguang Cc: Andrea Arcangeli Cc: KOSAKI Motohiro Reviewed-by: KAMEZAWA Hiroyuki Cc: Jiri Slaby Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/huge_mm.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'include/linux') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 1b921299abc4..f56cacb4fec3 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -113,6 +113,18 @@ extern void __vma_adjust_trans_huge(struct vm_area_struct *vma, unsigned long start, unsigned long end, long adjust_next); +extern int __pmd_trans_huge_lock(pmd_t *pmd, + struct vm_area_struct *vma); +/* mmap_sem must be held on entry */ +static inline int pmd_trans_huge_lock(pmd_t *pmd, + struct vm_area_struct *vma) +{ + VM_BUG_ON(!rwsem_is_locked(&vma->vm_mm->mmap_sem)); + if (pmd_trans_huge(*pmd)) + return __pmd_trans_huge_lock(pmd, vma); + else + return 0; +} static inline void vma_adjust_trans_huge(struct vm_area_struct *vma, unsigned long start, unsigned long end, @@ -176,6 +188,11 @@ static inline void vma_adjust_trans_huge(struct vm_area_struct *vma, long adjust_next) { } +static inline int pmd_trans_huge_lock(pmd_t *pmd, + struct vm_area_struct *vma) +{ + return 0; +} #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #endif /* _LINUX_HUGE_MM_H */ -- cgit v1.2.3 From e873c49fbfdd595481976b915850e682441bcbec Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Wed, 21 Mar 2012 16:33:58 -0700 Subject: pagemap: export KPF_THP This flag shows that a given page is a subpage of a transparent hugepage. It helps us debug and test the kernel by showing physical address of thp. Signed-off-by: Naoya Horiguchi Reviewed-by: Wu Fengguang Reviewed-by: KAMEZAWA Hiroyuki Acked-by: KOSAKI Motohiro Cc: David Rientjes Cc: Andi Kleen Cc: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kernel-page-flags.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/kernel-page-flags.h b/include/linux/kernel-page-flags.h index bd92a89f4b0a..26a65711676f 100644 --- a/include/linux/kernel-page-flags.h +++ b/include/linux/kernel-page-flags.h @@ -30,6 +30,7 @@ #define KPF_NOPAGE 20 #define KPF_KSM 21 +#define KPF_THP 22 /* kernel hacking assistances * WARNING: subject to change, never rely on them! -- cgit v1.2.3 From ce1744f4ed20ca873360e54502f8a71564ef7cc6 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Wed, 21 Mar 2012 16:33:59 -0700 Subject: mm: replace PAGE_MIGRATION with IS_ENABLED(CONFIG_MIGRATION) Since commit 2a11c8ea20bf ("kconfig: Introduce IS_ENABLED(), IS_BUILTIN() and IS_MODULE()") there is a generic grep-friendly method for checking config options in C expressions. Signed-off-by: Konstantin Khlebnikov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/migrate.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 05ed2828a553..855c337b20c3 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -8,7 +8,6 @@ typedef struct page *new_page_t(struct page *, unsigned long private, int **); #ifdef CONFIG_MIGRATION -#define PAGE_MIGRATION 1 extern void putback_lru_pages(struct list_head *l); extern int migrate_page(struct address_space *, @@ -32,7 +31,6 @@ extern void migrate_page_copy(struct page *newpage, struct page *page); extern int migrate_huge_page_move_mapping(struct address_space *mapping, struct page *newpage, struct page *page); #else -#define PAGE_MIGRATION 0 static inline void putback_lru_pages(struct list_head *l) {} static inline int migrate_pages(struct list_head *l, new_page_t x, -- cgit v1.2.3 From 978ea78b65794ef07eb66b9946064dea66b52554 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Wed, 21 Mar 2012 16:34:01 -0700 Subject: rmap: remove __anon_vma_link() declaration This declaration is not used anymore, remove it. Signed-off-by: Xiao Guangrong Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/rmap.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 1cdd62a2788a..fd07c4542cee 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -122,7 +122,6 @@ void unlink_anon_vmas(struct vm_area_struct *); int anon_vma_clone(struct vm_area_struct *, struct vm_area_struct *); void anon_vma_moveto_tail(struct vm_area_struct *); int anon_vma_fork(struct vm_area_struct *, struct vm_area_struct *); -void __anon_vma_link(struct vm_area_struct *); static inline void anon_vma_merge(struct vm_area_struct *vma, struct vm_area_struct *next) -- cgit v1.2.3 From b76437579d1344b612cf1851ae610c636cec7db0 Mon Sep 17 00:00:00 2001 From: Siddhesh Poyarekar Date: Wed, 21 Mar 2012 16:34:04 -0700 Subject: procfs: mark thread stack correctly in proc//maps Stack for a new thread is mapped by userspace code and passed via sys_clone. This memory is currently seen as anonymous in /proc//maps, which makes it difficult to ascertain which mappings are being used for thread stacks. This patch uses the individual task stack pointers to determine which vmas are actually thread stacks. For a multithreaded program like the following: #include void *thread_main(void *foo) { while(1); } int main() { pthread_t t; pthread_create(&t, NULL, thread_main, NULL); pthread_join(t, NULL); } proc/PID/maps looks like the following: 00400000-00401000 r-xp 00000000 fd:0a 3671804 /home/siddhesh/a.out 00600000-00601000 rw-p 00000000 fd:0a 3671804 /home/siddhesh/a.out 019ef000-01a10000 rw-p 00000000 00:00 0 [heap] 7f8a44491000-7f8a44492000 ---p 00000000 00:00 0 7f8a44492000-7f8a44c92000 rw-p 00000000 00:00 0 7f8a44c92000-7f8a44e3d000 r-xp 00000000 fd:00 2097482 /lib64/libc-2.14.90.so 7f8a44e3d000-7f8a4503d000 ---p 001ab000 fd:00 2097482 /lib64/libc-2.14.90.so 7f8a4503d000-7f8a45041000 r--p 001ab000 fd:00 2097482 /lib64/libc-2.14.90.so 7f8a45041000-7f8a45043000 rw-p 001af000 fd:00 2097482 /lib64/libc-2.14.90.so 7f8a45043000-7f8a45048000 rw-p 00000000 00:00 0 7f8a45048000-7f8a4505f000 r-xp 00000000 fd:00 2099938 /lib64/libpthread-2.14.90.so 7f8a4505f000-7f8a4525e000 ---p 00017000 fd:00 2099938 /lib64/libpthread-2.14.90.so 7f8a4525e000-7f8a4525f000 r--p 00016000 fd:00 2099938 /lib64/libpthread-2.14.90.so 7f8a4525f000-7f8a45260000 rw-p 00017000 fd:00 2099938 /lib64/libpthread-2.14.90.so 7f8a45260000-7f8a45264000 rw-p 00000000 00:00 0 7f8a45264000-7f8a45286000 r-xp 00000000 fd:00 2097348 /lib64/ld-2.14.90.so 7f8a45457000-7f8a4545a000 rw-p 00000000 00:00 0 7f8a45484000-7f8a45485000 rw-p 00000000 00:00 0 7f8a45485000-7f8a45486000 r--p 00021000 fd:00 2097348 /lib64/ld-2.14.90.so 7f8a45486000-7f8a45487000 rw-p 00022000 fd:00 2097348 /lib64/ld-2.14.90.so 7f8a45487000-7f8a45488000 rw-p 00000000 00:00 0 7fff6273b000-7fff6275c000 rw-p 00000000 00:00 0 [stack] 7fff627ff000-7fff62800000 r-xp 00000000 00:00 0 [vdso] ffffffffff600000-ffffffffff601000 r-xp 00000000 00:00 0 [vsyscall] Here, one could guess that 7f8a44492000-7f8a44c92000 is a stack since the earlier vma that has no permissions (7f8a44e3d000-7f8a4503d000) but that is not always a reliable way to find out which vma is a thread stack. Also, /proc/PID/maps and /proc/PID/task/TID/maps has the same content. With this patch in place, /proc/PID/task/TID/maps are treated as 'maps as the task would see it' and hence, only the vma that that task uses as stack is marked as [stack]. All other 'stack' vmas are marked as anonymous memory. /proc/PID/maps acts as a thread group level view, where all thread stack vmas are marked as [stack:TID] where TID is the process ID of the task that uses that vma as stack, while the process stack is marked as [stack]. So /proc/PID/maps will look like this: 00400000-00401000 r-xp 00000000 fd:0a 3671804 /home/siddhesh/a.out 00600000-00601000 rw-p 00000000 fd:0a 3671804 /home/siddhesh/a.out 019ef000-01a10000 rw-p 00000000 00:00 0 [heap] 7f8a44491000-7f8a44492000 ---p 00000000 00:00 0 7f8a44492000-7f8a44c92000 rw-p 00000000 00:00 0 [stack:1442] 7f8a44c92000-7f8a44e3d000 r-xp 00000000 fd:00 2097482 /lib64/libc-2.14.90.so 7f8a44e3d000-7f8a4503d000 ---p 001ab000 fd:00 2097482 /lib64/libc-2.14.90.so 7f8a4503d000-7f8a45041000 r--p 001ab000 fd:00 2097482 /lib64/libc-2.14.90.so 7f8a45041000-7f8a45043000 rw-p 001af000 fd:00 2097482 /lib64/libc-2.14.90.so 7f8a45043000-7f8a45048000 rw-p 00000000 00:00 0 7f8a45048000-7f8a4505f000 r-xp 00000000 fd:00 2099938 /lib64/libpthread-2.14.90.so 7f8a4505f000-7f8a4525e000 ---p 00017000 fd:00 2099938 /lib64/libpthread-2.14.90.so 7f8a4525e000-7f8a4525f000 r--p 00016000 fd:00 2099938 /lib64/libpthread-2.14.90.so 7f8a4525f000-7f8a45260000 rw-p 00017000 fd:00 2099938 /lib64/libpthread-2.14.90.so 7f8a45260000-7f8a45264000 rw-p 00000000 00:00 0 7f8a45264000-7f8a45286000 r-xp 00000000 fd:00 2097348 /lib64/ld-2.14.90.so 7f8a45457000-7f8a4545a000 rw-p 00000000 00:00 0 7f8a45484000-7f8a45485000 rw-p 00000000 00:00 0 7f8a45485000-7f8a45486000 r--p 00021000 fd:00 2097348 /lib64/ld-2.14.90.so 7f8a45486000-7f8a45487000 rw-p 00022000 fd:00 2097348 /lib64/ld-2.14.90.so 7f8a45487000-7f8a45488000 rw-p 00000000 00:00 0 7fff6273b000-7fff6275c000 rw-p 00000000 00:00 0 [stack] 7fff627ff000-7fff62800000 r-xp 00000000 00:00 0 [vdso] ffffffffff600000-ffffffffff601000 r-xp 00000000 00:00 0 [vsyscall] Thus marking all vmas that are used as stacks by the threads in the thread group along with the process stack. The task level maps will however like this: 00400000-00401000 r-xp 00000000 fd:0a 3671804 /home/siddhesh/a.out 00600000-00601000 rw-p 00000000 fd:0a 3671804 /home/siddhesh/a.out 019ef000-01a10000 rw-p 00000000 00:00 0 [heap] 7f8a44491000-7f8a44492000 ---p 00000000 00:00 0 7f8a44492000-7f8a44c92000 rw-p 00000000 00:00 0 [stack] 7f8a44c92000-7f8a44e3d000 r-xp 00000000 fd:00 2097482 /lib64/libc-2.14.90.so 7f8a44e3d000-7f8a4503d000 ---p 001ab000 fd:00 2097482 /lib64/libc-2.14.90.so 7f8a4503d000-7f8a45041000 r--p 001ab000 fd:00 2097482 /lib64/libc-2.14.90.so 7f8a45041000-7f8a45043000 rw-p 001af000 fd:00 2097482 /lib64/libc-2.14.90.so 7f8a45043000-7f8a45048000 rw-p 00000000 00:00 0 7f8a45048000-7f8a4505f000 r-xp 00000000 fd:00 2099938 /lib64/libpthread-2.14.90.so 7f8a4505f000-7f8a4525e000 ---p 00017000 fd:00 2099938 /lib64/libpthread-2.14.90.so 7f8a4525e000-7f8a4525f000 r--p 00016000 fd:00 2099938 /lib64/libpthread-2.14.90.so 7f8a4525f000-7f8a45260000 rw-p 00017000 fd:00 2099938 /lib64/libpthread-2.14.90.so 7f8a45260000-7f8a45264000 rw-p 00000000 00:00 0 7f8a45264000-7f8a45286000 r-xp 00000000 fd:00 2097348 /lib64/ld-2.14.90.so 7f8a45457000-7f8a4545a000 rw-p 00000000 00:00 0 7f8a45484000-7f8a45485000 rw-p 00000000 00:00 0 7f8a45485000-7f8a45486000 r--p 00021000 fd:00 2097348 /lib64/ld-2.14.90.so 7f8a45486000-7f8a45487000 rw-p 00022000 fd:00 2097348 /lib64/ld-2.14.90.so 7f8a45487000-7f8a45488000 rw-p 00000000 00:00 0 7fff6273b000-7fff6275c000 rw-p 00000000 00:00 0 7fff627ff000-7fff62800000 r-xp 00000000 00:00 0 [vdso] ffffffffff600000-ffffffffff601000 r-xp 00000000 00:00 0 [vsyscall] where only the vma that is being used as a stack by *that* task is marked as [stack]. Analogous changes have been made to /proc/PID/smaps, /proc/PID/numa_maps, /proc/PID/task/TID/smaps and /proc/PID/task/TID/numa_maps. Relevant snippets from smaps and numa_maps: [siddhesh@localhost ~ ]$ pgrep a.out 1441 [siddhesh@localhost ~ ]$ cat /proc/1441/smaps | grep "\[stack" 7f8a44492000-7f8a44c92000 rw-p 00000000 00:00 0 [stack:1442] 7fff6273b000-7fff6275c000 rw-p 00000000 00:00 0 [stack] [siddhesh@localhost ~ ]$ cat /proc/1441/task/1442/smaps | grep "\[stack" 7f8a44492000-7f8a44c92000 rw-p 00000000 00:00 0 [stack] [siddhesh@localhost ~ ]$ cat /proc/1441/task/1441/smaps | grep "\[stack" 7fff6273b000-7fff6275c000 rw-p 00000000 00:00 0 [stack] [siddhesh@localhost ~ ]$ cat /proc/1441/numa_maps | grep "stack" 7f8a44492000 default stack:1442 anon=2 dirty=2 N0=2 7fff6273a000 default stack anon=3 dirty=3 N0=3 [siddhesh@localhost ~ ]$ cat /proc/1441/task/1442/numa_maps | grep "stack" 7f8a44492000 default stack anon=2 dirty=2 N0=2 [siddhesh@localhost ~ ]$ cat /proc/1441/task/1441/numa_maps | grep "stack" 7fff6273a000 default stack anon=3 dirty=3 N0=3 [akpm@linux-foundation.org: checkpatch fixes] [akpm@linux-foundation.org: fix build] Signed-off-by: Siddhesh Poyarekar Cc: KOSAKI Motohiro Cc: Alexander Viro Cc: Jamie Lokier Cc: Mike Frysinger Cc: Alexey Dobriyan Cc: Matt Mackall Cc: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 378bccebc26c..df17ff23d50e 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1040,6 +1040,9 @@ static inline int stack_guard_page_end(struct vm_area_struct *vma, !vma_growsup(vma->vm_next, addr); } +extern pid_t +vm_is_stack(struct task_struct *task, struct vm_area_struct *vma, int in_group); + extern unsigned long move_page_tables(struct vm_area_struct *vma, unsigned long old_addr, struct vm_area_struct *new_vma, unsigned long new_addr, unsigned long len); -- cgit v1.2.3 From 08ab9b10d43aca091fdff58b69fc1ec89c5b8a83 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Wed, 21 Mar 2012 16:34:04 -0700 Subject: mm, oom: force oom kill on sysrq+f The oom killer chooses not to kill a thread if: - an eligible thread has already been oom killed and has yet to exit, and - an eligible thread is exiting but has yet to free all its memory and is not the thread attempting to currently allocate memory. SysRq+F manually invokes the global oom killer to kill a memory-hogging task. This is normally done as a last resort to free memory when no progress is being made or to test the oom killer itself. For both uses, we always want to kill a thread and never defer. This patch causes SysRq+F to always kill an eligible thread and can be used to force a kill even if another oom killed thread has failed to exit. Signed-off-by: David Rientjes Acked-by: KOSAKI Motohiro Acked-by: Pekka Enberg Acked-by: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/oom.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/oom.h b/include/linux/oom.h index 552fba9c7d5a..3d7647536b03 100644 --- a/include/linux/oom.h +++ b/include/linux/oom.h @@ -49,7 +49,7 @@ extern int try_set_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_flags); extern void clear_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_flags); extern void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, - int order, nodemask_t *mask); + int order, nodemask_t *mask, bool force_kill); extern int register_oom_notifier(struct notifier_block *nb); extern int unregister_oom_notifier(struct notifier_block *nb); -- cgit v1.2.3 From 385de35722c9a22917e7bc5e63cd83a8cffa5ecd Mon Sep 17 00:00:00 2001 From: Dean Nelson Date: Wed, 21 Mar 2012 16:34:05 -0700 Subject: thp: allow a hwpoisoned head page to be put back to LRU Andrea Arcangeli pointed out to me that a check in __memory_failure() which was intended to prevent THP tail pages from being checked for the absence of the PG_lru flag (something that is always the case), was also preventing THP head pages from being checked. A THP head page could actually benefit from the call to shake_page() by ending up being put back to a LRU, provided it had been waiting in a pagevec array. Andrea suggested that the "!PageTransCompound(p)" in the if-statement should be replaced by a "!PageTransTail(p)", thus allowing THP head pages to be checked and possibly shaken. Signed-off-by: Dean Nelson Cc: Jin Dongming Reviewed-by: Andrea Arcangeli Cc: Andi Kleen Cc: Hidetoshi Seto Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/page-flags.h | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) (limited to 'include/linux') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index e90a673be67e..6b25758e028e 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -414,11 +414,26 @@ static inline int PageTransHuge(struct page *page) return PageHead(page); } +/* + * PageTransCompound returns true for both transparent huge pages + * and hugetlbfs pages, so it should only be called when it's known + * that hugetlbfs pages aren't involved. + */ static inline int PageTransCompound(struct page *page) { return PageCompound(page); } +/* + * PageTransTail returns true for both transparent huge pages + * and hugetlbfs pages, so it should only be called when it's known + * that hugetlbfs pages aren't involved. + */ +static inline int PageTransTail(struct page *page) +{ + return PageTail(page); +} + #else static inline int PageTransHuge(struct page *page) @@ -430,6 +445,11 @@ static inline int PageTransCompound(struct page *page) { return 0; } + +static inline int PageTransTail(struct page *page) +{ + return 0; +} #endif #ifdef CONFIG_MMU -- cgit v1.2.3 From f0cb3c76ae1ced85f9034480b1b24cd96530ec78 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Wed, 21 Mar 2012 16:34:06 -0700 Subject: mm: drain percpu lru add/rotate page-vectors on cpu hot-unplug This cpu hotplug hook was accidentally removed in commit 00a62ce91e55 ("mm: fix Committed_AS underflow on large NR_CPUS environment") The visible effect of this accident: some pages are borrowed in per-cpu page-vectors. Truncate can deal with it, but these pages cannot be reused while this cpu is offline. So this is like a temporary memory leak. Signed-off-by: Konstantin Khlebnikov Cc: Dave Hansen Cc: KOSAKI Motohiro Cc: Eric B Munson Cc: Mel Gorman Cc: Christoph Lameter Cc: KAMEZAWA Hiroyuki Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/swap.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/swap.h b/include/linux/swap.h index 64a7dba67840..b86b5c20617d 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -223,6 +223,7 @@ extern void lru_add_page_tail(struct zone* zone, extern void activate_page(struct page *); extern void mark_page_accessed(struct page *); extern void lru_add_drain(void); +extern void lru_add_drain_cpu(int cpu); extern int lru_add_drain_all(void); extern void rotate_reclaimable_page(struct page *page); extern void deactivate_page(struct page *page); -- cgit v1.2.3 From e845e199362cc5712ba0e7eedc14eed70e144258 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Wed, 21 Mar 2012 16:34:10 -0700 Subject: mm, memcg: pass charge order to oom killer The oom killer typically displays the allocation order at the time of oom as a part of its diangostic messages (for global, cpuset, and mempolicy ooms). The memory controller may also pass the charge order to the oom killer so it can emit the same information. This is useful in determining how large the memory allocation is that triggered the oom killer. Signed-off-by: David Rientjes Cc: Johannes Weiner Cc: Michal Hocko Cc: Balbir Singh Acked-by: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index b80de520670b..d90965086fae 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -77,7 +77,8 @@ extern void mem_cgroup_uncharge_end(void); extern void mem_cgroup_uncharge_page(struct page *page); extern void mem_cgroup_uncharge_cache_page(struct page *page); -extern void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask); +extern void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, + int order); int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *memcg); extern struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page); -- cgit v1.2.3 From cc9a6c8776615f9c194ccf0b63a0aa5628235545 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 21 Mar 2012 16:34:11 -0700 Subject: cpuset: mm: reduce large amounts of memory barrier related damage v3 Commit c0ff7453bb5c ("cpuset,mm: fix no node to alloc memory when changing cpuset's mems") wins a super prize for the largest number of memory barriers entered into fast paths for one commit. [get|put]_mems_allowed is incredibly heavy with pairs of full memory barriers inserted into a number of hot paths. This was detected while investigating at large page allocator slowdown introduced some time after 2.6.32. The largest portion of this overhead was shown by oprofile to be at an mfence introduced by this commit into the page allocator hot path. For extra style points, the commit introduced the use of yield() in an implementation of what looks like a spinning mutex. This patch replaces the full memory barriers on both read and write sides with a sequence counter with just read barriers on the fast path side. This is much cheaper on some architectures, including x86. The main bulk of the patch is the retry logic if the nodemask changes in a manner that can cause a false failure. While updating the nodemask, a check is made to see if a false failure is a risk. If it is, the sequence number gets bumped and parallel allocators will briefly stall while the nodemask update takes place. In a page fault test microbenchmark, oprofile samples from __alloc_pages_nodemask went from 4.53% of all samples to 1.15%. The actual results were 3.3.0-rc3 3.3.0-rc3 rc3-vanilla nobarrier-v2r1 Clients 1 UserTime 0.07 ( 0.00%) 0.08 (-14.19%) Clients 2 UserTime 0.07 ( 0.00%) 0.07 ( 2.72%) Clients 4 UserTime 0.08 ( 0.00%) 0.07 ( 3.29%) Clients 1 SysTime 0.70 ( 0.00%) 0.65 ( 6.65%) Clients 2 SysTime 0.85 ( 0.00%) 0.82 ( 3.65%) Clients 4 SysTime 1.41 ( 0.00%) 1.41 ( 0.32%) Clients 1 WallTime 0.77 ( 0.00%) 0.74 ( 4.19%) Clients 2 WallTime 0.47 ( 0.00%) 0.45 ( 3.73%) Clients 4 WallTime 0.38 ( 0.00%) 0.37 ( 1.58%) Clients 1 Flt/sec/cpu 497620.28 ( 0.00%) 520294.53 ( 4.56%) Clients 2 Flt/sec/cpu 414639.05 ( 0.00%) 429882.01 ( 3.68%) Clients 4 Flt/sec/cpu 257959.16 ( 0.00%) 258761.48 ( 0.31%) Clients 1 Flt/sec 495161.39 ( 0.00%) 517292.87 ( 4.47%) Clients 2 Flt/sec 820325.95 ( 0.00%) 850289.77 ( 3.65%) Clients 4 Flt/sec 1020068.93 ( 0.00%) 1022674.06 ( 0.26%) MMTests Statistics: duration Sys Time Running Test (seconds) 135.68 132.17 User+Sys Time Running Test (seconds) 164.2 160.13 Total Elapsed Time (seconds) 123.46 120.87 The overall improvement is small but the System CPU time is much improved and roughly in correlation to what oprofile reported (these performance figures are without profiling so skew is expected). The actual number of page faults is noticeably improved. For benchmarks like kernel builds, the overall benefit is marginal but the system CPU time is slightly reduced. To test the actual bug the commit fixed I opened two terminals. The first ran within a cpuset and continually ran a small program that faulted 100M of anonymous data. In a second window, the nodemask of the cpuset was continually randomised in a loop. Without the commit, the program would fail every so often (usually within 10 seconds) and obviously with the commit everything worked fine. With this patch applied, it also worked fine so the fix should be functionally equivalent. Signed-off-by: Mel Gorman Cc: Miao Xie Cc: David Rientjes Cc: Peter Zijlstra Cc: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/cpuset.h | 47 ++++++++++++++++++++--------------------------- include/linux/init_task.h | 8 ++++++++ include/linux/sched.h | 2 +- 3 files changed, 29 insertions(+), 28 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h index e9eaec522655..7a7e5fd2a277 100644 --- a/include/linux/cpuset.h +++ b/include/linux/cpuset.h @@ -89,42 +89,33 @@ extern void rebuild_sched_domains(void); extern void cpuset_print_task_mems_allowed(struct task_struct *p); /* - * reading current mems_allowed and mempolicy in the fastpath must protected - * by get_mems_allowed() + * get_mems_allowed is required when making decisions involving mems_allowed + * such as during page allocation. mems_allowed can be updated in parallel + * and depending on the new value an operation can fail potentially causing + * process failure. A retry loop with get_mems_allowed and put_mems_allowed + * prevents these artificial failures. */ -static inline void get_mems_allowed(void) +static inline unsigned int get_mems_allowed(void) { - current->mems_allowed_change_disable++; - - /* - * ensure that reading mems_allowed and mempolicy happens after the - * update of ->mems_allowed_change_disable. - * - * the write-side task finds ->mems_allowed_change_disable is not 0, - * and knows the read-side task is reading mems_allowed or mempolicy, - * so it will clear old bits lazily. - */ - smp_mb(); + return read_seqcount_begin(¤t->mems_allowed_seq); } -static inline void put_mems_allowed(void) +/* + * If this returns false, the operation that took place after get_mems_allowed + * may have failed. It is up to the caller to retry the operation if + * appropriate. + */ +static inline bool put_mems_allowed(unsigned int seq) { - /* - * ensure that reading mems_allowed and mempolicy before reducing - * mems_allowed_change_disable. - * - * the write-side task will know that the read-side task is still - * reading mems_allowed or mempolicy, don't clears old bits in the - * nodemask. - */ - smp_mb(); - --ACCESS_ONCE(current->mems_allowed_change_disable); + return !read_seqcount_retry(¤t->mems_allowed_seq, seq); } static inline void set_mems_allowed(nodemask_t nodemask) { task_lock(current); + write_seqcount_begin(¤t->mems_allowed_seq); current->mems_allowed = nodemask; + write_seqcount_end(¤t->mems_allowed_seq); task_unlock(current); } @@ -234,12 +225,14 @@ static inline void set_mems_allowed(nodemask_t nodemask) { } -static inline void get_mems_allowed(void) +static inline unsigned int get_mems_allowed(void) { + return 0; } -static inline void put_mems_allowed(void) +static inline bool put_mems_allowed(unsigned int seq) { + return true; } #endif /* !CONFIG_CPUSETS */ diff --git a/include/linux/init_task.h b/include/linux/init_task.h index f994d51f70f2..e4baff5f7ff4 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -29,6 +29,13 @@ extern struct fs_struct init_fs; #define INIT_GROUP_RWSEM(sig) #endif +#ifdef CONFIG_CPUSETS +#define INIT_CPUSET_SEQ \ + .mems_allowed_seq = SEQCNT_ZERO, +#else +#define INIT_CPUSET_SEQ +#endif + #define INIT_SIGNALS(sig) { \ .nr_threads = 1, \ .wait_chldexit = __WAIT_QUEUE_HEAD_INITIALIZER(sig.wait_chldexit),\ @@ -192,6 +199,7 @@ extern struct cred init_cred; INIT_FTRACE_GRAPH \ INIT_TRACE_RECURSION \ INIT_TASK_RCU_PREEMPT(tsk) \ + INIT_CPUSET_SEQ \ } diff --git a/include/linux/sched.h b/include/linux/sched.h index e074e1e54f85..0c147a4260a5 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1514,7 +1514,7 @@ struct task_struct { #endif #ifdef CONFIG_CPUSETS nodemask_t mems_allowed; /* Protected by alloc_lock */ - int mems_allowed_change_disable; + seqcount_t mems_allowed_seq; /* Seqence no to catch updates */ int cpuset_mem_spread_rotor; int cpuset_slab_spread_rotor; #endif -- cgit v1.2.3 From a1d776ee3147cec2a54a645e92eb2e3e2f65a137 Mon Sep 17 00:00:00 2001 From: David Gibson Date: Wed, 21 Mar 2012 16:34:12 -0700 Subject: hugetlb: cleanup hugetlb.h Make a couple of small cleanups to linux/include/hugetlb.h. The set_file_hugepages() function, which was not used anywhere is removed, and the hugetlbfs_config and hugetlbfs_inode_info structures with its HUGETLBFS_I helper function are moved into inode.c, the only place they were used. These structures are really linked to the hugetlbfs filesystem specifically not to hugepage mm handling in general, so they belong in the filesystem code not in a generally available header. It would be nice to move the hugetlbfs_sb_info (superblock) structure in there as well, but it's currently needed in a number of places via the hstate_vma() and hstate_inode(). Signed-off-by: David Gibson Cc: Hugh Dickins Cc: Paul Mackerras Cc: Andrew Barry Cc: Mel Gorman Cc: Minchan Kim Cc: Hillf Danton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hugetlb.h | 25 ------------------------- 1 file changed, 25 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index d9d6c868b86b..7adc4923e7ac 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -128,15 +128,6 @@ enum { }; #ifdef CONFIG_HUGETLBFS -struct hugetlbfs_config { - uid_t uid; - gid_t gid; - umode_t mode; - long nr_blocks; - long nr_inodes; - struct hstate *hstate; -}; - struct hugetlbfs_sb_info { long max_blocks; /* blocks allowed */ long free_blocks; /* blocks free */ @@ -146,17 +137,6 @@ struct hugetlbfs_sb_info { struct hstate *hstate; }; - -struct hugetlbfs_inode_info { - struct shared_policy policy; - struct inode vfs_inode; -}; - -static inline struct hugetlbfs_inode_info *HUGETLBFS_I(struct inode *inode) -{ - return container_of(inode, struct hugetlbfs_inode_info, vfs_inode); -} - static inline struct hugetlbfs_sb_info *HUGETLBFS_SB(struct super_block *sb) { return sb->s_fs_info; @@ -179,14 +159,9 @@ static inline int is_file_hugepages(struct file *file) return 0; } -static inline void set_file_hugepages(struct file *file) -{ - file->f_op = &hugetlbfs_file_operations; -} #else /* !CONFIG_HUGETLBFS */ #define is_file_hugepages(file) 0 -#define set_file_hugepages(file) BUG() static inline struct file *hugetlb_file_setup(const char *name, size_t size, vm_flags_t acctflag, struct user_struct **user, int creat_flags) { -- cgit v1.2.3 From 90481622d75715bfcb68501280a917dbfe516029 Mon Sep 17 00:00:00 2001 From: David Gibson Date: Wed, 21 Mar 2012 16:34:12 -0700 Subject: hugepages: fix use after free bug in "quota" handling hugetlbfs_{get,put}_quota() are badly named. They don't interact with the general quota handling code, and they don't much resemble its behaviour. Rather than being about maintaining limits on on-disk block usage by particular users, they are instead about maintaining limits on in-memory page usage (including anonymous MAP_PRIVATE copied-on-write pages) associated with a particular hugetlbfs filesystem instance. Worse, they work by having callbacks to the hugetlbfs filesystem code from the low-level page handling code, in particular from free_huge_page(). This is a layering violation of itself, but more importantly, if the kernel does a get_user_pages() on hugepages (which can happen from KVM amongst others), then the free_huge_page() can be delayed until after the associated inode has already been freed. If an unmount occurs at the wrong time, even the hugetlbfs superblock where the "quota" limits are stored may have been freed. Andrew Barry proposed a patch to fix this by having hugepages, instead of storing a pointer to their address_space and reaching the superblock from there, had the hugepages store pointers directly to the superblock, bumping the reference count as appropriate to avoid it being freed. Andrew Morton rejected that version, however, on the grounds that it made the existing layering violation worse. This is a reworked version of Andrew's patch, which removes the extra, and some of the existing, layering violation. It works by introducing the concept of a hugepage "subpool" at the lower hugepage mm layer - that is a finite logical pool of hugepages to allocate from. hugetlbfs now creates a subpool for each filesystem instance with a page limit set, and a pointer to the subpool gets added to each allocated hugepage, instead of the address_space pointer used now. The subpool has its own lifetime and is only freed once all pages in it _and_ all other references to it (i.e. superblocks) are gone. subpools are optional - a NULL subpool pointer is taken by the code to mean that no subpool limits are in effect. Previous discussion of this bug found in: "Fix refcounting in hugetlbfs quota handling.". See: https://lkml.org/lkml/2011/8/11/28 or http://marc.info/?l=linux-mm&m=126928970510627&w=1 v2: Fixed a bug spotted by Hillf Danton, and removed the extra parameter to alloc_huge_page() - since it already takes the vma, it is not necessary. Signed-off-by: Andrew Barry Signed-off-by: David Gibson Cc: Hugh Dickins Cc: Mel Gorman Cc: Minchan Kim Cc: Hillf Danton Cc: Paul Mackerras Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hugetlb.h | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 7adc4923e7ac..cf0181738c9e 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -14,6 +14,15 @@ struct user_struct; #include #include +struct hugepage_subpool { + spinlock_t lock; + long count; + long max_hpages, used_hpages; +}; + +struct hugepage_subpool *hugepage_new_subpool(long nr_blocks); +void hugepage_put_subpool(struct hugepage_subpool *spool); + int PageHuge(struct page *page); void reset_vma_resv_huge_pages(struct vm_area_struct *vma); @@ -129,12 +138,11 @@ enum { #ifdef CONFIG_HUGETLBFS struct hugetlbfs_sb_info { - long max_blocks; /* blocks allowed */ - long free_blocks; /* blocks free */ long max_inodes; /* inodes allowed */ long free_inodes; /* inodes free */ spinlock_t stat_lock; struct hstate *hstate; + struct hugepage_subpool *spool; }; static inline struct hugetlbfs_sb_info *HUGETLBFS_SB(struct super_block *sb) @@ -146,8 +154,6 @@ extern const struct file_operations hugetlbfs_file_operations; extern const struct vm_operations_struct hugetlb_vm_ops; struct file *hugetlb_file_setup(const char *name, size_t size, vm_flags_t acct, struct user_struct **user, int creat_flags); -int hugetlb_get_quota(struct address_space *mapping, long delta); -void hugetlb_put_quota(struct address_space *mapping, long delta); static inline int is_file_hugepages(struct file *file) { -- cgit v1.2.3 From 05af2e104a0c282dcd9303431e1360750ba76de6 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Wed, 21 Mar 2012 16:34:13 -0700 Subject: mm, counters: remove task argument to sync_mm_rss() and __sync_task_rss_stat() sync_mm_rss() can only be used for current to avoid race conditions in iterating and clearing its per-task counters. Remove the task argument for it and its helper function, __sync_task_rss_stat(), to avoid thinking it can be used safely for anything other than current. Signed-off-by: David Rientjes Acked-by: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index df17ff23d50e..ce2b2a3b2876 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1131,9 +1131,9 @@ static inline void setmax_mm_hiwater_rss(unsigned long *maxrss, } #if defined(SPLIT_RSS_COUNTING) -void sync_mm_rss(struct task_struct *task, struct mm_struct *mm); +void sync_mm_rss(struct mm_struct *mm); #else -static inline void sync_mm_rss(struct task_struct *task, struct mm_struct *mm) +static inline void sync_mm_rss(struct mm_struct *mm) { } #endif -- cgit v1.2.3 From 40716e29243de46720e5773797791466c28904ec Mon Sep 17 00:00:00 2001 From: Steven Truelove Date: Wed, 21 Mar 2012 16:34:14 -0700 Subject: hugetlbfs: fix alignment of huge page requests When calling shmget() with SHM_HUGETLB, shmget aligns the request size to PAGE_SIZE, but this is not sufficient. Modify hugetlb_file_setup() to align requests to the huge page size, and to accept an address argument so that all alignment checks can be performed in hugetlb_file_setup(), rather than in its callers. Change newseg() and mmap_pgoff() to match the new prototype and eliminate a now redundant alignment check. [akpm@linux-foundation.org: fix build] Signed-off-by: Steven Truelove Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hugetlb.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index cf0181738c9e..000837e126e6 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -152,7 +152,8 @@ static inline struct hugetlbfs_sb_info *HUGETLBFS_SB(struct super_block *sb) extern const struct file_operations hugetlbfs_file_operations; extern const struct vm_operations_struct hugetlb_vm_ops; -struct file *hugetlb_file_setup(const char *name, size_t size, vm_flags_t acct, +struct file *hugetlb_file_setup(const char *name, unsigned long addr, + size_t size, vm_flags_t acct, struct user_struct **user, int creat_flags); static inline int is_file_hugepages(struct file *file) @@ -168,7 +169,8 @@ static inline int is_file_hugepages(struct file *file) #else /* !CONFIG_HUGETLBFS */ #define is_file_hugepages(file) 0 -static inline struct file *hugetlb_file_setup(const char *name, size_t size, +static inline struct file * +hugetlb_file_setup(const char *name, unsigned long addr, size_t size, vm_flags_t acctflag, struct user_struct **user, int creat_flags) { return ERR_PTR(-ENOSYS); -- cgit v1.2.3 From 8d13bddd11c10db40e2c81b4b224c11126691fc0 Mon Sep 17 00:00:00 2001 From: Kautuk Consul Date: Wed, 21 Mar 2012 16:34:15 -0700 Subject: page_alloc.c: remove add_from_early_node_map() add_from_early_node_map() is unused. Signed-off-by: Kautuk Consul Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index ce2b2a3b2876..b1c8318e32b8 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1295,8 +1295,6 @@ extern void get_pfn_range_for_nid(unsigned int nid, extern unsigned long find_min_pfn_with_active_regions(void); extern void free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn); -int add_from_early_node_map(struct range *range, int az, - int nr_range, int nid); extern void sparse_memory_present_with_active_regions(int nid); #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ -- cgit v1.2.3 From 31a79235fc75b506e282e43723107a40f3bc5c07 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Wed, 21 Mar 2012 16:34:18 -0700 Subject: memcg: replace MEM_CONT by MEM_RES_CTLR Correct an #endif comment in memcontrol.h from MEM_CONT to MEM_RES_CTLR. Signed-off-by: Hugh Dickins Reviewed-by: KOSAKI Motohiro Acked-by: Kirill A. Shutemov Acked-by: Michal Hocko Acked-by: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index d90965086fae..e76f10731a86 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -392,7 +392,7 @@ static inline void mem_cgroup_replace_page_cache(struct page *oldpage, struct page *newpage) { } -#endif /* CONFIG_CGROUP_MEM_CONT */ +#endif /* CONFIG_CGROUP_MEM_RES_CTLR */ #if !defined(CONFIG_CGROUP_MEM_RES_CTLR) || !defined(CONFIG_DEBUG_VM) static inline bool -- cgit v1.2.3 From 42aee6c495e07dba7410b863a360db6bb9ec6d66 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Wed, 21 Mar 2012 16:34:21 -0700 Subject: cgroup: revert ss_id_lock to spinlock Commit c1e2ee2dc436 ("memcg: replace ss->id_lock with a rwlock") has now been seen to cause the unfair behavior we should have expected from converting a spinlock to an rwlock: softlockup in cgroup_mkdir(), whose get_new_cssid() is waiting for the wlock, while there are 19 tasks using the rlock in css_get_next() to get on with their memcg workload (in an artificial test, admittedly). Yet lib/idr.c was made suitable for RCU way back: revert that commit, restoring ss->id_lock to a spinlock. Signed-off-by: Hugh Dickins Acked-by: KAMEZAWA Hiroyuki Acked-by: Li Zefan Cc: Eric Dumazet Acked-by: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/cgroup.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 501adb1b2f43..5a85b3415c1b 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -498,7 +498,7 @@ struct cgroup_subsys { struct list_head sibling; /* used when use_id == true */ struct idr idr; - rwlock_t id_lock; + spinlock_t id_lock; /* should be defined only by modular subsystems */ struct module *module; -- cgit v1.2.3 From b24028572fb69e9dd6de8c359eba2b2c66baa889 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Wed, 21 Mar 2012 16:34:22 -0700 Subject: memcg: remove PCG_CACHE page_cgroup flag We record 'the page is cache' with the PCG_CACHE bit in page_cgroup. Here, "CACHE" means anonymous user pages (and SwapCache). This doesn't include shmem. Considering callers, at charge/uncharge, the caller should know what the page is and we don't need to record it by using one bit per page. This patch removes PCG_CACHE bit and make callers of mem_cgroup_charge_statistics() to specify what the page is. About page migration: Mapping of the used page is not touched during migra tion (see page_remove_rmap) so we can rely on it and push the correct charge type down to __mem_cgroup_uncharge_common from end_migration for unused page. The force flag was misleading was abused for skipping the needless page_mapped() / PageCgroupMigration() check, as we know the unused page is no longer mapped and cleared the migration flag just a few lines up. But doing the checks is no biggie and it's not worth adding another flag just to skip them. [akpm@linux-foundation.org: checkpatch fixes] [hughd@google.com: fix PageAnon uncharging] Signed-off-by: KAMEZAWA Hiroyuki Acked-by: Michal Hocko Acked-by: Johannes Weiner Cc: Hugh Dickins Cc: Ying Han Acked-by: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/page_cgroup.h | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/page_cgroup.h b/include/linux/page_cgroup.h index a2d11771c84b..106029243ff4 100644 --- a/include/linux/page_cgroup.h +++ b/include/linux/page_cgroup.h @@ -4,7 +4,6 @@ enum { /* flags for mem_cgroup */ PCG_LOCK, /* Lock for pc->mem_cgroup and following bits. */ - PCG_CACHE, /* charged as cache */ PCG_USED, /* this object is in use. */ PCG_MIGRATION, /* under page migration */ /* flags for mem_cgroup and file and I/O status */ @@ -64,11 +63,6 @@ static inline void ClearPageCgroup##uname(struct page_cgroup *pc) \ static inline int TestClearPageCgroup##uname(struct page_cgroup *pc) \ { return test_and_clear_bit(PCG_##lname, &pc->flags); } -/* Cache flag is set only once (at allocation) */ -TESTPCGFLAG(Cache, CACHE) -CLEARPCGFLAG(Cache, CACHE) -SETPCGFLAG(Cache, CACHE) - TESTPCGFLAG(Used, USED) CLEARPCGFLAG(Used, USED) SETPCGFLAG(Used, USED) @@ -85,7 +79,7 @@ static inline void lock_page_cgroup(struct page_cgroup *pc) { /* * Don't take this lock in IRQ context. - * This lock is for pc->mem_cgroup, USED, CACHE, MIGRATION + * This lock is for pc->mem_cgroup, USED, MIGRATION */ bit_spin_lock(PCG_LOCK, &pc->flags); } -- cgit v1.2.3 From a710920caedfcf56543136bfea300a6c593f9838 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Wed, 21 Mar 2012 16:34:22 -0700 Subject: memcg: kill dead prev_priority stubs This code was removed in 25edde033291 ("vmscan: kill prev_priority completely") Signed-off-by: Konstantin Khlebnikov Acked-by: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 15 --------------- 1 file changed, 15 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index e76f10731a86..c54e5dfa1962 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -299,21 +299,6 @@ static inline void mem_cgroup_iter_break(struct mem_cgroup *root, { } -static inline int mem_cgroup_get_reclaim_priority(struct mem_cgroup *memcg) -{ - return 0; -} - -static inline void mem_cgroup_note_reclaim_priority(struct mem_cgroup *memcg, - int priority) -{ -} - -static inline void mem_cgroup_record_reclaim_priority(struct mem_cgroup *memcg, - int priority) -{ -} - static inline bool mem_cgroup_disabled(void) { return true; -- cgit v1.2.3 From 312734c04e2fecc58429aec98194e4ff12d8f7d6 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Wed, 21 Mar 2012 16:34:24 -0700 Subject: memcg: remove PCG_MOVE_LOCK flag from page_cgroup PCG_MOVE_LOCK is used for bit spinlock to avoid race between overwriting pc->mem_cgroup and page statistics accounting per memcg. This lock helps to avoid the race but the race is very rare because moving tasks between cgroup is not a usual job. So, it seems using 1bit per page is too costly. This patch changes this lock as per-memcg spinlock and removes PCG_MOVE_LOCK. If smaller lock is required, we'll be able to add some hashes but I'd like to start from this. Signed-off-by: KAMEZAWA Hiroyuki Acked-by: Greg Thelen Cc: Johannes Weiner Cc: Michal Hocko Cc: KOSAKI Motohiro Cc: Ying Han Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/page_cgroup.h | 19 ------------------- 1 file changed, 19 deletions(-) (limited to 'include/linux') diff --git a/include/linux/page_cgroup.h b/include/linux/page_cgroup.h index 106029243ff4..7a3af748f32b 100644 --- a/include/linux/page_cgroup.h +++ b/include/linux/page_cgroup.h @@ -7,7 +7,6 @@ enum { PCG_USED, /* this object is in use. */ PCG_MIGRATION, /* under page migration */ /* flags for mem_cgroup and file and I/O status */ - PCG_MOVE_LOCK, /* For race between move_account v.s. following bits */ PCG_FILE_MAPPED, /* page is accounted as "mapped" */ __NR_PCG_FLAGS, }; @@ -89,24 +88,6 @@ static inline void unlock_page_cgroup(struct page_cgroup *pc) bit_spin_unlock(PCG_LOCK, &pc->flags); } -static inline void move_lock_page_cgroup(struct page_cgroup *pc, - unsigned long *flags) -{ - /* - * We know updates to pc->flags of page cache's stats are from both of - * usual context or IRQ context. Disable IRQ to avoid deadlock. - */ - local_irq_save(*flags); - bit_spin_lock(PCG_MOVE_LOCK, &pc->flags); -} - -static inline void move_unlock_page_cgroup(struct page_cgroup *pc, - unsigned long *flags) -{ - bit_spin_unlock(PCG_MOVE_LOCK, &pc->flags); - local_irq_restore(*flags); -} - #else /* CONFIG_CGROUP_MEM_RES_CTLR */ struct page_cgroup; -- cgit v1.2.3 From 89c06bd52fb9ffceddf84f7309d2e8c9f1666216 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Wed, 21 Mar 2012 16:34:25 -0700 Subject: memcg: use new logic for page stat accounting Now, page-stat-per-memcg is recorded into per page_cgroup flag by duplicating page's status into the flag. The reason is that memcg has a feature to move a page from a group to another group and we have race between "move" and "page stat accounting", Under current logic, assume CPU-A and CPU-B. CPU-A does "move" and CPU-B does "page stat accounting". When CPU-A goes 1st, CPU-A CPU-B update "struct page" info. move_lock_mem_cgroup(memcg) see pc->flags copy page stat to new group overwrite pc->mem_cgroup. move_unlock_mem_cgroup(memcg) move_lock_mem_cgroup(mem) set pc->flags update page stat accounting move_unlock_mem_cgroup(mem) stat accounting is guarded by move_lock_mem_cgroup() and "move" logic (CPU-A) doesn't see changes in "struct page" information. But it's costly to have the same information both in 'struct page' and 'struct page_cgroup'. And, there is a potential problem. For example, assume we have PG_dirty accounting in memcg. PG_..is a flag for struct page. PCG_ is a flag for struct page_cgroup. (This is just an example. The same problem can be found in any kind of page stat accounting.) CPU-A CPU-B TestSet PG_dirty (delay) TestClear PG_dirty if (TestClear(PCG_dirty)) memcg->nr_dirty-- if (TestSet(PCG_dirty)) memcg->nr_dirty++ Here, memcg->nr_dirty = +1, this is wrong. This race was reported by Greg Thelen . Now, only FILE_MAPPED is supported but fortunately, it's serialized by page table lock and this is not real bug, _now_, If this potential problem is caused by having duplicated information in struct page and struct page_cgroup, we may be able to fix this by using original 'struct page' information. But we'll have a problem in "move account" Assume we use only PG_dirty. CPU-A CPU-B TestSet PG_dirty (delay) move_lock_mem_cgroup() if (PageDirty(page)) new_memcg->nr_dirty++ pc->mem_cgroup = new_memcg; move_unlock_mem_cgroup() move_lock_mem_cgroup() memcg = pc->mem_cgroup new_memcg->nr_dirty++ accounting information may be double-counted. This was original reason to have PCG_xxx flags but it seems PCG_xxx has another problem. I think we need a bigger lock as move_lock_mem_cgroup(page) TestSetPageDirty(page) update page stats (without any checks) move_unlock_mem_cgroup(page) This fixes both of problems and we don't have to duplicate page flag into page_cgroup. Please note: move_lock_mem_cgroup() is held only when there are possibility of "account move" under the system. So, in most path, status update will go without atomic locks. This patch introduces mem_cgroup_begin_update_page_stat() and mem_cgroup_end_update_page_stat() both should be called at modifying 'struct page' information if memcg takes care of it. as mem_cgroup_begin_update_page_stat() modify page information mem_cgroup_update_page_stat() => never check any 'struct page' info, just update counters. mem_cgroup_end_update_page_stat(). This patch is slow because we need to call begin_update_page_stat()/ end_update_page_stat() regardless of accounted will be changed or not. A following patch adds an easy optimization and reduces the cost. [akpm@linux-foundation.org: s/lock/locked/] [hughd@google.com: fix deadlock by avoiding stat lock when anon] Signed-off-by: KAMEZAWA Hiroyuki Cc: Greg Thelen Acked-by: Johannes Weiner Cc: Michal Hocko Cc: KOSAKI Motohiro Cc: Ying Han Signed-off-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index c54e5dfa1962..bf7ae01fc93b 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -141,6 +141,31 @@ static inline bool mem_cgroup_disabled(void) return false; } +void __mem_cgroup_begin_update_page_stat(struct page *page, bool *locked, + unsigned long *flags); + +static inline void mem_cgroup_begin_update_page_stat(struct page *page, + bool *locked, unsigned long *flags) +{ + if (mem_cgroup_disabled()) + return; + rcu_read_lock(); + *locked = false; + return __mem_cgroup_begin_update_page_stat(page, locked, flags); +} + +void __mem_cgroup_end_update_page_stat(struct page *page, + unsigned long *flags); +static inline void mem_cgroup_end_update_page_stat(struct page *page, + bool *locked, unsigned long *flags) +{ + if (mem_cgroup_disabled()) + return; + if (*locked) + __mem_cgroup_end_update_page_stat(page, flags); + rcu_read_unlock(); +} + void mem_cgroup_update_page_stat(struct page *page, enum mem_cgroup_page_stat_item idx, int val); @@ -341,6 +366,16 @@ mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) { } +static inline void mem_cgroup_begin_update_page_stat(struct page *page, + bool *locked, unsigned long *flags) +{ +} + +static inline void mem_cgroup_end_update_page_stat(struct page *page, + bool *locked, unsigned long *flags) +{ +} + static inline void mem_cgroup_inc_page_stat(struct page *page, enum mem_cgroup_page_stat_item idx) { -- cgit v1.2.3 From 2ff76f1193f8481f7e6c29304eea4006e8e51569 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Wed, 21 Mar 2012 16:34:25 -0700 Subject: memcg: remove PCG_FILE_MAPPED With the new lock scheme for updating memcg's page stat, we don't need a flag PCG_FILE_MAPPED which was duplicated information of page_mapped(). [hughd@google.com: cosmetic fix] [hughd@google.com: add comment to MEM_CGROUP_CHARGE_TYPE_MAPPED case in __mem_cgroup_uncharge_common()] Signed-off-by: KAMEZAWA Hiroyuki Acked-by: Greg Thelen Acked-by: Johannes Weiner Cc: Michal Hocko Cc: KOSAKI Motohiro Cc: Ying Han Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/page_cgroup.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/page_cgroup.h b/include/linux/page_cgroup.h index 7a3af748f32b..a88cdba27809 100644 --- a/include/linux/page_cgroup.h +++ b/include/linux/page_cgroup.h @@ -6,8 +6,6 @@ enum { PCG_LOCK, /* Lock for pc->mem_cgroup and following bits. */ PCG_USED, /* this object is in use. */ PCG_MIGRATION, /* under page migration */ - /* flags for mem_cgroup and file and I/O status */ - PCG_FILE_MAPPED, /* page is accounted as "mapped" */ __NR_PCG_FLAGS, }; @@ -66,10 +64,6 @@ TESTPCGFLAG(Used, USED) CLEARPCGFLAG(Used, USED) SETPCGFLAG(Used, USED) -SETPCGFLAG(FileMapped, FILE_MAPPED) -CLEARPCGFLAG(FileMapped, FILE_MAPPED) -TESTPCGFLAG(FileMapped, FILE_MAPPED) - SETPCGFLAG(Migration, MIGRATION) CLEARPCGFLAG(Migration, MIGRATION) TESTPCGFLAG(Migration, MIGRATION) -- cgit v1.2.3 From 4331f7d339ee0b54603344b9d13662a9c022540c Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Wed, 21 Mar 2012 16:34:26 -0700 Subject: memcg: fix performance of mem_cgroup_begin_update_page_stat() mem_cgroup_begin_update_page_stat() should be very fast because it's called very frequently. Now, it needs to look up page_cgroup and its memcg....this is slow. This patch adds a global variable to check "any memcg is moving or not". With this, the caller doesn't need to visit page_cgroup and memcg. Here is a test result. A test program makes page faults onto a file, MAP_SHARED and makes each page's page_mapcount(page) > 1, and free the range by madvise() and page fault again. This program causes 26214400 times of page fault onto a file(size was 1G.) and shows shows the cost of mem_cgroup_begin_update_page_stat(). Before this patch for mem_cgroup_begin_update_page_stat() [kamezawa@bluextal test]$ time ./mmap 1G real 0m21.765s user 0m5.999s sys 0m15.434s 27.46% mmap mmap [.] reader 21.15% mmap [kernel.kallsyms] [k] page_fault 9.17% mmap [kernel.kallsyms] [k] filemap_fault 2.96% mmap [kernel.kallsyms] [k] __do_fault 2.83% mmap [kernel.kallsyms] [k] __mem_cgroup_begin_update_page_stat After this patch [root@bluextal test]# time ./mmap 1G real 0m21.373s user 0m6.113s sys 0m15.016s In usual path, calls to __mem_cgroup_begin_update_page_stat() goes away. Note: we may be able to remove this optimization in future if we can get pointer to memcg directly from struct page. [akpm@linux-foundation.org: don't return a void] Signed-off-by: KAMEZAWA Hiroyuki Acked-by: Greg Thelen Acked-by: Johannes Weiner Cc: Michal Hocko Cc: KOSAKI Motohiro Cc: Ying Han Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index bf7ae01fc93b..f94efd2f6c27 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -144,6 +144,8 @@ static inline bool mem_cgroup_disabled(void) void __mem_cgroup_begin_update_page_stat(struct page *page, bool *locked, unsigned long *flags); +extern atomic_t memcg_moving; + static inline void mem_cgroup_begin_update_page_stat(struct page *page, bool *locked, unsigned long *flags) { @@ -151,7 +153,8 @@ static inline void mem_cgroup_begin_update_page_stat(struct page *page, return; rcu_read_lock(); *locked = false; - return __mem_cgroup_begin_update_page_stat(page, locked, flags); + if (atomic_read(&memcg_moving)) + __mem_cgroup_begin_update_page_stat(page, locked, flags); } void __mem_cgroup_end_update_page_stat(struct page *page, -- cgit v1.2.3 From d8c37c480678ebe09bc570f33e085e28049db035 Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Wed, 21 Mar 2012 16:34:27 -0700 Subject: thp: add HPAGE_PMD_* definitions for !CONFIG_TRANSPARENT_HUGEPAGE These macros will be used in a later patch, where all usages are expected to be optimized away without #ifdef CONFIG_TRANSPARENT_HUGEPAGE. But to detect unexpected usages, we convert the existing BUG() to BUILD_BUG(). [akpm@linux-foundation.org: fix build in mm/pgtable-generic.c] Signed-off-by: Naoya Horiguchi Acked-by: Hillf Danton Reviewed-by: Andrea Arcangeli Reviewed-by: KAMEZAWA Hiroyuki Acked-by: David Rientjes Cc: Daisuke Nishimura Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/huge_mm.h | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index f56cacb4fec3..c8af7a2efb52 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -51,6 +51,9 @@ extern pmd_t *page_check_address_pmd(struct page *page, unsigned long address, enum page_check_address_pmd_flag flag); +#define HPAGE_PMD_ORDER (HPAGE_PMD_SHIFT-PAGE_SHIFT) +#define HPAGE_PMD_NR (1< MAX_ORDER #error "hugepages can't be allocated by the buddy allocator" #endif @@ -158,9 +159,9 @@ static inline struct page *compound_trans_head(struct page *page) return page; } #else /* CONFIG_TRANSPARENT_HUGEPAGE */ -#define HPAGE_PMD_SHIFT ({ BUG(); 0; }) -#define HPAGE_PMD_MASK ({ BUG(); 0; }) -#define HPAGE_PMD_SIZE ({ BUG(); 0; }) +#define HPAGE_PMD_SHIFT ({ BUILD_BUG(); 0; }) +#define HPAGE_PMD_MASK ({ BUILD_BUG(); 0; }) +#define HPAGE_PMD_SIZE ({ BUILD_BUG(); 0; }) #define hpage_nr_pages(x) 1 -- cgit v1.2.3