diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig | 28 | ||||
-rw-r--r-- | mm/Makefile | 7 | ||||
-rw-r--r-- | mm/allocpercpu.c | 177 | ||||
-rw-r--r-- | mm/backing-dev.c | 5 | ||||
-rw-r--r-- | mm/bootmem.c | 228 | ||||
-rw-r--r-- | mm/bounce.c | 1 | ||||
-rw-r--r-- | mm/fadvise.c | 10 | ||||
-rw-r--r-- | mm/failslab.c | 19 | ||||
-rw-r--r-- | mm/filemap.c | 175 | ||||
-rw-r--r-- | mm/filemap_xip.c | 3 | ||||
-rw-r--r-- | mm/fremap.c | 2 | ||||
-rw-r--r-- | mm/highmem.c | 2 | ||||
-rw-r--r-- | mm/hugetlb.c | 558 | ||||
-rw-r--r-- | mm/hwpoison-inject.c | 113 | ||||
-rw-r--r-- | mm/internal.h | 35 | ||||
-rw-r--r-- | mm/kmemleak.c | 193 | ||||
-rw-r--r-- | mm/ksm.c | 961 | ||||
-rw-r--r-- | mm/maccess.c | 11 | ||||
-rw-r--r-- | mm/madvise.c | 21 | ||||
-rw-r--r-- | mm/memcontrol.c | 1835 | ||||
-rw-r--r-- | mm/memory-failure.c | 579 | ||||
-rw-r--r-- | mm/memory.c | 216 | ||||
-rw-r--r-- | mm/memory_hotplug.c | 26 | ||||
-rw-r--r-- | mm/mempolicy.c | 234 | ||||
-rw-r--r-- | mm/migrate.c | 177 | ||||
-rw-r--r-- | mm/mincore.c | 39 | ||||
-rw-r--r-- | mm/mlock.c | 57 | ||||
-rw-r--r-- | mm/mmap.c | 311 | ||||
-rw-r--r-- | mm/mmu_context.c | 4 | ||||
-rw-r--r-- | mm/mmu_notifier.c | 1 | ||||
-rw-r--r-- | mm/mprotect.c | 1 | ||||
-rw-r--r-- | mm/mremap.c | 249 | ||||
-rw-r--r-- | mm/nommu.c | 187 | ||||
-rw-r--r-- | mm/oom_kill.c | 116 | ||||
-rw-r--r-- | mm/page-writeback.c | 12 | ||||
-rw-r--r-- | mm/page_alloc.c | 527 | ||||
-rw-r--r-- | mm/page_cgroup.c | 42 | ||||
-rw-r--r-- | mm/page_io.c | 18 | ||||
-rw-r--r-- | mm/pagewalk.c | 59 | ||||
-rw-r--r-- | mm/percpu.c | 90 | ||||
-rw-r--r-- | mm/percpu_up.c | 30 | ||||
-rw-r--r-- | mm/quicklist.c | 1 | ||||
-rw-r--r-- | mm/readahead.c | 19 | ||||
-rw-r--r-- | mm/rmap.c | 536 | ||||
-rw-r--r-- | mm/shmem.c | 84 | ||||
-rw-r--r-- | mm/shmem_acl.c | 171 | ||||
-rw-r--r-- | mm/slab.c | 186 | ||||
-rw-r--r-- | mm/slub.c | 368 | ||||
-rw-r--r-- | mm/sparse-vmemmap.c | 77 | ||||
-rw-r--r-- | mm/sparse.c | 197 | ||||
-rw-r--r-- | mm/swap.c | 3 | ||||
-rw-r--r-- | mm/swap_state.c | 1 | ||||
-rw-r--r-- | mm/swapfile.c | 918 | ||||
-rw-r--r-- | mm/truncate.c | 39 | ||||
-rw-r--r-- | mm/util.c | 23 | ||||
-rw-r--r-- | mm/vmalloc.c | 125 | ||||
-rw-r--r-- | mm/vmscan.c | 464 | ||||
-rw-r--r-- | mm/vmstat.c | 28 |
58 files changed, 7212 insertions, 3387 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index 44cf6f0a3a6d..9c61158308dc 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -115,6 +115,10 @@ config SPARSEMEM_EXTREME config SPARSEMEM_VMEMMAP_ENABLE bool +config SPARSEMEM_ALLOC_MEM_MAP_TOGETHER + def_bool y + depends on SPARSEMEM && X86_64 + config SPARSEMEM_VMEMMAP bool "Sparse Memory virtual memmap" depends on SPARSEMEM && SPARSEMEM_VMEMMAP_ENABLE @@ -158,11 +162,13 @@ config PAGEFLAGS_EXTENDED # Default to 4 for wider testing, though 8 might be more appropriate. # ARM's adjust_pte (unused if VIPT) depends on mm-wide page_table_lock. # PA-RISC 7xxx's spinlock_t would enlarge struct page from 32 to 44 bytes. +# DEBUG_SPINLOCK and DEBUG_LOCK_ALLOC spinlock_t also enlarge struct page. # config SPLIT_PTLOCK_CPUS int - default "4096" if ARM && !CPU_CACHE_VIPT - default "4096" if PARISC && !PA20 + default "999999" if ARM && !CPU_CACHE_VIPT + default "999999" if PARISC && !PA20 + default "999999" if DEBUG_SPINLOCK || DEBUG_LOCK_ALLOC default "4" # @@ -193,21 +199,13 @@ config BOUNCE config NR_QUICK int depends on QUICKLIST - default "2" if SUPERH || AVR32 + default "2" if AVR32 default "1" config VIRT_TO_BUS def_bool y depends on !ARCH_NO_VIRT_TO_BUS -config HAVE_MLOCK - bool - default y if MMU=y - -config HAVE_MLOCKED_PAGE_BIT - bool - default y if HAVE_MLOCK=y - config MMU_NOTIFIER bool @@ -218,7 +216,7 @@ config KSM Enable Kernel Samepage Merging: KSM periodically scans those areas of an application's address space that an app has advised may be mergeable. When it finds pages of identical content, it replaces - the many instances by a single resident page with that content, so + the many instances by a single page with that content, so saving memory until one or another app needs to modify the content. Recommended for use with KVM, or with other duplicative applications. See Documentation/vm/ksm.txt for more information: KSM is inactive @@ -227,6 +225,7 @@ config KSM config DEFAULT_MMAP_MIN_ADDR int "Low address space to protect from user allocation" + depends on MMU default 4096 help This is the portion of low virtual memory which should be protected @@ -257,8 +256,9 @@ config MEMORY_FAILURE special hardware support and typically ECC memory. config HWPOISON_INJECT - tristate "Poison pages injector" - depends on MEMORY_FAILURE && DEBUG_KERNEL + tristate "HWPoison pages injector" + depends on MEMORY_FAILURE && DEBUG_KERNEL && PROC_FS + select PROC_PAGE_MONITOR config NOMMU_INITIAL_TRIM_EXCESS int "Turn on mmap() excess space trimming before booting" diff --git a/mm/Makefile b/mm/Makefile index ebf849042ed3..6c2a73a54a43 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -22,7 +22,6 @@ obj-$(CONFIG_HUGETLBFS) += hugetlb.o obj-$(CONFIG_NUMA) += mempolicy.o obj-$(CONFIG_SPARSEMEM) += sparse.o obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o -obj-$(CONFIG_TMPFS_POSIX_ACL) += shmem_acl.o obj-$(CONFIG_SLOB) += slob.o obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o obj-$(CONFIG_KSM) += ksm.o @@ -34,10 +33,10 @@ obj-$(CONFIG_FAILSLAB) += failslab.o obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o obj-$(CONFIG_FS_XIP) += filemap_xip.o obj-$(CONFIG_MIGRATION) += migrate.o -ifndef CONFIG_HAVE_LEGACY_PER_CPU_AREA -obj-$(CONFIG_SMP) += percpu.o +ifdef CONFIG_SMP +obj-y += percpu.o else -obj-$(CONFIG_SMP) += allocpercpu.o +obj-y += percpu_up.o endif obj-$(CONFIG_QUICKLIST) += quicklist.o obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o diff --git a/mm/allocpercpu.c b/mm/allocpercpu.c deleted file mode 100644 index df34ceae0c67..000000000000 --- a/mm/allocpercpu.c +++ /dev/null @@ -1,177 +0,0 @@ -/* - * linux/mm/allocpercpu.c - * - * Separated from slab.c August 11, 2006 Christoph Lameter - */ -#include <linux/mm.h> -#include <linux/module.h> -#include <linux/bootmem.h> -#include <asm/sections.h> - -#ifndef cache_line_size -#define cache_line_size() L1_CACHE_BYTES -#endif - -/** - * percpu_depopulate - depopulate per-cpu data for given cpu - * @__pdata: per-cpu data to depopulate - * @cpu: depopulate per-cpu data for this cpu - * - * Depopulating per-cpu data for a cpu going offline would be a typical - * use case. You need to register a cpu hotplug handler for that purpose. - */ -static void percpu_depopulate(void *__pdata, int cpu) -{ - struct percpu_data *pdata = __percpu_disguise(__pdata); - - kfree(pdata->ptrs[cpu]); - pdata->ptrs[cpu] = NULL; -} - -/** - * percpu_depopulate_mask - depopulate per-cpu data for some cpu's - * @__pdata: per-cpu data to depopulate - * @mask: depopulate per-cpu data for cpu's selected through mask bits - */ -static void __percpu_depopulate_mask(void *__pdata, const cpumask_t *mask) -{ - int cpu; - for_each_cpu_mask_nr(cpu, *mask) - percpu_depopulate(__pdata, cpu); -} - -#define percpu_depopulate_mask(__pdata, mask) \ - __percpu_depopulate_mask((__pdata), &(mask)) - -/** - * percpu_populate - populate per-cpu data for given cpu - * @__pdata: per-cpu data to populate further - * @size: size of per-cpu object - * @gfp: may sleep or not etc. - * @cpu: populate per-data for this cpu - * - * Populating per-cpu data for a cpu coming online would be a typical - * use case. You need to register a cpu hotplug handler for that purpose. - * Per-cpu object is populated with zeroed buffer. - */ -static void *percpu_populate(void *__pdata, size_t size, gfp_t gfp, int cpu) -{ - struct percpu_data *pdata = __percpu_disguise(__pdata); - int node = cpu_to_node(cpu); - - /* - * We should make sure each CPU gets private memory. - */ - size = roundup(size, cache_line_size()); - - BUG_ON(pdata->ptrs[cpu]); - if (node_online(node)) - pdata->ptrs[cpu] = kmalloc_node(size, gfp|__GFP_ZERO, node); - else - pdata->ptrs[cpu] = kzalloc(size, gfp); - return pdata->ptrs[cpu]; -} - -/** - * percpu_populate_mask - populate per-cpu data for more cpu's - * @__pdata: per-cpu data to populate further - * @size: size of per-cpu object - * @gfp: may sleep or not etc. - * @mask: populate per-cpu data for cpu's selected through mask bits - * - * Per-cpu objects are populated with zeroed buffers. - */ -static int __percpu_populate_mask(void *__pdata, size_t size, gfp_t gfp, - cpumask_t *mask) -{ - cpumask_t populated; - int cpu; - - cpus_clear(populated); - for_each_cpu_mask_nr(cpu, *mask) - if (unlikely(!percpu_populate(__pdata, size, gfp, cpu))) { - __percpu_depopulate_mask(__pdata, &populated); - return -ENOMEM; - } else - cpu_set(cpu, populated); - return 0; -} - -#define percpu_populate_mask(__pdata, size, gfp, mask) \ - __percpu_populate_mask((__pdata), (size), (gfp), &(mask)) - -/** - * alloc_percpu - initial setup of per-cpu data - * @size: size of per-cpu object - * @align: alignment - * - * Allocate dynamic percpu area. Percpu objects are populated with - * zeroed buffers. - */ -void *__alloc_percpu(size_t size, size_t align) -{ - /* - * We allocate whole cache lines to avoid false sharing - */ - size_t sz = roundup(nr_cpu_ids * sizeof(void *), cache_line_size()); - void *pdata = kzalloc(sz, GFP_KERNEL); - void *__pdata = __percpu_disguise(pdata); - - /* - * Can't easily make larger alignment work with kmalloc. WARN - * on it. Larger alignment should only be used for module - * percpu sections on SMP for which this path isn't used. - */ - WARN_ON_ONCE(align > SMP_CACHE_BYTES); - - if (unlikely(!pdata)) - return NULL; - if (likely(!__percpu_populate_mask(__pdata, size, GFP_KERNEL, - &cpu_possible_map))) - return __pdata; - kfree(pdata); - return NULL; -} -EXPORT_SYMBOL_GPL(__alloc_percpu); - -/** - * free_percpu - final cleanup of per-cpu data - * @__pdata: object to clean up - * - * We simply clean up any per-cpu object left. No need for the client to - * track and specify through a bis mask which per-cpu objects are to free. - */ -void free_percpu(void *__pdata) -{ - if (unlikely(!__pdata)) - return; - __percpu_depopulate_mask(__pdata, cpu_possible_mask); - kfree(__percpu_disguise(__pdata)); -} -EXPORT_SYMBOL_GPL(free_percpu); - -/* - * Generic percpu area setup. - */ -#ifndef CONFIG_HAVE_SETUP_PER_CPU_AREA -unsigned long __per_cpu_offset[NR_CPUS] __read_mostly; - -EXPORT_SYMBOL(__per_cpu_offset); - -void __init setup_per_cpu_areas(void) -{ - unsigned long size, i; - char *ptr; - unsigned long nr_possible_cpus = num_possible_cpus(); - - /* Copy section for each CPU (we discard the original) */ - size = ALIGN(PERCPU_ENOUGH_ROOM, PAGE_SIZE); - ptr = alloc_bootmem_pages(size * nr_possible_cpus); - - for_each_possible_cpu(i) { - __per_cpu_offset[i] = ptr - __per_cpu_start; - memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start); - ptr += size; - } -} -#endif /* CONFIG_HAVE_SETUP_PER_CPU_AREA */ diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 67a33a5a1a93..f13e067e1467 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -227,6 +227,9 @@ static struct device_attribute bdi_dev_attrs[] = { static __init int bdi_class_init(void) { bdi_class = class_create(THIS_MODULE, "bdi"); + if (IS_ERR(bdi_class)) + return PTR_ERR(bdi_class); + bdi_class->dev_attrs = bdi_dev_attrs; bdi_debug_init(); return 0; @@ -609,7 +612,7 @@ static void bdi_wb_shutdown(struct backing_dev_info *bdi) * it would never exet if it is currently stuck in the refrigerator. */ list_for_each_entry(wb, &bdi->wb_list, list) { - wb->task->flags &= ~PF_FROZEN; + thaw_process(wb->task); kthread_stop(wb->task); } } diff --git a/mm/bootmem.c b/mm/bootmem.c index 555d5d2731c6..58c66cc5056a 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c @@ -10,9 +10,11 @@ */ #include <linux/init.h> #include <linux/pfn.h> +#include <linux/slab.h> #include <linux/bootmem.h> #include <linux/module.h> #include <linux/kmemleak.h> +#include <linux/range.h> #include <asm/bug.h> #include <asm/io.h> @@ -32,6 +34,7 @@ unsigned long max_pfn; unsigned long saved_max_pfn; #endif +#ifndef CONFIG_NO_BOOTMEM bootmem_data_t bootmem_node_data[MAX_NUMNODES] __initdata; static struct list_head bdata_list __initdata = LIST_HEAD_INIT(bdata_list); @@ -142,7 +145,78 @@ unsigned long __init init_bootmem(unsigned long start, unsigned long pages) min_low_pfn = start; return init_bootmem_core(NODE_DATA(0)->bdata, start, 0, pages); } +#endif +/* + * free_bootmem_late - free bootmem pages directly to page allocator + * @addr: starting address of the range + * @size: size of the range in bytes + * + * This is only useful when the bootmem allocator has already been torn + * down, but we are still initializing the system. Pages are given directly + * to the page allocator, no bootmem metadata is updated because it is gone. + */ +void __init free_bootmem_late(unsigned long addr, unsigned long size) +{ + unsigned long cursor, end; + kmemleak_free_part(__va(addr), size); + + cursor = PFN_UP(addr); + end = PFN_DOWN(addr + size); + + for (; cursor < end; cursor++) { + __free_pages_bootmem(pfn_to_page(cursor), 0); + totalram_pages++; + } +} + +#ifdef CONFIG_NO_BOOTMEM +static void __init __free_pages_memory(unsigned long start, unsigned long end) +{ + int i; + unsigned long start_aligned, end_aligned; + int order = ilog2(BITS_PER_LONG); + + start_aligned = (start + (BITS_PER_LONG - 1)) & ~(BITS_PER_LONG - 1); + end_aligned = end & ~(BITS_PER_LONG - 1); + + if (end_aligned <= start_aligned) { + for (i = start; i < end; i++) + __free_pages_bootmem(pfn_to_page(i), 0); + + return; + } + + for (i = start; i < start_aligned; i++) + __free_pages_bootmem(pfn_to_page(i), 0); + + for (i = start_aligned; i < end_aligned; i += BITS_PER_LONG) + __free_pages_bootmem(pfn_to_page(i), order); + + for (i = end_aligned; i < end; i++) + __free_pages_bootmem(pfn_to_page(i), 0); +} + +unsigned long __init free_all_memory_core_early(int nodeid) +{ + int i; + u64 start, end; + unsigned long count = 0; + struct range *range = NULL; + int nr_range; + + nr_range = get_free_all_memory_range(&range, nodeid); + + for (i = 0; i < nr_range; i++) { + start = range[i].start; + end = range[i].end; + count += end - start; + __free_pages_memory(start, end); + } + + return count; +} +#else static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) { int aligned; @@ -203,6 +277,7 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) return count; } +#endif /** * free_all_bootmem_node - release a node's free pages to the buddy allocator @@ -213,7 +288,12 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) unsigned long __init free_all_bootmem_node(pg_data_t *pgdat) { register_page_bootmem_info_node(pgdat); +#ifdef CONFIG_NO_BOOTMEM + /* free_all_memory_core_early(MAX_NUMNODES) will be called later */ + return 0; +#else return free_all_bootmem_core(pgdat->bdata); +#endif } /** @@ -223,9 +303,27 @@ unsigned long __init free_all_bootmem_node(pg_data_t *pgdat) */ unsigned long __init free_all_bootmem(void) { - return free_all_bootmem_core(NODE_DATA(0)->bdata); +#ifdef CONFIG_NO_BOOTMEM + /* + * We need to use MAX_NUMNODES instead of NODE_DATA(0)->node_id + * because in some case like Node0 doesnt have RAM installed + * low ram will be on Node1 + * Use MAX_NUMNODES will make sure all ranges in early_node_map[] + * will be used instead of only Node0 related + */ + return free_all_memory_core_early(MAX_NUMNODES); +#else + unsigned long total_pages = 0; + bootmem_data_t *bdata; + + list_for_each_entry(bdata, &bdata_list, list) + total_pages += free_all_bootmem_core(bdata); + + return total_pages; +#endif } +#ifndef CONFIG_NO_BOOTMEM static void __init __free(bootmem_data_t *bdata, unsigned long sidx, unsigned long eidx) { @@ -320,6 +418,7 @@ static int __init mark_bootmem(unsigned long start, unsigned long end, } BUG(); } +#endif /** * free_bootmem_node - mark a page range as usable @@ -334,6 +433,9 @@ static int __init mark_bootmem(unsigned long start, unsigned long end, void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, unsigned long size) { +#ifdef CONFIG_NO_BOOTMEM + free_early(physaddr, physaddr + size); +#else unsigned long start, end; kmemleak_free_part(__va(physaddr), size); @@ -342,6 +444,7 @@ void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, end = PFN_DOWN(physaddr + size); mark_bootmem_node(pgdat->bdata, start, end, 0, 0); +#endif } /** @@ -355,6 +458,9 @@ void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, */ void __init free_bootmem(unsigned long addr, unsigned long size) { +#ifdef CONFIG_NO_BOOTMEM + free_early(addr, addr + size); +#else unsigned long start, end; kmemleak_free_part(__va(addr), size); @@ -363,6 +469,7 @@ void __init free_bootmem(unsigned long addr, unsigned long size) end = PFN_DOWN(addr + size); mark_bootmem(start, end, 0, 0); +#endif } /** @@ -379,12 +486,17 @@ void __init free_bootmem(unsigned long addr, unsigned long size) int __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, unsigned long size, int flags) { +#ifdef CONFIG_NO_BOOTMEM + panic("no bootmem"); + return 0; +#else unsigned long start, end; start = PFN_DOWN(physaddr); end = PFN_UP(physaddr + size); return mark_bootmem_node(pgdat->bdata, start, end, 1, flags); +#endif } /** @@ -400,16 +512,22 @@ int __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, int __init reserve_bootmem(unsigned long addr, unsigned long size, int flags) { +#ifdef CONFIG_NO_BOOTMEM + panic("no bootmem"); + return 0; +#else unsigned long start, end; start = PFN_DOWN(addr); end = PFN_UP(addr + size); return mark_bootmem(start, end, 1, flags); +#endif } -static unsigned long align_idx(struct bootmem_data *bdata, unsigned long idx, - unsigned long step) +#ifndef CONFIG_NO_BOOTMEM +static unsigned long __init align_idx(struct bootmem_data *bdata, + unsigned long idx, unsigned long step) { unsigned long base = bdata->node_min_pfn; @@ -421,8 +539,8 @@ static unsigned long align_idx(struct bootmem_data *bdata, unsigned long idx, return ALIGN(base + idx, step) - base; } -static unsigned long align_off(struct bootmem_data *bdata, unsigned long off, - unsigned long align) +static unsigned long __init align_off(struct bootmem_data *bdata, + unsigned long off, unsigned long align) { unsigned long base = PFN_PHYS(bdata->node_min_pfn); @@ -558,12 +676,33 @@ static void * __init alloc_arch_preferred_bootmem(bootmem_data_t *bdata, #endif return NULL; } +#endif static void * __init ___alloc_bootmem_nopanic(unsigned long size, unsigned long align, unsigned long goal, unsigned long limit) { +#ifdef CONFIG_NO_BOOTMEM + void *ptr; + + if (WARN_ON_ONCE(slab_is_available())) + return kzalloc(size, GFP_NOWAIT); + +restart: + + ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align, goal, limit); + + if (ptr) + return ptr; + + if (goal != 0) { + goal = 0; + goto restart; + } + + return NULL; +#else bootmem_data_t *bdata; void *region; @@ -589,6 +728,7 @@ restart: } return NULL; +#endif } /** @@ -607,7 +747,13 @@ restart: void * __init __alloc_bootmem_nopanic(unsigned long size, unsigned long align, unsigned long goal) { - return ___alloc_bootmem_nopanic(size, align, goal, 0); + unsigned long limit = 0; + +#ifdef CONFIG_NO_BOOTMEM + limit = -1UL; +#endif + + return ___alloc_bootmem_nopanic(size, align, goal, limit); } static void * __init ___alloc_bootmem(unsigned long size, unsigned long align, @@ -641,9 +787,16 @@ static void * __init ___alloc_bootmem(unsigned long size, unsigned long align, void * __init __alloc_bootmem(unsigned long size, unsigned long align, unsigned long goal) { - return ___alloc_bootmem(size, align, goal, 0); + unsigned long limit = 0; + +#ifdef CONFIG_NO_BOOTMEM + limit = -1UL; +#endif + + return ___alloc_bootmem(size, align, goal, limit); } +#ifndef CONFIG_NO_BOOTMEM static void * __init ___alloc_bootmem_node(bootmem_data_t *bdata, unsigned long size, unsigned long align, unsigned long goal, unsigned long limit) @@ -660,6 +813,7 @@ static void * __init ___alloc_bootmem_node(bootmem_data_t *bdata, return ___alloc_bootmem(size, align, goal, limit); } +#endif /** * __alloc_bootmem_node - allocate boot memory from a specific node @@ -682,7 +836,46 @@ void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, if (WARN_ON_ONCE(slab_is_available())) return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); +#ifdef CONFIG_NO_BOOTMEM + return __alloc_memory_core_early(pgdat->node_id, size, align, + goal, -1ULL); +#else return ___alloc_bootmem_node(pgdat->bdata, size, align, goal, 0); +#endif +} + +void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size, + unsigned long align, unsigned long goal) +{ +#ifdef MAX_DMA32_PFN + unsigned long end_pfn; + + if (WARN_ON_ONCE(slab_is_available())) + return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); + + /* update goal according ...MAX_DMA32_PFN */ + end_pfn = pgdat->node_start_pfn + pgdat->node_spanned_pages; + + if (end_pfn > MAX_DMA32_PFN + (128 >> (20 - PAGE_SHIFT)) && + (goal >> PAGE_SHIFT) < MAX_DMA32_PFN) { + void *ptr; + unsigned long new_goal; + + new_goal = MAX_DMA32_PFN << PAGE_SHIFT; +#ifdef CONFIG_NO_BOOTMEM + ptr = __alloc_memory_core_early(pgdat->node_id, size, align, + new_goal, -1ULL); +#else + ptr = alloc_bootmem_core(pgdat->bdata, size, align, + new_goal, 0); +#endif + if (ptr) + return ptr; + } +#endif + + return __alloc_bootmem_node(pgdat, size, align, goal); + } #ifdef CONFIG_SPARSEMEM @@ -696,6 +889,16 @@ void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, void * __init alloc_bootmem_section(unsigned long size, unsigned long section_nr) { +#ifdef CONFIG_NO_BOOTMEM + unsigned long pfn, goal, limit; + + pfn = section_nr_to_pfn(section_nr); + goal = pfn << PAGE_SHIFT; + limit = section_nr_to_pfn(section_nr + 1) << PAGE_SHIFT; + + return __alloc_memory_core_early(early_pfn_to_nid(pfn), size, + SMP_CACHE_BYTES, goal, limit); +#else bootmem_data_t *bdata; unsigned long pfn, goal, limit; @@ -705,6 +908,7 @@ void * __init alloc_bootmem_section(unsigned long size, bdata = &bootmem_node_data[early_pfn_to_nid(pfn)]; return alloc_bootmem_core(bdata, size, SMP_CACHE_BYTES, goal, limit); +#endif } #endif @@ -716,11 +920,16 @@ void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size, if (WARN_ON_ONCE(slab_is_available())) return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); +#ifdef CONFIG_NO_BOOTMEM + ptr = __alloc_memory_core_early(pgdat->node_id, size, align, + goal, -1ULL); +#else ptr = alloc_arch_preferred_bootmem(pgdat->bdata, size, align, goal, 0); if (ptr) return ptr; ptr = alloc_bootmem_core(pgdat->bdata, size, align, goal, 0); +#endif if (ptr) return ptr; @@ -771,6 +980,11 @@ void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size, if (WARN_ON_ONCE(slab_is_available())) return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); +#ifdef CONFIG_NO_BOOTMEM + return __alloc_memory_core_early(pgdat->node_id, size, align, + goal, ARCH_LOW_ADDRESS_LIMIT); +#else return ___alloc_bootmem_node(pgdat->bdata, size, align, goal, ARCH_LOW_ADDRESS_LIMIT); +#endif } diff --git a/mm/bounce.c b/mm/bounce.c index a2b76a588e34..13b6dad1eed2 100644 --- a/mm/bounce.c +++ b/mm/bounce.c @@ -6,6 +6,7 @@ #include <linux/mm.h> #include <linux/module.h> #include <linux/swap.h> +#include <linux/gfp.h> #include <linux/bio.h> #include <linux/pagemap.h> #include <linux/mempool.h> diff --git a/mm/fadvise.c b/mm/fadvise.c index e43359214f6f..8d723c9e8b75 100644 --- a/mm/fadvise.c +++ b/mm/fadvise.c @@ -77,12 +77,20 @@ SYSCALL_DEFINE(fadvise64_64)(int fd, loff_t offset, loff_t len, int advice) switch (advice) { case POSIX_FADV_NORMAL: file->f_ra.ra_pages = bdi->ra_pages; + spin_lock(&file->f_lock); + file->f_mode &= ~FMODE_RANDOM; + spin_unlock(&file->f_lock); break; case POSIX_FADV_RANDOM: - file->f_ra.ra_pages = 0; + spin_lock(&file->f_lock); + file->f_mode |= FMODE_RANDOM; + spin_unlock(&file->f_lock); break; case POSIX_FADV_SEQUENTIAL: file->f_ra.ra_pages = bdi->ra_pages * 2; + spin_lock(&file->f_lock); + file->f_mode &= ~FMODE_RANDOM; + spin_unlock(&file->f_lock); break; case POSIX_FADV_WILLNEED: if (!mapping->a_ops->readpage) { diff --git a/mm/failslab.c b/mm/failslab.c index 9339de5f0a91..c5f88f240ddc 100644 --- a/mm/failslab.c +++ b/mm/failslab.c @@ -1,18 +1,21 @@ #include <linux/fault-inject.h> -#include <linux/gfp.h> +#include <linux/slab.h> static struct { struct fault_attr attr; u32 ignore_gfp_wait; + int cache_filter; #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS struct dentry *ignore_gfp_wait_file; + struct dentry *cache_filter_file; #endif } failslab = { .attr = FAULT_ATTR_INITIALIZER, .ignore_gfp_wait = 1, + .cache_filter = 0, }; -bool should_failslab(size_t size, gfp_t gfpflags) +bool should_failslab(size_t size, gfp_t gfpflags, unsigned long cache_flags) { if (gfpflags & __GFP_NOFAIL) return false; @@ -20,6 +23,9 @@ bool should_failslab(size_t size, gfp_t gfpflags) if (failslab.ignore_gfp_wait && (gfpflags & __GFP_WAIT)) return false; + if (failslab.cache_filter && !(cache_flags & SLAB_FAILSLAB)) + return false; + return should_fail(&failslab.attr, size); } @@ -30,7 +36,6 @@ static int __init setup_failslab(char *str) __setup("failslab=", setup_failslab); #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS - static int __init failslab_debugfs_init(void) { mode_t mode = S_IFREG | S_IRUSR | S_IWUSR; @@ -46,8 +51,14 @@ static int __init failslab_debugfs_init(void) debugfs_create_bool("ignore-gfp-wait", mode, dir, &failslab.ignore_gfp_wait); - if (!failslab.ignore_gfp_wait_file) { + failslab.cache_filter_file = + debugfs_create_bool("cache-filter", mode, dir, + &failslab.cache_filter); + + if (!failslab.ignore_gfp_wait_file || + !failslab.cache_filter_file) { err = -ENOMEM; + debugfs_remove(failslab.cache_filter_file); debugfs_remove(failslab.ignore_gfp_wait_file); cleanup_fault_attr_dentries(&failslab.attr); } diff --git a/mm/filemap.c b/mm/filemap.c index ef169f37156d..140ebda9640f 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -10,13 +10,13 @@ * the NFS filesystem used to do this differently, for example) */ #include <linux/module.h> -#include <linux/slab.h> #include <linux/compiler.h> #include <linux/fs.h> #include <linux/uaccess.h> #include <linux/aio.h> #include <linux/capability.h> #include <linux/kernel_stat.h> +#include <linux/gfp.h> #include <linux/mm.h> #include <linux/swap.h> #include <linux/mman.h> @@ -260,27 +260,27 @@ int filemap_flush(struct address_space *mapping) EXPORT_SYMBOL(filemap_flush); /** - * wait_on_page_writeback_range - wait for writeback to complete - * @mapping: target address_space - * @start: beginning page index - * @end: ending page index + * filemap_fdatawait_range - wait for writeback to complete + * @mapping: address space structure to wait for + * @start_byte: offset in bytes where the range starts + * @end_byte: offset in bytes where the range ends (inclusive) * - * Wait for writeback to complete against pages indexed by start->end - * inclusive + * Walk the list of under-writeback pages of the given address space + * in the given range and wait for all of them. */ -int wait_on_page_writeback_range(struct address_space *mapping, - pgoff_t start, pgoff_t end) +int filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte, + loff_t end_byte) { + pgoff_t index = start_byte >> PAGE_CACHE_SHIFT; + pgoff_t end = end_byte >> PAGE_CACHE_SHIFT; struct pagevec pvec; int nr_pages; int ret = 0; - pgoff_t index; - if (end < start) + if (end_byte < start_byte) return 0; pagevec_init(&pvec, 0); - index = start; while ((index <= end) && (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, PAGECACHE_TAG_WRITEBACK, @@ -310,25 +310,6 @@ int wait_on_page_writeback_range(struct address_space *mapping, return ret; } - -/** - * filemap_fdatawait_range - wait for all under-writeback pages to complete in a given range - * @mapping: address space structure to wait for - * @start: offset in bytes where the range starts - * @end: offset in bytes where the range ends (inclusive) - * - * Walk the list of under-writeback pages of the given address space - * in the given range and wait for all of them. - * - * This is just a simple wrapper so that callers don't have to convert offsets - * to page indexes themselves - */ -int filemap_fdatawait_range(struct address_space *mapping, loff_t start, - loff_t end) -{ - return wait_on_page_writeback_range(mapping, start >> PAGE_CACHE_SHIFT, - end >> PAGE_CACHE_SHIFT); -} EXPORT_SYMBOL(filemap_fdatawait_range); /** @@ -345,8 +326,7 @@ int filemap_fdatawait(struct address_space *mapping) if (i_size == 0) return 0; - return wait_on_page_writeback_range(mapping, 0, - (i_size - 1) >> PAGE_CACHE_SHIFT); + return filemap_fdatawait_range(mapping, 0, i_size - 1); } EXPORT_SYMBOL(filemap_fdatawait); @@ -393,9 +373,8 @@ int filemap_write_and_wait_range(struct address_space *mapping, WB_SYNC_ALL); /* See comment of filemap_write_and_wait() */ if (err != -EIO) { - int err2 = wait_on_page_writeback_range(mapping, - lstart >> PAGE_CACHE_SHIFT, - lend >> PAGE_CACHE_SHIFT); + int err2 = filemap_fdatawait_range(mapping, + lstart, lend); if (!err) err = err2; } @@ -1138,7 +1117,7 @@ readpage: if (!PageUptodate(page)) { if (page->mapping == NULL) { /* - * invalidate_inode_pages got it + * invalidate_mapping_pages got it */ unlock_page(page); page_cache_release(page); @@ -1655,14 +1634,15 @@ EXPORT_SYMBOL(generic_file_readonly_mmap); static struct page *__read_cache_page(struct address_space *mapping, pgoff_t index, int (*filler)(void *,struct page*), - void *data) + void *data, + gfp_t gfp) { struct page *page; int err; repeat: page = find_get_page(mapping, index); if (!page) { - page = page_cache_alloc_cold(mapping); + page = __page_cache_alloc(gfp | __GFP_COLD); if (!page) return ERR_PTR(-ENOMEM); err = add_to_page_cache_lru(page, mapping, index, GFP_KERNEL); @@ -1682,31 +1662,18 @@ repeat: return page; } -/** - * read_cache_page_async - read into page cache, fill it if needed - * @mapping: the page's address_space - * @index: the page index - * @filler: function to perform the read - * @data: destination for read data - * - * Same as read_cache_page, but don't wait for page to become unlocked - * after submitting it to the filler. - * - * Read into the page cache. If a page already exists, and PageUptodate() is - * not set, try to fill the page but don't wait for it to become unlocked. - * - * If the page does not get brought uptodate, return -EIO. - */ -struct page *read_cache_page_async(struct address_space *mapping, +static struct page *do_read_cache_page(struct address_space *mapping, pgoff_t index, int (*filler)(void *,struct page*), - void *data) + void *data, + gfp_t gfp) + { struct page *page; int err; retry: - page = __read_cache_page(mapping, index, filler, data); + page = __read_cache_page(mapping, index, filler, data, gfp); if (IS_ERR(page)) return page; if (PageUptodate(page)) @@ -1731,8 +1698,67 @@ out: mark_page_accessed(page); return page; } + +/** + * read_cache_page_async - read into page cache, fill it if needed + * @mapping: the page's address_space + * @index: the page index + * @filler: function to perform the read + * @data: destination for read data + * + * Same as read_cache_page, but don't wait for page to become unlocked + * after submitting it to the filler. + * + * Read into the page cache. If a page already exists, and PageUptodate() is + * not set, try to fill the page but don't wait for it to become unlocked. + * + * If the page does not get brought uptodate, return -EIO. + */ +struct page *read_cache_page_async(struct address_space *mapping, + pgoff_t index, + int (*filler)(void *,struct page*), + void *data) +{ + return do_read_cache_page(mapping, index, filler, data, mapping_gfp_mask(mapping)); +} EXPORT_SYMBOL(read_cache_page_async); +static struct page *wait_on_page_read(struct page *page) +{ + if (!IS_ERR(page)) { + wait_on_page_locked(page); + if (!PageUptodate(page)) { + page_cache_release(page); + page = ERR_PTR(-EIO); + } + } + return page; +} + +/** + * read_cache_page_gfp - read into page cache, using specified page allocation flags. + * @mapping: the page's address_space + * @index: the page index + * @gfp: the page allocator flags to use if allocating + * + * This is the same as "read_mapping_page(mapping, index, NULL)", but with + * any new page allocations done using the specified allocation flags. Note + * that the Radix tree operations will still use GFP_KERNEL, so you can't + * expect to do this atomically or anything like that - but you can pass in + * other page requirements. + * + * If the page does not get brought uptodate, return -EIO. + */ +struct page *read_cache_page_gfp(struct address_space *mapping, + pgoff_t index, + gfp_t gfp) +{ + filler_t *filler = (filler_t *)mapping->a_ops->readpage; + + return wait_on_page_read(do_read_cache_page(mapping, index, filler, NULL, gfp)); +} +EXPORT_SYMBOL(read_cache_page_gfp); + /** * read_cache_page - read into page cache, fill it if needed * @mapping: the page's address_space @@ -1750,18 +1776,7 @@ struct page *read_cache_page(struct address_space *mapping, int (*filler)(void *,struct page*), void *data) { - struct page *page; - - page = read_cache_page_async(mapping, index, filler, data); - if (IS_ERR(page)) - goto out; - wait_on_page_locked(page); - if (!PageUptodate(page)) { - page_cache_release(page); - page = ERR_PTR(-EIO); - } - out: - return page; + return wait_on_page_read(read_cache_page_async(mapping, index, filler, data)); } EXPORT_SYMBOL(read_cache_page); @@ -1844,7 +1859,7 @@ static size_t __iovec_copy_from_user_inatomic(char *vaddr, /* * Copy as much as we can into the page and return the number of bytes which - * were sucessfully copied. If a fault is encountered then return the number of + * were successfully copied. If a fault is encountered then return the number of * bytes which were copied. */ size_t iov_iter_copy_from_user_atomic(struct page *page, @@ -1971,7 +1986,7 @@ EXPORT_SYMBOL(iov_iter_single_seg_count); inline int generic_write_checks(struct file *file, loff_t *pos, size_t *count, int isblk) { struct inode *inode = file->f_mapping->host; - unsigned long limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur; + unsigned long limit = rlimit(RLIMIT_FSIZE); if (unlikely(*pos < 0)) return -EINVAL; @@ -2217,6 +2232,9 @@ again: if (unlikely(status)) break; + if (mapping_writably_mapped(mapping)) + flush_dcache_page(page); + pagefault_disable(); copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes); pagefault_enable(); @@ -2261,7 +2279,6 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, size_t count, ssize_t written) { struct file *file = iocb->ki_filp; - struct address_space *mapping = file->f_mapping; ssize_t status; struct iov_iter i; @@ -2273,15 +2290,6 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, *ppos = pos + status; } - /* - * If we get here for O_DIRECT writes then we must have fallen through - * to buffered writes (block instantiation inside i_size). So we sync - * the file data here, to try to honour O_DIRECT expectations. - */ - if (unlikely(file->f_flags & O_DIRECT) && written) - status = filemap_write_and_wait_range(mapping, - pos, pos + written - 1); - return written ? written : status; } EXPORT_SYMBOL(generic_file_buffered_write); @@ -2380,10 +2388,7 @@ ssize_t __generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov, * semantics. */ endbyte = pos + written_buffered - written - 1; - err = do_sync_mapping_range(file->f_mapping, pos, endbyte, - SYNC_FILE_RANGE_WAIT_BEFORE| - SYNC_FILE_RANGE_WRITE| - SYNC_FILE_RANGE_WAIT_AFTER); + err = filemap_write_and_wait_range(file->f_mapping, pos, endbyte); if (err == 0) { written = written_buffered; invalidate_mapping_pages(mapping, diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c index 1888b2d71bb8..83364df74a33 100644 --- a/mm/filemap_xip.c +++ b/mm/filemap_xip.c @@ -17,6 +17,7 @@ #include <linux/sched.h> #include <linux/seqlock.h> #include <linux/mutex.h> +#include <linux/gfp.h> #include <asm/tlbflush.h> #include <asm/io.h> @@ -194,7 +195,7 @@ retry: flush_cache_page(vma, address, pte_pfn(*pte)); pteval = ptep_clear_flush_notify(vma, address, pte); page_remove_rmap(page); - dec_mm_counter(mm, file_rss); + dec_mm_counter(mm, MM_FILEPAGES); BUG_ON(pte_dirty(pteval)); pte_unmap_unlock(pte, ptl); page_cache_release(page); diff --git a/mm/fremap.c b/mm/fremap.c index b6ec85abbb39..46f5dacf90a2 100644 --- a/mm/fremap.c +++ b/mm/fremap.c @@ -40,7 +40,7 @@ static void zap_pte(struct mm_struct *mm, struct vm_area_struct *vma, page_remove_rmap(page); page_cache_release(page); update_hiwater_rss(mm); - dec_mm_counter(mm, file_rss); + dec_mm_counter(mm, MM_FILEPAGES); } } else { if (!pte_file(pte)) diff --git a/mm/highmem.c b/mm/highmem.c index 9c1e627f282e..bed8a8bfd01f 100644 --- a/mm/highmem.c +++ b/mm/highmem.c @@ -220,7 +220,7 @@ EXPORT_SYMBOL(kmap_high); * @page: &struct page to pin * * Returns the page's current virtual memory address, or NULL if no mapping - * exists. When and only when a non null address is returned then a + * exists. If and only if a non null address is returned then a * matching call to kunmap_high() is necessary. * * This can be called from any context. diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 5d7601b02874..6034dc9e9796 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2,7 +2,6 @@ * Generic hugetlb support. * (C) William Irwin, April 2004 */ -#include <linux/gfp.h> #include <linux/list.h> #include <linux/init.h> #include <linux/module.h> @@ -18,12 +17,14 @@ #include <linux/mutex.h> #include <linux/bootmem.h> #include <linux/sysfs.h> +#include <linux/slab.h> #include <asm/page.h> #include <asm/pgtable.h> #include <asm/io.h> #include <linux/hugetlb.h> +#include <linux/node.h> #include "internal.h" const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL; @@ -401,7 +402,7 @@ static void clear_huge_page(struct page *page, { int i; - if (unlikely(sz > MAX_ORDER_NR_PAGES)) { + if (unlikely(sz/PAGE_SIZE > MAX_ORDER_NR_PAGES)) { clear_gigantic_page(page, addr, sz); return; } @@ -622,42 +623,66 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid) } /* - * Use a helper variable to find the next node and then - * copy it back to next_nid_to_alloc afterwards: - * otherwise there's a window in which a racer might - * pass invalid nid MAX_NUMNODES to alloc_pages_exact_node. - * But we don't need to use a spin_lock here: it really - * doesn't matter if occasionally a racer chooses the - * same nid as we do. Move nid forward in the mask even - * if we just successfully allocated a hugepage so that - * the next caller gets hugepages on the next node. + * common helper functions for hstate_next_node_to_{alloc|free}. + * We may have allocated or freed a huge page based on a different + * nodes_allowed previously, so h->next_node_to_{alloc|free} might + * be outside of *nodes_allowed. Ensure that we use an allowed + * node for alloc or free. */ -static int hstate_next_node_to_alloc(struct hstate *h) +static int next_node_allowed(int nid, nodemask_t *nodes_allowed) { - int next_nid; - next_nid = next_node(h->next_nid_to_alloc, node_online_map); - if (next_nid == MAX_NUMNODES) - next_nid = first_node(node_online_map); - h->next_nid_to_alloc = next_nid; - return next_nid; + nid = next_node(nid, *nodes_allowed); + if (nid == MAX_NUMNODES) + nid = first_node(*nodes_allowed); + VM_BUG_ON(nid >= MAX_NUMNODES); + + return nid; +} + +static int get_valid_node_allowed(int nid, nodemask_t *nodes_allowed) +{ + if (!node_isset(nid, *nodes_allowed)) + nid = next_node_allowed(nid, nodes_allowed); + return nid; +} + +/* + * returns the previously saved node ["this node"] from which to + * allocate a persistent huge page for the pool and advance the + * next node from which to allocate, handling wrap at end of node + * mask. + */ +static int hstate_next_node_to_alloc(struct hstate *h, + nodemask_t *nodes_allowed) +{ + int nid; + + VM_BUG_ON(!nodes_allowed); + + nid = get_valid_node_allowed(h->next_nid_to_alloc, nodes_allowed); + h->next_nid_to_alloc = next_node_allowed(nid, nodes_allowed); + + return nid; } -static int alloc_fresh_huge_page(struct hstate *h) +static int alloc_fresh_huge_page(struct hstate *h, nodemask_t *nodes_allowed) { struct page *page; int start_nid; int next_nid; int ret = 0; - start_nid = h->next_nid_to_alloc; + start_nid = hstate_next_node_to_alloc(h, nodes_allowed); next_nid = start_nid; do { page = alloc_fresh_huge_page_node(h, next_nid); - if (page) + if (page) { ret = 1; - next_nid = hstate_next_node_to_alloc(h); - } while (!page && next_nid != start_nid); + break; + } + next_nid = hstate_next_node_to_alloc(h, nodes_allowed); + } while (next_nid != start_nid); if (ret) count_vm_event(HTLB_BUDDY_PGALLOC); @@ -668,17 +693,21 @@ static int alloc_fresh_huge_page(struct hstate *h) } /* - * helper for free_pool_huge_page() - find next node - * from which to free a huge page + * helper for free_pool_huge_page() - return the previously saved + * node ["this node"] from which to free a huge page. Advance the + * next node id whether or not we find a free huge page to free so + * that the next attempt to free addresses the next node. */ -static int hstate_next_node_to_free(struct hstate *h) +static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed) { - int next_nid; - next_nid = next_node(h->next_nid_to_free, node_online_map); - if (next_nid == MAX_NUMNODES) - next_nid = first_node(node_online_map); - h->next_nid_to_free = next_nid; - return next_nid; + int nid; + + VM_BUG_ON(!nodes_allowed); + + nid = get_valid_node_allowed(h->next_nid_to_free, nodes_allowed); + h->next_nid_to_free = next_node_allowed(nid, nodes_allowed); + + return nid; } /* @@ -687,13 +716,14 @@ static int hstate_next_node_to_free(struct hstate *h) * balanced over allowed nodes. * Called with hugetlb_lock locked. */ -static int free_pool_huge_page(struct hstate *h, bool acct_surplus) +static int free_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed, + bool acct_surplus) { int start_nid; int next_nid; int ret = 0; - start_nid = h->next_nid_to_free; + start_nid = hstate_next_node_to_free(h, nodes_allowed); next_nid = start_nid; do { @@ -715,9 +745,10 @@ static int free_pool_huge_page(struct hstate *h, bool acct_surplus) } update_and_free_page(h, page); ret = 1; + break; } - next_nid = hstate_next_node_to_free(h); - } while (!ret && next_nid != start_nid); + next_nid = hstate_next_node_to_free(h, nodes_allowed); + } while (next_nid != start_nid); return ret; } @@ -911,14 +942,14 @@ static void return_unused_surplus_pages(struct hstate *h, /* * We want to release as many surplus pages as possible, spread - * evenly across all nodes. Iterate across all nodes until we - * can no longer free unreserved surplus pages. This occurs when - * the nodes with surplus pages have no free pages. - * free_pool_huge_page() will balance the the frees across the - * on-line nodes for us and will handle the hstate accounting. + * evenly across all nodes with memory. Iterate across these nodes + * until we can no longer free unreserved surplus pages. This occurs + * when the nodes with surplus pages have no free pages. + * free_pool_huge_page() will balance the the freed pages across the + * on-line nodes with memory and will handle the hstate accounting. */ while (nr_pages--) { - if (!free_pool_huge_page(h, 1)) + if (!free_pool_huge_page(h, &node_states[N_HIGH_MEMORY], 1)) break; } } @@ -1022,16 +1053,16 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, int __weak alloc_bootmem_huge_page(struct hstate *h) { struct huge_bootmem_page *m; - int nr_nodes = nodes_weight(node_online_map); + int nr_nodes = nodes_weight(node_states[N_HIGH_MEMORY]); while (nr_nodes) { void *addr; addr = __alloc_bootmem_node_nopanic( - NODE_DATA(h->next_nid_to_alloc), + NODE_DATA(hstate_next_node_to_alloc(h, + &node_states[N_HIGH_MEMORY])), huge_page_size(h), huge_page_size(h), 0); - hstate_next_node_to_alloc(h); if (addr) { /* * Use the beginning of the huge page to store the @@ -1084,7 +1115,8 @@ static void __init hugetlb_hstate_alloc_pages(struct hstate *h) if (h->order >= MAX_ORDER) { if (!alloc_bootmem_huge_page(h)) break; - } else if (!alloc_fresh_huge_page(h)) + } else if (!alloc_fresh_huge_page(h, + &node_states[N_HIGH_MEMORY])) break; } h->max_huge_pages = i; @@ -1126,14 +1158,15 @@ static void __init report_hugepages(void) } #ifdef CONFIG_HIGHMEM -static void try_to_free_low(struct hstate *h, unsigned long count) +static void try_to_free_low(struct hstate *h, unsigned long count, + nodemask_t *nodes_allowed) { int i; if (h->order >= MAX_ORDER) return; - for (i = 0; i < MAX_NUMNODES; ++i) { + for_each_node_mask(i, *nodes_allowed) { struct page *page, *next; struct list_head *freel = &h->hugepage_freelists[i]; list_for_each_entry_safe(page, next, freel, lru) { @@ -1149,7 +1182,8 @@ static void try_to_free_low(struct hstate *h, unsigned long count) } } #else -static inline void try_to_free_low(struct hstate *h, unsigned long count) +static inline void try_to_free_low(struct hstate *h, unsigned long count, + nodemask_t *nodes_allowed) { } #endif @@ -1159,7 +1193,8 @@ static inline void try_to_free_low(struct hstate *h, unsigned long count) * balanced by operating on them in a round-robin fashion. * Returns 1 if an adjustment was made. */ -static int adjust_pool_surplus(struct hstate *h, int delta) +static int adjust_pool_surplus(struct hstate *h, nodemask_t *nodes_allowed, + int delta) { int start_nid, next_nid; int ret = 0; @@ -1167,29 +1202,33 @@ static int adjust_pool_surplus(struct hstate *h, int delta) VM_BUG_ON(delta != -1 && delta != 1); if (delta < 0) - start_nid = h->next_nid_to_alloc; + start_nid = hstate_next_node_to_alloc(h, nodes_allowed); else - start_nid = h->next_nid_to_free; + start_nid = hstate_next_node_to_free(h, nodes_allowed); next_nid = start_nid; do { int nid = next_nid; if (delta < 0) { - next_nid = hstate_next_node_to_alloc(h); /* * To shrink on this node, there must be a surplus page */ - if (!h->surplus_huge_pages_node[nid]) + if (!h->surplus_huge_pages_node[nid]) { + next_nid = hstate_next_node_to_alloc(h, + nodes_allowed); continue; + } } if (delta > 0) { - next_nid = hstate_next_node_to_free(h); /* * Surplus cannot exceed the total number of pages */ if (h->surplus_huge_pages_node[nid] >= - h->nr_huge_pages_node[nid]) + h->nr_huge_pages_node[nid]) { + next_nid = hstate_next_node_to_free(h, + nodes_allowed); continue; + } } h->surplus_huge_pages += delta; @@ -1202,7 +1241,8 @@ static int adjust_pool_surplus(struct hstate *h, int delta) } #define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages) -static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count) +static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count, + nodemask_t *nodes_allowed) { unsigned long min_count, ret; @@ -1222,7 +1262,7 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count) */ spin_lock(&hugetlb_lock); while (h->surplus_huge_pages && count > persistent_huge_pages(h)) { - if (!adjust_pool_surplus(h, -1)) + if (!adjust_pool_surplus(h, nodes_allowed, -1)) break; } @@ -1233,11 +1273,14 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count) * and reducing the surplus. */ spin_unlock(&hugetlb_lock); - ret = alloc_fresh_huge_page(h); + ret = alloc_fresh_huge_page(h, nodes_allowed); spin_lock(&hugetlb_lock); if (!ret) goto out; + /* Bail for signals. Probably ctrl-c from user */ + if (signal_pending(current)) + goto out; } /* @@ -1257,13 +1300,13 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count) */ min_count = h->resv_huge_pages + h->nr_huge_pages - h->free_huge_pages; min_count = max(count, min_count); - try_to_free_low(h, min_count); + try_to_free_low(h, min_count, nodes_allowed); while (min_count < persistent_huge_pages(h)) { - if (!free_pool_huge_page(h, 0)) + if (!free_pool_huge_page(h, nodes_allowed, 0)) break; } while (count < persistent_huge_pages(h)) { - if (!adjust_pool_surplus(h, 1)) + if (!adjust_pool_surplus(h, nodes_allowed, 1)) break; } out: @@ -1282,43 +1325,117 @@ out: static struct kobject *hugepages_kobj; static struct kobject *hstate_kobjs[HUGE_MAX_HSTATE]; -static struct hstate *kobj_to_hstate(struct kobject *kobj) +static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp); + +static struct hstate *kobj_to_hstate(struct kobject *kobj, int *nidp) { int i; + for (i = 0; i < HUGE_MAX_HSTATE; i++) - if (hstate_kobjs[i] == kobj) + if (hstate_kobjs[i] == kobj) { + if (nidp) + *nidp = NUMA_NO_NODE; return &hstates[i]; - BUG(); - return NULL; + } + + return kobj_to_node_hstate(kobj, nidp); } -static ssize_t nr_hugepages_show(struct kobject *kobj, +static ssize_t nr_hugepages_show_common(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - struct hstate *h = kobj_to_hstate(kobj); - return sprintf(buf, "%lu\n", h->nr_huge_pages); + struct hstate *h; + unsigned long nr_huge_pages; + int nid; + + h = kobj_to_hstate(kobj, &nid); + if (nid == NUMA_NO_NODE) + nr_huge_pages = h->nr_huge_pages; + else + nr_huge_pages = h->nr_huge_pages_node[nid]; + + return sprintf(buf, "%lu\n", nr_huge_pages); } -static ssize_t nr_hugepages_store(struct kobject *kobj, - struct kobj_attribute *attr, const char *buf, size_t count) +static ssize_t nr_hugepages_store_common(bool obey_mempolicy, + struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t len) { int err; - unsigned long input; - struct hstate *h = kobj_to_hstate(kobj); + int nid; + unsigned long count; + struct hstate *h; + NODEMASK_ALLOC(nodemask_t, nodes_allowed, GFP_KERNEL | __GFP_NORETRY); - err = strict_strtoul(buf, 10, &input); + err = strict_strtoul(buf, 10, &count); if (err) return 0; - h->max_huge_pages = set_max_huge_pages(h, input); + h = kobj_to_hstate(kobj, &nid); + if (nid == NUMA_NO_NODE) { + /* + * global hstate attribute + */ + if (!(obey_mempolicy && + init_nodemask_of_mempolicy(nodes_allowed))) { + NODEMASK_FREE(nodes_allowed); + nodes_allowed = &node_states[N_HIGH_MEMORY]; + } + } else if (nodes_allowed) { + /* + * per node hstate attribute: adjust count to global, + * but restrict alloc/free to the specified node. + */ + count += h->nr_huge_pages - h->nr_huge_pages_node[nid]; + init_nodemask_of_node(nodes_allowed, nid); + } else + nodes_allowed = &node_states[N_HIGH_MEMORY]; + + h->max_huge_pages = set_max_huge_pages(h, count, nodes_allowed); - return count; + if (nodes_allowed != &node_states[N_HIGH_MEMORY]) + NODEMASK_FREE(nodes_allowed); + + return len; +} + +static ssize_t nr_hugepages_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return nr_hugepages_show_common(kobj, attr, buf); +} + +static ssize_t nr_hugepages_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t len) +{ + return nr_hugepages_store_common(false, kobj, attr, buf, len); } HSTATE_ATTR(nr_hugepages); +#ifdef CONFIG_NUMA + +/* + * hstate attribute for optionally mempolicy-based constraint on persistent + * huge page alloc/free. + */ +static ssize_t nr_hugepages_mempolicy_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return nr_hugepages_show_common(kobj, attr, buf); +} + +static ssize_t nr_hugepages_mempolicy_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t len) +{ + return nr_hugepages_store_common(true, kobj, attr, buf, len); +} +HSTATE_ATTR(nr_hugepages_mempolicy); +#endif + + static ssize_t nr_overcommit_hugepages_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - struct hstate *h = kobj_to_hstate(kobj); + struct hstate *h = kobj_to_hstate(kobj, NULL); return sprintf(buf, "%lu\n", h->nr_overcommit_huge_pages); } static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj, @@ -1326,7 +1443,7 @@ static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj, { int err; unsigned long input; - struct hstate *h = kobj_to_hstate(kobj); + struct hstate *h = kobj_to_hstate(kobj, NULL); err = strict_strtoul(buf, 10, &input); if (err) @@ -1343,15 +1460,24 @@ HSTATE_ATTR(nr_overcommit_hugepages); static ssize_t free_hugepages_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - struct hstate *h = kobj_to_hstate(kobj); - return sprintf(buf, "%lu\n", h->free_huge_pages); + struct hstate *h; + unsigned long free_huge_pages; + int nid; + + h = kobj_to_hstate(kobj, &nid); + if (nid == NUMA_NO_NODE) + free_huge_pages = h->free_huge_pages; + else + free_huge_pages = h->free_huge_pages_node[nid]; + + return sprintf(buf, "%lu\n", free_huge_pages); } HSTATE_ATTR_RO(free_hugepages); static ssize_t resv_hugepages_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - struct hstate *h = kobj_to_hstate(kobj); + struct hstate *h = kobj_to_hstate(kobj, NULL); return sprintf(buf, "%lu\n", h->resv_huge_pages); } HSTATE_ATTR_RO(resv_hugepages); @@ -1359,8 +1485,17 @@ HSTATE_ATTR_RO(resv_hugepages); static ssize_t surplus_hugepages_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - struct hstate *h = kobj_to_hstate(kobj); - return sprintf(buf, "%lu\n", h->surplus_huge_pages); + struct hstate *h; + unsigned long surplus_huge_pages; + int nid; + + h = kobj_to_hstate(kobj, &nid); + if (nid == NUMA_NO_NODE) + surplus_huge_pages = h->surplus_huge_pages; + else + surplus_huge_pages = h->surplus_huge_pages_node[nid]; + + return sprintf(buf, "%lu\n", surplus_huge_pages); } HSTATE_ATTR_RO(surplus_hugepages); @@ -1370,6 +1505,9 @@ static struct attribute *hstate_attrs[] = { &free_hugepages_attr.attr, &resv_hugepages_attr.attr, &surplus_hugepages_attr.attr, +#ifdef CONFIG_NUMA + &nr_hugepages_mempolicy_attr.attr, +#endif NULL, }; @@ -1377,19 +1515,20 @@ static struct attribute_group hstate_attr_group = { .attrs = hstate_attrs, }; -static int __init hugetlb_sysfs_add_hstate(struct hstate *h) +static int hugetlb_sysfs_add_hstate(struct hstate *h, struct kobject *parent, + struct kobject **hstate_kobjs, + struct attribute_group *hstate_attr_group) { int retval; + int hi = h - hstates; - hstate_kobjs[h - hstates] = kobject_create_and_add(h->name, - hugepages_kobj); - if (!hstate_kobjs[h - hstates]) + hstate_kobjs[hi] = kobject_create_and_add(h->name, parent); + if (!hstate_kobjs[hi]) return -ENOMEM; - retval = sysfs_create_group(hstate_kobjs[h - hstates], - &hstate_attr_group); + retval = sysfs_create_group(hstate_kobjs[hi], hstate_attr_group); if (retval) - kobject_put(hstate_kobjs[h - hstates]); + kobject_put(hstate_kobjs[hi]); return retval; } @@ -1404,17 +1543,184 @@ static void __init hugetlb_sysfs_init(void) return; for_each_hstate(h) { - err = hugetlb_sysfs_add_hstate(h); + err = hugetlb_sysfs_add_hstate(h, hugepages_kobj, + hstate_kobjs, &hstate_attr_group); if (err) printk(KERN_ERR "Hugetlb: Unable to add hstate %s", h->name); } } +#ifdef CONFIG_NUMA + +/* + * node_hstate/s - associate per node hstate attributes, via their kobjects, + * with node sysdevs in node_devices[] using a parallel array. The array + * index of a node sysdev or _hstate == node id. + * This is here to avoid any static dependency of the node sysdev driver, in + * the base kernel, on the hugetlb module. + */ +struct node_hstate { + struct kobject *hugepages_kobj; + struct kobject *hstate_kobjs[HUGE_MAX_HSTATE]; +}; +struct node_hstate node_hstates[MAX_NUMNODES]; + +/* + * A subset of global hstate attributes for node sysdevs + */ +static struct attribute *per_node_hstate_attrs[] = { + &nr_hugepages_attr.attr, + &free_hugepages_attr.attr, + &surplus_hugepages_attr.attr, + NULL, +}; + +static struct attribute_group per_node_hstate_attr_group = { + .attrs = per_node_hstate_attrs, +}; + +/* + * kobj_to_node_hstate - lookup global hstate for node sysdev hstate attr kobj. + * Returns node id via non-NULL nidp. + */ +static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp) +{ + int nid; + + for (nid = 0; nid < nr_node_ids; nid++) { + struct node_hstate *nhs = &node_hstates[nid]; + int i; + for (i = 0; i < HUGE_MAX_HSTATE; i++) + if (nhs->hstate_kobjs[i] == kobj) { + if (nidp) + *nidp = nid; + return &hstates[i]; + } + } + + BUG(); + return NULL; +} + +/* + * Unregister hstate attributes from a single node sysdev. + * No-op if no hstate attributes attached. + */ +void hugetlb_unregister_node(struct node *node) +{ + struct hstate *h; + struct node_hstate *nhs = &node_hstates[node->sysdev.id]; + + if (!nhs->hugepages_kobj) + return; /* no hstate attributes */ + + for_each_hstate(h) + if (nhs->hstate_kobjs[h - hstates]) { + kobject_put(nhs->hstate_kobjs[h - hstates]); + nhs->hstate_kobjs[h - hstates] = NULL; + } + + kobject_put(nhs->hugepages_kobj); + nhs->hugepages_kobj = NULL; +} + +/* + * hugetlb module exit: unregister hstate attributes from node sysdevs + * that have them. + */ +static void hugetlb_unregister_all_nodes(void) +{ + int nid; + + /* + * disable node sysdev registrations. + */ + register_hugetlbfs_with_node(NULL, NULL); + + /* + * remove hstate attributes from any nodes that have them. + */ + for (nid = 0; nid < nr_node_ids; nid++) + hugetlb_unregister_node(&node_devices[nid]); +} + +/* + * Register hstate attributes for a single node sysdev. + * No-op if attributes already registered. + */ +void hugetlb_register_node(struct node *node) +{ + struct hstate *h; + struct node_hstate *nhs = &node_hstates[node->sysdev.id]; + int err; + + if (nhs->hugepages_kobj) + return; /* already allocated */ + + nhs->hugepages_kobj = kobject_create_and_add("hugepages", + &node->sysdev.kobj); + if (!nhs->hugepages_kobj) + return; + + for_each_hstate(h) { + err = hugetlb_sysfs_add_hstate(h, nhs->hugepages_kobj, + nhs->hstate_kobjs, + &per_node_hstate_attr_group); + if (err) { + printk(KERN_ERR "Hugetlb: Unable to add hstate %s" + " for node %d\n", + h->name, node->sysdev.id); + hugetlb_unregister_node(node); + break; + } + } +} + +/* + * hugetlb init time: register hstate attributes for all registered node + * sysdevs of nodes that have memory. All on-line nodes should have + * registered their associated sysdev by this time. + */ +static void hugetlb_register_all_nodes(void) +{ + int nid; + + for_each_node_state(nid, N_HIGH_MEMORY) { + struct node *node = &node_devices[nid]; + if (node->sysdev.id == nid) + hugetlb_register_node(node); + } + + /* + * Let the node sysdev driver know we're here so it can + * [un]register hstate attributes on node hotplug. + */ + register_hugetlbfs_with_node(hugetlb_register_node, + hugetlb_unregister_node); +} +#else /* !CONFIG_NUMA */ + +static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp) +{ + BUG(); + if (nidp) + *nidp = -1; + return NULL; +} + +static void hugetlb_unregister_all_nodes(void) { } + +static void hugetlb_register_all_nodes(void) { } + +#endif + static void __exit hugetlb_exit(void) { struct hstate *h; + hugetlb_unregister_all_nodes(); + for_each_hstate(h) { kobject_put(hstate_kobjs[h - hstates]); } @@ -1449,6 +1755,8 @@ static int __init hugetlb_init(void) hugetlb_sysfs_init(); + hugetlb_register_all_nodes(); + return 0; } module_init(hugetlb_init); @@ -1472,8 +1780,8 @@ void __init hugetlb_add_hstate(unsigned order) h->free_huge_pages = 0; for (i = 0; i < MAX_NUMNODES; ++i) INIT_LIST_HEAD(&h->hugepage_freelists[i]); - h->next_nid_to_alloc = first_node(node_online_map); - h->next_nid_to_free = first_node(node_online_map); + h->next_nid_to_alloc = first_node(node_states[N_HIGH_MEMORY]); + h->next_nid_to_free = first_node(node_states[N_HIGH_MEMORY]); snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB", huge_page_size(h)/1024); @@ -1536,9 +1844,9 @@ static unsigned int cpuset_mems_nr(unsigned int *array) } #ifdef CONFIG_SYSCTL -int hugetlb_sysctl_handler(struct ctl_table *table, int write, - void __user *buffer, - size_t *length, loff_t *ppos) +static int hugetlb_sysctl_handler_common(bool obey_mempolicy, + struct ctl_table *table, int write, + void __user *buffer, size_t *length, loff_t *ppos) { struct hstate *h = &default_hstate; unsigned long tmp; @@ -1550,12 +1858,40 @@ int hugetlb_sysctl_handler(struct ctl_table *table, int write, table->maxlen = sizeof(unsigned long); proc_doulongvec_minmax(table, write, buffer, length, ppos); - if (write) - h->max_huge_pages = set_max_huge_pages(h, tmp); + if (write) { + NODEMASK_ALLOC(nodemask_t, nodes_allowed, + GFP_KERNEL | __GFP_NORETRY); + if (!(obey_mempolicy && + init_nodemask_of_mempolicy(nodes_allowed))) { + NODEMASK_FREE(nodes_allowed); + nodes_allowed = &node_states[N_HIGH_MEMORY]; + } + h->max_huge_pages = set_max_huge_pages(h, tmp, nodes_allowed); + + if (nodes_allowed != &node_states[N_HIGH_MEMORY]) + NODEMASK_FREE(nodes_allowed); + } return 0; } +int hugetlb_sysctl_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *length, loff_t *ppos) +{ + + return hugetlb_sysctl_handler_common(false, table, write, + buffer, length, ppos); +} + +#ifdef CONFIG_NUMA +int hugetlb_mempolicy_sysctl_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *length, loff_t *ppos) +{ + return hugetlb_sysctl_handler_common(true, table, write, + buffer, length, ppos); +} +#endif /* CONFIG_NUMA */ + int hugetlb_treat_movable_handler(struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) @@ -1751,7 +2087,7 @@ static void set_huge_ptep_writable(struct vm_area_struct *vma, entry = pte_mkwrite(pte_mkdirty(huge_ptep_get(ptep))); if (huge_ptep_set_access_flags(vma, address, ptep, entry, 1)) { - update_mmu_cache(vma, address, entry); + update_mmu_cache(vma, address, ptep); } } @@ -1903,6 +2239,12 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, + (vma->vm_pgoff >> PAGE_SHIFT); mapping = (struct address_space *)page_private(page); + /* + * Take the mapping lock for the duration of the table walk. As + * this mapping should be shared between all the VMAs, + * __unmap_hugepage_range() is called as the lock is already held + */ + spin_lock(&mapping->i_mmap_lock); vma_prio_tree_foreach(iter_vma, &iter, &mapping->i_mmap, pgoff, pgoff) { /* Do not unmap the current VMA */ if (iter_vma == vma) @@ -1916,10 +2258,11 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, * from the time of fork. This would look like data corruption */ if (!is_vma_resv_set(iter_vma, HPAGE_RESV_OWNER)) - unmap_hugepage_range(iter_vma, + __unmap_hugepage_range(iter_vma, address, address + huge_page_size(h), page); } + spin_unlock(&mapping->i_mmap_lock); return 1; } @@ -1959,6 +2302,9 @@ retry_avoidcopy: outside_reserve = 1; page_cache_get(old_page); + + /* Drop page_table_lock as buddy allocator may be called */ + spin_unlock(&mm->page_table_lock); new_page = alloc_huge_page(vma, address, outside_reserve); if (IS_ERR(new_page)) { @@ -1976,19 +2322,25 @@ retry_avoidcopy: if (unmap_ref_private(mm, vma, old_page, address)) { BUG_ON(page_count(old_page) != 1); BUG_ON(huge_pte_none(pte)); + spin_lock(&mm->page_table_lock); goto retry_avoidcopy; } WARN_ON_ONCE(1); } + /* Caller expects lock to be held */ + spin_lock(&mm->page_table_lock); return -PTR_ERR(new_page); } - spin_unlock(&mm->page_table_lock); copy_huge_page(new_page, old_page, address, vma); __SetPageUptodate(new_page); - spin_lock(&mm->page_table_lock); + /* + * Retake the page_table_lock to check for racing updates + * before the page tables are altered + */ + spin_lock(&mm->page_table_lock); ptep = huge_pte_offset(mm, address & huge_page_mask(h)); if (likely(pte_same(huge_ptep_get(ptep), pte))) { /* Break COW */ @@ -2206,7 +2558,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, entry = pte_mkyoung(entry); if (huge_ptep_set_access_flags(vma, address, ptep, entry, flags & FAULT_FLAG_WRITE)) - update_mmu_cache(vma, address, entry); + update_mmu_cache(vma, address, ptep); out_page_table_lock: spin_unlock(&mm->page_table_lock); diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c index e1d85137f086..10ea71905c1f 100644 --- a/mm/hwpoison-inject.c +++ b/mm/hwpoison-inject.c @@ -3,18 +3,68 @@ #include <linux/debugfs.h> #include <linux/kernel.h> #include <linux/mm.h> +#include <linux/swap.h> +#include <linux/pagemap.h> +#include "internal.h" -static struct dentry *hwpoison_dir, *corrupt_pfn; +static struct dentry *hwpoison_dir; static int hwpoison_inject(void *data, u64 val) { + unsigned long pfn = val; + struct page *p; + int err; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (!hwpoison_filter_enable) + goto inject; + if (!pfn_valid(pfn)) + return -ENXIO; + + p = pfn_to_page(pfn); + /* + * This implies unable to support free buddy pages. + */ + if (!get_page_unless_zero(p)) + return 0; + + if (!PageLRU(p)) + shake_page(p, 0); + /* + * This implies unable to support non-LRU pages. + */ + if (!PageLRU(p)) + return 0; + + /* + * do a racy check with elevated page count, to make sure PG_hwpoison + * will only be set for the targeted owner (or on a free page). + * We temporarily take page lock for try_get_mem_cgroup_from_page(). + * __memory_failure() will redo the check reliably inside page lock. + */ + lock_page(p); + err = hwpoison_filter(p); + unlock_page(p); + if (err) + return 0; + +inject: + printk(KERN_INFO "Injecting memory failure at pfn %lx\n", pfn); + return __memory_failure(pfn, 18, MF_COUNT_INCREASED); +} + +static int hwpoison_unpoison(void *data, u64 val) +{ if (!capable(CAP_SYS_ADMIN)) return -EPERM; - printk(KERN_INFO "Injecting memory failure at pfn %Lx\n", val); - return __memory_failure(val, 18, 0); + + return unpoison_memory(val); } DEFINE_SIMPLE_ATTRIBUTE(hwpoison_fops, NULL, hwpoison_inject, "%lli\n"); +DEFINE_SIMPLE_ATTRIBUTE(unpoison_fops, NULL, hwpoison_unpoison, "%lli\n"); static void pfn_inject_exit(void) { @@ -24,16 +74,63 @@ static void pfn_inject_exit(void) static int pfn_inject_init(void) { + struct dentry *dentry; + hwpoison_dir = debugfs_create_dir("hwpoison", NULL); if (hwpoison_dir == NULL) return -ENOMEM; - corrupt_pfn = debugfs_create_file("corrupt-pfn", 0600, hwpoison_dir, + + /* + * Note that the below poison/unpoison interfaces do not involve + * hardware status change, hence do not require hardware support. + * They are mainly for testing hwpoison in software level. + */ + dentry = debugfs_create_file("corrupt-pfn", 0600, hwpoison_dir, NULL, &hwpoison_fops); - if (corrupt_pfn == NULL) { - pfn_inject_exit(); - return -ENOMEM; - } + if (!dentry) + goto fail; + + dentry = debugfs_create_file("unpoison-pfn", 0600, hwpoison_dir, + NULL, &unpoison_fops); + if (!dentry) + goto fail; + + dentry = debugfs_create_u32("corrupt-filter-enable", 0600, + hwpoison_dir, &hwpoison_filter_enable); + if (!dentry) + goto fail; + + dentry = debugfs_create_u32("corrupt-filter-dev-major", 0600, + hwpoison_dir, &hwpoison_filter_dev_major); + if (!dentry) + goto fail; + + dentry = debugfs_create_u32("corrupt-filter-dev-minor", 0600, + hwpoison_dir, &hwpoison_filter_dev_minor); + if (!dentry) + goto fail; + + dentry = debugfs_create_u64("corrupt-filter-flags-mask", 0600, + hwpoison_dir, &hwpoison_filter_flags_mask); + if (!dentry) + goto fail; + + dentry = debugfs_create_u64("corrupt-filter-flags-value", 0600, + hwpoison_dir, &hwpoison_filter_flags_value); + if (!dentry) + goto fail; + +#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP + dentry = debugfs_create_u64("corrupt-filter-memcg", 0600, + hwpoison_dir, &hwpoison_filter_memcg); + if (!dentry) + goto fail; +#endif + return 0; +fail: + pfn_inject_exit(); + return -ENOMEM; } module_init(pfn_inject_init); diff --git a/mm/internal.h b/mm/internal.h index 22ec8d2b0fb8..6a697bb97fc5 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -50,6 +50,9 @@ extern void putback_lru_page(struct page *page); */ extern void __free_pages_bootmem(struct page *page, unsigned int order); extern void prep_compound_page(struct page *page, unsigned long order); +#ifdef CONFIG_MEMORY_FAILURE +extern bool is_free_buddy_page(struct page *page); +#endif /* @@ -63,7 +66,7 @@ static inline unsigned long page_order(struct page *page) return page_private(page); } -#ifdef CONFIG_HAVE_MLOCK +#ifdef CONFIG_MMU extern long mlock_vma_pages_range(struct vm_area_struct *vma, unsigned long start, unsigned long end); extern void munlock_vma_pages_range(struct vm_area_struct *vma, @@ -72,22 +75,8 @@ static inline void munlock_vma_pages_all(struct vm_area_struct *vma) { munlock_vma_pages_range(vma, vma->vm_start, vma->vm_end); } -#endif /* - * unevictable_migrate_page() called only from migrate_page_copy() to - * migrate unevictable flag to new page. - * Note that the old page has been isolated from the LRU lists at this - * point so we don't need to worry about LRU statistics. - */ -static inline void unevictable_migrate_page(struct page *new, struct page *old) -{ - if (TestClearPageUnevictable(old)) - SetPageUnevictable(new); -} - -#ifdef CONFIG_HAVE_MLOCKED_PAGE_BIT -/* * Called only in fault path via page_evictable() for a new page * to determine if it's being mapped into a LOCKED vma. * If so, mark page as mlocked. @@ -107,9 +96,10 @@ static inline int is_mlocked_vma(struct vm_area_struct *vma, struct page *page) } /* - * must be called with vma's mmap_sem held for read, and page locked. + * must be called with vma's mmap_sem held for read or write, and page locked. */ extern void mlock_vma_page(struct page *page); +extern void munlock_vma_page(struct page *page); /* * Clear the page's PageMlocked(). This can be useful in a situation where @@ -144,7 +134,7 @@ static inline void mlock_migrate_page(struct page *newpage, struct page *page) } } -#else /* CONFIG_HAVE_MLOCKED_PAGE_BIT */ +#else /* !CONFIG_MMU */ static inline int is_mlocked_vma(struct vm_area_struct *v, struct page *p) { return 0; @@ -153,7 +143,7 @@ static inline void clear_page_mlock(struct page *page) { } static inline void mlock_vma_page(struct page *page) { } static inline void mlock_migrate_page(struct page *new, struct page *old) { } -#endif /* CONFIG_HAVE_MLOCKED_PAGE_BIT */ +#endif /* !CONFIG_MMU */ /* * Return the mem_map entry representing the 'offset' subpage within @@ -260,3 +250,12 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, #define ZONE_RECLAIM_SOME 0 #define ZONE_RECLAIM_SUCCESS 1 #endif + +extern int hwpoison_filter(struct page *p); + +extern u32 hwpoison_filter_dev_major; +extern u32 hwpoison_filter_dev_minor; +extern u64 hwpoison_filter_flags_mask; +extern u64 hwpoison_filter_flags_value; +extern u64 hwpoison_filter_memcg; +extern u32 hwpoison_filter_enable; diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 8bf765c4f58d..2c0d032ac898 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -72,7 +72,6 @@ #include <linux/module.h> #include <linux/kthread.h> #include <linux/prio_tree.h> -#include <linux/gfp.h> #include <linux/fs.h> #include <linux/debugfs.h> #include <linux/seq_file.h> @@ -93,6 +92,7 @@ #include <linux/nodemask.h> #include <linux/mm.h> #include <linux/workqueue.h> +#include <linux/crc32.h> #include <asm/sections.h> #include <asm/processor.h> @@ -108,7 +108,6 @@ #define MSECS_MIN_AGE 5000 /* minimum object age for reporting */ #define SECS_FIRST_SCAN 60 /* delay before the first scan */ #define SECS_SCAN_WAIT 600 /* subsequent auto scanning delay */ -#define GRAY_LIST_PASSES 25 /* maximum number of gray list scans */ #define MAX_SCAN_SIZE 4096 /* maximum size of a scanned block */ #define BYTES_PER_POINTER sizeof(void *) @@ -119,8 +118,8 @@ /* scanning area inside a memory block */ struct kmemleak_scan_area { struct hlist_node node; - unsigned long offset; - size_t length; + unsigned long start; + size_t size; }; #define KMEMLEAK_GREY 0 @@ -149,6 +148,8 @@ struct kmemleak_object { int min_count; /* the total number of pointers found pointing to this object */ int count; + /* checksum for detecting modified objects */ + u32 checksum; /* memory ranges to be scanned inside an object (empty for all) */ struct hlist_head area_list; unsigned long trace[MAX_TRACE]; @@ -164,8 +165,6 @@ struct kmemleak_object { #define OBJECT_REPORTED (1 << 1) /* flag set to not scan the object */ #define OBJECT_NO_SCAN (1 << 2) -/* flag set on newly allocated objects */ -#define OBJECT_NEW (1 << 3) /* number of bytes to print per line; must be 16 or 32 */ #define HEX_ROW_SIZE 16 @@ -241,8 +240,6 @@ struct early_log { const void *ptr; /* allocated/freed memory block */ size_t size; /* memory block size */ int min_count; /* minimum reference count */ - unsigned long offset; /* scan area offset */ - size_t length; /* scan area length */ unsigned long trace[MAX_TRACE]; /* stack trace */ unsigned int trace_len; /* stack trace length */ }; @@ -323,11 +320,6 @@ static bool color_gray(const struct kmemleak_object *object) object->count >= object->min_count; } -static bool color_black(const struct kmemleak_object *object) -{ - return object->min_count == KMEMLEAK_BLACK; -} - /* * Objects are considered unreferenced only if their color is white, they have * not be deleted and have a minimum age to avoid false positives caused by @@ -335,7 +327,7 @@ static bool color_black(const struct kmemleak_object *object) */ static bool unreferenced_object(struct kmemleak_object *object) { - return (object->flags & OBJECT_ALLOCATED) && color_white(object) && + return (color_white(object) && object->flags & OBJECT_ALLOCATED) && time_before_eq(object->jiffies + jiffies_min_age, jiffies_last_scan); } @@ -348,11 +340,13 @@ static void print_unreferenced(struct seq_file *seq, struct kmemleak_object *object) { int i; + unsigned int msecs_age = jiffies_to_msecs(jiffies - object->jiffies); seq_printf(seq, "unreferenced object 0x%08lx (size %zu):\n", object->pointer, object->size); - seq_printf(seq, " comm \"%s\", pid %d, jiffies %lu\n", - object->comm, object->pid, object->jiffies); + seq_printf(seq, " comm \"%s\", pid %d, jiffies %lu (age %d.%03ds)\n", + object->comm, object->pid, object->jiffies, + msecs_age / 1000, msecs_age % 1000); hex_dump_object(seq, object); seq_printf(seq, " backtrace:\n"); @@ -381,6 +375,7 @@ static void dump_object_info(struct kmemleak_object *object) pr_notice(" min_count = %d\n", object->min_count); pr_notice(" count = %d\n", object->count); pr_notice(" flags = 0x%lx\n", object->flags); + pr_notice(" checksum = %d\n", object->checksum); pr_notice(" backtrace:\n"); print_stack_trace(&trace, 4); } @@ -522,12 +517,13 @@ static struct kmemleak_object *create_object(unsigned long ptr, size_t size, INIT_HLIST_HEAD(&object->area_list); spin_lock_init(&object->lock); atomic_set(&object->use_count, 1); - object->flags = OBJECT_ALLOCATED | OBJECT_NEW; + object->flags = OBJECT_ALLOCATED; object->pointer = ptr; object->size = size; object->min_count = min_count; - object->count = -1; /* no color initially */ + object->count = 0; /* white color initially */ object->jiffies = jiffies; + object->checksum = 0; /* task information */ if (in_irq()) { @@ -720,14 +716,13 @@ static void make_black_object(unsigned long ptr) * Add a scanning area to the object. If at least one such area is added, * kmemleak will only scan these ranges rather than the whole memory block. */ -static void add_scan_area(unsigned long ptr, unsigned long offset, - size_t length, gfp_t gfp) +static void add_scan_area(unsigned long ptr, size_t size, gfp_t gfp) { unsigned long flags; struct kmemleak_object *object; struct kmemleak_scan_area *area; - object = find_and_get_object(ptr, 0); + object = find_and_get_object(ptr, 1); if (!object) { kmemleak_warn("Adding scan area to unknown object at 0x%08lx\n", ptr); @@ -741,7 +736,7 @@ static void add_scan_area(unsigned long ptr, unsigned long offset, } spin_lock_irqsave(&object->lock, flags); - if (offset + length > object->size) { + if (ptr + size > object->pointer + object->size) { kmemleak_warn("Scan area larger than object 0x%08lx\n", ptr); dump_object_info(object); kmem_cache_free(scan_area_cache, area); @@ -749,8 +744,8 @@ static void add_scan_area(unsigned long ptr, unsigned long offset, } INIT_HLIST_NODE(&area->node); - area->offset = offset; - area->length = length; + area->start = ptr; + area->size = size; hlist_add_head(&area->node, &object->area_list); out_unlock: @@ -786,7 +781,7 @@ static void object_no_scan(unsigned long ptr) * processed later once kmemleak is fully initialized. */ static void __init log_early(int op_type, const void *ptr, size_t size, - int min_count, unsigned long offset, size_t length) + int min_count) { unsigned long flags; struct early_log *log; @@ -808,8 +803,6 @@ static void __init log_early(int op_type, const void *ptr, size_t size, log->ptr = ptr; log->size = size; log->min_count = min_count; - log->offset = offset; - log->length = length; if (op_type == KMEMLEAK_ALLOC) log->trace_len = __save_stack_trace(log->trace); crt_early_log++; @@ -858,7 +851,7 @@ void __ref kmemleak_alloc(const void *ptr, size_t size, int min_count, if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) create_object((unsigned long)ptr, size, min_count, gfp); else if (atomic_read(&kmemleak_early_log)) - log_early(KMEMLEAK_ALLOC, ptr, size, min_count, 0, 0); + log_early(KMEMLEAK_ALLOC, ptr, size, min_count); } EXPORT_SYMBOL_GPL(kmemleak_alloc); @@ -873,7 +866,7 @@ void __ref kmemleak_free(const void *ptr) if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) delete_object_full((unsigned long)ptr); else if (atomic_read(&kmemleak_early_log)) - log_early(KMEMLEAK_FREE, ptr, 0, 0, 0, 0); + log_early(KMEMLEAK_FREE, ptr, 0, 0); } EXPORT_SYMBOL_GPL(kmemleak_free); @@ -888,7 +881,7 @@ void __ref kmemleak_free_part(const void *ptr, size_t size) if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) delete_object_part((unsigned long)ptr, size); else if (atomic_read(&kmemleak_early_log)) - log_early(KMEMLEAK_FREE_PART, ptr, size, 0, 0, 0); + log_early(KMEMLEAK_FREE_PART, ptr, size, 0); } EXPORT_SYMBOL_GPL(kmemleak_free_part); @@ -903,7 +896,7 @@ void __ref kmemleak_not_leak(const void *ptr) if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) make_gray_object((unsigned long)ptr); else if (atomic_read(&kmemleak_early_log)) - log_early(KMEMLEAK_NOT_LEAK, ptr, 0, 0, 0, 0); + log_early(KMEMLEAK_NOT_LEAK, ptr, 0, 0); } EXPORT_SYMBOL(kmemleak_not_leak); @@ -919,22 +912,21 @@ void __ref kmemleak_ignore(const void *ptr) if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) make_black_object((unsigned long)ptr); else if (atomic_read(&kmemleak_early_log)) - log_early(KMEMLEAK_IGNORE, ptr, 0, 0, 0, 0); + log_early(KMEMLEAK_IGNORE, ptr, 0, 0); } EXPORT_SYMBOL(kmemleak_ignore); /* * Limit the range to be scanned in an allocated memory block. */ -void __ref kmemleak_scan_area(const void *ptr, unsigned long offset, - size_t length, gfp_t gfp) +void __ref kmemleak_scan_area(const void *ptr, size_t size, gfp_t gfp) { pr_debug("%s(0x%p)\n", __func__, ptr); if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) - add_scan_area((unsigned long)ptr, offset, length, gfp); + add_scan_area((unsigned long)ptr, size, gfp); else if (atomic_read(&kmemleak_early_log)) - log_early(KMEMLEAK_SCAN_AREA, ptr, 0, 0, offset, length); + log_early(KMEMLEAK_SCAN_AREA, ptr, size, 0); } EXPORT_SYMBOL(kmemleak_scan_area); @@ -948,11 +940,25 @@ void __ref kmemleak_no_scan(const void *ptr) if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) object_no_scan((unsigned long)ptr); else if (atomic_read(&kmemleak_early_log)) - log_early(KMEMLEAK_NO_SCAN, ptr, 0, 0, 0, 0); + log_early(KMEMLEAK_NO_SCAN, ptr, 0, 0); } EXPORT_SYMBOL(kmemleak_no_scan); /* + * Update an object's checksum and return true if it was modified. + */ +static bool update_checksum(struct kmemleak_object *object) +{ + u32 old_csum = object->checksum; + + if (!kmemcheck_is_obj_initialized(object->pointer, object->size)) + return false; + + object->checksum = crc32(0, (void *)object->pointer, object->size); + return object->checksum != old_csum; +} + +/* * Memory scanning is a long process and it needs to be interruptable. This * function checks whether such interrupt condition occured. */ @@ -1031,11 +1037,14 @@ static void scan_block(void *_start, void *_end, * added to the gray_list. */ object->count++; - if (color_gray(object)) + if (color_gray(object)) { list_add_tail(&object->gray_list, &gray_list); - else - put_object(object); + spin_unlock_irqrestore(&object->lock, flags); + continue; + } + spin_unlock_irqrestore(&object->lock, flags); + put_object(object); } } @@ -1050,8 +1059,8 @@ static void scan_object(struct kmemleak_object *object) unsigned long flags; /* - * Once the object->lock is aquired, the corresponding memory block - * cannot be freed (the same lock is aquired in delete_object). + * Once the object->lock is acquired, the corresponding memory block + * cannot be freed (the same lock is acquired in delete_object). */ spin_lock_irqsave(&object->lock, flags); if (object->flags & OBJECT_NO_SCAN) @@ -1075,14 +1084,47 @@ static void scan_object(struct kmemleak_object *object) } } else hlist_for_each_entry(area, elem, &object->area_list, node) - scan_block((void *)(object->pointer + area->offset), - (void *)(object->pointer + area->offset - + area->length), object, 0); + scan_block((void *)area->start, + (void *)(area->start + area->size), + object, 0); out: spin_unlock_irqrestore(&object->lock, flags); } /* + * Scan the objects already referenced (gray objects). More objects will be + * referenced and, if there are no memory leaks, all the objects are scanned. + */ +static void scan_gray_list(void) +{ + struct kmemleak_object *object, *tmp; + + /* + * The list traversal is safe for both tail additions and removals + * from inside the loop. The kmemleak objects cannot be freed from + * outside the loop because their use_count was incremented. + */ + object = list_entry(gray_list.next, typeof(*object), gray_list); + while (&object->gray_list != &gray_list) { + cond_resched(); + + /* may add new objects to the list */ + if (!scan_should_stop()) + scan_object(object); + + tmp = list_entry(object->gray_list.next, typeof(*object), + gray_list); + + /* remove the object from the list and release it */ + list_del(&object->gray_list); + put_object(object); + + object = tmp; + } + WARN_ON(!list_empty(&gray_list)); +} + +/* * Scan data sections and all the referenced memory blocks allocated via the * kernel's standard allocators. This function must be called with the * scan_mutex held. @@ -1090,10 +1132,9 @@ out: static void kmemleak_scan(void) { unsigned long flags; - struct kmemleak_object *object, *tmp; + struct kmemleak_object *object; int i; int new_leaks = 0; - int gray_list_pass = 0; jiffies_last_scan = jiffies; @@ -1114,7 +1155,6 @@ static void kmemleak_scan(void) #endif /* reset the reference count (whiten the object) */ object->count = 0; - object->flags &= ~OBJECT_NEW; if (color_gray(object) && get_object(object)) list_add_tail(&object->gray_list, &gray_list); @@ -1172,62 +1212,36 @@ static void kmemleak_scan(void) /* * Scan the objects already referenced from the sections scanned - * above. More objects will be referenced and, if there are no memory - * leaks, all the objects will be scanned. The list traversal is safe - * for both tail additions and removals from inside the loop. The - * kmemleak objects cannot be freed from outside the loop because their - * use_count was increased. + * above. */ -repeat: - object = list_entry(gray_list.next, typeof(*object), gray_list); - while (&object->gray_list != &gray_list) { - cond_resched(); - - /* may add new objects to the list */ - if (!scan_should_stop()) - scan_object(object); - - tmp = list_entry(object->gray_list.next, typeof(*object), - gray_list); - - /* remove the object from the list and release it */ - list_del(&object->gray_list); - put_object(object); - - object = tmp; - } - - if (scan_should_stop() || ++gray_list_pass >= GRAY_LIST_PASSES) - goto scan_end; + scan_gray_list(); /* - * Check for new objects allocated during this scanning and add them - * to the gray list. + * Check for new or unreferenced objects modified since the previous + * scan and color them gray until the next scan. */ rcu_read_lock(); list_for_each_entry_rcu(object, &object_list, object_list) { spin_lock_irqsave(&object->lock, flags); - if ((object->flags & OBJECT_NEW) && !color_black(object) && - get_object(object)) { - object->flags &= ~OBJECT_NEW; + if (color_white(object) && (object->flags & OBJECT_ALLOCATED) + && update_checksum(object) && get_object(object)) { + /* color it gray temporarily */ + object->count = object->min_count; list_add_tail(&object->gray_list, &gray_list); } spin_unlock_irqrestore(&object->lock, flags); } rcu_read_unlock(); - if (!list_empty(&gray_list)) - goto repeat; - -scan_end: - WARN_ON(!list_empty(&gray_list)); + /* + * Re-scan the gray list for modified unreferenced objects. + */ + scan_gray_list(); /* - * If scanning was stopped or new objects were being allocated at a - * higher rate than gray list scanning, do not report any new - * unreferenced objects. + * If scanning was stopped do not report any new unreferenced objects. */ - if (scan_should_stop() || gray_list_pass >= GRAY_LIST_PASSES) + if (scan_should_stop()) return; /* @@ -1642,8 +1656,7 @@ void __init kmemleak_init(void) kmemleak_ignore(log->ptr); break; case KMEMLEAK_SCAN_AREA: - kmemleak_scan_area(log->ptr, log->offset, log->length, - GFP_KERNEL); + kmemleak_scan_area(log->ptr, log->size, GFP_KERNEL); break; case KMEMLEAK_NO_SCAN: kmemleak_no_scan(log->ptr); @@ -29,11 +29,13 @@ #include <linux/wait.h> #include <linux/slab.h> #include <linux/rbtree.h> +#include <linux/memory.h> #include <linux/mmu_notifier.h> #include <linux/swap.h> #include <linux/ksm.h> #include <asm/tlbflush.h> +#include "internal.h" /* * A few notes about the KSM scanning process, @@ -79,13 +81,13 @@ * struct mm_slot - ksm information per mm that is being scanned * @link: link to the mm_slots hash list * @mm_list: link into the mm_slots list, rooted in ksm_mm_head - * @rmap_list: head for this mm_slot's list of rmap_items + * @rmap_list: head for this mm_slot's singly-linked list of rmap_items * @mm: the mm that this information is valid for */ struct mm_slot { struct hlist_node link; struct list_head mm_list; - struct list_head rmap_list; + struct rmap_item *rmap_list; struct mm_struct *mm; }; @@ -93,7 +95,7 @@ struct mm_slot { * struct ksm_scan - cursor for scanning * @mm_slot: the current mm_slot we are scanning * @address: the next address inside that to be scanned - * @rmap_item: the current rmap that we are scanning inside the rmap_list + * @rmap_list: link to the next rmap to be scanned in the rmap_list * @seqnr: count of completed full scans (needed when removing unstable node) * * There is only the one ksm_scan instance of this cursor structure. @@ -101,37 +103,51 @@ struct mm_slot { struct ksm_scan { struct mm_slot *mm_slot; unsigned long address; - struct rmap_item *rmap_item; + struct rmap_item **rmap_list; unsigned long seqnr; }; /** + * struct stable_node - node of the stable rbtree + * @node: rb node of this ksm page in the stable tree + * @hlist: hlist head of rmap_items using this ksm page + * @kpfn: page frame number of this ksm page + */ +struct stable_node { + struct rb_node node; + struct hlist_head hlist; + unsigned long kpfn; +}; + +/** * struct rmap_item - reverse mapping item for virtual addresses - * @link: link into mm_slot's rmap_list (rmap_list is per mm) + * @rmap_list: next rmap_item in mm_slot's singly-linked rmap_list + * @anon_vma: pointer to anon_vma for this mm,address, when in stable tree * @mm: the memory structure this rmap_item is pointing into * @address: the virtual address this rmap_item tracks (+ flags in low bits) * @oldchecksum: previous checksum of the page at that virtual address - * @node: rb_node of this rmap_item in either unstable or stable tree - * @next: next rmap_item hanging off the same node of the stable tree - * @prev: previous rmap_item hanging off the same node of the stable tree + * @node: rb node of this rmap_item in the unstable tree + * @head: pointer to stable_node heading this list in the stable tree + * @hlist: link into hlist of rmap_items hanging off that stable_node */ struct rmap_item { - struct list_head link; + struct rmap_item *rmap_list; + struct anon_vma *anon_vma; /* when stable */ struct mm_struct *mm; unsigned long address; /* + low bits used for flags below */ + unsigned int oldchecksum; /* when unstable */ union { - unsigned int oldchecksum; /* when unstable */ - struct rmap_item *next; /* when stable */ - }; - union { - struct rb_node node; /* when tree node */ - struct rmap_item *prev; /* in stable list */ + struct rb_node node; /* when node of unstable tree */ + struct { /* when listed from stable tree */ + struct stable_node *head; + struct hlist_node hlist; + }; }; }; #define SEQNR_MASK 0x0ff /* low bits of unstable tree seqnr */ -#define NODE_FLAG 0x100 /* is a node of unstable or stable tree */ -#define STABLE_FLAG 0x200 /* is a node or list item of stable tree */ +#define UNSTABLE_FLAG 0x100 /* is a node of the unstable tree */ +#define STABLE_FLAG 0x200 /* is listed from the stable tree */ /* The stable and unstable tree heads */ static struct rb_root root_stable_tree = RB_ROOT; @@ -148,6 +164,7 @@ static struct ksm_scan ksm_scan = { }; static struct kmem_cache *rmap_item_cache; +static struct kmem_cache *stable_node_cache; static struct kmem_cache *mm_slot_cache; /* The number of nodes in the stable tree */ @@ -162,9 +179,6 @@ static unsigned long ksm_pages_unshared; /* The number of rmap_items in use: to calculate pages_volatile */ static unsigned long ksm_rmap_items; -/* Limit on the number of unswappable pages used */ -static unsigned long ksm_max_kernel_pages; - /* Number of pages ksmd should scan in one batch */ static unsigned int ksm_thread_pages_to_scan = 100; @@ -190,13 +204,19 @@ static int __init ksm_slab_init(void) if (!rmap_item_cache) goto out; + stable_node_cache = KSM_KMEM_CACHE(stable_node, 0); + if (!stable_node_cache) + goto out_free1; + mm_slot_cache = KSM_KMEM_CACHE(mm_slot, 0); if (!mm_slot_cache) - goto out_free; + goto out_free2; return 0; -out_free: +out_free2: + kmem_cache_destroy(stable_node_cache); +out_free1: kmem_cache_destroy(rmap_item_cache); out: return -ENOMEM; @@ -205,6 +225,7 @@ out: static void __init ksm_slab_free(void) { kmem_cache_destroy(mm_slot_cache); + kmem_cache_destroy(stable_node_cache); kmem_cache_destroy(rmap_item_cache); mm_slot_cache = NULL; } @@ -226,6 +247,16 @@ static inline void free_rmap_item(struct rmap_item *rmap_item) kmem_cache_free(rmap_item_cache, rmap_item); } +static inline struct stable_node *alloc_stable_node(void) +{ + return kmem_cache_alloc(stable_node_cache, GFP_KERNEL); +} + +static inline void free_stable_node(struct stable_node *stable_node) +{ + kmem_cache_free(stable_node_cache, stable_node); +} + static inline struct mm_slot *alloc_mm_slot(void) { if (!mm_slot_cache) /* initialization failed */ @@ -275,7 +306,6 @@ static void insert_to_mm_slots_hash(struct mm_struct *mm, bucket = &mm_slots_hash[((unsigned long)mm / sizeof(struct mm_struct)) % MM_SLOTS_HASH_HEADS]; mm_slot->mm = mm; - INIT_LIST_HEAD(&mm_slot->rmap_list); hlist_add_head(&mm_slot->link, bucket); } @@ -284,6 +314,25 @@ static inline int in_stable_tree(struct rmap_item *rmap_item) return rmap_item->address & STABLE_FLAG; } +static void hold_anon_vma(struct rmap_item *rmap_item, + struct anon_vma *anon_vma) +{ + rmap_item->anon_vma = anon_vma; + atomic_inc(&anon_vma->ksm_refcount); +} + +static void drop_anon_vma(struct rmap_item *rmap_item) +{ + struct anon_vma *anon_vma = rmap_item->anon_vma; + + if (atomic_dec_and_lock(&anon_vma->ksm_refcount, &anon_vma->lock)) { + int empty = list_empty(&anon_vma->head); + spin_unlock(&anon_vma->lock); + if (empty) + anon_vma_free(anon_vma); + } +} + /* * ksmd, and unmerge_and_remove_all_rmap_items(), must not touch an mm's * page tables after it has passed through ksm_exit() - which, if necessary, @@ -356,10 +405,18 @@ static int break_ksm(struct vm_area_struct *vma, unsigned long addr) return (ret & VM_FAULT_OOM) ? -ENOMEM : 0; } -static void break_cow(struct mm_struct *mm, unsigned long addr) +static void break_cow(struct rmap_item *rmap_item) { + struct mm_struct *mm = rmap_item->mm; + unsigned long addr = rmap_item->address; struct vm_area_struct *vma; + /* + * It is not an accident that whenever we want to break COW + * to undo, we also need to drop a reference to the anon_vma. + */ + drop_anon_vma(rmap_item); + down_read(&mm->mmap_sem); if (ksm_test_exit(mm)) goto out; @@ -403,21 +460,77 @@ out: page = NULL; return page; } +static void remove_node_from_stable_tree(struct stable_node *stable_node) +{ + struct rmap_item *rmap_item; + struct hlist_node *hlist; + + hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) { + if (rmap_item->hlist.next) + ksm_pages_sharing--; + else + ksm_pages_shared--; + drop_anon_vma(rmap_item); + rmap_item->address &= PAGE_MASK; + cond_resched(); + } + + rb_erase(&stable_node->node, &root_stable_tree); + free_stable_node(stable_node); +} + /* - * get_ksm_page: checks if the page at the virtual address in rmap_item - * is still PageKsm, in which case we can trust the content of the page, - * and it returns the gotten page; but NULL if the page has been zapped. + * get_ksm_page: checks if the page indicated by the stable node + * is still its ksm page, despite having held no reference to it. + * In which case we can trust the content of the page, and it + * returns the gotten page; but if the page has now been zapped, + * remove the stale node from the stable tree and return NULL. + * + * You would expect the stable_node to hold a reference to the ksm page. + * But if it increments the page's count, swapping out has to wait for + * ksmd to come around again before it can free the page, which may take + * seconds or even minutes: much too unresponsive. So instead we use a + * "keyhole reference": access to the ksm page from the stable node peeps + * out through its keyhole to see if that page still holds the right key, + * pointing back to this stable node. This relies on freeing a PageAnon + * page to reset its page->mapping to NULL, and relies on no other use of + * a page to put something that might look like our key in page->mapping. + * + * include/linux/pagemap.h page_cache_get_speculative() is a good reference, + * but this is different - made simpler by ksm_thread_mutex being held, but + * interesting for assuming that no other use of the struct page could ever + * put our expected_mapping into page->mapping (or a field of the union which + * coincides with page->mapping). The RCU calls are not for KSM at all, but + * to keep the page_count protocol described with page_cache_get_speculative. + * + * Note: it is possible that get_ksm_page() will return NULL one moment, + * then page the next, if the page is in between page_freeze_refs() and + * page_unfreeze_refs(): this shouldn't be a problem anywhere, the page + * is on its way to being freed; but it is an anomaly to bear in mind. */ -static struct page *get_ksm_page(struct rmap_item *rmap_item) +static struct page *get_ksm_page(struct stable_node *stable_node) { struct page *page; - - page = get_mergeable_page(rmap_item); - if (page && !PageKsm(page)) { + void *expected_mapping; + + page = pfn_to_page(stable_node->kpfn); + expected_mapping = (void *)stable_node + + (PAGE_MAPPING_ANON | PAGE_MAPPING_KSM); + rcu_read_lock(); + if (page->mapping != expected_mapping) + goto stale; + if (!get_page_unless_zero(page)) + goto stale; + if (page->mapping != expected_mapping) { put_page(page); - page = NULL; + goto stale; } + rcu_read_unlock(); return page; +stale: + rcu_read_unlock(); + remove_node_from_stable_tree(stable_node); + return NULL; } /* @@ -426,35 +539,29 @@ static struct page *get_ksm_page(struct rmap_item *rmap_item) */ static void remove_rmap_item_from_tree(struct rmap_item *rmap_item) { - if (in_stable_tree(rmap_item)) { - struct rmap_item *next_item = rmap_item->next; - - if (rmap_item->address & NODE_FLAG) { - if (next_item) { - rb_replace_node(&rmap_item->node, - &next_item->node, - &root_stable_tree); - next_item->address |= NODE_FLAG; - ksm_pages_sharing--; - } else { - rb_erase(&rmap_item->node, &root_stable_tree); - ksm_pages_shared--; - } - } else { - struct rmap_item *prev_item = rmap_item->prev; + if (rmap_item->address & STABLE_FLAG) { + struct stable_node *stable_node; + struct page *page; - BUG_ON(prev_item->next != rmap_item); - prev_item->next = next_item; - if (next_item) { - BUG_ON(next_item->prev != rmap_item); - next_item->prev = rmap_item->prev; - } + stable_node = rmap_item->head; + page = get_ksm_page(stable_node); + if (!page) + goto out; + + lock_page(page); + hlist_del(&rmap_item->hlist); + unlock_page(page); + put_page(page); + + if (stable_node->hlist.first) ksm_pages_sharing--; - } + else + ksm_pages_shared--; - rmap_item->next = NULL; + drop_anon_vma(rmap_item); + rmap_item->address &= PAGE_MASK; - } else if (rmap_item->address & NODE_FLAG) { + } else if (rmap_item->address & UNSTABLE_FLAG) { unsigned char age; /* * Usually ksmd can and must skip the rb_erase, because @@ -467,24 +574,21 @@ static void remove_rmap_item_from_tree(struct rmap_item *rmap_item) BUG_ON(age > 1); if (!age) rb_erase(&rmap_item->node, &root_unstable_tree); + ksm_pages_unshared--; + rmap_item->address &= PAGE_MASK; } - - rmap_item->address &= PAGE_MASK; - +out: cond_resched(); /* we're called from many long loops */ } static void remove_trailing_rmap_items(struct mm_slot *mm_slot, - struct list_head *cur) + struct rmap_item **rmap_list) { - struct rmap_item *rmap_item; - - while (cur != &mm_slot->rmap_list) { - rmap_item = list_entry(cur, struct rmap_item, link); - cur = cur->next; + while (*rmap_list) { + struct rmap_item *rmap_item = *rmap_list; + *rmap_list = rmap_item->rmap_list; remove_rmap_item_from_tree(rmap_item); - list_del(&rmap_item->link); free_rmap_item(rmap_item); } } @@ -550,7 +654,7 @@ static int unmerge_and_remove_all_rmap_items(void) goto error; } - remove_trailing_rmap_items(mm_slot, mm_slot->rmap_list.next); + remove_trailing_rmap_items(mm_slot, &mm_slot->rmap_list); spin_lock(&ksm_mmlist_lock); ksm_scan.mm_slot = list_entry(mm_slot->mm_list.next, @@ -646,8 +750,8 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page, * Check that no O_DIRECT or similar I/O is in progress on the * page */ - if ((page_mapcount(page) + 2 + swapped) != page_count(page)) { - set_pte_at_notify(mm, addr, ptep, entry); + if (page_mapcount(page) + 1 + swapped != page_count(page)) { + set_pte_at(mm, addr, ptep, entry); goto out_unlock; } entry = pte_wrprotect(entry); @@ -664,15 +768,15 @@ out: /** * replace_page - replace page in vma by new ksm page - * @vma: vma that holds the pte pointing to oldpage - * @oldpage: the page we are replacing by newpage - * @newpage: the ksm page we replace oldpage by + * @vma: vma that holds the pte pointing to page + * @page: the page we are replacing by kpage + * @kpage: the ksm page we replace page by * @orig_pte: the original value of the pte * * Returns 0 on success, -EFAULT on failure. */ -static int replace_page(struct vm_area_struct *vma, struct page *oldpage, - struct page *newpage, pte_t orig_pte) +static int replace_page(struct vm_area_struct *vma, struct page *page, + struct page *kpage, pte_t orig_pte) { struct mm_struct *mm = vma->vm_mm; pgd_t *pgd; @@ -681,12 +785,9 @@ static int replace_page(struct vm_area_struct *vma, struct page *oldpage, pte_t *ptep; spinlock_t *ptl; unsigned long addr; - pgprot_t prot; int err = -EFAULT; - prot = vm_get_page_prot(vma->vm_flags & ~VM_WRITE); - - addr = page_address_in_vma(oldpage, vma); + addr = page_address_in_vma(page, vma); if (addr == -EFAULT) goto out; @@ -708,15 +809,15 @@ static int replace_page(struct vm_area_struct *vma, struct page *oldpage, goto out; } - get_page(newpage); - page_add_ksm_rmap(newpage); + get_page(kpage); + page_add_anon_rmap(kpage, vma, addr); flush_cache_page(vma, addr, pte_pfn(*ptep)); ptep_clear_flush(vma, addr, ptep); - set_pte_at_notify(mm, addr, ptep, mk_pte(newpage, prot)); + set_pte_at_notify(mm, addr, ptep, mk_pte(kpage, vma->vm_page_prot)); - page_remove_rmap(oldpage); - put_page(oldpage); + page_remove_rmap(page); + put_page(page); pte_unmap_unlock(ptep, ptl); err = 0; @@ -726,32 +827,27 @@ out: /* * try_to_merge_one_page - take two pages and merge them into one - * @vma: the vma that hold the pte pointing into oldpage - * @oldpage: the page that we want to replace with newpage - * @newpage: the page that we want to map instead of oldpage - * - * Note: - * oldpage should be a PageAnon page, while newpage should be a PageKsm page, - * or a newly allocated kernel page which page_add_ksm_rmap will make PageKsm. + * @vma: the vma that holds the pte pointing to page + * @page: the PageAnon page that we want to replace with kpage + * @kpage: the PageKsm page that we want to map instead of page, + * or NULL the first time when we want to use page as kpage. * * This function returns 0 if the pages were merged, -EFAULT otherwise. */ static int try_to_merge_one_page(struct vm_area_struct *vma, - struct page *oldpage, - struct page *newpage) + struct page *page, struct page *kpage) { pte_t orig_pte = __pte(0); int err = -EFAULT; + if (page == kpage) /* ksm page forked */ + return 0; + if (!(vma->vm_flags & VM_MERGEABLE)) goto out; - - if (!PageAnon(oldpage)) + if (!PageAnon(page)) goto out; - get_page(newpage); - get_page(oldpage); - /* * We need the page lock to read a stable PageSwapCache in * write_protect_page(). We use trylock_page() instead of @@ -759,26 +855,39 @@ static int try_to_merge_one_page(struct vm_area_struct *vma, * prefer to continue scanning and merging different pages, * then come back to this page when it is unlocked. */ - if (!trylock_page(oldpage)) - goto out_putpage; + if (!trylock_page(page)) + goto out; /* * If this anonymous page is mapped only here, its pte may need * to be write-protected. If it's mapped elsewhere, all of its * ptes are necessarily already write-protected. But in either * case, we need to lock and check page_count is not raised. */ - if (write_protect_page(vma, oldpage, &orig_pte)) { - unlock_page(oldpage); - goto out_putpage; + if (write_protect_page(vma, page, &orig_pte) == 0) { + if (!kpage) { + /* + * While we hold page lock, upgrade page from + * PageAnon+anon_vma to PageKsm+NULL stable_node: + * stable_tree_insert() will update stable_node. + */ + set_page_stable_node(page, NULL); + mark_page_accessed(page); + err = 0; + } else if (pages_identical(page, kpage)) + err = replace_page(vma, page, kpage, orig_pte); } - unlock_page(oldpage); - if (pages_identical(oldpage, newpage)) - err = replace_page(vma, oldpage, newpage, orig_pte); + if ((vma->vm_flags & VM_LOCKED) && kpage && !err) { + munlock_vma_page(page); + if (!PageMlocked(kpage)) { + unlock_page(page); + lock_page(kpage); + mlock_vma_page(kpage); + page = kpage; /* for final unlock */ + } + } -out_putpage: - put_page(oldpage); - put_page(newpage); + unlock_page(page); out: return err; } @@ -786,26 +895,31 @@ out: /* * try_to_merge_with_ksm_page - like try_to_merge_two_pages, * but no new kernel page is allocated: kpage must already be a ksm page. + * + * This function returns 0 if the pages were merged, -EFAULT otherwise. */ -static int try_to_merge_with_ksm_page(struct mm_struct *mm1, - unsigned long addr1, - struct page *page1, - struct page *kpage) +static int try_to_merge_with_ksm_page(struct rmap_item *rmap_item, + struct page *page, struct page *kpage) { + struct mm_struct *mm = rmap_item->mm; struct vm_area_struct *vma; int err = -EFAULT; - down_read(&mm1->mmap_sem); - if (ksm_test_exit(mm1)) + down_read(&mm->mmap_sem); + if (ksm_test_exit(mm)) + goto out; + vma = find_vma(mm, rmap_item->address); + if (!vma || vma->vm_start > rmap_item->address) goto out; - vma = find_vma(mm1, addr1); - if (!vma || vma->vm_start > addr1) + err = try_to_merge_one_page(vma, page, kpage); + if (err) goto out; - err = try_to_merge_one_page(vma, page1, kpage); + /* Must get reference to anon_vma while still holding mmap_sem */ + hold_anon_vma(rmap_item, vma->anon_vma); out: - up_read(&mm1->mmap_sem); + up_read(&mm->mmap_sem); return err; } @@ -813,109 +927,73 @@ out: * try_to_merge_two_pages - take two identical pages and prepare them * to be merged into one page. * - * This function returns 0 if we successfully mapped two identical pages - * into one page, -EFAULT otherwise. + * This function returns the kpage if we successfully merged two identical + * pages into one ksm page, NULL otherwise. * - * Note that this function allocates a new kernel page: if one of the pages + * Note that this function upgrades page to ksm page: if one of the pages * is already a ksm page, try_to_merge_with_ksm_page should be used. */ -static int try_to_merge_two_pages(struct mm_struct *mm1, unsigned long addr1, - struct page *page1, struct mm_struct *mm2, - unsigned long addr2, struct page *page2) +static struct page *try_to_merge_two_pages(struct rmap_item *rmap_item, + struct page *page, + struct rmap_item *tree_rmap_item, + struct page *tree_page) { - struct vm_area_struct *vma; - struct page *kpage; - int err = -EFAULT; - - /* - * The number of nodes in the stable tree - * is the number of kernel pages that we hold. - */ - if (ksm_max_kernel_pages && - ksm_max_kernel_pages <= ksm_pages_shared) - return err; - - kpage = alloc_page(GFP_HIGHUSER); - if (!kpage) - return err; - - down_read(&mm1->mmap_sem); - if (ksm_test_exit(mm1)) { - up_read(&mm1->mmap_sem); - goto out; - } - vma = find_vma(mm1, addr1); - if (!vma || vma->vm_start > addr1) { - up_read(&mm1->mmap_sem); - goto out; - } - - copy_user_highpage(kpage, page1, addr1, vma); - err = try_to_merge_one_page(vma, page1, kpage); - up_read(&mm1->mmap_sem); + int err; + err = try_to_merge_with_ksm_page(rmap_item, page, NULL); if (!err) { - err = try_to_merge_with_ksm_page(mm2, addr2, page2, kpage); + err = try_to_merge_with_ksm_page(tree_rmap_item, + tree_page, page); /* * If that fails, we have a ksm page with only one pte * pointing to it: so break it. */ if (err) - break_cow(mm1, addr1); + break_cow(rmap_item); } -out: - put_page(kpage); - return err; + return err ? NULL : page; } /* - * stable_tree_search - search page inside the stable tree - * @page: the page that we are searching identical pages to. - * @page2: pointer into identical page that we are holding inside the stable - * tree that we have found. - * @rmap_item: the reverse mapping item + * stable_tree_search - search for page inside the stable tree * * This function checks if there is a page inside the stable tree * with identical content to the page that we are scanning right now. * - * This function return rmap_item pointer to the identical item if found, + * This function returns the stable tree node of identical content if found, * NULL otherwise. */ -static struct rmap_item *stable_tree_search(struct page *page, - struct page **page2, - struct rmap_item *rmap_item) +static struct page *stable_tree_search(struct page *page) { struct rb_node *node = root_stable_tree.rb_node; + struct stable_node *stable_node; + + stable_node = page_stable_node(page); + if (stable_node) { /* ksm page forked */ + get_page(page); + return page; + } while (node) { - struct rmap_item *tree_rmap_item, *next_rmap_item; + struct page *tree_page; int ret; - tree_rmap_item = rb_entry(node, struct rmap_item, node); - while (tree_rmap_item) { - BUG_ON(!in_stable_tree(tree_rmap_item)); - cond_resched(); - page2[0] = get_ksm_page(tree_rmap_item); - if (page2[0]) - break; - next_rmap_item = tree_rmap_item->next; - remove_rmap_item_from_tree(tree_rmap_item); - tree_rmap_item = next_rmap_item; - } - if (!tree_rmap_item) + cond_resched(); + stable_node = rb_entry(node, struct stable_node, node); + tree_page = get_ksm_page(stable_node); + if (!tree_page) return NULL; - ret = memcmp_pages(page, page2[0]); + ret = memcmp_pages(page, tree_page); if (ret < 0) { - put_page(page2[0]); + put_page(tree_page); node = node->rb_left; } else if (ret > 0) { - put_page(page2[0]); + put_page(tree_page); node = node->rb_right; - } else { - return tree_rmap_item; - } + } else + return tree_page; } return NULL; @@ -925,38 +1003,26 @@ static struct rmap_item *stable_tree_search(struct page *page, * stable_tree_insert - insert rmap_item pointing to new ksm page * into the stable tree. * - * @page: the page that we are searching identical page to inside the stable - * tree. - * @rmap_item: pointer to the reverse mapping item. - * - * This function returns rmap_item if success, NULL otherwise. + * This function returns the stable tree node just allocated on success, + * NULL otherwise. */ -static struct rmap_item *stable_tree_insert(struct page *page, - struct rmap_item *rmap_item) +static struct stable_node *stable_tree_insert(struct page *kpage) { struct rb_node **new = &root_stable_tree.rb_node; struct rb_node *parent = NULL; + struct stable_node *stable_node; while (*new) { - struct rmap_item *tree_rmap_item, *next_rmap_item; struct page *tree_page; int ret; - tree_rmap_item = rb_entry(*new, struct rmap_item, node); - while (tree_rmap_item) { - BUG_ON(!in_stable_tree(tree_rmap_item)); - cond_resched(); - tree_page = get_ksm_page(tree_rmap_item); - if (tree_page) - break; - next_rmap_item = tree_rmap_item->next; - remove_rmap_item_from_tree(tree_rmap_item); - tree_rmap_item = next_rmap_item; - } - if (!tree_rmap_item) + cond_resched(); + stable_node = rb_entry(*new, struct stable_node, node); + tree_page = get_ksm_page(stable_node); + if (!tree_page) return NULL; - ret = memcmp_pages(page, tree_page); + ret = memcmp_pages(kpage, tree_page); put_page(tree_page); parent = *new; @@ -974,22 +1040,24 @@ static struct rmap_item *stable_tree_insert(struct page *page, } } - rmap_item->address |= NODE_FLAG | STABLE_FLAG; - rmap_item->next = NULL; - rb_link_node(&rmap_item->node, parent, new); - rb_insert_color(&rmap_item->node, &root_stable_tree); + stable_node = alloc_stable_node(); + if (!stable_node) + return NULL; - ksm_pages_shared++; - return rmap_item; + rb_link_node(&stable_node->node, parent, new); + rb_insert_color(&stable_node->node, &root_stable_tree); + + INIT_HLIST_HEAD(&stable_node->hlist); + + stable_node->kpfn = page_to_pfn(kpage); + set_page_stable_node(kpage, stable_node); + + return stable_node; } /* - * unstable_tree_search_insert - search and insert items into the unstable tree. - * - * @page: the page that we are going to search for identical page or to insert - * into the unstable tree - * @page2: pointer into identical page that was found inside the unstable tree - * @rmap_item: the reverse mapping item of page + * unstable_tree_search_insert - search for identical page, + * else insert rmap_item into the unstable tree. * * This function searches for a page in the unstable tree identical to the * page currently being scanned; and if no identical page is found in the @@ -1001,47 +1069,50 @@ static struct rmap_item *stable_tree_insert(struct page *page, * This function does both searching and inserting, because they share * the same walking algorithm in an rbtree. */ -static struct rmap_item *unstable_tree_search_insert(struct page *page, - struct page **page2, - struct rmap_item *rmap_item) +static +struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item, + struct page *page, + struct page **tree_pagep) + { struct rb_node **new = &root_unstable_tree.rb_node; struct rb_node *parent = NULL; while (*new) { struct rmap_item *tree_rmap_item; + struct page *tree_page; int ret; cond_resched(); tree_rmap_item = rb_entry(*new, struct rmap_item, node); - page2[0] = get_mergeable_page(tree_rmap_item); - if (!page2[0]) + tree_page = get_mergeable_page(tree_rmap_item); + if (!tree_page) return NULL; /* - * Don't substitute an unswappable ksm page - * just for one good swappable forked page. + * Don't substitute a ksm page for a forked page. */ - if (page == page2[0]) { - put_page(page2[0]); + if (page == tree_page) { + put_page(tree_page); return NULL; } - ret = memcmp_pages(page, page2[0]); + ret = memcmp_pages(page, tree_page); parent = *new; if (ret < 0) { - put_page(page2[0]); + put_page(tree_page); new = &parent->rb_left; } else if (ret > 0) { - put_page(page2[0]); + put_page(tree_page); new = &parent->rb_right; } else { + *tree_pagep = tree_page; return tree_rmap_item; } } - rmap_item->address |= NODE_FLAG; + rmap_item->address |= UNSTABLE_FLAG; rmap_item->address |= (ksm_scan.seqnr & SEQNR_MASK); rb_link_node(&rmap_item->node, parent, new); rb_insert_color(&rmap_item->node, &root_unstable_tree); @@ -1056,18 +1127,16 @@ static struct rmap_item *unstable_tree_search_insert(struct page *page, * the same ksm page. */ static void stable_tree_append(struct rmap_item *rmap_item, - struct rmap_item *tree_rmap_item) + struct stable_node *stable_node) { - rmap_item->next = tree_rmap_item->next; - rmap_item->prev = tree_rmap_item; - - if (tree_rmap_item->next) - tree_rmap_item->next->prev = rmap_item; - - tree_rmap_item->next = rmap_item; + rmap_item->head = stable_node; rmap_item->address |= STABLE_FLAG; + hlist_add_head(&rmap_item->hlist, &stable_node->hlist); - ksm_pages_sharing++; + if (rmap_item->hlist.next) + ksm_pages_sharing++; + else + ksm_pages_shared++; } /* @@ -1081,49 +1150,37 @@ static void stable_tree_append(struct rmap_item *rmap_item, */ static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item) { - struct page *page2[1]; struct rmap_item *tree_rmap_item; + struct page *tree_page = NULL; + struct stable_node *stable_node; + struct page *kpage; unsigned int checksum; int err; - if (in_stable_tree(rmap_item)) - remove_rmap_item_from_tree(rmap_item); + remove_rmap_item_from_tree(rmap_item); /* We first start with searching the page inside the stable tree */ - tree_rmap_item = stable_tree_search(page, page2, rmap_item); - if (tree_rmap_item) { - if (page == page2[0]) /* forked */ - err = 0; - else - err = try_to_merge_with_ksm_page(rmap_item->mm, - rmap_item->address, - page, page2[0]); - put_page(page2[0]); - + kpage = stable_tree_search(page); + if (kpage) { + err = try_to_merge_with_ksm_page(rmap_item, page, kpage); if (!err) { /* * The page was successfully merged: * add its rmap_item to the stable tree. */ - stable_tree_append(rmap_item, tree_rmap_item); + lock_page(kpage); + stable_tree_append(rmap_item, page_stable_node(kpage)); + unlock_page(kpage); } + put_page(kpage); return; } /* - * A ksm page might have got here by fork, but its other - * references have already been removed from the stable tree. - * Or it might be left over from a break_ksm which failed - * when the mem_cgroup had reached its limit: try again now. - */ - if (PageKsm(page)) - break_cow(rmap_item->mm, rmap_item->address); - - /* - * In case the hash value of the page was changed from the last time we - * have calculated it, this page to be changed frequely, therefore we - * don't want to insert it to the unstable tree, and we don't want to - * waste our time to search if there is something identical to it there. + * If the hash value of the page has changed from the last time + * we calculated it, this page is changing frequently: therefore we + * don't want to insert it in the unstable tree, and we don't want + * to waste our time searching for something identical to it there. */ checksum = calc_checksum(page); if (rmap_item->oldchecksum != checksum) { @@ -1131,21 +1188,27 @@ static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item) return; } - tree_rmap_item = unstable_tree_search_insert(page, page2, rmap_item); + tree_rmap_item = + unstable_tree_search_insert(rmap_item, page, &tree_page); if (tree_rmap_item) { - err = try_to_merge_two_pages(rmap_item->mm, - rmap_item->address, page, - tree_rmap_item->mm, - tree_rmap_item->address, page2[0]); + kpage = try_to_merge_two_pages(rmap_item, page, + tree_rmap_item, tree_page); + put_page(tree_page); /* * As soon as we merge this page, we want to remove the * rmap_item of the page we have merged with from the unstable * tree, and insert it instead as new node in the stable tree. */ - if (!err) { - rb_erase(&tree_rmap_item->node, &root_unstable_tree); - tree_rmap_item->address &= ~NODE_FLAG; - ksm_pages_unshared--; + if (kpage) { + remove_rmap_item_from_tree(tree_rmap_item); + + lock_page(kpage); + stable_node = stable_tree_insert(kpage); + if (stable_node) { + stable_tree_append(tree_rmap_item, stable_node); + stable_tree_append(rmap_item, stable_node); + } + unlock_page(kpage); /* * If we fail to insert the page into the stable tree, @@ -1153,37 +1216,28 @@ static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item) * to a ksm page left outside the stable tree, * in which case we need to break_cow on both. */ - if (stable_tree_insert(page2[0], tree_rmap_item)) - stable_tree_append(rmap_item, tree_rmap_item); - else { - break_cow(tree_rmap_item->mm, - tree_rmap_item->address); - break_cow(rmap_item->mm, rmap_item->address); + if (!stable_node) { + break_cow(tree_rmap_item); + break_cow(rmap_item); } } - - put_page(page2[0]); } } static struct rmap_item *get_next_rmap_item(struct mm_slot *mm_slot, - struct list_head *cur, + struct rmap_item **rmap_list, unsigned long addr) { struct rmap_item *rmap_item; - while (cur != &mm_slot->rmap_list) { - rmap_item = list_entry(cur, struct rmap_item, link); - if ((rmap_item->address & PAGE_MASK) == addr) { - if (!in_stable_tree(rmap_item)) - remove_rmap_item_from_tree(rmap_item); + while (*rmap_list) { + rmap_item = *rmap_list; + if ((rmap_item->address & PAGE_MASK) == addr) return rmap_item; - } if (rmap_item->address > addr) break; - cur = cur->next; + *rmap_list = rmap_item->rmap_list; remove_rmap_item_from_tree(rmap_item); - list_del(&rmap_item->link); free_rmap_item(rmap_item); } @@ -1192,7 +1246,8 @@ static struct rmap_item *get_next_rmap_item(struct mm_slot *mm_slot, /* It has already been zeroed */ rmap_item->mm = mm_slot->mm; rmap_item->address = addr; - list_add_tail(&rmap_item->link, cur); + rmap_item->rmap_list = *rmap_list; + *rmap_list = rmap_item; } return rmap_item; } @@ -1217,8 +1272,7 @@ static struct rmap_item *scan_get_next_rmap_item(struct page **page) spin_unlock(&ksm_mmlist_lock); next_mm: ksm_scan.address = 0; - ksm_scan.rmap_item = list_entry(&slot->rmap_list, - struct rmap_item, link); + ksm_scan.rmap_list = &slot->rmap_list; } mm = slot->mm; @@ -1244,10 +1298,10 @@ next_mm: flush_anon_page(vma, *page, ksm_scan.address); flush_dcache_page(*page); rmap_item = get_next_rmap_item(slot, - ksm_scan.rmap_item->link.next, - ksm_scan.address); + ksm_scan.rmap_list, ksm_scan.address); if (rmap_item) { - ksm_scan.rmap_item = rmap_item; + ksm_scan.rmap_list = + &rmap_item->rmap_list; ksm_scan.address += PAGE_SIZE; } else put_page(*page); @@ -1263,14 +1317,13 @@ next_mm: if (ksm_test_exit(mm)) { ksm_scan.address = 0; - ksm_scan.rmap_item = list_entry(&slot->rmap_list, - struct rmap_item, link); + ksm_scan.rmap_list = &slot->rmap_list; } /* * Nuke all the rmap_items that are above this current rmap: * because there were no VM_MERGEABLE vmas with such addresses. */ - remove_trailing_rmap_items(slot, ksm_scan.rmap_item->link.next); + remove_trailing_rmap_items(slot, ksm_scan.rmap_list); spin_lock(&ksm_mmlist_lock); ksm_scan.mm_slot = list_entry(slot->mm_list.next, @@ -1323,14 +1376,6 @@ static void ksm_do_scan(unsigned int scan_npages) return; if (!PageKsm(page) || !in_stable_tree(rmap_item)) cmp_and_merge_page(page, rmap_item); - else if (page_mapcount(page) == 1) { - /* - * Replace now-unshared ksm page by ordinary page. - */ - break_cow(rmap_item->mm, rmap_item->address); - remove_rmap_item_from_tree(rmap_item); - rmap_item->oldchecksum = calc_checksum(page); - } put_page(page); } } @@ -1375,7 +1420,7 @@ int ksm_madvise(struct vm_area_struct *vma, unsigned long start, if (*vm_flags & (VM_MERGEABLE | VM_SHARED | VM_MAYSHARE | VM_PFNMAP | VM_IO | VM_DONTEXPAND | VM_RESERVED | VM_HUGETLB | VM_INSERTPAGE | - VM_MIXEDMAP | VM_SAO)) + VM_NONLINEAR | VM_MIXEDMAP | VM_SAO)) return 0; /* just ignore the advice */ if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) { @@ -1452,7 +1497,7 @@ void __ksm_exit(struct mm_struct *mm) spin_lock(&ksm_mmlist_lock); mm_slot = get_mm_slot(mm); if (mm_slot && ksm_scan.mm_slot != mm_slot) { - if (list_empty(&mm_slot->rmap_list)) { + if (!mm_slot->rmap_list) { hlist_del(&mm_slot->link); list_del(&mm_slot->mm_list); easy_to_free = 1; @@ -1473,6 +1518,255 @@ void __ksm_exit(struct mm_struct *mm) } } +struct page *ksm_does_need_to_copy(struct page *page, + struct vm_area_struct *vma, unsigned long address) +{ + struct page *new_page; + + unlock_page(page); /* any racers will COW it, not modify it */ + + new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); + if (new_page) { + copy_user_highpage(new_page, page, address, vma); + + SetPageDirty(new_page); + __SetPageUptodate(new_page); + SetPageSwapBacked(new_page); + __set_page_locked(new_page); + + if (page_evictable(new_page, vma)) + lru_cache_add_lru(new_page, LRU_ACTIVE_ANON); + else + add_page_to_unevictable_list(new_page); + } + + page_cache_release(page); + return new_page; +} + +int page_referenced_ksm(struct page *page, struct mem_cgroup *memcg, + unsigned long *vm_flags) +{ + struct stable_node *stable_node; + struct rmap_item *rmap_item; + struct hlist_node *hlist; + unsigned int mapcount = page_mapcount(page); + int referenced = 0; + int search_new_forks = 0; + + VM_BUG_ON(!PageKsm(page)); + VM_BUG_ON(!PageLocked(page)); + + stable_node = page_stable_node(page); + if (!stable_node) + return 0; +again: + hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) { + struct anon_vma *anon_vma = rmap_item->anon_vma; + struct anon_vma_chain *vmac; + struct vm_area_struct *vma; + + spin_lock(&anon_vma->lock); + list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) { + vma = vmac->vma; + if (rmap_item->address < vma->vm_start || + rmap_item->address >= vma->vm_end) + continue; + /* + * Initially we examine only the vma which covers this + * rmap_item; but later, if there is still work to do, + * we examine covering vmas in other mms: in case they + * were forked from the original since ksmd passed. + */ + if ((rmap_item->mm == vma->vm_mm) == search_new_forks) + continue; + + if (memcg && !mm_match_cgroup(vma->vm_mm, memcg)) + continue; + + referenced += page_referenced_one(page, vma, + rmap_item->address, &mapcount, vm_flags); + if (!search_new_forks || !mapcount) + break; + } + spin_unlock(&anon_vma->lock); + if (!mapcount) + goto out; + } + if (!search_new_forks++) + goto again; +out: + return referenced; +} + +int try_to_unmap_ksm(struct page *page, enum ttu_flags flags) +{ + struct stable_node *stable_node; + struct hlist_node *hlist; + struct rmap_item *rmap_item; + int ret = SWAP_AGAIN; + int search_new_forks = 0; + + VM_BUG_ON(!PageKsm(page)); + VM_BUG_ON(!PageLocked(page)); + + stable_node = page_stable_node(page); + if (!stable_node) + return SWAP_FAIL; +again: + hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) { + struct anon_vma *anon_vma = rmap_item->anon_vma; + struct anon_vma_chain *vmac; + struct vm_area_struct *vma; + + spin_lock(&anon_vma->lock); + list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) { + vma = vmac->vma; + if (rmap_item->address < vma->vm_start || + rmap_item->address >= vma->vm_end) + continue; + /* + * Initially we examine only the vma which covers this + * rmap_item; but later, if there is still work to do, + * we examine covering vmas in other mms: in case they + * were forked from the original since ksmd passed. + */ + if ((rmap_item->mm == vma->vm_mm) == search_new_forks) + continue; + + ret = try_to_unmap_one(page, vma, + rmap_item->address, flags); + if (ret != SWAP_AGAIN || !page_mapped(page)) { + spin_unlock(&anon_vma->lock); + goto out; + } + } + spin_unlock(&anon_vma->lock); + } + if (!search_new_forks++) + goto again; +out: + return ret; +} + +#ifdef CONFIG_MIGRATION +int rmap_walk_ksm(struct page *page, int (*rmap_one)(struct page *, + struct vm_area_struct *, unsigned long, void *), void *arg) +{ + struct stable_node *stable_node; + struct hlist_node *hlist; + struct rmap_item *rmap_item; + int ret = SWAP_AGAIN; + int search_new_forks = 0; + + VM_BUG_ON(!PageKsm(page)); + VM_BUG_ON(!PageLocked(page)); + + stable_node = page_stable_node(page); + if (!stable_node) + return ret; +again: + hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) { + struct anon_vma *anon_vma = rmap_item->anon_vma; + struct anon_vma_chain *vmac; + struct vm_area_struct *vma; + + spin_lock(&anon_vma->lock); + list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) { + vma = vmac->vma; + if (rmap_item->address < vma->vm_start || + rmap_item->address >= vma->vm_end) + continue; + /* + * Initially we examine only the vma which covers this + * rmap_item; but later, if there is still work to do, + * we examine covering vmas in other mms: in case they + * were forked from the original since ksmd passed. + */ + if ((rmap_item->mm == vma->vm_mm) == search_new_forks) + continue; + + ret = rmap_one(page, vma, rmap_item->address, arg); + if (ret != SWAP_AGAIN) { + spin_unlock(&anon_vma->lock); + goto out; + } + } + spin_unlock(&anon_vma->lock); + } + if (!search_new_forks++) + goto again; +out: + return ret; +} + +void ksm_migrate_page(struct page *newpage, struct page *oldpage) +{ + struct stable_node *stable_node; + + VM_BUG_ON(!PageLocked(oldpage)); + VM_BUG_ON(!PageLocked(newpage)); + VM_BUG_ON(newpage->mapping != oldpage->mapping); + + stable_node = page_stable_node(newpage); + if (stable_node) { + VM_BUG_ON(stable_node->kpfn != page_to_pfn(oldpage)); + stable_node->kpfn = page_to_pfn(newpage); + } +} +#endif /* CONFIG_MIGRATION */ + +#ifdef CONFIG_MEMORY_HOTREMOVE +static struct stable_node *ksm_check_stable_tree(unsigned long start_pfn, + unsigned long end_pfn) +{ + struct rb_node *node; + + for (node = rb_first(&root_stable_tree); node; node = rb_next(node)) { + struct stable_node *stable_node; + + stable_node = rb_entry(node, struct stable_node, node); + if (stable_node->kpfn >= start_pfn && + stable_node->kpfn < end_pfn) + return stable_node; + } + return NULL; +} + +static int ksm_memory_callback(struct notifier_block *self, + unsigned long action, void *arg) +{ + struct memory_notify *mn = arg; + struct stable_node *stable_node; + + switch (action) { + case MEM_GOING_OFFLINE: + /* + * Keep it very simple for now: just lock out ksmd and + * MADV_UNMERGEABLE while any memory is going offline. + */ + mutex_lock(&ksm_thread_mutex); + break; + + case MEM_OFFLINE: + /* + * Most of the work is done by page migration; but there might + * be a few stable_nodes left over, still pointing to struct + * pages which have been offlined: prune those from the tree. + */ + while ((stable_node = ksm_check_stable_tree(mn->start_pfn, + mn->start_pfn + mn->nr_pages)) != NULL) + remove_node_from_stable_tree(stable_node); + /* fallthrough */ + + case MEM_CANCEL_OFFLINE: + mutex_unlock(&ksm_thread_mutex); + break; + } + return NOTIFY_OK; +} +#endif /* CONFIG_MEMORY_HOTREMOVE */ + #ifdef CONFIG_SYSFS /* * This all compiles without CONFIG_SYSFS, but is a waste of space. @@ -1551,8 +1845,8 @@ static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr, /* * KSM_RUN_MERGE sets ksmd running, and 0 stops it running. * KSM_RUN_UNMERGE stops it running and unmerges all rmap_items, - * breaking COW to free the unswappable pages_shared (but leaves - * mm_slots on the list for when ksmd may be set running again). + * breaking COW to free the pages_shared (but leaves mm_slots + * on the list for when ksmd may be set running again). */ mutex_lock(&ksm_thread_mutex); @@ -1577,29 +1871,6 @@ static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr, } KSM_ATTR(run); -static ssize_t max_kernel_pages_store(struct kobject *kobj, - struct kobj_attribute *attr, - const char *buf, size_t count) -{ - int err; - unsigned long nr_pages; - - err = strict_strtoul(buf, 10, &nr_pages); - if (err) - return -EINVAL; - - ksm_max_kernel_pages = nr_pages; - - return count; -} - -static ssize_t max_kernel_pages_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) -{ - return sprintf(buf, "%lu\n", ksm_max_kernel_pages); -} -KSM_ATTR(max_kernel_pages); - static ssize_t pages_shared_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { @@ -1649,7 +1920,6 @@ static struct attribute *ksm_attrs[] = { &sleep_millisecs_attr.attr, &pages_to_scan_attr.attr, &run_attr.attr, - &max_kernel_pages_attr.attr, &pages_shared_attr.attr, &pages_sharing_attr.attr, &pages_unshared_attr.attr, @@ -1669,8 +1939,6 @@ static int __init ksm_init(void) struct task_struct *ksm_thread; int err; - ksm_max_kernel_pages = totalram_pages / 4; - err = ksm_slab_init(); if (err) goto out; @@ -1698,6 +1966,13 @@ static int __init ksm_init(void) #endif /* CONFIG_SYSFS */ +#ifdef CONFIG_MEMORY_HOTREMOVE + /* + * Choose a high priority since the callback takes ksm_thread_mutex: + * later callbacks could only be taking locks which nest within that. + */ + hotplug_memory_notifier(ksm_memory_callback, 100); +#endif return 0; out_free2: diff --git a/mm/maccess.c b/mm/maccess.c index 9073695ff25f..4e348dbaecd7 100644 --- a/mm/maccess.c +++ b/mm/maccess.c @@ -14,7 +14,11 @@ * Safely read from address @src to the buffer at @dst. If a kernel fault * happens, handle that and return -EFAULT. */ -long probe_kernel_read(void *dst, void *src, size_t size) + +long __weak probe_kernel_read(void *dst, void *src, size_t size) + __attribute__((alias("__probe_kernel_read"))); + +long __probe_kernel_read(void *dst, void *src, size_t size) { long ret; mm_segment_t old_fs = get_fs(); @@ -39,7 +43,10 @@ EXPORT_SYMBOL_GPL(probe_kernel_read); * Safely write to address @dst from the buffer at @src. If a kernel fault * happens, handle that and return -EFAULT. */ -long notrace __weak probe_kernel_write(void *dst, void *src, size_t size) +long __weak probe_kernel_write(void *dst, void *src, size_t size) + __attribute__((alias("__probe_kernel_write"))); + +long __probe_kernel_write(void *dst, void *src, size_t size) { long ret; mm_segment_t old_fs = get_fs(); diff --git a/mm/madvise.c b/mm/madvise.c index 35b1479b7c9d..319528b8db74 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -9,6 +9,7 @@ #include <linux/pagemap.h> #include <linux/syscalls.h> #include <linux/mempolicy.h> +#include <linux/page-isolation.h> #include <linux/hugetlb.h> #include <linux/sched.h> #include <linux/ksm.h> @@ -222,7 +223,7 @@ static long madvise_remove(struct vm_area_struct *vma, /* * Error injection support for memory error handling. */ -static int madvise_hwpoison(unsigned long start, unsigned long end) +static int madvise_hwpoison(int bhv, unsigned long start, unsigned long end) { int ret = 0; @@ -230,15 +231,21 @@ static int madvise_hwpoison(unsigned long start, unsigned long end) return -EPERM; for (; start < end; start += PAGE_SIZE) { struct page *p; - int ret = get_user_pages(current, current->mm, start, 1, - 0, 0, &p, NULL); + int ret = get_user_pages_fast(start, 1, 0, &p); if (ret != 1) return ret; + if (bhv == MADV_SOFT_OFFLINE) { + printk(KERN_INFO "Soft offlining page %lx at %lx\n", + page_to_pfn(p), start); + ret = soft_offline_page(p, MF_COUNT_INCREASED); + if (ret) + break; + continue; + } printk(KERN_INFO "Injecting memory failure for page %lx at %lx\n", page_to_pfn(p), start); /* Ignore return value for now */ - __memory_failure(page_to_pfn(p), 0, 1); - put_page(p); + __memory_failure(page_to_pfn(p), 0, MF_COUNT_INCREASED); } return ret; } @@ -335,8 +342,8 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior) size_t len; #ifdef CONFIG_MEMORY_FAILURE - if (behavior == MADV_HWPOISON) - return madvise_hwpoison(start, start+len_in); + if (behavior == MADV_HWPOISON || behavior == MADV_SOFT_OFFLINE) + return madvise_hwpoison(behavior, start, start+len_in); #endif if (!madvise_behavior_valid(behavior)) return error; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index f99f5991d6bb..f4ede99c8b9b 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -6,6 +6,10 @@ * Copyright 2007 OpenVZ SWsoft Inc * Author: Pavel Emelianov <xemul@openvz.org> * + * Memory thresholds + * Copyright (C) 2009 Nokia Corporation + * Author: Kirill A. Shutemov + * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or @@ -21,6 +25,7 @@ #include <linux/memcontrol.h> #include <linux/cgroup.h> #include <linux/mm.h> +#include <linux/hugetlb.h> #include <linux/pagemap.h> #include <linux/smp.h> #include <linux/page-flags.h> @@ -32,12 +37,16 @@ #include <linux/rbtree.h> #include <linux/slab.h> #include <linux/swap.h> +#include <linux/swapops.h> #include <linux/spinlock.h> +#include <linux/eventfd.h> +#include <linux/sort.h> #include <linux/fs.h> #include <linux/seq_file.h> #include <linux/vmalloc.h> #include <linux/mm_inline.h> #include <linux/page_cgroup.h> +#include <linux/cpu.h> #include "internal.h" #include <asm/uaccess.h> @@ -54,8 +63,15 @@ static int really_do_swap_account __initdata = 1; /* for remember boot option*/ #define do_swap_account (0) #endif -static DEFINE_MUTEX(memcg_tasklist); /* can be hold under cgroup_mutex */ -#define SOFTLIMIT_EVENTS_THRESH (1000) +/* + * Per memcg event counter is incremented at every pagein/pageout. This counter + * is used for trigger some periodic events. This is straightforward and better + * than using jiffies etc. to handle periodic memcg event. + * + * These values will be used as !((event) & ((1 <<(thresh)) - 1)) + */ +#define THRESHOLDS_EVENTS_THRESH (7) /* once in 128 */ +#define SOFTLIMIT_EVENTS_THRESH (10) /* once in 1024 */ /* * Statistics for memory cgroup. @@ -66,65 +82,19 @@ enum mem_cgroup_stat_index { */ MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */ MEM_CGROUP_STAT_RSS, /* # of pages charged as anon rss */ - MEM_CGROUP_STAT_MAPPED_FILE, /* # of pages charged as file rss */ + MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */ MEM_CGROUP_STAT_PGPGIN_COUNT, /* # of pages paged in */ MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */ - MEM_CGROUP_STAT_EVENTS, /* sum of pagein + pageout for internal use */ MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */ + MEM_CGROUP_EVENTS, /* incremented at every pagein/pageout */ MEM_CGROUP_STAT_NSTATS, }; struct mem_cgroup_stat_cpu { s64 count[MEM_CGROUP_STAT_NSTATS]; -} ____cacheline_aligned_in_smp; - -struct mem_cgroup_stat { - struct mem_cgroup_stat_cpu cpustat[0]; }; -static inline void -__mem_cgroup_stat_reset_safe(struct mem_cgroup_stat_cpu *stat, - enum mem_cgroup_stat_index idx) -{ - stat->count[idx] = 0; -} - -static inline s64 -__mem_cgroup_stat_read_local(struct mem_cgroup_stat_cpu *stat, - enum mem_cgroup_stat_index idx) -{ - return stat->count[idx]; -} - -/* - * For accounting under irq disable, no need for increment preempt count. - */ -static inline void __mem_cgroup_stat_add_safe(struct mem_cgroup_stat_cpu *stat, - enum mem_cgroup_stat_index idx, int val) -{ - stat->count[idx] += val; -} - -static s64 mem_cgroup_read_stat(struct mem_cgroup_stat *stat, - enum mem_cgroup_stat_index idx) -{ - int cpu; - s64 ret = 0; - for_each_possible_cpu(cpu) - ret += stat->cpustat[cpu].count[idx]; - return ret; -} - -static s64 mem_cgroup_local_usage(struct mem_cgroup_stat *stat) -{ - s64 ret; - - ret = mem_cgroup_read_stat(stat, MEM_CGROUP_STAT_CACHE); - ret += mem_cgroup_read_stat(stat, MEM_CGROUP_STAT_RSS); - return ret; -} - /* * per-zone information in memory controller. */ @@ -174,6 +144,22 @@ struct mem_cgroup_tree { static struct mem_cgroup_tree soft_limit_tree __read_mostly; +struct mem_cgroup_threshold { + struct eventfd_ctx *eventfd; + u64 threshold; +}; + +struct mem_cgroup_threshold_ary { + /* An array index points to threshold just below usage. */ + atomic_t current_threshold; + /* Size of entries[] */ + unsigned int size; + /* Array of thresholds */ + struct mem_cgroup_threshold entries[0]; +}; + +static void mem_cgroup_threshold(struct mem_cgroup *mem); + /* * The memory controller data structure. The memory controller controls both * page cache and RSS per cgroup. We would eventually like to provide @@ -209,7 +195,7 @@ struct mem_cgroup { int prev_priority; /* for recording reclaim priority */ /* - * While reclaiming in a hiearchy, we cache the last child we + * While reclaiming in a hierarchy, we cache the last child we * reclaimed from. */ int last_scanned_child; @@ -217,7 +203,7 @@ struct mem_cgroup { * Should the accounting and control be hierarchical, per subtree? */ bool use_hierarchy; - unsigned long last_oom_jiffies; + atomic_t oom_lock; atomic_t refcnt; unsigned int swappiness; @@ -225,10 +211,48 @@ struct mem_cgroup { /* set when res.limit == memsw.limit */ bool memsw_is_minimum; + /* protect arrays of thresholds */ + struct mutex thresholds_lock; + + /* thresholds for memory usage. RCU-protected */ + struct mem_cgroup_threshold_ary *thresholds; + + /* thresholds for mem+swap usage. RCU-protected */ + struct mem_cgroup_threshold_ary *memsw_thresholds; + + /* + * Should we move charges of a task when a task is moved into this + * mem_cgroup ? And what type of charges should we move ? + */ + unsigned long move_charge_at_immigrate; + /* - * statistics. This must be placed at the end of memcg. + * percpu counter. */ - struct mem_cgroup_stat stat; + struct mem_cgroup_stat_cpu *stat; +}; + +/* Stuffs for move charges at task migration. */ +/* + * Types of charges to be moved. "move_charge_at_immitgrate" is treated as a + * left-shifted bitmap of these types. + */ +enum move_type { + MOVE_CHARGE_TYPE_ANON, /* private anonymous page and swap of it */ + NR_MOVE_TYPE, +}; + +/* "mc" and its members are protected by cgroup_mutex */ +static struct move_charge_struct { + struct mem_cgroup *from; + struct mem_cgroup *to; + unsigned long precharge; + unsigned long moved_charge; + unsigned long moved_swap; + struct task_struct *moving_task; /* a task moving charges */ + wait_queue_head_t waitq; /* a waitq for other context */ +} mc = { + .waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq), }; /* @@ -275,6 +299,7 @@ enum charge_type { static void mem_cgroup_get(struct mem_cgroup *mem); static void mem_cgroup_put(struct mem_cgroup *mem); static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem); +static void drain_all_stock_async(void); static struct mem_cgroup_per_zone * mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid) @@ -282,6 +307,11 @@ mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid) return &mem->info.nodeinfo[nid]->zoneinfo[zid]; } +struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *mem) +{ + return &mem->css; +} + static struct mem_cgroup_per_zone * page_cgroup_zoneinfo(struct page_cgroup *pc) { @@ -365,23 +395,6 @@ mem_cgroup_remove_exceeded(struct mem_cgroup *mem, spin_unlock(&mctz->lock); } -static bool mem_cgroup_soft_limit_check(struct mem_cgroup *mem) -{ - bool ret = false; - int cpu; - s64 val; - struct mem_cgroup_stat_cpu *cpustat; - - cpu = get_cpu(); - cpustat = &mem->stat.cpustat[cpu]; - val = __mem_cgroup_stat_read_local(cpustat, MEM_CGROUP_STAT_EVENTS); - if (unlikely(val > SOFTLIMIT_EVENTS_THRESH)) { - __mem_cgroup_stat_reset_safe(cpustat, MEM_CGROUP_STAT_EVENTS); - ret = true; - } - put_cpu(); - return ret; -} static void mem_cgroup_update_tree(struct mem_cgroup *mem, struct page *page) { @@ -475,17 +488,31 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) return mz; } +static s64 mem_cgroup_read_stat(struct mem_cgroup *mem, + enum mem_cgroup_stat_index idx) +{ + int cpu; + s64 val = 0; + + for_each_possible_cpu(cpu) + val += per_cpu(mem->stat->count[idx], cpu); + return val; +} + +static s64 mem_cgroup_local_usage(struct mem_cgroup *mem) +{ + s64 ret; + + ret = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_RSS); + ret += mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_CACHE); + return ret; +} + static void mem_cgroup_swap_statistics(struct mem_cgroup *mem, bool charge) { int val = (charge) ? 1 : -1; - struct mem_cgroup_stat *stat = &mem->stat; - struct mem_cgroup_stat_cpu *cpustat; - int cpu = get_cpu(); - - cpustat = &stat->cpustat[cpu]; - __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_SWAPOUT, val); - put_cpu(); + this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_SWAPOUT], val); } static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, @@ -493,24 +520,21 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, bool charge) { int val = (charge) ? 1 : -1; - struct mem_cgroup_stat *stat = &mem->stat; - struct mem_cgroup_stat_cpu *cpustat; - int cpu = get_cpu(); - cpustat = &stat->cpustat[cpu]; + preempt_disable(); + if (PageCgroupCache(pc)) - __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_CACHE, val); + __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_CACHE], val); else - __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_RSS, val); + __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_RSS], val); if (charge) - __mem_cgroup_stat_add_safe(cpustat, - MEM_CGROUP_STAT_PGPGIN_COUNT, 1); + __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_PGPGIN_COUNT]); else - __mem_cgroup_stat_add_safe(cpustat, - MEM_CGROUP_STAT_PGPGOUT_COUNT, 1); - __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_EVENTS, 1); - put_cpu(); + __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_PGPGOUT_COUNT]); + __this_cpu_inc(mem->stat->count[MEM_CGROUP_EVENTS]); + + preempt_enable(); } static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem, @@ -528,6 +552,29 @@ static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem, return total; } +static bool __memcg_event_check(struct mem_cgroup *mem, int event_mask_shift) +{ + s64 val; + + val = this_cpu_read(mem->stat->count[MEM_CGROUP_EVENTS]); + + return !(val & ((1 << event_mask_shift) - 1)); +} + +/* + * Check events in order. + * + */ +static void memcg_check_events(struct mem_cgroup *mem, struct page *page) +{ + /* threshold event is triggered in finer grain than soft limit */ + if (unlikely(__memcg_event_check(mem, THRESHOLDS_EVENTS_THRESH))) { + mem_cgroup_threshold(mem); + if (unlikely(__memcg_event_check(mem, SOFTLIMIT_EVENTS_THRESH))) + mem_cgroup_update_tree(mem, page); + } +} + static struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont) { return container_of(cgroup_subsys_state(cont, @@ -758,7 +805,13 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem) task_unlock(task); if (!curr) return 0; - if (curr->use_hierarchy) + /* + * We should check use_hierarchy of "mem" not "curr". Because checking + * use_hierarchy of "curr" here make this function true if hierarchy is + * enabled in "curr" and "curr" is a child of "mem" in *cgroup* + * hierarchy(even if use_hierarchy is disabled in "mem"). + */ + if (mem->use_hierarchy) ret = css_is_ancestor(&curr->css, &mem->css); else ret = (curr == mem); @@ -988,7 +1041,7 @@ static int mem_cgroup_count_children_cb(struct mem_cgroup *mem, void *data) } /** - * mem_cgroup_print_mem_info: Called from OOM with tasklist_lock held in read mode. + * mem_cgroup_print_oom_info: Called from OOM with tasklist_lock held in read mode. * @memcg: The memory cgroup that went over limit * @p: Task that is going to be killed * @@ -1007,7 +1060,7 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) static char memcg_name[PATH_MAX]; int ret; - if (!memcg) + if (!memcg || !p) return; @@ -1137,6 +1190,8 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, victim = mem_cgroup_select_victim(root_mem); if (victim == root_mem) { loop++; + if (loop >= 1) + drain_all_stock_async(); if (loop >= 2) { /* * If we have not been able to reclaim @@ -1160,7 +1215,7 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, } } } - if (!mem_cgroup_local_usage(&victim->stat)) { + if (!mem_cgroup_local_usage(victim)) { /* this cgroup's local usage == 0 */ css_put(&victim->css); continue; @@ -1191,90 +1246,284 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, return total; } -bool mem_cgroup_oom_called(struct task_struct *task) +static int mem_cgroup_oom_lock_cb(struct mem_cgroup *mem, void *data) { - bool ret = false; - struct mem_cgroup *mem; - struct mm_struct *mm; + int *val = (int *)data; + int x; + /* + * Logically, we can stop scanning immediately when we find + * a memcg is already locked. But condidering unlock ops and + * creation/removal of memcg, scan-all is simple operation. + */ + x = atomic_inc_return(&mem->oom_lock); + *val = max(x, *val); + return 0; +} +/* + * Check OOM-Killer is already running under our hierarchy. + * If someone is running, return false. + */ +static bool mem_cgroup_oom_lock(struct mem_cgroup *mem) +{ + int lock_count = 0; - rcu_read_lock(); - mm = task->mm; - if (!mm) - mm = &init_mm; - mem = mem_cgroup_from_task(rcu_dereference(mm->owner)); - if (mem && time_before(jiffies, mem->last_oom_jiffies + HZ/10)) - ret = true; - rcu_read_unlock(); - return ret; + mem_cgroup_walk_tree(mem, &lock_count, mem_cgroup_oom_lock_cb); + + if (lock_count == 1) + return true; + return false; } -static int record_last_oom_cb(struct mem_cgroup *mem, void *data) +static int mem_cgroup_oom_unlock_cb(struct mem_cgroup *mem, void *data) { - mem->last_oom_jiffies = jiffies; + /* + * When a new child is created while the hierarchy is under oom, + * mem_cgroup_oom_lock() may not be called. We have to use + * atomic_add_unless() here. + */ + atomic_add_unless(&mem->oom_lock, -1, 0); return 0; } -static void record_last_oom(struct mem_cgroup *mem) +static void mem_cgroup_oom_unlock(struct mem_cgroup *mem) +{ + mem_cgroup_walk_tree(mem, NULL, mem_cgroup_oom_unlock_cb); +} + +static DEFINE_MUTEX(memcg_oom_mutex); +static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq); + +/* + * try to call OOM killer. returns false if we should exit memory-reclaim loop. + */ +bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask) { - mem_cgroup_walk_tree(mem, NULL, record_last_oom_cb); + DEFINE_WAIT(wait); + bool locked; + + /* At first, try to OOM lock hierarchy under mem.*/ + mutex_lock(&memcg_oom_mutex); + locked = mem_cgroup_oom_lock(mem); + /* + * Even if signal_pending(), we can't quit charge() loop without + * accounting. So, UNINTERRUPTIBLE is appropriate. But SIGKILL + * under OOM is always welcomed, use TASK_KILLABLE here. + */ + if (!locked) + prepare_to_wait(&memcg_oom_waitq, &wait, TASK_KILLABLE); + mutex_unlock(&memcg_oom_mutex); + + if (locked) + mem_cgroup_out_of_memory(mem, mask); + else { + schedule(); + finish_wait(&memcg_oom_waitq, &wait); + } + mutex_lock(&memcg_oom_mutex); + mem_cgroup_oom_unlock(mem); + /* + * Here, we use global waitq .....more fine grained waitq ? + * Assume following hierarchy. + * A/ + * 01 + * 02 + * assume OOM happens both in A and 01 at the same time. Tthey are + * mutually exclusive by lock. (kill in 01 helps A.) + * When we use per memcg waitq, we have to wake up waiters on A and 02 + * in addtion to waiters on 01. We use global waitq for avoiding mess. + * It will not be a big problem. + * (And a task may be moved to other groups while it's waiting for OOM.) + */ + wake_up_all(&memcg_oom_waitq); + mutex_unlock(&memcg_oom_mutex); + + if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current)) + return false; + /* Give chance to dying process */ + schedule_timeout(1); + return true; } /* * Currently used to update mapped file statistics, but the routine can be * generalized to update other statistics as well. */ -void mem_cgroup_update_mapped_file_stat(struct page *page, int val) +void mem_cgroup_update_file_mapped(struct page *page, int val) { struct mem_cgroup *mem; - struct mem_cgroup_stat *stat; - struct mem_cgroup_stat_cpu *cpustat; - int cpu; struct page_cgroup *pc; - if (!page_is_file_cache(page)) - return; - pc = lookup_page_cgroup(page); if (unlikely(!pc)) return; lock_page_cgroup(pc); mem = pc->mem_cgroup; - if (!mem) - goto done; - - if (!PageCgroupUsed(pc)) + if (!mem || !PageCgroupUsed(pc)) goto done; /* - * Preemption is already disabled, we don't need get_cpu() + * Preemption is already disabled. We can use __this_cpu_xxx */ - cpu = smp_processor_id(); - stat = &mem->stat; - cpustat = &stat->cpustat[cpu]; + if (val > 0) { + __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]); + SetPageCgroupFileMapped(pc); + } else { + __this_cpu_dec(mem->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]); + ClearPageCgroupFileMapped(pc); + } - __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_MAPPED_FILE, val); done: unlock_page_cgroup(pc); } /* + * size of first charge trial. "32" comes from vmscan.c's magic value. + * TODO: maybe necessary to use big numbers in big irons. + */ +#define CHARGE_SIZE (32 * PAGE_SIZE) +struct memcg_stock_pcp { + struct mem_cgroup *cached; /* this never be root cgroup */ + int charge; + struct work_struct work; +}; +static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock); +static atomic_t memcg_drain_count; + +/* + * Try to consume stocked charge on this cpu. If success, PAGE_SIZE is consumed + * from local stock and true is returned. If the stock is 0 or charges from a + * cgroup which is not current target, returns false. This stock will be + * refilled. + */ +static bool consume_stock(struct mem_cgroup *mem) +{ + struct memcg_stock_pcp *stock; + bool ret = true; + + stock = &get_cpu_var(memcg_stock); + if (mem == stock->cached && stock->charge) + stock->charge -= PAGE_SIZE; + else /* need to call res_counter_charge */ + ret = false; + put_cpu_var(memcg_stock); + return ret; +} + +/* + * Returns stocks cached in percpu to res_counter and reset cached information. + */ +static void drain_stock(struct memcg_stock_pcp *stock) +{ + struct mem_cgroup *old = stock->cached; + + if (stock->charge) { + res_counter_uncharge(&old->res, stock->charge); + if (do_swap_account) + res_counter_uncharge(&old->memsw, stock->charge); + } + stock->cached = NULL; + stock->charge = 0; +} + +/* + * This must be called under preempt disabled or must be called by + * a thread which is pinned to local cpu. + */ +static void drain_local_stock(struct work_struct *dummy) +{ + struct memcg_stock_pcp *stock = &__get_cpu_var(memcg_stock); + drain_stock(stock); +} + +/* + * Cache charges(val) which is from res_counter, to local per_cpu area. + * This will be consumed by consumt_stock() function, later. + */ +static void refill_stock(struct mem_cgroup *mem, int val) +{ + struct memcg_stock_pcp *stock = &get_cpu_var(memcg_stock); + + if (stock->cached != mem) { /* reset if necessary */ + drain_stock(stock); + stock->cached = mem; + } + stock->charge += val; + put_cpu_var(memcg_stock); +} + +/* + * Tries to drain stocked charges in other cpus. This function is asynchronous + * and just put a work per cpu for draining localy on each cpu. Caller can + * expects some charges will be back to res_counter later but cannot wait for + * it. + */ +static void drain_all_stock_async(void) +{ + int cpu; + /* This function is for scheduling "drain" in asynchronous way. + * The result of "drain" is not directly handled by callers. Then, + * if someone is calling drain, we don't have to call drain more. + * Anyway, WORK_STRUCT_PENDING check in queue_work_on() will catch if + * there is a race. We just do loose check here. + */ + if (atomic_read(&memcg_drain_count)) + return; + /* Notify other cpus that system-wide "drain" is running */ + atomic_inc(&memcg_drain_count); + get_online_cpus(); + for_each_online_cpu(cpu) { + struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu); + schedule_work_on(cpu, &stock->work); + } + put_online_cpus(); + atomic_dec(&memcg_drain_count); + /* We don't wait for flush_work */ +} + +/* This is a synchronous drain interface. */ +static void drain_all_stock_sync(void) +{ + /* called when force_empty is called */ + atomic_inc(&memcg_drain_count); + schedule_on_each_cpu(drain_local_stock); + atomic_dec(&memcg_drain_count); +} + +static int __cpuinit memcg_stock_cpu_callback(struct notifier_block *nb, + unsigned long action, + void *hcpu) +{ + int cpu = (unsigned long)hcpu; + struct memcg_stock_pcp *stock; + + if (action != CPU_DEAD) + return NOTIFY_OK; + stock = &per_cpu(memcg_stock, cpu); + drain_stock(stock); + return NOTIFY_OK; +} + +/* * Unlike exported interface, "oom" parameter is added. if oom==true, * oom-killer can be invoked. */ static int __mem_cgroup_try_charge(struct mm_struct *mm, - gfp_t gfp_mask, struct mem_cgroup **memcg, - bool oom, struct page *page) + gfp_t gfp_mask, struct mem_cgroup **memcg, bool oom) { struct mem_cgroup *mem, *mem_over_limit; int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; struct res_counter *fail_res; + int csize = CHARGE_SIZE; - if (unlikely(test_thread_flag(TIF_MEMDIE))) { - /* Don't account this! */ - *memcg = NULL; - return 0; - } + /* + * Unlike gloval-vm's OOM-kill, we're not in memory shortage + * in system level. So, allow to go ahead dying process in addition to + * MEMDIE process. + */ + if (unlikely(test_thread_flag(TIF_MEMDIE) + || fatal_signal_pending(current))) + goto bypass; /* * We always charge the cgroup the mm_struct belongs to. @@ -1293,23 +1542,25 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, return 0; VM_BUG_ON(css_is_removed(&mem->css)); + if (mem_cgroup_is_root(mem)) + goto done; while (1) { int ret = 0; unsigned long flags = 0; - if (mem_cgroup_is_root(mem)) + if (consume_stock(mem)) goto done; - ret = res_counter_charge(&mem->res, PAGE_SIZE, &fail_res); + + ret = res_counter_charge(&mem->res, csize, &fail_res); if (likely(!ret)) { if (!do_swap_account) break; - ret = res_counter_charge(&mem->memsw, PAGE_SIZE, - &fail_res); + ret = res_counter_charge(&mem->memsw, csize, &fail_res); if (likely(!ret)) break; /* mem+swap counter fails */ - res_counter_uncharge(&mem->res, PAGE_SIZE); + res_counter_uncharge(&mem->res, csize); flags |= MEM_CGROUP_RECLAIM_NOSWAP; mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw); @@ -1318,6 +1569,11 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, mem_over_limit = mem_cgroup_from_res_counter(fail_res, res); + /* reduce request size and retry */ + if (csize > PAGE_SIZE) { + csize = PAGE_SIZE; + continue; + } if (!(gfp_mask & __GFP_WAIT)) goto nomem; @@ -1337,27 +1593,94 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, if (mem_cgroup_check_under_limit(mem_over_limit)) continue; + /* try to avoid oom while someone is moving charge */ + if (mc.moving_task && current != mc.moving_task) { + struct mem_cgroup *from, *to; + bool do_continue = false; + /* + * There is a small race that "from" or "to" can be + * freed by rmdir, so we use css_tryget(). + */ + rcu_read_lock(); + from = mc.from; + to = mc.to; + if (from && css_tryget(&from->css)) { + if (mem_over_limit->use_hierarchy) + do_continue = css_is_ancestor( + &from->css, + &mem_over_limit->css); + else + do_continue = (from == mem_over_limit); + css_put(&from->css); + } + if (!do_continue && to && css_tryget(&to->css)) { + if (mem_over_limit->use_hierarchy) + do_continue = css_is_ancestor( + &to->css, + &mem_over_limit->css); + else + do_continue = (to == mem_over_limit); + css_put(&to->css); + } + rcu_read_unlock(); + if (do_continue) { + DEFINE_WAIT(wait); + prepare_to_wait(&mc.waitq, &wait, + TASK_INTERRUPTIBLE); + /* moving charge context might have finished. */ + if (mc.moving_task) + schedule(); + finish_wait(&mc.waitq, &wait); + continue; + } + } + if (!nr_retries--) { - if (oom) { - mutex_lock(&memcg_tasklist); - mem_cgroup_out_of_memory(mem_over_limit, gfp_mask); - mutex_unlock(&memcg_tasklist); - record_last_oom(mem_over_limit); + if (!oom) + goto nomem; + if (mem_cgroup_handle_oom(mem_over_limit, gfp_mask)) { + nr_retries = MEM_CGROUP_RECLAIM_RETRIES; + continue; } - goto nomem; + /* When we reach here, current task is dying .*/ + css_put(&mem->css); + goto bypass; } } - /* - * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree. - * if they exceeds softlimit. - */ - if (mem_cgroup_soft_limit_check(mem)) - mem_cgroup_update_tree(mem, page); + if (csize > PAGE_SIZE) + refill_stock(mem, csize - PAGE_SIZE); done: return 0; nomem: css_put(&mem->css); return -ENOMEM; +bypass: + *memcg = NULL; + return 0; +} + +/* + * Somemtimes we have to undo a charge we got by try_charge(). + * This function is for that and do uncharge, put css's refcnt. + * gotten by try_charge(). + */ +static void __mem_cgroup_cancel_charge(struct mem_cgroup *mem, + unsigned long count) +{ + if (!mem_cgroup_is_root(mem)) { + res_counter_uncharge(&mem->res, PAGE_SIZE * count); + if (do_swap_account) + res_counter_uncharge(&mem->memsw, PAGE_SIZE * count); + VM_BUG_ON(test_bit(CSS_ROOT, &mem->css.flags)); + WARN_ON_ONCE(count > INT_MAX); + __css_put(&mem->css, (int)count); + } + /* we don't need css_put for root */ +} + +static void mem_cgroup_cancel_charge(struct mem_cgroup *mem) +{ + __mem_cgroup_cancel_charge(mem, 1); } /* @@ -1379,25 +1702,22 @@ static struct mem_cgroup *mem_cgroup_lookup(unsigned short id) return container_of(css, struct mem_cgroup, css); } -static struct mem_cgroup *try_get_mem_cgroup_from_swapcache(struct page *page) +struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page) { - struct mem_cgroup *mem; + struct mem_cgroup *mem = NULL; struct page_cgroup *pc; unsigned short id; swp_entry_t ent; VM_BUG_ON(!PageLocked(page)); - if (!PageSwapCache(page)) - return NULL; - pc = lookup_page_cgroup(page); lock_page_cgroup(pc); if (PageCgroupUsed(pc)) { mem = pc->mem_cgroup; if (mem && !css_tryget(&mem->css)) mem = NULL; - } else { + } else if (PageSwapCache(page)) { ent.val = page_private(page); id = lookup_swap_cgroup(ent); rcu_read_lock(); @@ -1426,12 +1746,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem, lock_page_cgroup(pc); if (unlikely(PageCgroupUsed(pc))) { unlock_page_cgroup(pc); - if (!mem_cgroup_is_root(mem)) { - res_counter_uncharge(&mem->res, PAGE_SIZE); - if (do_swap_account) - res_counter_uncharge(&mem->memsw, PAGE_SIZE); - } - css_put(&mem->css); + mem_cgroup_cancel_charge(mem); return; } @@ -1461,88 +1776,83 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem, mem_cgroup_charge_statistics(mem, pc, true); unlock_page_cgroup(pc); + /* + * "charge_statistics" updated event counter. Then, check it. + * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree. + * if they exceeds softlimit. + */ + memcg_check_events(mem, pc->page); } /** - * mem_cgroup_move_account - move account of the page + * __mem_cgroup_move_account - move account of the page * @pc: page_cgroup of the page. * @from: mem_cgroup which the page is moved from. * @to: mem_cgroup which the page is moved to. @from != @to. + * @uncharge: whether we should call uncharge and css_put against @from. * * The caller must confirm following. * - page is not on LRU (isolate_page() is useful.) + * - the pc is locked, used, and ->mem_cgroup points to @from. * - * returns 0 at success, - * returns -EBUSY when lock is busy or "pc" is unstable. - * - * This function does "uncharge" from old cgroup but doesn't do "charge" to - * new cgroup. It should be done by a caller. + * This function doesn't do "charge" nor css_get to new cgroup. It should be + * done by a caller(__mem_cgroup_try_charge would be usefull). If @uncharge is + * true, this function does "uncharge" from old cgroup, but it doesn't if + * @uncharge is false, so a caller should do "uncharge". */ -static int mem_cgroup_move_account(struct page_cgroup *pc, - struct mem_cgroup *from, struct mem_cgroup *to) +static void __mem_cgroup_move_account(struct page_cgroup *pc, + struct mem_cgroup *from, struct mem_cgroup *to, bool uncharge) { - struct mem_cgroup_per_zone *from_mz, *to_mz; - int nid, zid; - int ret = -EBUSY; - struct page *page; - int cpu; - struct mem_cgroup_stat *stat; - struct mem_cgroup_stat_cpu *cpustat; - VM_BUG_ON(from == to); VM_BUG_ON(PageLRU(pc->page)); - - nid = page_cgroup_nid(pc); - zid = page_cgroup_zid(pc); - from_mz = mem_cgroup_zoneinfo(from, nid, zid); - to_mz = mem_cgroup_zoneinfo(to, nid, zid); - - if (!trylock_page_cgroup(pc)) - return ret; - - if (!PageCgroupUsed(pc)) - goto out; - - if (pc->mem_cgroup != from) - goto out; - - if (!mem_cgroup_is_root(from)) - res_counter_uncharge(&from->res, PAGE_SIZE); - mem_cgroup_charge_statistics(from, pc, false); - - page = pc->page; - if (page_is_file_cache(page) && page_mapped(page)) { - cpu = smp_processor_id(); - /* Update mapped_file data for mem_cgroup "from" */ - stat = &from->stat; - cpustat = &stat->cpustat[cpu]; - __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_MAPPED_FILE, - -1); - - /* Update mapped_file data for mem_cgroup "to" */ - stat = &to->stat; - cpustat = &stat->cpustat[cpu]; - __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_MAPPED_FILE, - 1); + VM_BUG_ON(!PageCgroupLocked(pc)); + VM_BUG_ON(!PageCgroupUsed(pc)); + VM_BUG_ON(pc->mem_cgroup != from); + + if (PageCgroupFileMapped(pc)) { + /* Update mapped_file data for mem_cgroup */ + preempt_disable(); + __this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]); + __this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]); + preempt_enable(); } + mem_cgroup_charge_statistics(from, pc, false); + if (uncharge) + /* This is not "cancel", but cancel_charge does all we need. */ + mem_cgroup_cancel_charge(from); - if (do_swap_account && !mem_cgroup_is_root(from)) - res_counter_uncharge(&from->memsw, PAGE_SIZE); - css_put(&from->css); - - css_get(&to->css); + /* caller should have done css_get */ pc->mem_cgroup = to; mem_cgroup_charge_statistics(to, pc, true); - ret = 0; -out: - unlock_page_cgroup(pc); /* * We charges against "to" which may not have any tasks. Then, "to" * can be under rmdir(). But in current implementation, caller of - * this function is just force_empty() and it's garanteed that - * "to" is never removed. So, we don't check rmdir status here. + * this function is just force_empty() and move charge, so it's + * garanteed that "to" is never removed. So, we don't check rmdir + * status here. + */ +} + +/* + * check whether the @pc is valid for moving account and call + * __mem_cgroup_move_account() + */ +static int mem_cgroup_move_account(struct page_cgroup *pc, + struct mem_cgroup *from, struct mem_cgroup *to, bool uncharge) +{ + int ret = -EINVAL; + lock_page_cgroup(pc); + if (PageCgroupUsed(pc) && pc->mem_cgroup == from) { + __mem_cgroup_move_account(pc, from, to, uncharge); + ret = 0; + } + unlock_page_cgroup(pc); + /* + * check events */ + memcg_check_events(to, pc->page); + memcg_check_events(from, pc->page); return ret; } @@ -1564,45 +1874,25 @@ static int mem_cgroup_move_parent(struct page_cgroup *pc, if (!pcg) return -EINVAL; + ret = -EBUSY; + if (!get_page_unless_zero(page)) + goto out; + if (isolate_lru_page(page)) + goto put; parent = mem_cgroup_from_cont(pcg); - - - ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false, page); + ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false); if (ret || !parent) - return ret; - - if (!get_page_unless_zero(page)) { - ret = -EBUSY; - goto uncharge; - } - - ret = isolate_lru_page(page); + goto put_back; + ret = mem_cgroup_move_account(pc, child, parent, true); if (ret) - goto cancel; - - ret = mem_cgroup_move_account(pc, child, parent); - + mem_cgroup_cancel_charge(parent); +put_back: putback_lru_page(page); - if (!ret) { - put_page(page); - /* drop extra refcnt by try_charge() */ - css_put(&parent->css); - return 0; - } - -cancel: +put: put_page(page); -uncharge: - /* drop extra refcnt by try_charge() */ - css_put(&parent->css); - /* uncharge if move fails */ - if (!mem_cgroup_is_root(parent)) { - res_counter_uncharge(&parent->res, PAGE_SIZE); - if (do_swap_account) - res_counter_uncharge(&parent->memsw, PAGE_SIZE); - } +out: return ret; } @@ -1627,7 +1917,7 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, prefetchw(pc); mem = memcg; - ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true, page); + ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true); if (ret || !mem) return ret; @@ -1720,7 +2010,7 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, /* * While swap-in, try_charge -> commit or cancel, the page is locked. * And when try_charge() successfully returns, one refcnt to memcg without - * struct page_cgroup is aquired. This refcnt will be cumsumed by + * struct page_cgroup is acquired. This refcnt will be consumed by * "commit()" or removed by "cancel()" */ int mem_cgroup_try_charge_swapin(struct mm_struct *mm, @@ -1737,23 +2027,24 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm, goto charge_cur_mm; /* * A racing thread's fault, or swapoff, may have already updated - * the pte, and even removed page from swap cache: return success - * to go on to do_swap_page()'s pte_same() test, which should fail. + * the pte, and even removed page from swap cache: in those cases + * do_swap_page()'s pte_same() test will fail; but there's also a + * KSM case which does need to charge the page. */ if (!PageSwapCache(page)) - return 0; - mem = try_get_mem_cgroup_from_swapcache(page); + goto charge_cur_mm; + mem = try_get_mem_cgroup_from_page(page); if (!mem) goto charge_cur_mm; *ptr = mem; - ret = __mem_cgroup_try_charge(NULL, mask, ptr, true, page); + ret = __mem_cgroup_try_charge(NULL, mask, ptr, true); /* drop extra refcnt from tryget */ css_put(&mem->css); return ret; charge_cur_mm: if (unlikely(!mm)) mm = &init_mm; - return __mem_cgroup_try_charge(mm, mask, ptr, true, page); + return __mem_cgroup_try_charge(mm, mask, ptr, true); } static void @@ -1818,14 +2109,53 @@ void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem) return; if (!mem) return; - if (!mem_cgroup_is_root(mem)) { - res_counter_uncharge(&mem->res, PAGE_SIZE); - if (do_swap_account) - res_counter_uncharge(&mem->memsw, PAGE_SIZE); - } - css_put(&mem->css); + mem_cgroup_cancel_charge(mem); } +static void +__do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype) +{ + struct memcg_batch_info *batch = NULL; + bool uncharge_memsw = true; + /* If swapout, usage of swap doesn't decrease */ + if (!do_swap_account || ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) + uncharge_memsw = false; + /* + * do_batch > 0 when unmapping pages or inode invalidate/truncate. + * In those cases, all pages freed continously can be expected to be in + * the same cgroup and we have chance to coalesce uncharges. + * But we do uncharge one by one if this is killed by OOM(TIF_MEMDIE) + * because we want to do uncharge as soon as possible. + */ + if (!current->memcg_batch.do_batch || test_thread_flag(TIF_MEMDIE)) + goto direct_uncharge; + + batch = ¤t->memcg_batch; + /* + * In usual, we do css_get() when we remember memcg pointer. + * But in this case, we keep res->usage until end of a series of + * uncharges. Then, it's ok to ignore memcg's refcnt. + */ + if (!batch->memcg) + batch->memcg = mem; + /* + * In typical case, batch->memcg == mem. This means we can + * merge a series of uncharges to an uncharge of res_counter. + * If not, we uncharge res_counter ony by one. + */ + if (batch->memcg != mem) + goto direct_uncharge; + /* remember freed charge and uncharge it later */ + batch->bytes += PAGE_SIZE; + if (uncharge_memsw) + batch->memsw_bytes += PAGE_SIZE; + return; +direct_uncharge: + res_counter_uncharge(&mem->res, PAGE_SIZE); + if (uncharge_memsw) + res_counter_uncharge(&mem->memsw, PAGE_SIZE); + return; +} /* * uncharge if !page_mapped(page) @@ -1874,12 +2204,8 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) break; } - if (!mem_cgroup_is_root(mem)) { - res_counter_uncharge(&mem->res, PAGE_SIZE); - if (do_swap_account && - (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT)) - res_counter_uncharge(&mem->memsw, PAGE_SIZE); - } + if (!mem_cgroup_is_root(mem)) + __do_uncharge(mem, ctype); if (ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) mem_cgroup_swap_statistics(mem, true); mem_cgroup_charge_statistics(mem, pc, false); @@ -1895,8 +2221,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) mz = page_cgroup_zoneinfo(pc); unlock_page_cgroup(pc); - if (mem_cgroup_soft_limit_check(mem)) - mem_cgroup_update_tree(mem, page); + memcg_check_events(mem, page); /* at swapout, this memcg will be accessed to record to swap */ if (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT) css_put(&mem->css); @@ -1925,6 +2250,50 @@ void mem_cgroup_uncharge_cache_page(struct page *page) __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE); } +/* + * Batch_start/batch_end is called in unmap_page_range/invlidate/trucate. + * In that cases, pages are freed continuously and we can expect pages + * are in the same memcg. All these calls itself limits the number of + * pages freed at once, then uncharge_start/end() is called properly. + * This may be called prural(2) times in a context, + */ + +void mem_cgroup_uncharge_start(void) +{ + current->memcg_batch.do_batch++; + /* We can do nest. */ + if (current->memcg_batch.do_batch == 1) { + current->memcg_batch.memcg = NULL; + current->memcg_batch.bytes = 0; + current->memcg_batch.memsw_bytes = 0; + } +} + +void mem_cgroup_uncharge_end(void) +{ + struct memcg_batch_info *batch = ¤t->memcg_batch; + + if (!batch->do_batch) + return; + + batch->do_batch--; + if (batch->do_batch) /* If stacked, do nothing. */ + return; + + if (!batch->memcg) + return; + /* + * This "batch->memcg" is valid without any css_get/put etc... + * bacause we hide charges behind us. + */ + if (batch->bytes) + res_counter_uncharge(&batch->memcg->res, batch->bytes); + if (batch->memsw_bytes) + res_counter_uncharge(&batch->memcg->memsw, batch->memsw_bytes); + /* forget this pointer (for sanity check) */ + batch->memcg = NULL; +} + #ifdef CONFIG_SWAP /* * called after __delete_from_swap_cache() and drop "page" account. @@ -1979,6 +2348,64 @@ void mem_cgroup_uncharge_swap(swp_entry_t ent) } rcu_read_unlock(); } + +/** + * mem_cgroup_move_swap_account - move swap charge and swap_cgroup's record. + * @entry: swap entry to be moved + * @from: mem_cgroup which the entry is moved from + * @to: mem_cgroup which the entry is moved to + * @need_fixup: whether we should fixup res_counters and refcounts. + * + * It succeeds only when the swap_cgroup's record for this entry is the same + * as the mem_cgroup's id of @from. + * + * Returns 0 on success, -EINVAL on failure. + * + * The caller must have charged to @to, IOW, called res_counter_charge() about + * both res and memsw, and called css_get(). + */ +static int mem_cgroup_move_swap_account(swp_entry_t entry, + struct mem_cgroup *from, struct mem_cgroup *to, bool need_fixup) +{ + unsigned short old_id, new_id; + + old_id = css_id(&from->css); + new_id = css_id(&to->css); + + if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) { + mem_cgroup_swap_statistics(from, false); + mem_cgroup_swap_statistics(to, true); + /* + * This function is only called from task migration context now. + * It postpones res_counter and refcount handling till the end + * of task migration(mem_cgroup_clear_mc()) for performance + * improvement. But we cannot postpone mem_cgroup_get(to) + * because if the process that has been moved to @to does + * swap-in, the refcount of @to might be decreased to 0. + */ + mem_cgroup_get(to); + if (need_fixup) { + if (!mem_cgroup_is_root(from)) + res_counter_uncharge(&from->memsw, PAGE_SIZE); + mem_cgroup_put(from); + /* + * we charged both to->res and to->memsw, so we should + * uncharge to->res. + */ + if (!mem_cgroup_is_root(to)) + res_counter_uncharge(&to->res, PAGE_SIZE); + css_put(&to->css); + } + return 0; + } + return -EINVAL; +} +#else +static inline int mem_cgroup_move_swap_account(swp_entry_t entry, + struct mem_cgroup *from, struct mem_cgroup *to, bool need_fixup) +{ + return -EINVAL; +} #endif /* @@ -2003,8 +2430,7 @@ int mem_cgroup_prepare_migration(struct page *page, struct mem_cgroup **ptr) unlock_page_cgroup(pc); if (mem) { - ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false, - page); + ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false); css_put(&mem->css); } *ptr = mem; @@ -2100,7 +2526,6 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, unsigned long long val) { int retry_count; - int progress; u64 memswlimit; int ret = 0; int children = mem_cgroup_count_children(memcg); @@ -2144,8 +2569,7 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, if (!ret) break; - progress = mem_cgroup_hierarchical_reclaim(memcg, NULL, - GFP_KERNEL, + mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL, MEM_CGROUP_RECLAIM_SHRINK); curusage = res_counter_read_u64(&memcg->res, RES_USAGE); /* Usage is reduced ? */ @@ -2334,7 +2758,7 @@ static int mem_cgroup_force_empty_list(struct mem_cgroup *mem, pc = list_entry(list->prev, struct page_cgroup, lru); if (busy == pc) { list_move(&pc->lru, list); - busy = 0; + busy = NULL; spin_unlock_irqrestore(&zone->lru_lock, flags); continue; } @@ -2375,7 +2799,7 @@ static int mem_cgroup_force_empty(struct mem_cgroup *mem, bool free_all) if (free_all) goto try_to_free; move_account: - while (mem->res.usage > 0) { + do { ret = -EBUSY; if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children)) goto out; @@ -2384,6 +2808,7 @@ move_account: goto out; /* This is for making all *used* pages to be on LRU. */ lru_add_drain_all(); + drain_all_stock_sync(); ret = 0; for_each_node_state(node, N_HIGH_MEMORY) { for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) { @@ -2402,8 +2827,8 @@ move_account: if (ret == -ENOMEM) goto try_to_free; cond_resched(); - } - ret = 0; + /* "ret" should also be checked to ensure all lists are empty. */ + } while (mem->res.usage > 0 || ret); out: css_put(&mem->css); return ret; @@ -2436,10 +2861,7 @@ try_to_free: } lru_add_drain(); /* try move_account...there may be some *locked* pages. */ - if (mem->res.usage) - goto move_account; - ret = 0; - goto out; + goto move_account; } int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event) @@ -2466,7 +2888,7 @@ static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft, cgroup_lock(); /* - * If parent's use_hiearchy is set, we can't make any modifications + * If parent's use_hierarchy is set, we can't make any modifications * in the child subtrees. If it is unset, then the change can * occur, provided the current cgroup has no children. * @@ -2495,7 +2917,7 @@ static int mem_cgroup_get_idx_stat(struct mem_cgroup *mem, void *data) { struct mem_cgroup_idx_data *d = data; - d->val += mem_cgroup_read_stat(&mem->stat, d->idx); + d->val += mem_cgroup_read_stat(mem, d->idx); return 0; } @@ -2510,39 +2932,50 @@ mem_cgroup_get_recursive_idx_stat(struct mem_cgroup *mem, *val = d.val; } +static inline u64 mem_cgroup_usage(struct mem_cgroup *mem, bool swap) +{ + u64 idx_val, val; + + if (!mem_cgroup_is_root(mem)) { + if (!swap) + return res_counter_read_u64(&mem->res, RES_USAGE); + else + return res_counter_read_u64(&mem->memsw, RES_USAGE); + } + + mem_cgroup_get_recursive_idx_stat(mem, MEM_CGROUP_STAT_CACHE, &idx_val); + val = idx_val; + mem_cgroup_get_recursive_idx_stat(mem, MEM_CGROUP_STAT_RSS, &idx_val); + val += idx_val; + + if (swap) { + mem_cgroup_get_recursive_idx_stat(mem, + MEM_CGROUP_STAT_SWAPOUT, &idx_val); + val += idx_val; + } + + return val << PAGE_SHIFT; +} + static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft) { struct mem_cgroup *mem = mem_cgroup_from_cont(cont); - u64 idx_val, val; + u64 val; int type, name; type = MEMFILE_TYPE(cft->private); name = MEMFILE_ATTR(cft->private); switch (type) { case _MEM: - if (name == RES_USAGE && mem_cgroup_is_root(mem)) { - mem_cgroup_get_recursive_idx_stat(mem, - MEM_CGROUP_STAT_CACHE, &idx_val); - val = idx_val; - mem_cgroup_get_recursive_idx_stat(mem, - MEM_CGROUP_STAT_RSS, &idx_val); - val += idx_val; - val <<= PAGE_SHIFT; - } else + if (name == RES_USAGE) + val = mem_cgroup_usage(mem, false); + else val = res_counter_read_u64(&mem->res, name); break; case _MEMSWAP: - if (name == RES_USAGE && mem_cgroup_is_root(mem)) { - mem_cgroup_get_recursive_idx_stat(mem, - MEM_CGROUP_STAT_CACHE, &idx_val); - val = idx_val; - mem_cgroup_get_recursive_idx_stat(mem, - MEM_CGROUP_STAT_RSS, &idx_val); - val += idx_val; - mem_cgroup_get_recursive_idx_stat(mem, - MEM_CGROUP_STAT_SWAPOUT, &idx_val); - val <<= PAGE_SHIFT; - } else + if (name == RES_USAGE) + val = mem_cgroup_usage(mem, true); + else val = res_counter_read_u64(&mem->memsw, name); break; default: @@ -2655,12 +3088,45 @@ static int mem_cgroup_reset(struct cgroup *cont, unsigned int event) return 0; } +static u64 mem_cgroup_move_charge_read(struct cgroup *cgrp, + struct cftype *cft) +{ + return mem_cgroup_from_cont(cgrp)->move_charge_at_immigrate; +} + +#ifdef CONFIG_MMU +static int mem_cgroup_move_charge_write(struct cgroup *cgrp, + struct cftype *cft, u64 val) +{ + struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp); + + if (val >= (1 << NR_MOVE_TYPE)) + return -EINVAL; + /* + * We check this value several times in both in can_attach() and + * attach(), so we need cgroup lock to prevent this value from being + * inconsistent. + */ + cgroup_lock(); + mem->move_charge_at_immigrate = val; + cgroup_unlock(); + + return 0; +} +#else +static int mem_cgroup_move_charge_write(struct cgroup *cgrp, + struct cftype *cft, u64 val) +{ + return -ENOSYS; +} +#endif + /* For read statistics */ enum { MCS_CACHE, MCS_RSS, - MCS_MAPPED_FILE, + MCS_FILE_MAPPED, MCS_PGPGIN, MCS_PGPGOUT, MCS_SWAP, @@ -2700,18 +3166,18 @@ static int mem_cgroup_get_local_stat(struct mem_cgroup *mem, void *data) s64 val; /* per cpu stat */ - val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_CACHE); + val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_CACHE); s->stat[MCS_CACHE] += val * PAGE_SIZE; - val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_RSS); + val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_RSS); s->stat[MCS_RSS] += val * PAGE_SIZE; - val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_MAPPED_FILE); - s->stat[MCS_MAPPED_FILE] += val * PAGE_SIZE; - val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_PGPGIN_COUNT); + val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_FILE_MAPPED); + s->stat[MCS_FILE_MAPPED] += val * PAGE_SIZE; + val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_PGPGIN_COUNT); s->stat[MCS_PGPGIN] += val; - val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_PGPGOUT_COUNT); + val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_PGPGOUT_COUNT); s->stat[MCS_PGPGOUT] += val; if (do_swap_account) { - val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_SWAPOUT); + val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_SWAPOUT); s->stat[MCS_SWAP] += val * PAGE_SIZE; } @@ -2839,12 +3305,249 @@ static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft, return 0; } +static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap) +{ + struct mem_cgroup_threshold_ary *t; + u64 usage; + int i; + + rcu_read_lock(); + if (!swap) + t = rcu_dereference(memcg->thresholds); + else + t = rcu_dereference(memcg->memsw_thresholds); + + if (!t) + goto unlock; + + usage = mem_cgroup_usage(memcg, swap); + + /* + * current_threshold points to threshold just below usage. + * If it's not true, a threshold was crossed after last + * call of __mem_cgroup_threshold(). + */ + i = atomic_read(&t->current_threshold); + + /* + * Iterate backward over array of thresholds starting from + * current_threshold and check if a threshold is crossed. + * If none of thresholds below usage is crossed, we read + * only one element of the array here. + */ + for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--) + eventfd_signal(t->entries[i].eventfd, 1); + + /* i = current_threshold + 1 */ + i++; + + /* + * Iterate forward over array of thresholds starting from + * current_threshold+1 and check if a threshold is crossed. + * If none of thresholds above usage is crossed, we read + * only one element of the array here. + */ + for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++) + eventfd_signal(t->entries[i].eventfd, 1); + + /* Update current_threshold */ + atomic_set(&t->current_threshold, i - 1); +unlock: + rcu_read_unlock(); +} + +static void mem_cgroup_threshold(struct mem_cgroup *memcg) +{ + __mem_cgroup_threshold(memcg, false); + if (do_swap_account) + __mem_cgroup_threshold(memcg, true); +} + +static int compare_thresholds(const void *a, const void *b) +{ + const struct mem_cgroup_threshold *_a = a; + const struct mem_cgroup_threshold *_b = b; + + return _a->threshold - _b->threshold; +} + +static int mem_cgroup_register_event(struct cgroup *cgrp, struct cftype *cft, + struct eventfd_ctx *eventfd, const char *args) +{ + struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); + struct mem_cgroup_threshold_ary *thresholds, *thresholds_new; + int type = MEMFILE_TYPE(cft->private); + u64 threshold, usage; + int size; + int i, ret; + + ret = res_counter_memparse_write_strategy(args, &threshold); + if (ret) + return ret; + + mutex_lock(&memcg->thresholds_lock); + if (type == _MEM) + thresholds = memcg->thresholds; + else if (type == _MEMSWAP) + thresholds = memcg->memsw_thresholds; + else + BUG(); + + usage = mem_cgroup_usage(memcg, type == _MEMSWAP); + + /* Check if a threshold crossed before adding a new one */ + if (thresholds) + __mem_cgroup_threshold(memcg, type == _MEMSWAP); + + if (thresholds) + size = thresholds->size + 1; + else + size = 1; + + /* Allocate memory for new array of thresholds */ + thresholds_new = kmalloc(sizeof(*thresholds_new) + + size * sizeof(struct mem_cgroup_threshold), + GFP_KERNEL); + if (!thresholds_new) { + ret = -ENOMEM; + goto unlock; + } + thresholds_new->size = size; + + /* Copy thresholds (if any) to new array */ + if (thresholds) + memcpy(thresholds_new->entries, thresholds->entries, + thresholds->size * + sizeof(struct mem_cgroup_threshold)); + /* Add new threshold */ + thresholds_new->entries[size - 1].eventfd = eventfd; + thresholds_new->entries[size - 1].threshold = threshold; + + /* Sort thresholds. Registering of new threshold isn't time-critical */ + sort(thresholds_new->entries, size, + sizeof(struct mem_cgroup_threshold), + compare_thresholds, NULL); + + /* Find current threshold */ + atomic_set(&thresholds_new->current_threshold, -1); + for (i = 0; i < size; i++) { + if (thresholds_new->entries[i].threshold < usage) { + /* + * thresholds_new->current_threshold will not be used + * until rcu_assign_pointer(), so it's safe to increment + * it here. + */ + atomic_inc(&thresholds_new->current_threshold); + } + } + + if (type == _MEM) + rcu_assign_pointer(memcg->thresholds, thresholds_new); + else + rcu_assign_pointer(memcg->memsw_thresholds, thresholds_new); + + /* To be sure that nobody uses thresholds before freeing it */ + synchronize_rcu(); + + kfree(thresholds); +unlock: + mutex_unlock(&memcg->thresholds_lock); + + return ret; +} + +static int mem_cgroup_unregister_event(struct cgroup *cgrp, struct cftype *cft, + struct eventfd_ctx *eventfd) +{ + struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); + struct mem_cgroup_threshold_ary *thresholds, *thresholds_new; + int type = MEMFILE_TYPE(cft->private); + u64 usage; + int size = 0; + int i, j, ret; + + mutex_lock(&memcg->thresholds_lock); + if (type == _MEM) + thresholds = memcg->thresholds; + else if (type == _MEMSWAP) + thresholds = memcg->memsw_thresholds; + else + BUG(); + + /* + * Something went wrong if we trying to unregister a threshold + * if we don't have thresholds + */ + BUG_ON(!thresholds); + + usage = mem_cgroup_usage(memcg, type == _MEMSWAP); + + /* Check if a threshold crossed before removing */ + __mem_cgroup_threshold(memcg, type == _MEMSWAP); + + /* Calculate new number of threshold */ + for (i = 0; i < thresholds->size; i++) { + if (thresholds->entries[i].eventfd != eventfd) + size++; + } + + /* Set thresholds array to NULL if we don't have thresholds */ + if (!size) { + thresholds_new = NULL; + goto assign; + } + + /* Allocate memory for new array of thresholds */ + thresholds_new = kmalloc(sizeof(*thresholds_new) + + size * sizeof(struct mem_cgroup_threshold), + GFP_KERNEL); + if (!thresholds_new) { + ret = -ENOMEM; + goto unlock; + } + thresholds_new->size = size; + + /* Copy thresholds and find current threshold */ + atomic_set(&thresholds_new->current_threshold, -1); + for (i = 0, j = 0; i < thresholds->size; i++) { + if (thresholds->entries[i].eventfd == eventfd) + continue; + + thresholds_new->entries[j] = thresholds->entries[i]; + if (thresholds_new->entries[j].threshold < usage) { + /* + * thresholds_new->current_threshold will not be used + * until rcu_assign_pointer(), so it's safe to increment + * it here. + */ + atomic_inc(&thresholds_new->current_threshold); + } + j++; + } + +assign: + if (type == _MEM) + rcu_assign_pointer(memcg->thresholds, thresholds_new); + else + rcu_assign_pointer(memcg->memsw_thresholds, thresholds_new); + + /* To be sure that nobody uses thresholds before freeing it */ + synchronize_rcu(); + + kfree(thresholds); +unlock: + mutex_unlock(&memcg->thresholds_lock); + + return ret; +} static struct cftype mem_cgroup_files[] = { { .name = "usage_in_bytes", .private = MEMFILE_PRIVATE(_MEM, RES_USAGE), .read_u64 = mem_cgroup_read, + .register_event = mem_cgroup_register_event, + .unregister_event = mem_cgroup_unregister_event, }, { .name = "max_usage_in_bytes", @@ -2888,6 +3591,11 @@ static struct cftype mem_cgroup_files[] = { .read_u64 = mem_cgroup_swappiness_read, .write_u64 = mem_cgroup_swappiness_write, }, + { + .name = "move_charge_at_immigrate", + .read_u64 = mem_cgroup_move_charge_read, + .write_u64 = mem_cgroup_move_charge_write, + }, }; #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP @@ -2896,6 +3604,8 @@ static struct cftype memsw_cgroup_files[] = { .name = "memsw.usage_in_bytes", .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE), .read_u64 = mem_cgroup_read, + .register_event = mem_cgroup_register_event, + .unregister_event = mem_cgroup_unregister_event, }, { .name = "memsw.max_usage_in_bytes", @@ -2970,24 +3680,29 @@ static void free_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node) kfree(mem->info.nodeinfo[node]); } -static int mem_cgroup_size(void) -{ - int cpustat_size = nr_cpu_ids * sizeof(struct mem_cgroup_stat_cpu); - return sizeof(struct mem_cgroup) + cpustat_size; -} - static struct mem_cgroup *mem_cgroup_alloc(void) { struct mem_cgroup *mem; - int size = mem_cgroup_size(); + int size = sizeof(struct mem_cgroup); + /* Can be very big if MAX_NUMNODES is very big */ if (size < PAGE_SIZE) mem = kmalloc(size, GFP_KERNEL); else mem = vmalloc(size); - if (mem) - memset(mem, 0, size); + if (!mem) + return NULL; + + memset(mem, 0, size); + mem->stat = alloc_percpu(struct mem_cgroup_stat_cpu); + if (!mem->stat) { + if (size < PAGE_SIZE) + kfree(mem); + else + vfree(mem); + mem = NULL; + } return mem; } @@ -3012,7 +3727,8 @@ static void __mem_cgroup_free(struct mem_cgroup *mem) for_each_node_state(node, N_POSSIBLE) free_mem_cgroup_per_zone_info(mem, node); - if (mem_cgroup_size() < PAGE_SIZE) + free_percpu(mem->stat); + if (sizeof(struct mem_cgroup) < PAGE_SIZE) kfree(mem); else vfree(mem); @@ -3023,9 +3739,9 @@ static void mem_cgroup_get(struct mem_cgroup *mem) atomic_inc(&mem->refcnt); } -static void mem_cgroup_put(struct mem_cgroup *mem) +static void __mem_cgroup_put(struct mem_cgroup *mem, int count) { - if (atomic_dec_and_test(&mem->refcnt)) { + if (atomic_sub_and_test(count, &mem->refcnt)) { struct mem_cgroup *parent = parent_mem_cgroup(mem); __mem_cgroup_free(mem); if (parent) @@ -3033,6 +3749,11 @@ static void mem_cgroup_put(struct mem_cgroup *mem) } } +static void mem_cgroup_put(struct mem_cgroup *mem) +{ + __mem_cgroup_put(mem, 1); +} + /* * Returns the parent mem_cgroup in memcgroup hierarchy with hierarchy enabled. */ @@ -3097,12 +3818,18 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) /* root ? */ if (cont->parent == NULL) { + int cpu; enable_swap_cgroup(); parent = NULL; root_mem_cgroup = mem; if (mem_cgroup_soft_limit_tree_init()) goto free_out; - + for_each_possible_cpu(cpu) { + struct memcg_stock_pcp *stock = + &per_cpu(memcg_stock, cpu); + INIT_WORK(&stock->work, drain_local_stock); + } + hotcpu_notifier(memcg_stock_cpu_callback, 0); } else { parent = mem_cgroup_from_cont(cont->parent); mem->use_hierarchy = parent->use_hierarchy; @@ -3128,6 +3855,8 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) if (parent) mem->swappiness = get_swappiness(parent); atomic_set(&mem->refcnt, 1); + mem->move_charge_at_immigrate = 0; + mutex_init(&mem->thresholds_lock); return &mem->css; free_out: __mem_cgroup_free(mem); @@ -3164,19 +3893,445 @@ static int mem_cgroup_populate(struct cgroup_subsys *ss, return ret; } +#ifdef CONFIG_MMU +/* Handlers for move charge at task migration. */ +#define PRECHARGE_COUNT_AT_ONCE 256 +static int mem_cgroup_do_precharge(unsigned long count) +{ + int ret = 0; + int batch_count = PRECHARGE_COUNT_AT_ONCE; + struct mem_cgroup *mem = mc.to; + + if (mem_cgroup_is_root(mem)) { + mc.precharge += count; + /* we don't need css_get for root */ + return ret; + } + /* try to charge at once */ + if (count > 1) { + struct res_counter *dummy; + /* + * "mem" cannot be under rmdir() because we've already checked + * by cgroup_lock_live_cgroup() that it is not removed and we + * are still under the same cgroup_mutex. So we can postpone + * css_get(). + */ + if (res_counter_charge(&mem->res, PAGE_SIZE * count, &dummy)) + goto one_by_one; + if (do_swap_account && res_counter_charge(&mem->memsw, + PAGE_SIZE * count, &dummy)) { + res_counter_uncharge(&mem->res, PAGE_SIZE * count); + goto one_by_one; + } + mc.precharge += count; + VM_BUG_ON(test_bit(CSS_ROOT, &mem->css.flags)); + WARN_ON_ONCE(count > INT_MAX); + __css_get(&mem->css, (int)count); + return ret; + } +one_by_one: + /* fall back to one by one charge */ + while (count--) { + if (signal_pending(current)) { + ret = -EINTR; + break; + } + if (!batch_count--) { + batch_count = PRECHARGE_COUNT_AT_ONCE; + cond_resched(); + } + ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false); + if (ret || !mem) + /* mem_cgroup_clear_mc() will do uncharge later */ + return -ENOMEM; + mc.precharge++; + } + return ret; +} + +/** + * is_target_pte_for_mc - check a pte whether it is valid for move charge + * @vma: the vma the pte to be checked belongs + * @addr: the address corresponding to the pte to be checked + * @ptent: the pte to be checked + * @target: the pointer the target page or swap ent will be stored(can be NULL) + * + * Returns + * 0(MC_TARGET_NONE): if the pte is not a target for move charge. + * 1(MC_TARGET_PAGE): if the page corresponding to this pte is a target for + * move charge. if @target is not NULL, the page is stored in target->page + * with extra refcnt got(Callers should handle it). + * 2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a + * target for charge migration. if @target is not NULL, the entry is stored + * in target->ent. + * + * Called with pte lock held. + */ +union mc_target { + struct page *page; + swp_entry_t ent; +}; + +enum mc_target_type { + MC_TARGET_NONE, /* not used */ + MC_TARGET_PAGE, + MC_TARGET_SWAP, +}; + +static int is_target_pte_for_mc(struct vm_area_struct *vma, + unsigned long addr, pte_t ptent, union mc_target *target) +{ + struct page *page = NULL; + struct page_cgroup *pc; + int ret = 0; + swp_entry_t ent = { .val = 0 }; + int usage_count = 0; + bool move_anon = test_bit(MOVE_CHARGE_TYPE_ANON, + &mc.to->move_charge_at_immigrate); + + if (!pte_present(ptent)) { + /* TODO: handle swap of shmes/tmpfs */ + if (pte_none(ptent) || pte_file(ptent)) + return 0; + else if (is_swap_pte(ptent)) { + ent = pte_to_swp_entry(ptent); + if (!move_anon || non_swap_entry(ent)) + return 0; + usage_count = mem_cgroup_count_swap_user(ent, &page); + } + } else { + page = vm_normal_page(vma, addr, ptent); + if (!page || !page_mapped(page)) + return 0; + /* + * TODO: We don't move charges of file(including shmem/tmpfs) + * pages for now. + */ + if (!move_anon || !PageAnon(page)) + return 0; + if (!get_page_unless_zero(page)) + return 0; + usage_count = page_mapcount(page); + } + if (usage_count > 1) { + /* + * TODO: We don't move charges of shared(used by multiple + * processes) pages for now. + */ + if (page) + put_page(page); + return 0; + } + if (page) { + pc = lookup_page_cgroup(page); + /* + * Do only loose check w/o page_cgroup lock. + * mem_cgroup_move_account() checks the pc is valid or not under + * the lock. + */ + if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) { + ret = MC_TARGET_PAGE; + if (target) + target->page = page; + } + if (!ret || !target) + put_page(page); + } + /* throught */ + if (ent.val && do_swap_account && !ret && + css_id(&mc.from->css) == lookup_swap_cgroup(ent)) { + ret = MC_TARGET_SWAP; + if (target) + target->ent = ent; + } + return ret; +} + +static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd, + unsigned long addr, unsigned long end, + struct mm_walk *walk) +{ + struct vm_area_struct *vma = walk->private; + pte_t *pte; + spinlock_t *ptl; + + pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); + for (; addr != end; pte++, addr += PAGE_SIZE) + if (is_target_pte_for_mc(vma, addr, *pte, NULL)) + mc.precharge++; /* increment precharge temporarily */ + pte_unmap_unlock(pte - 1, ptl); + cond_resched(); + + return 0; +} + +static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm) +{ + unsigned long precharge; + struct vm_area_struct *vma; + + down_read(&mm->mmap_sem); + for (vma = mm->mmap; vma; vma = vma->vm_next) { + struct mm_walk mem_cgroup_count_precharge_walk = { + .pmd_entry = mem_cgroup_count_precharge_pte_range, + .mm = mm, + .private = vma, + }; + if (is_vm_hugetlb_page(vma)) + continue; + /* TODO: We don't move charges of shmem/tmpfs pages for now. */ + if (vma->vm_flags & VM_SHARED) + continue; + walk_page_range(vma->vm_start, vma->vm_end, + &mem_cgroup_count_precharge_walk); + } + up_read(&mm->mmap_sem); + + precharge = mc.precharge; + mc.precharge = 0; + + return precharge; +} + +static int mem_cgroup_precharge_mc(struct mm_struct *mm) +{ + return mem_cgroup_do_precharge(mem_cgroup_count_precharge(mm)); +} + +static void mem_cgroup_clear_mc(void) +{ + /* we must uncharge all the leftover precharges from mc.to */ + if (mc.precharge) { + __mem_cgroup_cancel_charge(mc.to, mc.precharge); + mc.precharge = 0; + } + /* + * we didn't uncharge from mc.from at mem_cgroup_move_account(), so + * we must uncharge here. + */ + if (mc.moved_charge) { + __mem_cgroup_cancel_charge(mc.from, mc.moved_charge); + mc.moved_charge = 0; + } + /* we must fixup refcnts and charges */ + if (mc.moved_swap) { + WARN_ON_ONCE(mc.moved_swap > INT_MAX); + /* uncharge swap account from the old cgroup */ + if (!mem_cgroup_is_root(mc.from)) + res_counter_uncharge(&mc.from->memsw, + PAGE_SIZE * mc.moved_swap); + __mem_cgroup_put(mc.from, mc.moved_swap); + + if (!mem_cgroup_is_root(mc.to)) { + /* + * we charged both to->res and to->memsw, so we should + * uncharge to->res. + */ + res_counter_uncharge(&mc.to->res, + PAGE_SIZE * mc.moved_swap); + VM_BUG_ON(test_bit(CSS_ROOT, &mc.to->css.flags)); + __css_put(&mc.to->css, mc.moved_swap); + } + /* we've already done mem_cgroup_get(mc.to) */ + + mc.moved_swap = 0; + } + mc.from = NULL; + mc.to = NULL; + mc.moving_task = NULL; + wake_up_all(&mc.waitq); +} + +static int mem_cgroup_can_attach(struct cgroup_subsys *ss, + struct cgroup *cgroup, + struct task_struct *p, + bool threadgroup) +{ + int ret = 0; + struct mem_cgroup *mem = mem_cgroup_from_cont(cgroup); + + if (mem->move_charge_at_immigrate) { + struct mm_struct *mm; + struct mem_cgroup *from = mem_cgroup_from_task(p); + + VM_BUG_ON(from == mem); + + mm = get_task_mm(p); + if (!mm) + return 0; + /* We move charges only when we move a owner of the mm */ + if (mm->owner == p) { + VM_BUG_ON(mc.from); + VM_BUG_ON(mc.to); + VM_BUG_ON(mc.precharge); + VM_BUG_ON(mc.moved_charge); + VM_BUG_ON(mc.moved_swap); + VM_BUG_ON(mc.moving_task); + mc.from = from; + mc.to = mem; + mc.precharge = 0; + mc.moved_charge = 0; + mc.moved_swap = 0; + mc.moving_task = current; + + ret = mem_cgroup_precharge_mc(mm); + if (ret) + mem_cgroup_clear_mc(); + } + mmput(mm); + } + return ret; +} + +static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss, + struct cgroup *cgroup, + struct task_struct *p, + bool threadgroup) +{ + mem_cgroup_clear_mc(); +} + +static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, + unsigned long addr, unsigned long end, + struct mm_walk *walk) +{ + int ret = 0; + struct vm_area_struct *vma = walk->private; + pte_t *pte; + spinlock_t *ptl; + +retry: + pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); + for (; addr != end; addr += PAGE_SIZE) { + pte_t ptent = *(pte++); + union mc_target target; + int type; + struct page *page; + struct page_cgroup *pc; + swp_entry_t ent; + + if (!mc.precharge) + break; + + type = is_target_pte_for_mc(vma, addr, ptent, &target); + switch (type) { + case MC_TARGET_PAGE: + page = target.page; + if (isolate_lru_page(page)) + goto put; + pc = lookup_page_cgroup(page); + if (!mem_cgroup_move_account(pc, + mc.from, mc.to, false)) { + mc.precharge--; + /* we uncharge from mc.from later. */ + mc.moved_charge++; + } + putback_lru_page(page); +put: /* is_target_pte_for_mc() gets the page */ + put_page(page); + break; + case MC_TARGET_SWAP: + ent = target.ent; + if (!mem_cgroup_move_swap_account(ent, + mc.from, mc.to, false)) { + mc.precharge--; + /* we fixup refcnts and charges later. */ + mc.moved_swap++; + } + break; + default: + break; + } + } + pte_unmap_unlock(pte - 1, ptl); + cond_resched(); + + if (addr != end) { + /* + * We have consumed all precharges we got in can_attach(). + * We try charge one by one, but don't do any additional + * charges to mc.to if we have failed in charge once in attach() + * phase. + */ + ret = mem_cgroup_do_precharge(1); + if (!ret) + goto retry; + } + + return ret; +} + +static void mem_cgroup_move_charge(struct mm_struct *mm) +{ + struct vm_area_struct *vma; + + lru_add_drain_all(); + down_read(&mm->mmap_sem); + for (vma = mm->mmap; vma; vma = vma->vm_next) { + int ret; + struct mm_walk mem_cgroup_move_charge_walk = { + .pmd_entry = mem_cgroup_move_charge_pte_range, + .mm = mm, + .private = vma, + }; + if (is_vm_hugetlb_page(vma)) + continue; + /* TODO: We don't move charges of shmem/tmpfs pages for now. */ + if (vma->vm_flags & VM_SHARED) + continue; + ret = walk_page_range(vma->vm_start, vma->vm_end, + &mem_cgroup_move_charge_walk); + if (ret) + /* + * means we have consumed all precharges and failed in + * doing additional charge. Just abandon here. + */ + break; + } + up_read(&mm->mmap_sem); +} + static void mem_cgroup_move_task(struct cgroup_subsys *ss, struct cgroup *cont, struct cgroup *old_cont, struct task_struct *p, bool threadgroup) { - mutex_lock(&memcg_tasklist); - /* - * FIXME: It's better to move charges of this process from old - * memcg to new memcg. But it's just on TODO-List now. - */ - mutex_unlock(&memcg_tasklist); + struct mm_struct *mm; + + if (!mc.to) + /* no need to move charge */ + return; + + mm = get_task_mm(p); + if (mm) { + mem_cgroup_move_charge(mm); + mmput(mm); + } + mem_cgroup_clear_mc(); +} +#else /* !CONFIG_MMU */ +static int mem_cgroup_can_attach(struct cgroup_subsys *ss, + struct cgroup *cgroup, + struct task_struct *p, + bool threadgroup) +{ + return 0; } +static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss, + struct cgroup *cgroup, + struct task_struct *p, + bool threadgroup) +{ +} +static void mem_cgroup_move_task(struct cgroup_subsys *ss, + struct cgroup *cont, + struct cgroup *old_cont, + struct task_struct *p, + bool threadgroup) +{ +} +#endif struct cgroup_subsys mem_cgroup_subsys = { .name = "memory", @@ -3185,6 +4340,8 @@ struct cgroup_subsys mem_cgroup_subsys = { .pre_destroy = mem_cgroup_pre_destroy, .destroy = mem_cgroup_destroy, .populate = mem_cgroup_populate, + .can_attach = mem_cgroup_can_attach, + .cancel_attach = mem_cgroup_cancel_attach, .attach = mem_cgroup_move_task, .early_init = 0, .use_id = 1, diff --git a/mm/memory-failure.c b/mm/memory-failure.c index dacc64183874..620b0b461593 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -34,12 +34,17 @@ #include <linux/kernel.h> #include <linux/mm.h> #include <linux/page-flags.h> +#include <linux/kernel-page-flags.h> #include <linux/sched.h> #include <linux/ksm.h> #include <linux/rmap.h> #include <linux/pagemap.h> #include <linux/swap.h> #include <linux/backing-dev.h> +#include <linux/migrate.h> +#include <linux/page-isolation.h> +#include <linux/suspend.h> +#include <linux/slab.h> #include "internal.h" int sysctl_memory_failure_early_kill __read_mostly = 0; @@ -48,6 +53,129 @@ int sysctl_memory_failure_recovery __read_mostly = 1; atomic_long_t mce_bad_pages __read_mostly = ATOMIC_LONG_INIT(0); +#if defined(CONFIG_HWPOISON_INJECT) || defined(CONFIG_HWPOISON_INJECT_MODULE) + +u32 hwpoison_filter_enable = 0; +u32 hwpoison_filter_dev_major = ~0U; +u32 hwpoison_filter_dev_minor = ~0U; +u64 hwpoison_filter_flags_mask; +u64 hwpoison_filter_flags_value; +EXPORT_SYMBOL_GPL(hwpoison_filter_enable); +EXPORT_SYMBOL_GPL(hwpoison_filter_dev_major); +EXPORT_SYMBOL_GPL(hwpoison_filter_dev_minor); +EXPORT_SYMBOL_GPL(hwpoison_filter_flags_mask); +EXPORT_SYMBOL_GPL(hwpoison_filter_flags_value); + +static int hwpoison_filter_dev(struct page *p) +{ + struct address_space *mapping; + dev_t dev; + + if (hwpoison_filter_dev_major == ~0U && + hwpoison_filter_dev_minor == ~0U) + return 0; + + /* + * page_mapping() does not accept slab page + */ + if (PageSlab(p)) + return -EINVAL; + + mapping = page_mapping(p); + if (mapping == NULL || mapping->host == NULL) + return -EINVAL; + + dev = mapping->host->i_sb->s_dev; + if (hwpoison_filter_dev_major != ~0U && + hwpoison_filter_dev_major != MAJOR(dev)) + return -EINVAL; + if (hwpoison_filter_dev_minor != ~0U && + hwpoison_filter_dev_minor != MINOR(dev)) + return -EINVAL; + + return 0; +} + +static int hwpoison_filter_flags(struct page *p) +{ + if (!hwpoison_filter_flags_mask) + return 0; + + if ((stable_page_flags(p) & hwpoison_filter_flags_mask) == + hwpoison_filter_flags_value) + return 0; + else + return -EINVAL; +} + +/* + * This allows stress tests to limit test scope to a collection of tasks + * by putting them under some memcg. This prevents killing unrelated/important + * processes such as /sbin/init. Note that the target task may share clean + * pages with init (eg. libc text), which is harmless. If the target task + * share _dirty_ pages with another task B, the test scheme must make sure B + * is also included in the memcg. At last, due to race conditions this filter + * can only guarantee that the page either belongs to the memcg tasks, or is + * a freed page. + */ +#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP +u64 hwpoison_filter_memcg; +EXPORT_SYMBOL_GPL(hwpoison_filter_memcg); +static int hwpoison_filter_task(struct page *p) +{ + struct mem_cgroup *mem; + struct cgroup_subsys_state *css; + unsigned long ino; + + if (!hwpoison_filter_memcg) + return 0; + + mem = try_get_mem_cgroup_from_page(p); + if (!mem) + return -EINVAL; + + css = mem_cgroup_css(mem); + /* root_mem_cgroup has NULL dentries */ + if (!css->cgroup->dentry) + return -EINVAL; + + ino = css->cgroup->dentry->d_inode->i_ino; + css_put(css); + + if (ino != hwpoison_filter_memcg) + return -EINVAL; + + return 0; +} +#else +static int hwpoison_filter_task(struct page *p) { return 0; } +#endif + +int hwpoison_filter(struct page *p) +{ + if (!hwpoison_filter_enable) + return 0; + + if (hwpoison_filter_dev(p)) + return -EINVAL; + + if (hwpoison_filter_flags(p)) + return -EINVAL; + + if (hwpoison_filter_task(p)) + return -EINVAL; + + return 0; +} +#else +int hwpoison_filter(struct page *p) +{ + return 0; +} +#endif + +EXPORT_SYMBOL_GPL(hwpoison_filter); + /* * Send all the processes who have the page mapped an ``action optional'' * signal. @@ -83,6 +211,36 @@ static int kill_proc_ao(struct task_struct *t, unsigned long addr, int trapno, } /* + * When a unknown page type is encountered drain as many buffers as possible + * in the hope to turn the page into a LRU or free page, which we can handle. + */ +void shake_page(struct page *p, int access) +{ + if (!PageSlab(p)) { + lru_add_drain_all(); + if (PageLRU(p)) + return; + drain_all_pages(); + if (PageLRU(p) || is_free_buddy_page(p)) + return; + } + + /* + * Only all shrink_slab here (which would also + * shrink other caches) if access is not potentially fatal. + */ + if (access) { + int nr; + do { + nr = shrink_slab(1000, GFP_KERNEL, 1000); + if (page_count(p) == 0) + break; + } while (nr > 10); + } +} +EXPORT_SYMBOL_GPL(shake_page); + +/* * Kill all processes that have a poisoned page mapped and then isolate * the page. * @@ -174,10 +332,9 @@ static void kill_procs_ao(struct list_head *to_kill, int doit, int trapno, list_for_each_entry_safe (tk, next, to_kill, nd) { if (doit) { /* - * In case something went wrong with munmaping + * In case something went wrong with munmapping * make sure the process doesn't catch the * signal and then access the memory. Just kill it. - * the signal handlers */ if (fail || tk->addr_valid == 0) { printk(KERN_ERR @@ -227,9 +384,12 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill, if (av == NULL) /* Not actually mapped anymore */ goto out; for_each_process (tsk) { + struct anon_vma_chain *vmac; + if (!task_early_kill(tsk)) continue; - list_for_each_entry (vma, &av->head, anon_vma_node) { + list_for_each_entry(vmac, &av->head, same_anon_vma) { + vma = vmac->vma; if (!page_mapped_in_vma(page, vma)) continue; if (vma->vm_mm == tsk->mm) @@ -314,33 +474,49 @@ static void collect_procs(struct page *page, struct list_head *tokill) */ enum outcome { - FAILED, /* Error handling failed */ + IGNORED, /* Error: cannot be handled */ + FAILED, /* Error: handling failed */ DELAYED, /* Will be handled later */ - IGNORED, /* Error safely ignored */ RECOVERED, /* Successfully recovered */ }; static const char *action_name[] = { + [IGNORED] = "Ignored", [FAILED] = "Failed", [DELAYED] = "Delayed", - [IGNORED] = "Ignored", [RECOVERED] = "Recovered", }; /* - * Error hit kernel page. - * Do nothing, try to be lucky and not touch this instead. For a few cases we - * could be more sophisticated. + * XXX: It is possible that a page is isolated from LRU cache, + * and then kept in swap cache or failed to remove from page cache. + * The page count will stop it from being freed by unpoison. + * Stress tests should be aware of this memory leak problem. */ -static int me_kernel(struct page *p, unsigned long pfn) +static int delete_from_lru_cache(struct page *p) { - return DELAYED; + if (!isolate_lru_page(p)) { + /* + * Clear sensible page flags, so that the buddy system won't + * complain when the page is unpoison-and-freed. + */ + ClearPageActive(p); + ClearPageUnevictable(p); + /* + * drop the page count elevated by isolate_lru_page() + */ + page_cache_release(p); + return 0; + } + return -EIO; } /* - * Already poisoned page. + * Error hit kernel page. + * Do nothing, try to be lucky and not touch this instead. For a few cases we + * could be more sophisticated. */ -static int me_ignore(struct page *p, unsigned long pfn) +static int me_kernel(struct page *p, unsigned long pfn) { return IGNORED; } @@ -355,14 +531,6 @@ static int me_unknown(struct page *p, unsigned long pfn) } /* - * Free memory - */ -static int me_free(struct page *p, unsigned long pfn) -{ - return DELAYED; -} - -/* * Clean (or cleaned) page cache page. */ static int me_pagecache_clean(struct page *p, unsigned long pfn) @@ -371,6 +539,8 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn) int ret = FAILED; struct address_space *mapping; + delete_from_lru_cache(p); + /* * For anonymous pages we're done the only reference left * should be the one m_f() holds. @@ -500,14 +670,20 @@ static int me_swapcache_dirty(struct page *p, unsigned long pfn) /* Trigger EIO in shmem: */ ClearPageUptodate(p); - return DELAYED; + if (!delete_from_lru_cache(p)) + return DELAYED; + else + return FAILED; } static int me_swapcache_clean(struct page *p, unsigned long pfn) { delete_from_swap_cache(p); - return RECOVERED; + if (!delete_from_lru_cache(p)) + return RECOVERED; + else + return FAILED; } /* @@ -550,7 +726,6 @@ static int me_huge_page(struct page *p, unsigned long pfn) #define tail (1UL << PG_tail) #define compound (1UL << PG_compound) #define slab (1UL << PG_slab) -#define buddy (1UL << PG_buddy) #define reserved (1UL << PG_reserved) static struct page_state { @@ -559,8 +734,11 @@ static struct page_state { char *msg; int (*action)(struct page *p, unsigned long pfn); } error_states[] = { - { reserved, reserved, "reserved kernel", me_ignore }, - { buddy, buddy, "free kernel", me_free }, + { reserved, reserved, "reserved kernel", me_kernel }, + /* + * free pages are specially detected outside this table: + * PG_buddy pages only make a small fraction of all free pages. + */ /* * Could in theory check if slab page is free or if we can drop @@ -582,14 +760,11 @@ static struct page_state { { unevict|dirty, unevict|dirty, "unevictable LRU", me_pagecache_dirty}, { unevict, unevict, "unevictable LRU", me_pagecache_clean}, -#ifdef CONFIG_HAVE_MLOCKED_PAGE_BIT { mlock|dirty, mlock|dirty, "mlocked LRU", me_pagecache_dirty }, { mlock, mlock, "mlocked LRU", me_pagecache_clean }, -#endif { lru|dirty, lru|dirty, "LRU", me_pagecache_dirty }, { lru|dirty, lru, "clean LRU", me_pagecache_clean }, - { swapbacked, swapbacked, "anonymous", me_pagecache_clean }, /* * Catchall entry: must be at end. @@ -597,20 +772,31 @@ static struct page_state { { 0, 0, "unknown page state", me_unknown }, }; +#undef dirty +#undef sc +#undef unevict +#undef mlock +#undef writeback +#undef lru +#undef swapbacked +#undef head +#undef tail +#undef compound +#undef slab +#undef reserved + static void action_result(unsigned long pfn, char *msg, int result) { - struct page *page = NULL; - if (pfn_valid(pfn)) - page = pfn_to_page(pfn); + struct page *page = pfn_to_page(pfn); printk(KERN_ERR "MCE %#lx: %s%s page recovery: %s\n", pfn, - page && PageDirty(page) ? "dirty " : "", + PageDirty(page) ? "dirty " : "", msg, action_name[result]); } static int page_action(struct page_state *ps, struct page *p, - unsigned long pfn, int ref) + unsigned long pfn) { int result; int count; @@ -618,18 +804,22 @@ static int page_action(struct page_state *ps, struct page *p, result = ps->action(p, pfn); action_result(pfn, ps->msg, result); - count = page_count(p) - 1 - ref; - if (count != 0) + count = page_count(p) - 1; + if (ps->action == me_swapcache_dirty && result == DELAYED) + count--; + if (count != 0) { printk(KERN_ERR "MCE %#lx: %s page still referenced by %d users\n", pfn, ps->msg, count); + result = FAILED; + } /* Could do more checks here if page looks ok */ /* * Could adjust zone counters here to correct for the missing page. */ - return result == RECOVERED ? 0 : -EBUSY; + return (result == RECOVERED || result == DELAYED) ? 0 : -EBUSY; } #define N_UNMAP_TRIES 5 @@ -638,7 +828,7 @@ static int page_action(struct page_state *ps, struct page *p, * Do all that is necessary to remove user space mappings. Unmap * the pages and send SIGBUS to the processes if the data was dirty. */ -static void hwpoison_user_mappings(struct page *p, unsigned long pfn, +static int hwpoison_user_mappings(struct page *p, unsigned long pfn, int trapno) { enum ttu_flags ttu = TTU_UNMAP | TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS; @@ -648,15 +838,18 @@ static void hwpoison_user_mappings(struct page *p, unsigned long pfn, int i; int kill = 1; - if (PageReserved(p) || PageCompound(p) || PageSlab(p) || PageKsm(p)) - return; + if (PageReserved(p) || PageSlab(p)) + return SWAP_SUCCESS; /* * This check implies we don't kill processes if their pages * are in the swap cache early. Those are always late kills. */ if (!page_mapped(p)) - return; + return SWAP_SUCCESS; + + if (PageCompound(p) || PageKsm(p)) + return SWAP_FAIL; if (PageSwapCache(p)) { printk(KERN_ERR @@ -667,6 +860,8 @@ static void hwpoison_user_mappings(struct page *p, unsigned long pfn, /* * Propagate the dirty bit from PTEs to struct page first, because we * need this to decide if we should kill or just drop the page. + * XXX: the dirty test could be racy: set_page_dirty() may not always + * be called inside page lock (it's recommended but not enforced). */ mapping = page_mapping(p); if (!PageDirty(p) && mapping && mapping_cap_writeback_dirty(mapping)) { @@ -718,11 +913,12 @@ static void hwpoison_user_mappings(struct page *p, unsigned long pfn, */ kill_procs_ao(&tokill, !!PageDirty(p), trapno, ret != SWAP_SUCCESS, pfn); + + return ret; } -int __memory_failure(unsigned long pfn, int trapno, int ref) +int __memory_failure(unsigned long pfn, int trapno, int flags) { - unsigned long lru_flag; struct page_state *ps; struct page *p; int res; @@ -731,13 +927,15 @@ int __memory_failure(unsigned long pfn, int trapno, int ref) panic("Memory failure from trap %d on page %lx", trapno, pfn); if (!pfn_valid(pfn)) { - action_result(pfn, "memory outside kernel control", IGNORED); - return -EIO; + printk(KERN_ERR + "MCE %#lx: memory outside kernel control\n", + pfn); + return -ENXIO; } p = pfn_to_page(pfn); if (TestSetPageHWPoison(p)) { - action_result(pfn, "already hardware poisoned", IGNORED); + printk(KERN_ERR "MCE %#lx: already hardware poisoned\n", pfn); return 0; } @@ -754,9 +952,15 @@ int __memory_failure(unsigned long pfn, int trapno, int ref) * In fact it's dangerous to directly bump up page count from 0, * that may make page_freeze_refs()/page_unfreeze_refs() mismatch. */ - if (!get_page_unless_zero(compound_head(p))) { - action_result(pfn, "free or high order kernel", IGNORED); - return PageBuddy(compound_head(p)) ? 0 : -EBUSY; + if (!(flags & MF_COUNT_INCREASED) && + !get_page_unless_zero(compound_head(p))) { + if (is_free_buddy_page(p)) { + action_result(pfn, "free buddy", DELAYED); + return 0; + } else { + action_result(pfn, "high order kernel", IGNORED); + return -EBUSY; + } } /* @@ -768,14 +972,19 @@ int __memory_failure(unsigned long pfn, int trapno, int ref) * walked by the page reclaim code, however that's not a big loss. */ if (!PageLRU(p)) - lru_add_drain_all(); - lru_flag = p->flags & lru; - if (isolate_lru_page(p)) { + shake_page(p, 0); + if (!PageLRU(p)) { + /* + * shake_page could have turned it free. + */ + if (is_free_buddy_page(p)) { + action_result(pfn, "free buddy, 2nd try", DELAYED); + return 0; + } action_result(pfn, "non LRU", IGNORED); put_page(p); return -EBUSY; } - page_cache_release(p); /* * Lock the page and wait for writeback to finish. @@ -783,26 +992,48 @@ int __memory_failure(unsigned long pfn, int trapno, int ref) * and in many cases impossible, so we just avoid it here. */ lock_page_nosync(p); + + /* + * unpoison always clear PG_hwpoison inside page lock + */ + if (!PageHWPoison(p)) { + printk(KERN_ERR "MCE %#lx: just unpoisoned\n", pfn); + res = 0; + goto out; + } + if (hwpoison_filter(p)) { + if (TestClearPageHWPoison(p)) + atomic_long_dec(&mce_bad_pages); + unlock_page(p); + put_page(p); + return 0; + } + wait_on_page_writeback(p); /* * Now take care of user space mappings. + * Abort on fail: __remove_from_page_cache() assumes unmapped page. */ - hwpoison_user_mappings(p, pfn, trapno); + if (hwpoison_user_mappings(p, pfn, trapno) != SWAP_SUCCESS) { + printk(KERN_ERR "MCE %#lx: cannot unmap page, give up\n", pfn); + res = -EBUSY; + goto out; + } /* * Torn down by someone else? */ - if ((lru_flag & lru) && !PageSwapCache(p) && p->mapping == NULL) { + if (PageLRU(p) && !PageSwapCache(p) && p->mapping == NULL) { action_result(pfn, "already truncated LRU", IGNORED); - res = 0; + res = -EBUSY; goto out; } res = -EBUSY; for (ps = error_states;; ps++) { - if (((p->flags | lru_flag)& ps->mask) == ps->res) { - res = page_action(ps, p, pfn, ref); + if ((p->flags & ps->mask) == ps->res) { + res = page_action(ps, p, pfn); break; } } @@ -833,3 +1064,235 @@ void memory_failure(unsigned long pfn, int trapno) { __memory_failure(pfn, trapno, 0); } + +/** + * unpoison_memory - Unpoison a previously poisoned page + * @pfn: Page number of the to be unpoisoned page + * + * Software-unpoison a page that has been poisoned by + * memory_failure() earlier. + * + * This is only done on the software-level, so it only works + * for linux injected failures, not real hardware failures + * + * Returns 0 for success, otherwise -errno. + */ +int unpoison_memory(unsigned long pfn) +{ + struct page *page; + struct page *p; + int freeit = 0; + + if (!pfn_valid(pfn)) + return -ENXIO; + + p = pfn_to_page(pfn); + page = compound_head(p); + + if (!PageHWPoison(p)) { + pr_debug("MCE: Page was already unpoisoned %#lx\n", pfn); + return 0; + } + + if (!get_page_unless_zero(page)) { + if (TestClearPageHWPoison(p)) + atomic_long_dec(&mce_bad_pages); + pr_debug("MCE: Software-unpoisoned free page %#lx\n", pfn); + return 0; + } + + lock_page_nosync(page); + /* + * This test is racy because PG_hwpoison is set outside of page lock. + * That's acceptable because that won't trigger kernel panic. Instead, + * the PG_hwpoison page will be caught and isolated on the entrance to + * the free buddy page pool. + */ + if (TestClearPageHWPoison(p)) { + pr_debug("MCE: Software-unpoisoned page %#lx\n", pfn); + atomic_long_dec(&mce_bad_pages); + freeit = 1; + } + unlock_page(page); + + put_page(page); + if (freeit) + put_page(page); + + return 0; +} +EXPORT_SYMBOL(unpoison_memory); + +static struct page *new_page(struct page *p, unsigned long private, int **x) +{ + int nid = page_to_nid(p); + return alloc_pages_exact_node(nid, GFP_HIGHUSER_MOVABLE, 0); +} + +/* + * Safely get reference count of an arbitrary page. + * Returns 0 for a free page, -EIO for a zero refcount page + * that is not free, and 1 for any other page type. + * For 1 the page is returned with increased page count, otherwise not. + */ +static int get_any_page(struct page *p, unsigned long pfn, int flags) +{ + int ret; + + if (flags & MF_COUNT_INCREASED) + return 1; + + /* + * The lock_system_sleep prevents a race with memory hotplug, + * because the isolation assumes there's only a single user. + * This is a big hammer, a better would be nicer. + */ + lock_system_sleep(); + + /* + * Isolate the page, so that it doesn't get reallocated if it + * was free. + */ + set_migratetype_isolate(p); + if (!get_page_unless_zero(compound_head(p))) { + if (is_free_buddy_page(p)) { + pr_debug("get_any_page: %#lx free buddy page\n", pfn); + /* Set hwpoison bit while page is still isolated */ + SetPageHWPoison(p); + ret = 0; + } else { + pr_debug("get_any_page: %#lx: unknown zero refcount page type %lx\n", + pfn, p->flags); + ret = -EIO; + } + } else { + /* Not a free page */ + ret = 1; + } + unset_migratetype_isolate(p); + unlock_system_sleep(); + return ret; +} + +/** + * soft_offline_page - Soft offline a page. + * @page: page to offline + * @flags: flags. Same as memory_failure(). + * + * Returns 0 on success, otherwise negated errno. + * + * Soft offline a page, by migration or invalidation, + * without killing anything. This is for the case when + * a page is not corrupted yet (so it's still valid to access), + * but has had a number of corrected errors and is better taken + * out. + * + * The actual policy on when to do that is maintained by + * user space. + * + * This should never impact any application or cause data loss, + * however it might take some time. + * + * This is not a 100% solution for all memory, but tries to be + * ``good enough'' for the majority of memory. + */ +int soft_offline_page(struct page *page, int flags) +{ + int ret; + unsigned long pfn = page_to_pfn(page); + + ret = get_any_page(page, pfn, flags); + if (ret < 0) + return ret; + if (ret == 0) + goto done; + + /* + * Page cache page we can handle? + */ + if (!PageLRU(page)) { + /* + * Try to free it. + */ + put_page(page); + shake_page(page, 1); + + /* + * Did it turn free? + */ + ret = get_any_page(page, pfn, 0); + if (ret < 0) + return ret; + if (ret == 0) + goto done; + } + if (!PageLRU(page)) { + pr_debug("soft_offline: %#lx: unknown non LRU page type %lx\n", + pfn, page->flags); + return -EIO; + } + + lock_page(page); + wait_on_page_writeback(page); + + /* + * Synchronized using the page lock with memory_failure() + */ + if (PageHWPoison(page)) { + unlock_page(page); + put_page(page); + pr_debug("soft offline: %#lx page already poisoned\n", pfn); + return -EBUSY; + } + + /* + * Try to invalidate first. This should work for + * non dirty unmapped page cache pages. + */ + ret = invalidate_inode_page(page); + unlock_page(page); + + /* + * Drop count because page migration doesn't like raised + * counts. The page could get re-allocated, but if it becomes + * LRU the isolation will just fail. + * RED-PEN would be better to keep it isolated here, but we + * would need to fix isolation locking first. + */ + put_page(page); + if (ret == 1) { + ret = 0; + pr_debug("soft_offline: %#lx: invalidated\n", pfn); + goto done; + } + + /* + * Simple invalidation didn't work. + * Try to migrate to a new page instead. migrate.c + * handles a large number of cases for us. + */ + ret = isolate_lru_page(page); + if (!ret) { + LIST_HEAD(pagelist); + + list_add(&page->lru, &pagelist); + ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0); + if (ret) { + pr_debug("soft offline: %#lx: migration failed %d, type %lx\n", + pfn, ret, page->flags); + if (ret > 0) + ret = -EIO; + } + } else { + pr_debug("soft offline: %#lx: isolation failed: %d, page count %d, type %lx\n", + pfn, ret, page_count(page), page->flags); + } + if (ret) + return ret; + +done: + atomic_long_add(1, &mce_bad_pages); + SetPageHWPoison(page); + /* keep elevated page count for bad page */ + return ret; +} diff --git a/mm/memory.c b/mm/memory.c index 6ab19dd4a199..833952d8b74d 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -56,6 +56,7 @@ #include <linux/kallsyms.h> #include <linux/swapops.h> #include <linux/elf.h> +#include <linux/gfp.h> #include <asm/io.h> #include <asm/pgalloc.h> @@ -121,6 +122,77 @@ static int __init init_zero_pfn(void) } core_initcall(init_zero_pfn); + +#if defined(SPLIT_RSS_COUNTING) + +static void __sync_task_rss_stat(struct task_struct *task, struct mm_struct *mm) +{ + int i; + + for (i = 0; i < NR_MM_COUNTERS; i++) { + if (task->rss_stat.count[i]) { + add_mm_counter(mm, i, task->rss_stat.count[i]); + task->rss_stat.count[i] = 0; + } + } + task->rss_stat.events = 0; +} + +static void add_mm_counter_fast(struct mm_struct *mm, int member, int val) +{ + struct task_struct *task = current; + + if (likely(task->mm == mm)) + task->rss_stat.count[member] += val; + else + add_mm_counter(mm, member, val); +} +#define inc_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, 1) +#define dec_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, -1) + +/* sync counter once per 64 page faults */ +#define TASK_RSS_EVENTS_THRESH (64) +static void check_sync_rss_stat(struct task_struct *task) +{ + if (unlikely(task != current)) + return; + if (unlikely(task->rss_stat.events++ > TASK_RSS_EVENTS_THRESH)) + __sync_task_rss_stat(task, task->mm); +} + +unsigned long get_mm_counter(struct mm_struct *mm, int member) +{ + long val = 0; + + /* + * Don't use task->mm here...for avoiding to use task_get_mm().. + * The caller must guarantee task->mm is not invalid. + */ + val = atomic_long_read(&mm->rss_stat.count[member]); + /* + * counter is updated in asynchronous manner and may go to minus. + * But it's never be expected number for users. + */ + if (val < 0) + return 0; + return (unsigned long)val; +} + +void sync_mm_rss(struct task_struct *task, struct mm_struct *mm) +{ + __sync_task_rss_stat(task, mm); +} +#else + +#define inc_mm_counter_fast(mm, member) inc_mm_counter(mm, member) +#define dec_mm_counter_fast(mm, member) dec_mm_counter(mm, member) + +static void check_sync_rss_stat(struct task_struct *task) +{ +} + +#endif + /* * If a p?d_bad entry is found while walking page tables, report * the error, before resetting entry to p?d_none. Usually (but @@ -300,7 +372,7 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma, * Hide vma from rmap and truncate_pagecache before freeing * pgtables */ - anon_vma_unlink(vma); + unlink_anon_vmas(vma); unlink_file_vma(vma); if (is_vm_hugetlb_page(vma)) { @@ -314,7 +386,7 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma, && !is_vm_hugetlb_page(next)) { vma = next; next = vma->vm_next; - anon_vma_unlink(vma); + unlink_anon_vmas(vma); unlink_file_vma(vma); } free_pgd_range(tlb, addr, vma->vm_end, @@ -376,12 +448,20 @@ int __pte_alloc_kernel(pmd_t *pmd, unsigned long address) return 0; } -static inline void add_mm_rss(struct mm_struct *mm, int file_rss, int anon_rss) +static inline void init_rss_vec(int *rss) { - if (file_rss) - add_mm_counter(mm, file_rss, file_rss); - if (anon_rss) - add_mm_counter(mm, anon_rss, anon_rss); + memset(rss, 0, sizeof(int) * NR_MM_COUNTERS); +} + +static inline void add_mm_rss_vec(struct mm_struct *mm, int *rss) +{ + int i; + + if (current->mm == mm) + sync_mm_rss(current, mm); + for (i = 0; i < NR_MM_COUNTERS; i++) + if (rss[i]) + add_mm_counter(mm, i, rss[i]); } /* @@ -430,12 +510,8 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr, "BUG: Bad page map in process %s pte:%08llx pmd:%08llx\n", current->comm, (long long)pte_val(pte), (long long)pmd_val(*pmd)); - if (page) { - printk(KERN_ALERT - "page:%p flags:%p count:%d mapcount:%d mapping:%p index:%lx\n", - page, (void *)page->flags, page_count(page), - page_mapcount(page), page->mapping, page->index); - } + if (page) + dump_page(page); printk(KERN_ALERT "addr:%p vm_flags:%08lx anon_vma:%p mapping:%p index:%lx\n", (void *)addr, vma->vm_flags, vma->anon_vma, mapping, index); @@ -572,7 +648,7 @@ out: * covered by this vma. */ -static inline void +static inline unsigned long copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma, unsigned long addr, int *rss) @@ -586,7 +662,9 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, if (!pte_file(pte)) { swp_entry_t entry = pte_to_swp_entry(pte); - swap_duplicate(entry); + if (swap_duplicate(entry) < 0) + return entry.val; + /* make sure dst_mm is on swapoff's mmlist. */ if (unlikely(list_empty(&dst_mm->mmlist))) { spin_lock(&mmlist_lock); @@ -595,7 +673,9 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, &src_mm->mmlist); spin_unlock(&mmlist_lock); } - if (is_write_migration_entry(entry) && + if (likely(!non_swap_entry(entry))) + rss[MM_SWAPENTS]++; + else if (is_write_migration_entry(entry) && is_cow_mapping(vm_flags)) { /* * COW mappings require pages in both parent @@ -630,11 +710,15 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, if (page) { get_page(page); page_dup_rmap(page); - rss[PageAnon(page)]++; + if (PageAnon(page)) + rss[MM_ANONPAGES]++; + else + rss[MM_FILEPAGES]++; } out_set_pte: set_pte_at(dst_mm, addr, dst_pte, pte); + return 0; } static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, @@ -645,10 +729,12 @@ static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, pte_t *src_pte, *dst_pte; spinlock_t *src_ptl, *dst_ptl; int progress = 0; - int rss[2]; + int rss[NR_MM_COUNTERS]; + swp_entry_t entry = (swp_entry_t){0}; again: - rss[1] = rss[0] = 0; + init_rss_vec(rss); + dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl); if (!dst_pte) return -ENOMEM; @@ -674,16 +760,25 @@ again: progress++; continue; } - copy_one_pte(dst_mm, src_mm, dst_pte, src_pte, vma, addr, rss); + entry.val = copy_one_pte(dst_mm, src_mm, dst_pte, src_pte, + vma, addr, rss); + if (entry.val) + break; progress += 8; } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end); arch_leave_lazy_mmu_mode(); spin_unlock(src_ptl); pte_unmap_nested(orig_src_pte); - add_mm_rss(dst_mm, rss[0], rss[1]); + add_mm_rss_vec(dst_mm, rss); pte_unmap_unlock(orig_dst_pte, dst_ptl); cond_resched(); + + if (entry.val) { + if (add_swap_count_continuation(entry, GFP_KERNEL) < 0) + return -ENOMEM; + progress = 0; + } if (addr != end) goto again; return 0; @@ -803,8 +898,9 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, struct mm_struct *mm = tlb->mm; pte_t *pte; spinlock_t *ptl; - int file_rss = 0; - int anon_rss = 0; + int rss[NR_MM_COUNTERS]; + + init_rss_vec(rss); pte = pte_offset_map_lock(mm, pmd, addr, &ptl); arch_enter_lazy_mmu_mode(); @@ -850,14 +946,14 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, set_pte_at(mm, addr, pte, pgoff_to_pte(page->index)); if (PageAnon(page)) - anon_rss--; + rss[MM_ANONPAGES]--; else { if (pte_dirty(ptent)) set_page_dirty(page); if (pte_young(ptent) && likely(!VM_SequentialReadHint(vma))) mark_page_accessed(page); - file_rss--; + rss[MM_FILEPAGES]--; } page_remove_rmap(page); if (unlikely(page_mapcount(page) < 0)) @@ -874,13 +970,18 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, if (pte_file(ptent)) { if (unlikely(!(vma->vm_flags & VM_NONLINEAR))) print_bad_pte(vma, addr, ptent, NULL); - } else if - (unlikely(!free_swap_and_cache(pte_to_swp_entry(ptent)))) - print_bad_pte(vma, addr, ptent, NULL); + } else { + swp_entry_t entry = pte_to_swp_entry(ptent); + + if (!non_swap_entry(entry)) + rss[MM_SWAPENTS]--; + if (unlikely(!free_swap_and_cache(entry))) + print_bad_pte(vma, addr, ptent, NULL); + } pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); } while (pte++, addr += PAGE_SIZE, (addr != end && *zap_work > 0)); - add_mm_rss(mm, file_rss, anon_rss); + add_mm_rss_vec(mm, rss); arch_leave_lazy_mmu_mode(); pte_unmap_unlock(pte - 1, ptl); @@ -943,6 +1044,7 @@ static unsigned long unmap_page_range(struct mmu_gather *tlb, details = NULL; BUG_ON(addr >= end); + mem_cgroup_uncharge_start(); tlb_start_vma(tlb, vma); pgd = pgd_offset(vma->vm_mm, addr); do { @@ -955,6 +1057,7 @@ static unsigned long unmap_page_range(struct mmu_gather *tlb, zap_work, details); } while (pgd++, addr = next, (addr != end && *zap_work > 0)); tlb_end_vma(tlb, vma); + mem_cgroup_uncharge_end(); return addr; } @@ -1512,7 +1615,7 @@ static int insert_page(struct vm_area_struct *vma, unsigned long addr, /* Ok, finally just insert the thing.. */ get_page(page); - inc_mm_counter(mm, file_rss); + inc_mm_counter_fast(mm, MM_FILEPAGES); page_add_file_rmap(page); set_pte_at(mm, addr, pte, mk_pte(page, prot)); @@ -1578,7 +1681,7 @@ static int insert_pfn(struct vm_area_struct *vma, unsigned long addr, /* Ok, finally just insert the thing.. */ entry = pte_mkspecial(pfn_pte(pfn, prot)); set_pte_at(mm, addr, pte, entry); - update_mmu_cache(vma, addr, entry); /* XXX: why not for insert_page? */ + update_mmu_cache(vma, addr, pte); /* XXX: why not for insert_page? */ retval = 0; out_unlock: @@ -2029,6 +2132,13 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, page_cache_release(old_page); } reuse = reuse_swap_page(old_page); + if (reuse) + /* + * The page is all ours. Move it to our anon_vma so + * the rmap code will not search our parent or siblings. + * Protected against the rmap code by the page lock. + */ + page_move_anon_rmap(old_page, vma, address); unlock_page(old_page); } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) == (VM_WRITE|VM_SHARED))) { @@ -2101,7 +2211,7 @@ reuse: entry = pte_mkyoung(orig_pte); entry = maybe_mkwrite(pte_mkdirty(entry), vma); if (ptep_set_access_flags(vma, address, page_table, entry,1)) - update_mmu_cache(vma, address, entry); + update_mmu_cache(vma, address, page_table); ret |= VM_FAULT_WRITE; goto unlock; } @@ -2148,11 +2258,11 @@ gotten: if (likely(pte_same(*page_table, orig_pte))) { if (old_page) { if (!PageAnon(old_page)) { - dec_mm_counter(mm, file_rss); - inc_mm_counter(mm, anon_rss); + dec_mm_counter_fast(mm, MM_FILEPAGES); + inc_mm_counter_fast(mm, MM_ANONPAGES); } } else - inc_mm_counter(mm, anon_rss); + inc_mm_counter_fast(mm, MM_ANONPAGES); flush_cache_page(vma, address, pte_pfn(orig_pte)); entry = mk_pte(new_page, vma->vm_page_prot); entry = maybe_mkwrite(pte_mkdirty(entry), vma); @@ -2170,7 +2280,7 @@ gotten: * new page to be mapped directly into the secondary page table. */ set_pte_at_notify(mm, address, page_table, entry); - update_mmu_cache(vma, address, entry); + update_mmu_cache(vma, address, page_table); if (old_page) { /* * Only after switching the pte to the new page may @@ -2514,7 +2624,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, ret = VM_FAULT_HWPOISON; } else { print_bad_pte(vma, address, orig_pte, NULL); - ret = VM_FAULT_OOM; + ret = VM_FAULT_SIGBUS; } goto out; } @@ -2540,6 +2650,10 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, ret = VM_FAULT_MAJOR; count_vm_event(PGMAJFAULT); } else if (PageHWPoison(page)) { + /* + * hwpoisoned dirty swapcache pages are kept for killing + * owner processes (which may be unknown at hwpoison time) + */ ret = VM_FAULT_HWPOISON; delayacct_clear_flag(DELAYACCT_PF_SWAPIN); goto out_release; @@ -2548,6 +2662,12 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, lock_page(page); delayacct_clear_flag(DELAYACCT_PF_SWAPIN); + page = ksm_might_need_to_copy(page, vma, address); + if (!page) { + ret = VM_FAULT_OOM; + goto out; + } + if (mem_cgroup_try_charge_swapin(mm, page, GFP_KERNEL, &ptr)) { ret = VM_FAULT_OOM; goto out_page; @@ -2579,7 +2699,8 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, * discarded at swap_free(). */ - inc_mm_counter(mm, anon_rss); + inc_mm_counter_fast(mm, MM_ANONPAGES); + dec_mm_counter_fast(mm, MM_SWAPENTS); pte = mk_pte(page, vma->vm_page_prot); if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) { pte = maybe_mkwrite(pte_mkdirty(pte), vma); @@ -2604,7 +2725,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, } /* No need to invalidate - it was non-present before */ - update_mmu_cache(vma, address, pte); + update_mmu_cache(vma, address, page_table); unlock: pte_unmap_unlock(page_table, ptl); out: @@ -2663,13 +2784,13 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, if (!pte_none(*page_table)) goto release; - inc_mm_counter(mm, anon_rss); + inc_mm_counter_fast(mm, MM_ANONPAGES); page_add_new_anon_rmap(page, vma, address); setpte: set_pte_at(mm, address, page_table, entry); /* No need to invalidate - it was non-present before */ - update_mmu_cache(vma, address, entry); + update_mmu_cache(vma, address, page_table); unlock: pte_unmap_unlock(page_table, ptl); return 0; @@ -2817,10 +2938,10 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, if (flags & FAULT_FLAG_WRITE) entry = maybe_mkwrite(pte_mkdirty(entry), vma); if (anon) { - inc_mm_counter(mm, anon_rss); + inc_mm_counter_fast(mm, MM_ANONPAGES); page_add_new_anon_rmap(page, vma, address); } else { - inc_mm_counter(mm, file_rss); + inc_mm_counter_fast(mm, MM_FILEPAGES); page_add_file_rmap(page); if (flags & FAULT_FLAG_WRITE) { dirty_page = page; @@ -2830,7 +2951,7 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, set_pte_at(mm, address, page_table, entry); /* no need to invalidate: a not-present page won't be cached */ - update_mmu_cache(vma, address, entry); + update_mmu_cache(vma, address, page_table); } else { if (charged) mem_cgroup_uncharge_page(page); @@ -2910,7 +3031,7 @@ static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma, * Page table corrupted: show pte and kill process. */ print_bad_pte(vma, address, orig_pte, NULL); - return VM_FAULT_OOM; + return VM_FAULT_SIGBUS; } pgoff = pte_to_pgoff(orig_pte); @@ -2967,7 +3088,7 @@ static inline int handle_pte_fault(struct mm_struct *mm, } entry = pte_mkyoung(entry); if (ptep_set_access_flags(vma, address, pte, entry, flags & FAULT_FLAG_WRITE)) { - update_mmu_cache(vma, address, entry); + update_mmu_cache(vma, address, pte); } else { /* * This is needed only for protection faults but the arch code @@ -2998,6 +3119,9 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, count_vm_event(PGFAULT); + /* do counter updates before entering really critical section. */ + check_sync_rss_stat(current); + if (unlikely(is_vm_hugetlb_page(vma))) return hugetlb_fault(mm, vma, address, flags); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 2047465cd27c..be211a582930 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -27,6 +27,8 @@ #include <linux/page-isolation.h> #include <linux/pfn.h> #include <linux/suspend.h> +#include <linux/mm_inline.h> +#include <linux/firmware-map.h> #include <asm/tlbflush.h> @@ -71,7 +73,9 @@ static void get_page_bootmem(unsigned long info, struct page *page, int type) atomic_inc(&page->_count); } -void put_page_bootmem(struct page *page) +/* reference to __meminit __free_pages_bootmem is valid + * so use __ref to tell modpost not to generate a warning */ +void __ref put_page_bootmem(struct page *page) { int type; @@ -520,6 +524,9 @@ int __ref add_memory(int nid, u64 start, u64 size) BUG_ON(ret); } + /* create new memmap entry */ + firmware_map_add_hotplug(start, start + size, "System RAM"); + goto out; error: @@ -672,15 +679,18 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) if (!ret) { /* Success */ list_add_tail(&page->lru, &source); move_pages--; + inc_zone_page_state(page, NR_ISOLATED_ANON + + page_is_file_cache(page)); + } else { /* Becasue we don't have big zone->lock. we should check this again here. */ if (page_count(page)) not_managed++; #ifdef CONFIG_DEBUG_VM - printk(KERN_INFO "removing from LRU failed" - " %lx/%d/%lx\n", - pfn, page_count(page), page->flags); + printk(KERN_ALERT "removing pfn %lx from LRU failed\n", + pfn); + dump_page(page); #endif } } @@ -694,7 +704,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) if (list_empty(&source)) goto out; /* this function returns # of failed pages */ - ret = migrate_pages(&source, hotremove_migrate_alloc, 0); + ret = migrate_pages(&source, hotremove_migrate_alloc, 0, 1); out: return ret; @@ -747,7 +757,7 @@ check_pages_isolated(unsigned long start_pfn, unsigned long end_pfn) return offlined; } -int offline_pages(unsigned long start_pfn, +static int offline_pages(unsigned long start_pfn, unsigned long end_pfn, unsigned long timeout) { unsigned long pfn, nr_pages, expire; @@ -849,6 +859,10 @@ repeat: setup_per_zone_wmarks(); calculate_zone_inactive_ratio(zone); + if (!node_present_pages(node)) { + node_clear_state(node, N_HIGH_MEMORY); + kswapd_stop(node); + } vm_total_pages = nr_free_pagecache_pages(); writeback_set_ratelimit(); diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 4545d5944243..08f40a2f3fe0 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -73,7 +73,6 @@ #include <linux/sched.h> #include <linux/nodemask.h> #include <linux/cpuset.h> -#include <linux/gfp.h> #include <linux/slab.h> #include <linux/string.h> #include <linux/module.h> @@ -85,10 +84,12 @@ #include <linux/seq_file.h> #include <linux/proc_fs.h> #include <linux/migrate.h> +#include <linux/ksm.h> #include <linux/rmap.h> #include <linux/security.h> #include <linux/syscalls.h> #include <linux/ctype.h> +#include <linux/mm_inline.h> #include <asm/tlbflush.h> #include <asm/uaccess.h> @@ -412,17 +413,11 @@ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd, if (!page) continue; /* - * The check for PageReserved here is important to avoid - * handling zero pages and other pages that may have been - * marked special by the system. - * - * If the PageReserved would not be checked here then f.e. - * the location of the zero page could have an influence - * on MPOL_MF_STRICT, zero pages would be counted for - * the per node stats, and there would be useless attempts - * to put zero pages on the migration list. + * vm_normal_page() filters out zero pages, but there might + * still be PageReserved pages to skip, perhaps in a VDSO. + * And we cannot move PageKsm pages sensibly or safely yet. */ - if (PageReserved(page)) + if (PageReserved(page) || PageKsm(page)) continue; nid = page_to_nid(page); if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT)) @@ -567,24 +562,50 @@ static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new) } /* Step 2: apply policy to a range and do splits. */ -static int mbind_range(struct vm_area_struct *vma, unsigned long start, - unsigned long end, struct mempolicy *new) +static int mbind_range(struct mm_struct *mm, unsigned long start, + unsigned long end, struct mempolicy *new_pol) { struct vm_area_struct *next; - int err; + struct vm_area_struct *prev; + struct vm_area_struct *vma; + int err = 0; + pgoff_t pgoff; + unsigned long vmstart; + unsigned long vmend; - err = 0; - for (; vma && vma->vm_start < end; vma = next) { + vma = find_vma_prev(mm, start, &prev); + if (!vma || vma->vm_start > start) + return -EFAULT; + + for (; vma && vma->vm_start < end; prev = vma, vma = next) { next = vma->vm_next; - if (vma->vm_start < start) - err = split_vma(vma->vm_mm, vma, start, 1); - if (!err && vma->vm_end > end) - err = split_vma(vma->vm_mm, vma, end, 0); - if (!err) - err = policy_vma(vma, new); + vmstart = max(start, vma->vm_start); + vmend = min(end, vma->vm_end); + + pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); + prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags, + vma->anon_vma, vma->vm_file, pgoff, new_pol); + if (prev) { + vma = prev; + next = vma->vm_next; + continue; + } + if (vma->vm_start != vmstart) { + err = split_vma(vma->vm_mm, vma, vmstart, 1); + if (err) + goto out; + } + if (vma->vm_end != vmend) { + err = split_vma(vma->vm_mm, vma, vmend, 0); + if (err) + goto out; + } + err = policy_vma(vma, new_pol); if (err) - break; + goto out; } + + out: return err; } @@ -784,9 +805,13 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask, err = 0; if (nmask) { - task_lock(current); - get_policy_nodemask(pol, nmask); - task_unlock(current); + if (mpol_store_user_nodemask(pol)) { + *nmask = pol->w.user_nodemask; + } else { + task_lock(current); + get_policy_nodemask(pol, nmask); + task_unlock(current); + } } out: @@ -809,6 +834,8 @@ static void migrate_page_add(struct page *page, struct list_head *pagelist, if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) { if (!isolate_lru_page(page)) { list_add_tail(&page->lru, pagelist); + inc_zone_page_state(page, NR_ISOLATED_ANON + + page_is_file_cache(page)); } } } @@ -836,7 +863,7 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest, flags | MPOL_MF_DISCONTIG_OK, &pagelist); if (!list_empty(&pagelist)) - err = migrate_pages(&pagelist, new_node_page, dest); + err = migrate_pages(&pagelist, new_node_page, dest, 0); return err; } @@ -864,36 +891,36 @@ int do_migrate_pages(struct mm_struct *mm, if (err) goto out; -/* - * Find a 'source' bit set in 'tmp' whose corresponding 'dest' - * bit in 'to' is not also set in 'tmp'. Clear the found 'source' - * bit in 'tmp', and return that <source, dest> pair for migration. - * The pair of nodemasks 'to' and 'from' define the map. - * - * If no pair of bits is found that way, fallback to picking some - * pair of 'source' and 'dest' bits that are not the same. If the - * 'source' and 'dest' bits are the same, this represents a node - * that will be migrating to itself, so no pages need move. - * - * If no bits are left in 'tmp', or if all remaining bits left - * in 'tmp' correspond to the same bit in 'to', return false - * (nothing left to migrate). - * - * This lets us pick a pair of nodes to migrate between, such that - * if possible the dest node is not already occupied by some other - * source node, minimizing the risk of overloading the memory on a - * node that would happen if we migrated incoming memory to a node - * before migrating outgoing memory source that same node. - * - * A single scan of tmp is sufficient. As we go, we remember the - * most recent <s, d> pair that moved (s != d). If we find a pair - * that not only moved, but what's better, moved to an empty slot - * (d is not set in tmp), then we break out then, with that pair. - * Otherwise when we finish scannng from_tmp, we at least have the - * most recent <s, d> pair that moved. If we get all the way through - * the scan of tmp without finding any node that moved, much less - * moved to an empty node, then there is nothing left worth migrating. - */ + /* + * Find a 'source' bit set in 'tmp' whose corresponding 'dest' + * bit in 'to' is not also set in 'tmp'. Clear the found 'source' + * bit in 'tmp', and return that <source, dest> pair for migration. + * The pair of nodemasks 'to' and 'from' define the map. + * + * If no pair of bits is found that way, fallback to picking some + * pair of 'source' and 'dest' bits that are not the same. If the + * 'source' and 'dest' bits are the same, this represents a node + * that will be migrating to itself, so no pages need move. + * + * If no bits are left in 'tmp', or if all remaining bits left + * in 'tmp' correspond to the same bit in 'to', return false + * (nothing left to migrate). + * + * This lets us pick a pair of nodes to migrate between, such that + * if possible the dest node is not already occupied by some other + * source node, minimizing the risk of overloading the memory on a + * node that would happen if we migrated incoming memory to a node + * before migrating outgoing memory source that same node. + * + * A single scan of tmp is sufficient. As we go, we remember the + * most recent <s, d> pair that moved (s != d). If we find a pair + * that not only moved, but what's better, moved to an empty slot + * (d is not set in tmp), then we break out then, with that pair. + * Otherwise when we finish scannng from_tmp, we at least have the + * most recent <s, d> pair that moved. If we get all the way through + * the scan of tmp without finding any node that moved, much less + * moved to an empty node, then there is nothing left worth migrating. + */ tmp = *from_nodes; while (!nodes_empty(tmp)) { @@ -1049,11 +1076,11 @@ static long do_mbind(unsigned long start, unsigned long len, if (!IS_ERR(vma)) { int nr_failed = 0; - err = mbind_range(vma, start, end, new); + err = mbind_range(mm, start, end, new); if (!list_empty(&pagelist)) nr_failed = migrate_pages(&pagelist, new_vma_page, - (unsigned long)vma); + (unsigned long)vma, 0); if (!err && nr_failed && (flags & MPOL_MF_STRICT)) err = -EIO; @@ -1565,6 +1592,53 @@ struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr, } return zl; } + +/* + * init_nodemask_of_mempolicy + * + * If the current task's mempolicy is "default" [NULL], return 'false' + * to indicate default policy. Otherwise, extract the policy nodemask + * for 'bind' or 'interleave' policy into the argument nodemask, or + * initialize the argument nodemask to contain the single node for + * 'preferred' or 'local' policy and return 'true' to indicate presence + * of non-default mempolicy. + * + * We don't bother with reference counting the mempolicy [mpol_get/put] + * because the current task is examining it's own mempolicy and a task's + * mempolicy is only ever changed by the task itself. + * + * N.B., it is the caller's responsibility to free a returned nodemask. + */ +bool init_nodemask_of_mempolicy(nodemask_t *mask) +{ + struct mempolicy *mempolicy; + int nid; + + if (!(mask && current->mempolicy)) + return false; + + mempolicy = current->mempolicy; + switch (mempolicy->mode) { + case MPOL_PREFERRED: + if (mempolicy->flags & MPOL_F_LOCAL) + nid = numa_node_id(); + else + nid = mempolicy->v.preferred_node; + init_nodemask_of_node(mask, nid); + break; + + case MPOL_BIND: + /* Fall through */ + case MPOL_INTERLEAVE: + *mask = mempolicy->v.nodes; + break; + + default: + BUG(); + } + + return true; +} #endif /* Allocate a page in interleaved policy. @@ -1685,10 +1759,12 @@ struct mempolicy *__mpol_dup(struct mempolicy *old) if (!new) return ERR_PTR(-ENOMEM); + rcu_read_lock(); if (current_cpuset_is_being_rebound()) { nodemask_t mems = cpuset_mems_allowed(current); mpol_rebind_policy(old, &mems); } + rcu_read_unlock(); *new = *old; atomic_set(&new->refcnt, 1); return new; @@ -2122,8 +2198,8 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context) char *rest = nodelist; while (isdigit(*rest)) rest++; - if (!*rest) - err = 0; + if (*rest) + goto out; } break; case MPOL_INTERLEAVE: @@ -2132,7 +2208,6 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context) */ if (!nodelist) nodes = node_states[N_HIGH_MEMORY]; - err = 0; break; case MPOL_LOCAL: /* @@ -2142,11 +2217,19 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context) goto out; mode = MPOL_PREFERRED; break; - - /* - * case MPOL_BIND: mpol_new() enforces non-empty nodemask. - * case MPOL_DEFAULT: mpol_new() enforces empty nodemask, ignores flags. - */ + case MPOL_DEFAULT: + /* + * Insist on a empty nodelist + */ + if (!nodelist) + err = 0; + goto out; + case MPOL_BIND: + /* + * Insist on a nodelist + */ + if (!nodelist) + goto out; } mode_flags = 0; @@ -2160,13 +2243,14 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context) else if (!strcmp(flags, "relative")) mode_flags |= MPOL_F_RELATIVE_NODES; else - err = 1; + goto out; } new = mpol_new(mode, mode_flags, &nodes); if (IS_ERR(new)) - err = 1; - else { + goto out; + + { int ret; NODEMASK_SCRATCH(scratch); if (scratch) { @@ -2177,13 +2261,15 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context) ret = -ENOMEM; NODEMASK_SCRATCH_FREE(scratch); if (ret) { - err = 1; mpol_put(new); - } else if (no_context) { - /* save for contextualization */ - new->w.user_nodemask = nodes; + goto out; } } + err = 0; + if (no_context) { + /* save for contextualization */ + new->w.user_nodemask = nodes; + } out: /* Restore string for error message */ diff --git a/mm/migrate.c b/mm/migrate.c index 7dbcb22316d2..d3f3f7f81075 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -21,6 +21,7 @@ #include <linux/mm_inline.h> #include <linux/nsproxy.h> #include <linux/pagevec.h> +#include <linux/ksm.h> #include <linux/rmap.h> #include <linux/topology.h> #include <linux/cpu.h> @@ -31,6 +32,7 @@ #include <linux/security.h> #include <linux/memcontrol.h> #include <linux/syscalls.h> +#include <linux/gfp.h> #include "internal.h" @@ -78,8 +80,8 @@ int putback_lru_pages(struct list_head *l) /* * Restore a potential migration pte to a working pte entry */ -static void remove_migration_pte(struct vm_area_struct *vma, - struct page *old, struct page *new) +static int remove_migration_pte(struct page *new, struct vm_area_struct *vma, + unsigned long addr, void *old) { struct mm_struct *mm = vma->vm_mm; swp_entry_t entry; @@ -88,40 +90,37 @@ static void remove_migration_pte(struct vm_area_struct *vma, pmd_t *pmd; pte_t *ptep, pte; spinlock_t *ptl; - unsigned long addr = page_address_in_vma(new, vma); - - if (addr == -EFAULT) - return; pgd = pgd_offset(mm, addr); if (!pgd_present(*pgd)) - return; + goto out; pud = pud_offset(pgd, addr); if (!pud_present(*pud)) - return; + goto out; pmd = pmd_offset(pud, addr); if (!pmd_present(*pmd)) - return; + goto out; ptep = pte_offset_map(pmd, addr); if (!is_swap_pte(*ptep)) { pte_unmap(ptep); - return; + goto out; } ptl = pte_lockptr(mm, pmd); spin_lock(ptl); pte = *ptep; if (!is_swap_pte(pte)) - goto out; + goto unlock; entry = pte_to_swp_entry(pte); - if (!is_migration_entry(entry) || migration_entry_to_page(entry) != old) - goto out; + if (!is_migration_entry(entry) || + migration_entry_to_page(entry) != old) + goto unlock; get_page(new); pte = pte_mkold(mk_pte(new, vma->vm_page_prot)); @@ -136,59 +135,11 @@ static void remove_migration_pte(struct vm_area_struct *vma, page_add_file_rmap(new); /* No need to invalidate - it was non-present before */ - update_mmu_cache(vma, addr, pte); - -out: + update_mmu_cache(vma, addr, ptep); +unlock: pte_unmap_unlock(ptep, ptl); -} - -/* - * Note that remove_file_migration_ptes will only work on regular mappings, - * Nonlinear mappings do not use migration entries. - */ -static void remove_file_migration_ptes(struct page *old, struct page *new) -{ - struct vm_area_struct *vma; - struct address_space *mapping = new->mapping; - struct prio_tree_iter iter; - pgoff_t pgoff = new->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); - - if (!mapping) - return; - - spin_lock(&mapping->i_mmap_lock); - - vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) - remove_migration_pte(vma, old, new); - - spin_unlock(&mapping->i_mmap_lock); -} - -/* - * Must hold mmap_sem lock on at least one of the vmas containing - * the page so that the anon_vma cannot vanish. - */ -static void remove_anon_migration_ptes(struct page *old, struct page *new) -{ - struct anon_vma *anon_vma; - struct vm_area_struct *vma; - unsigned long mapping; - - mapping = (unsigned long)new->mapping; - - if (!mapping || (mapping & PAGE_MAPPING_ANON) == 0) - return; - - /* - * We hold the mmap_sem lock. So no need to call page_lock_anon_vma. - */ - anon_vma = (struct anon_vma *) (mapping - PAGE_MAPPING_ANON); - spin_lock(&anon_vma->lock); - - list_for_each_entry(vma, &anon_vma->head, anon_vma_node) - remove_migration_pte(vma, old, new); - - spin_unlock(&anon_vma->lock); +out: + return SWAP_AGAIN; } /* @@ -197,10 +148,7 @@ static void remove_anon_migration_ptes(struct page *old, struct page *new) */ static void remove_migration_ptes(struct page *old, struct page *new) { - if (PageAnon(new)) - remove_anon_migration_ptes(old, new); - else - remove_file_migration_ptes(old, new); + rmap_walk(new, remove_migration_pte, old); } /* @@ -328,8 +276,6 @@ static int migrate_page_move_mapping(struct address_space *mapping, */ static void migrate_page_copy(struct page *newpage, struct page *page) { - int anon; - copy_highpage(newpage, page); if (PageError(page)) @@ -341,8 +287,8 @@ static void migrate_page_copy(struct page *newpage, struct page *page) if (TestClearPageActive(page)) { VM_BUG_ON(PageUnevictable(page)); SetPageActive(newpage); - } else - unevictable_migrate_page(newpage, page); + } else if (TestClearPageUnevictable(page)) + SetPageUnevictable(newpage); if (PageChecked(page)) SetPageChecked(newpage); if (PageMappedToDisk(page)) @@ -361,12 +307,11 @@ static void migrate_page_copy(struct page *newpage, struct page *page) } mlock_migrate_page(newpage, page); + ksm_migrate_page(newpage, page); ClearPageSwapCache(page); ClearPagePrivate(page); set_page_private(page, 0); - /* page->mapping contains a flag for PageAnon() */ - anon = PageAnon(page); page->mapping = NULL; /* @@ -580,9 +525,9 @@ static int move_to_new_page(struct page *newpage, struct page *page) else rc = fallback_migrate_page(mapping, newpage, page); - if (!rc) { + if (!rc) remove_migration_ptes(page, newpage); - } else + else newpage->mapping = NULL; unlock_page(newpage); @@ -595,7 +540,7 @@ static int move_to_new_page(struct page *newpage, struct page *page) * to the newly allocated page in newpage. */ static int unmap_and_move(new_page_t get_new_page, unsigned long private, - struct page *page, int force) + struct page *page, int force, int offlining) { int rc = 0; int *result = NULL; @@ -621,6 +566,20 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, lock_page(page); } + /* + * Only memory hotplug's offline_pages() caller has locked out KSM, + * and can safely migrate a KSM page. The other cases have skipped + * PageKsm along with PageReserved - but it is only now when we have + * the page lock that we can be certain it will not go KSM beneath us + * (KSM will not upgrade a page from PageAnon to PageKsm when it sees + * its pagecount raised, but only here do we take the page lock which + * serializes that). + */ + if (PageKsm(page) && !offlining) { + rc = -EBUSY; + goto unlock; + } + /* charge against new page */ charge = mem_cgroup_prepare_migration(page, &mem); if (charge == -ENOMEM) { @@ -737,7 +696,7 @@ move_newpage: * Return: Number of pages not migrated or error code. */ int migrate_pages(struct list_head *from, - new_page_t get_new_page, unsigned long private) + new_page_t get_new_page, unsigned long private, int offlining) { int retry = 1; int nr_failed = 0; @@ -746,13 +705,6 @@ int migrate_pages(struct list_head *from, struct page *page2; int swapwrite = current->flags & PF_SWAPWRITE; int rc; - unsigned long flags; - - local_irq_save(flags); - list_for_each_entry(page, from, lru) - __inc_zone_page_state(page, NR_ISOLATED_ANON + - page_is_file_cache(page)); - local_irq_restore(flags); if (!swapwrite) current->flags |= PF_SWAPWRITE; @@ -764,7 +716,7 @@ int migrate_pages(struct list_head *from, cond_resched(); rc = unmap_and_move(get_new_page, private, - page, pass > 2); + page, pass > 2, offlining); switch(rc) { case -ENOMEM: @@ -860,7 +812,8 @@ static int do_move_page_to_node_array(struct mm_struct *mm, if (!page) goto set_status; - if (PageReserved(page)) /* Check for zero page */ + /* Use PageReserved to check for zero page */ + if (PageReserved(page) || PageKsm(page)) goto put_and_set; pp->page = page; @@ -878,8 +831,11 @@ static int do_move_page_to_node_array(struct mm_struct *mm, goto put_and_set; err = isolate_lru_page(page); - if (!err) + if (!err) { list_add_tail(&page->lru, &pagelist); + inc_zone_page_state(page, NR_ISOLATED_ANON + + page_is_file_cache(page)); + } put_and_set: /* * Either remove the duplicate refcount from @@ -894,7 +850,7 @@ set_status: err = 0; if (!list_empty(&pagelist)) err = migrate_pages(&pagelist, new_page_node, - (unsigned long)pm); + (unsigned long)pm, 0); up_read(&mm->mmap_sem); return err; @@ -953,6 +909,9 @@ static int do_pages_move(struct mm_struct *mm, struct task_struct *task, goto out_pm; err = -ENODEV; + if (node < 0 || node >= MAX_NUMNODES) + goto out_pm; + if (!node_state(node, N_HIGH_MEMORY)) goto out_pm; @@ -1015,7 +974,7 @@ static void do_pages_stat_array(struct mm_struct *mm, unsigned long nr_pages, err = -ENOENT; /* Use PageReserved to check for zero page */ - if (!page || PageReserved(page)) + if (!page || PageReserved(page) || PageKsm(page)) goto set_status; err = page_to_nid(page); @@ -1040,33 +999,27 @@ static int do_pages_stat(struct mm_struct *mm, unsigned long nr_pages, #define DO_PAGES_STAT_CHUNK_NR 16 const void __user *chunk_pages[DO_PAGES_STAT_CHUNK_NR]; int chunk_status[DO_PAGES_STAT_CHUNK_NR]; - unsigned long i, chunk_nr = DO_PAGES_STAT_CHUNK_NR; - int err; - for (i = 0; i < nr_pages; i += chunk_nr) { - if (chunk_nr + i > nr_pages) - chunk_nr = nr_pages - i; + while (nr_pages) { + unsigned long chunk_nr; - err = copy_from_user(chunk_pages, &pages[i], - chunk_nr * sizeof(*chunk_pages)); - if (err) { - err = -EFAULT; - goto out; - } + chunk_nr = nr_pages; + if (chunk_nr > DO_PAGES_STAT_CHUNK_NR) + chunk_nr = DO_PAGES_STAT_CHUNK_NR; + + if (copy_from_user(chunk_pages, pages, chunk_nr * sizeof(*chunk_pages))) + break; do_pages_stat_array(mm, chunk_nr, chunk_pages, chunk_status); - err = copy_to_user(&status[i], chunk_status, - chunk_nr * sizeof(*chunk_status)); - if (err) { - err = -EFAULT; - goto out; - } - } - err = 0; + if (copy_to_user(status, chunk_status, chunk_nr * sizeof(*status))) + break; -out: - return err; + pages += chunk_nr; + status += chunk_nr; + nr_pages -= chunk_nr; + } + return nr_pages ? -EFAULT : 0; } /* diff --git a/mm/mincore.c b/mm/mincore.c index 8cb508f84ea4..f77433c20279 100644 --- a/mm/mincore.c +++ b/mm/mincore.c @@ -7,13 +7,14 @@ /* * The mincore() system call. */ -#include <linux/slab.h> #include <linux/pagemap.h> +#include <linux/gfp.h> #include <linux/mm.h> #include <linux/mman.h> #include <linux/syscalls.h> #include <linux/swap.h> #include <linux/swapops.h> +#include <linux/hugetlb.h> #include <asm/uaccess.h> #include <asm/pgtable.h> @@ -72,6 +73,42 @@ static long do_mincore(unsigned long addr, unsigned char *vec, unsigned long pag if (!vma || addr < vma->vm_start) return -ENOMEM; +#ifdef CONFIG_HUGETLB_PAGE + if (is_vm_hugetlb_page(vma)) { + struct hstate *h; + unsigned long nr_huge; + unsigned char present; + + i = 0; + nr = min(pages, (vma->vm_end - addr) >> PAGE_SHIFT); + h = hstate_vma(vma); + nr_huge = ((addr + pages * PAGE_SIZE - 1) >> huge_page_shift(h)) + - (addr >> huge_page_shift(h)) + 1; + nr_huge = min(nr_huge, + (vma->vm_end - addr) >> huge_page_shift(h)); + while (1) { + /* hugepage always in RAM for now, + * but generally it needs to be check */ + ptep = huge_pte_offset(current->mm, + addr & huge_page_mask(h)); + present = !!(ptep && + !huge_pte_none(huge_ptep_get(ptep))); + while (1) { + vec[i++] = present; + addr += PAGE_SIZE; + /* reach buffer limit */ + if (i == nr) + return nr; + /* check hugepage border */ + if (!((addr & ~huge_page_mask(h)) + >> PAGE_SHIFT)) + break; + } + } + return nr; + } +#endif + /* * Calculate how many pages there are left in the last level of the * PTE array for our address. diff --git a/mm/mlock.c b/mm/mlock.c index bd6f0e466f6c..8f4e2dfceec1 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -25,7 +25,7 @@ int can_do_mlock(void) { if (capable(CAP_IPC_LOCK)) return 1; - if (current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur != 0) + if (rlimit(RLIMIT_MEMLOCK) != 0) return 1; return 0; } @@ -88,25 +88,22 @@ void mlock_vma_page(struct page *page) } } -/* - * called from munlock()/munmap() path with page supposedly on the LRU. +/** + * munlock_vma_page - munlock a vma page + * @page - page to be unlocked * - * Note: unlike mlock_vma_page(), we can't just clear the PageMlocked - * [in try_to_munlock()] and then attempt to isolate the page. We must - * isolate the page to keep others from messing with its unevictable - * and mlocked state while trying to munlock. However, we pre-clear the - * mlocked state anyway as we might lose the isolation race and we might - * not get another chance to clear PageMlocked. If we successfully - * isolate the page and try_to_munlock() detects other VM_LOCKED vmas - * mapping the page, it will restore the PageMlocked state, unless the page - * is mapped in a non-linear vma. So, we go ahead and SetPageMlocked(), - * perhaps redundantly. - * If we lose the isolation race, and the page is mapped by other VM_LOCKED - * vmas, we'll detect this in vmscan--via try_to_munlock() or try_to_unmap() - * either of which will restore the PageMlocked state by calling - * mlock_vma_page() above, if it can grab the vma's mmap sem. + * called from munlock()/munmap() path with page supposedly on the LRU. + * When we munlock a page, because the vma where we found the page is being + * munlock()ed or munmap()ed, we want to check whether other vmas hold the + * page locked so that we can leave it on the unevictable lru list and not + * bother vmscan with it. However, to walk the page's rmap list in + * try_to_munlock() we must isolate the page from the LRU. If some other + * task has removed the page from the LRU, we won't be able to do that. + * So we clear the PageMlocked as we might not get another chance. If we + * can't isolate the page, we leave it for putback_lru_page() and vmscan + * [page_referenced()/try_to_unmap()] to deal with. */ -static void munlock_vma_page(struct page *page) +void munlock_vma_page(struct page *page) { BUG_ON(!PageLocked(page)); @@ -117,18 +114,18 @@ static void munlock_vma_page(struct page *page) /* * did try_to_unlock() succeed or punt? */ - if (ret == SWAP_SUCCESS || ret == SWAP_AGAIN) + if (ret != SWAP_MLOCK) count_vm_event(UNEVICTABLE_PGMUNLOCKED); putback_lru_page(page); } else { /* - * We lost the race. let try_to_unmap() deal - * with it. At least we get the page state and - * mlock stats right. However, page is still on - * the noreclaim list. We'll fix that up when - * the page is eventually freed or we scan the - * noreclaim list. + * Some other task has removed the page from the LRU. + * putback_lru_page() will take care of removing the + * page from the unevictable list, if necessary. + * vmscan [page_referenced()] will move the page back + * to the unevictable list if some other vma has it + * mlocked. */ if (PageUnevictable(page)) count_vm_event(UNEVICTABLE_PGSTRANDED); @@ -490,7 +487,7 @@ SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len) locked = len >> PAGE_SHIFT; locked += current->mm->locked_vm; - lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur; + lock_limit = rlimit(RLIMIT_MEMLOCK); lock_limit >>= PAGE_SHIFT; /* check against resource limits */ @@ -553,7 +550,7 @@ SYSCALL_DEFINE1(mlockall, int, flags) down_write(¤t->mm->mmap_sem); - lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur; + lock_limit = rlimit(RLIMIT_MEMLOCK); lock_limit >>= PAGE_SHIFT; ret = -ENOMEM; @@ -587,7 +584,7 @@ int user_shm_lock(size_t size, struct user_struct *user) int allowed = 0; locked = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; - lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur; + lock_limit = rlimit(RLIMIT_MEMLOCK); if (lock_limit == RLIM_INFINITY) allowed = 1; lock_limit >>= PAGE_SHIFT; @@ -621,12 +618,12 @@ int account_locked_memory(struct mm_struct *mm, struct rlimit *rlim, down_write(&mm->mmap_sem); - lim = rlim[RLIMIT_AS].rlim_cur >> PAGE_SHIFT; + lim = ACCESS_ONCE(rlim[RLIMIT_AS].rlim_cur) >> PAGE_SHIFT; vm = mm->total_vm + pgsz; if (lim < vm) goto out; - lim = rlim[RLIMIT_MEMLOCK].rlim_cur >> PAGE_SHIFT; + lim = ACCESS_ONCE(rlim[RLIMIT_MEMLOCK].rlim_cur) >> PAGE_SHIFT; vm = mm->locked_vm + pgsz; if (lim < vm) goto out; diff --git a/mm/mmap.c b/mm/mmap.c index 73f5e4b64010..75557c639ad4 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -20,7 +20,6 @@ #include <linux/fs.h> #include <linux/personality.h> #include <linux/security.h> -#include <linux/ima.h> #include <linux/hugetlb.h> #include <linux/profile.h> #include <linux/module.h> @@ -266,7 +265,7 @@ SYSCALL_DEFINE1(brk, unsigned long, brk) * segment grow beyond its set limit the in case where the limit is * not page aligned -Ram Gupta */ - rlim = current->signal->rlim[RLIMIT_DATA].rlim_cur; + rlim = rlimit(RLIMIT_DATA); if (rlim < RLIM_INFINITY && (brk - mm->start_brk) + (mm->end_data - mm->start_data) > rlim) goto out; @@ -438,7 +437,6 @@ __vma_link(struct mm_struct *mm, struct vm_area_struct *vma, { __vma_link_list(mm, vma, prev, rb_parent); __vma_link_rb(mm, vma, rb_link, rb_parent); - __anon_vma_link(vma); } static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma, @@ -500,7 +498,7 @@ __vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma, * are necessary. The "insert" vma (if any) is to be inserted * before we drop the necessary locks. */ -void vma_adjust(struct vm_area_struct *vma, unsigned long start, +int vma_adjust(struct vm_area_struct *vma, unsigned long start, unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert) { struct mm_struct *mm = vma->vm_mm; @@ -543,6 +541,26 @@ again: remove_next = 1 + (end > next->vm_end); } } + /* + * When changing only vma->vm_end, we don't really need anon_vma lock. + */ + if (vma->anon_vma && (insert || importer || start != vma->vm_start)) + anon_vma = vma->anon_vma; + if (anon_vma) { + /* + * Easily overlooked: when mprotect shifts the boundary, + * make sure the expanding vma has anon_vma set if the + * shrinking vma had, to cover any anon pages imported. + */ + if (importer && !importer->anon_vma) { + /* Block reverse map lookups until things are set up. */ + if (anon_vma_clone(importer, vma)) { + return -ENOMEM; + } + importer->anon_vma = anon_vma; + } + } + if (file) { mapping = file->f_mapping; if (!(vma->vm_flags & VM_NONLINEAR)) @@ -568,25 +586,6 @@ again: remove_next = 1 + (end > next->vm_end); } } - /* - * When changing only vma->vm_end, we don't really need - * anon_vma lock. - */ - if (vma->anon_vma && (insert || importer || start != vma->vm_start)) - anon_vma = vma->anon_vma; - if (anon_vma) { - spin_lock(&anon_vma->lock); - /* - * Easily overlooked: when mprotect shifts the boundary, - * make sure the expanding vma has anon_vma set if the - * shrinking vma had, to cover any anon pages imported. - */ - if (importer && !importer->anon_vma) { - importer->anon_vma = anon_vma; - __anon_vma_link(importer); - } - } - if (root) { flush_dcache_mmap_lock(mapping); vma_prio_tree_remove(vma, root); @@ -617,8 +616,6 @@ again: remove_next = 1 + (end > next->vm_end); __vma_unlink(mm, next, vma); if (file) __remove_shared_vm_struct(next, file, mapping); - if (next->anon_vma) - __anon_vma_merge(vma, next); } else if (insert) { /* * split_vma has split insert from vma, and needs @@ -628,8 +625,6 @@ again: remove_next = 1 + (end > next->vm_end); __insert_vm_struct(mm, insert); } - if (anon_vma) - spin_unlock(&anon_vma->lock); if (mapping) spin_unlock(&mapping->i_mmap_lock); @@ -639,6 +634,8 @@ again: remove_next = 1 + (end > next->vm_end); if (next->vm_flags & VM_EXECUTABLE) removed_exe_file_vma(mm); } + if (next->anon_vma) + anon_vma_merge(vma, next); mm->map_count--; mpol_put(vma_policy(next)); kmem_cache_free(vm_area_cachep, next); @@ -654,6 +651,8 @@ again: remove_next = 1 + (end > next->vm_end); } validate_mm(mm); + + return 0; } /* @@ -760,6 +759,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, { pgoff_t pglen = (end - addr) >> PAGE_SHIFT; struct vm_area_struct *area, *next; + int err; /* * We later require that vma->vm_flags == vm_flags, @@ -793,11 +793,13 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, is_mergeable_anon_vma(prev->anon_vma, next->anon_vma)) { /* cases 1, 6 */ - vma_adjust(prev, prev->vm_start, + err = vma_adjust(prev, prev->vm_start, next->vm_end, prev->vm_pgoff, NULL); } else /* cases 2, 5, 7 */ - vma_adjust(prev, prev->vm_start, + err = vma_adjust(prev, prev->vm_start, end, prev->vm_pgoff, NULL); + if (err) + return NULL; return prev; } @@ -809,11 +811,13 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, can_vma_merge_before(next, vm_flags, anon_vma, file, pgoff+pglen)) { if (prev && addr < prev->vm_end) /* case 4 */ - vma_adjust(prev, prev->vm_start, + err = vma_adjust(prev, prev->vm_start, addr, prev->vm_pgoff, NULL); else /* cases 3, 8 */ - vma_adjust(area, addr, next->vm_end, + err = vma_adjust(area, addr, next->vm_end, next->vm_pgoff - pglen, NULL); + if (err) + return NULL; return area; } @@ -932,13 +936,9 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, if (!(flags & MAP_FIXED)) addr = round_hint_to_min(addr); - error = arch_mmap_check(addr, len, flags); - if (error) - return error; - /* Careful about overflows.. */ len = PAGE_ALIGN(len); - if (!len || len > TASK_SIZE) + if (!len) return -ENOMEM; /* offset overflow? */ @@ -949,24 +949,6 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, if (mm->map_count > sysctl_max_map_count) return -ENOMEM; - if (flags & MAP_HUGETLB) { - struct user_struct *user = NULL; - if (file) - return -EINVAL; - - /* - * VM_NORESERVE is used because the reservations will be - * taken when vm_ops->mmap() is called - * A dummy user value is used because we are not locking - * memory so no accounting is necessary - */ - len = ALIGN(len, huge_page_size(&default_hstate)); - file = hugetlb_file_setup(HUGETLB_ANON_FILE, len, VM_NORESERVE, - &user, HUGETLB_ANONHUGE_INODE); - if (IS_ERR(file)) - return PTR_ERR(file); - } - /* Obtain the address to map to. we verify (or select) it and ensure * that it represents a valid section of the address space. */ @@ -990,7 +972,7 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, unsigned long locked, lock_limit; locked = len >> PAGE_SHIFT; locked += mm->locked_vm; - lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur; + lock_limit = rlimit(RLIMIT_MEMLOCK); lock_limit >>= PAGE_SHIFT; if (locked > lock_limit && !capable(CAP_IPC_LOCK)) return -EAGAIN; @@ -1061,14 +1043,75 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, error = security_file_mmap(file, reqprot, prot, flags, addr, 0); if (error) return error; - error = ima_file_mmap(file, prot); - if (error) - return error; return mmap_region(file, addr, len, flags, vm_flags, pgoff); } EXPORT_SYMBOL(do_mmap_pgoff); +SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len, + unsigned long, prot, unsigned long, flags, + unsigned long, fd, unsigned long, pgoff) +{ + struct file *file = NULL; + unsigned long retval = -EBADF; + + if (!(flags & MAP_ANONYMOUS)) { + if (unlikely(flags & MAP_HUGETLB)) + return -EINVAL; + file = fget(fd); + if (!file) + goto out; + } else if (flags & MAP_HUGETLB) { + struct user_struct *user = NULL; + /* + * VM_NORESERVE is used because the reservations will be + * taken when vm_ops->mmap() is called + * A dummy user value is used because we are not locking + * memory so no accounting is necessary + */ + len = ALIGN(len, huge_page_size(&default_hstate)); + file = hugetlb_file_setup(HUGETLB_ANON_FILE, len, VM_NORESERVE, + &user, HUGETLB_ANONHUGE_INODE); + if (IS_ERR(file)) + return PTR_ERR(file); + } + + flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); + + down_write(¤t->mm->mmap_sem); + retval = do_mmap_pgoff(file, addr, len, prot, flags, pgoff); + up_write(¤t->mm->mmap_sem); + + if (file) + fput(file); +out: + return retval; +} + +#ifdef __ARCH_WANT_SYS_OLD_MMAP +struct mmap_arg_struct { + unsigned long addr; + unsigned long len; + unsigned long prot; + unsigned long flags; + unsigned long fd; + unsigned long offset; +}; + +SYSCALL_DEFINE1(old_mmap, struct mmap_arg_struct __user *, arg) +{ + struct mmap_arg_struct a; + + if (copy_from_user(&a, arg, sizeof(a))) + return -EFAULT; + if (a.offset & ~PAGE_MASK) + return -EINVAL; + + return sys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd, + a.offset >> PAGE_SHIFT); +} +#endif /* __ARCH_WANT_SYS_OLD_MMAP */ + /* * Some shared mappigns will want the pages marked read-only * to track write events. If so, we'll downgrade vm_page_prot @@ -1191,6 +1234,7 @@ munmap_back: vma->vm_flags = vm_flags; vma->vm_page_prot = vm_get_page_prot(vm_flags); vma->vm_pgoff = pgoff; + INIT_LIST_HEAD(&vma->anon_vma_chain); if (file) { error = -EINVAL; @@ -1224,8 +1268,20 @@ munmap_back: goto free_vma; } - if (vma_wants_writenotify(vma)) + if (vma_wants_writenotify(vma)) { + pgprot_t pprot = vma->vm_page_prot; + + /* Can vma->vm_page_prot have changed?? + * + * Answer: Yes, drivers may have changed it in their + * f_op->mmap method. + * + * Ensures that vmas marked as uncached stay that way. + */ vma->vm_page_prot = vm_get_page_prot(vm_flags & ~VM_SHARED); + if (pgprot_val(pprot) == pgprot_val(pgprot_noncached(pprot))) + vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); + } vma_link(mm, vma, prev, rb_link, rb_parent); file = vma->vm_file; @@ -1239,13 +1295,8 @@ out: mm->total_vm += len >> PAGE_SHIFT; vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT); if (vm_flags & VM_LOCKED) { - /* - * makes pages present; downgrades, drops, reacquires mmap_sem - */ - long nr_pages = mlock_vma_pages_range(vma, addr, addr + len); - if (nr_pages < 0) - return nr_pages; /* vma gone! */ - mm->locked_vm += (len >> PAGE_SHIFT) - nr_pages; + if (!mlock_vma_pages_range(vma, addr, addr + len)) + mm->locked_vm += (len >> PAGE_SHIFT); } else if ((flags & MAP_POPULATE) && !(flags & MAP_NONBLOCK)) make_pages_present(addr, addr + len); return addr; @@ -1459,6 +1510,14 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long (*get_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long); + unsigned long error = arch_mmap_check(addr, len, flags); + if (error) + return error; + + /* Careful about overflows.. */ + if (len > TASK_SIZE) + return -ENOMEM; + get_area = current->mm->get_unmapped_area; if (file && file->f_op && file->f_op->get_unmapped_area) get_area = file->f_op->get_unmapped_area; @@ -1565,7 +1624,7 @@ static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, uns return -ENOMEM; /* Stack limit test */ - if (size > rlim[RLIMIT_STACK].rlim_cur) + if (size > ACCESS_ONCE(rlim[RLIMIT_STACK].rlim_cur)) return -ENOMEM; /* mlock limit tests */ @@ -1573,7 +1632,8 @@ static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, uns unsigned long locked; unsigned long limit; locked = mm->locked_vm + grow; - limit = rlim[RLIMIT_MEMLOCK].rlim_cur >> PAGE_SHIFT; + limit = ACCESS_ONCE(rlim[RLIMIT_MEMLOCK].rlim_cur); + limit >>= PAGE_SHIFT; if (locked > limit && !capable(CAP_IPC_LOCK)) return -ENOMEM; } @@ -1720,8 +1780,7 @@ find_extend_vma(struct mm_struct *mm, unsigned long addr) if (!prev || expand_stack(prev, addr)) return NULL; if (prev->vm_flags & VM_LOCKED) { - if (mlock_vma_pages_range(prev, addr, prev->vm_end) < 0) - return NULL; /* vma gone! */ + mlock_vma_pages_range(prev, addr, prev->vm_end); } return prev; } @@ -1749,8 +1808,7 @@ find_extend_vma(struct mm_struct * mm, unsigned long addr) if (expand_stack(vma, addr)) return NULL; if (vma->vm_flags & VM_LOCKED) { - if (mlock_vma_pages_range(vma, addr, start) < 0) - return NULL; /* vma gone! */ + mlock_vma_pages_range(vma, addr, start); } return vma; } @@ -1829,29 +1887,29 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma, } /* - * Split a vma into two pieces at address 'addr', a new vma is allocated - * either for the first part or the tail. + * __split_vma() bypasses sysctl_max_map_count checking. We use this on the + * munmap path where it doesn't make sense to fail. */ -int split_vma(struct mm_struct * mm, struct vm_area_struct * vma, +static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma, unsigned long addr, int new_below) { struct mempolicy *pol; struct vm_area_struct *new; + int err = -ENOMEM; if (is_vm_hugetlb_page(vma) && (addr & ~(huge_page_mask(hstate_vma(vma))))) return -EINVAL; - if (mm->map_count >= sysctl_max_map_count) - return -ENOMEM; - new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); if (!new) - return -ENOMEM; + goto out_err; /* most fields are the same, copy all, and then fixup */ *new = *vma; + INIT_LIST_HEAD(&new->anon_vma_chain); + if (new_below) new->vm_end = addr; else { @@ -1861,11 +1919,14 @@ int split_vma(struct mm_struct * mm, struct vm_area_struct * vma, pol = mpol_dup(vma_policy(vma)); if (IS_ERR(pol)) { - kmem_cache_free(vm_area_cachep, new); - return PTR_ERR(pol); + err = PTR_ERR(pol); + goto out_free_vma; } vma_set_policy(new, pol); + if (anon_vma_clone(new, vma)) + goto out_free_mpol; + if (new->vm_file) { get_file(new->vm_file); if (vma->vm_flags & VM_EXECUTABLE) @@ -1876,12 +1937,41 @@ int split_vma(struct mm_struct * mm, struct vm_area_struct * vma, new->vm_ops->open(new); if (new_below) - vma_adjust(vma, addr, vma->vm_end, vma->vm_pgoff + + err = vma_adjust(vma, addr, vma->vm_end, vma->vm_pgoff + ((addr - new->vm_start) >> PAGE_SHIFT), new); else - vma_adjust(vma, vma->vm_start, addr, vma->vm_pgoff, new); + err = vma_adjust(vma, vma->vm_start, addr, vma->vm_pgoff, new); - return 0; + /* Success. */ + if (!err) + return 0; + + /* Clean everything up if vma_adjust failed. */ + new->vm_ops->close(new); + if (new->vm_file) { + if (vma->vm_flags & VM_EXECUTABLE) + removed_exe_file_vma(mm); + fput(new->vm_file); + } + out_free_mpol: + mpol_put(pol); + out_free_vma: + kmem_cache_free(vm_area_cachep, new); + out_err: + return err; +} + +/* + * Split a vma into two pieces at address 'addr', a new vma is allocated + * either for the first part or the tail. + */ +int split_vma(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long addr, int new_below) +{ + if (mm->map_count >= sysctl_max_map_count) + return -ENOMEM; + + return __split_vma(mm, vma, addr, new_below); } /* Munmap is split into 2 main parts -- this part which finds @@ -1919,7 +2009,17 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len) * places tmp vma above, and higher split_vma places tmp vma below. */ if (start > vma->vm_start) { - int error = split_vma(mm, vma, start, 0); + int error; + + /* + * Make sure that map_count on return from munmap() will + * not exceed its limit; but let map_count go just above + * its limit temporarily, to help free resources as expected. + */ + if (end < vma->vm_end && mm->map_count >= sysctl_max_map_count) + return -ENOMEM; + + error = __split_vma(mm, vma, start, 0); if (error) return error; prev = vma; @@ -1928,7 +2028,7 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len) /* Does it split the last one? */ last = find_vma(mm, end); if (last && end > last->vm_start) { - int error = split_vma(mm, last, end, 1); + int error = __split_vma(mm, last, end, 1); if (error) return error; } @@ -2003,20 +2103,14 @@ unsigned long do_brk(unsigned long addr, unsigned long len) if (!len) return addr; - if ((addr + len) > TASK_SIZE || (addr + len) < addr) - return -EINVAL; - - if (is_hugepage_only_range(mm, addr, len)) - return -EINVAL; - error = security_file_mmap(NULL, 0, 0, 0, addr, 1); if (error) return error; flags = VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags; - error = arch_mmap_check(addr, len, flags); - if (error) + error = get_unmapped_area(NULL, addr, len, 0, MAP_FIXED); + if (error & ~PAGE_MASK) return error; /* @@ -2026,7 +2120,7 @@ unsigned long do_brk(unsigned long addr, unsigned long len) unsigned long locked, lock_limit; locked = len >> PAGE_SHIFT; locked += mm->locked_vm; - lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur; + lock_limit = rlimit(RLIMIT_MEMLOCK); lock_limit >>= PAGE_SHIFT; if (locked > lock_limit && !capable(CAP_IPC_LOCK)) return -EAGAIN; @@ -2074,6 +2168,7 @@ unsigned long do_brk(unsigned long addr, unsigned long len) return -ENOMEM; } + INIT_LIST_HEAD(&vma->anon_vma_chain); vma->vm_mm = mm; vma->vm_start = addr; vma->vm_end = addr + len; @@ -2210,10 +2305,11 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, if (new_vma) { *new_vma = *vma; pol = mpol_dup(vma_policy(vma)); - if (IS_ERR(pol)) { - kmem_cache_free(vm_area_cachep, new_vma); - return NULL; - } + if (IS_ERR(pol)) + goto out_free_vma; + INIT_LIST_HEAD(&new_vma->anon_vma_chain); + if (anon_vma_clone(new_vma, vma)) + goto out_free_mempol; vma_set_policy(new_vma, pol); new_vma->vm_start = addr; new_vma->vm_end = addr + len; @@ -2229,6 +2325,12 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, } } return new_vma; + + out_free_mempol: + mpol_put(pol); + out_free_vma: + kmem_cache_free(vm_area_cachep, new_vma); + return NULL; } /* @@ -2240,7 +2342,7 @@ int may_expand_vm(struct mm_struct *mm, unsigned long npages) unsigned long cur = mm->total_vm; /* pages */ unsigned long lim; - lim = current->signal->rlim[RLIMIT_AS].rlim_cur >> PAGE_SHIFT; + lim = rlimit(RLIMIT_AS) >> PAGE_SHIFT; if (cur + npages > lim) return 0; @@ -2306,6 +2408,7 @@ int install_special_mapping(struct mm_struct *mm, if (unlikely(vma == NULL)) return -ENOMEM; + INIT_LIST_HEAD(&vma->anon_vma_chain); vma->vm_mm = mm; vma->vm_start = addr; vma->vm_end = addr + len; @@ -2406,6 +2509,7 @@ static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping) int mm_take_all_locks(struct mm_struct *mm) { struct vm_area_struct *vma; + struct anon_vma_chain *avc; int ret = -EINTR; BUG_ON(down_read_trylock(&mm->mmap_sem)); @@ -2423,7 +2527,8 @@ int mm_take_all_locks(struct mm_struct *mm) if (signal_pending(current)) goto out_unlock; if (vma->anon_vma) - vm_lock_anon_vma(mm, vma->anon_vma); + list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) + vm_lock_anon_vma(mm, avc->anon_vma); } ret = 0; @@ -2478,13 +2583,15 @@ static void vm_unlock_mapping(struct address_space *mapping) void mm_drop_all_locks(struct mm_struct *mm) { struct vm_area_struct *vma; + struct anon_vma_chain *avc; BUG_ON(down_read_trylock(&mm->mmap_sem)); BUG_ON(!mutex_is_locked(&mm_all_locks_mutex)); for (vma = mm->mmap; vma; vma = vma->vm_next) { if (vma->anon_vma) - vm_unlock_anon_vma(vma->anon_vma); + list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) + vm_unlock_anon_vma(avc->anon_vma); if (vma->vm_file && vma->vm_file->f_mapping) vm_unlock_mapping(vma->vm_file->f_mapping); } diff --git a/mm/mmu_context.c b/mm/mmu_context.c index ded9081f4021..9e82e937000e 100644 --- a/mm/mmu_context.c +++ b/mm/mmu_context.c @@ -5,6 +5,7 @@ #include <linux/mm.h> #include <linux/mmu_context.h> +#include <linux/module.h> #include <linux/sched.h> #include <asm/mmu_context.h> @@ -37,6 +38,7 @@ void use_mm(struct mm_struct *mm) if (active_mm != mm) mmdrop(active_mm); } +EXPORT_SYMBOL_GPL(use_mm); /* * unuse_mm @@ -51,8 +53,10 @@ void unuse_mm(struct mm_struct *mm) struct task_struct *tsk = current; task_lock(tsk); + sync_mm_rss(tsk, mm); tsk->mm = NULL; /* active_mm is still 'mm' */ enter_lazy_tlb(mm, tsk); task_unlock(tsk); } +EXPORT_SYMBOL_GPL(unuse_mm); diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c index 7e33f2cb3c77..438951d366f2 100644 --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c @@ -16,6 +16,7 @@ #include <linux/err.h> #include <linux/rcupdate.h> #include <linux/sched.h> +#include <linux/slab.h> /* * This function can't run concurrently against mmu_notifier_register diff --git a/mm/mprotect.c b/mm/mprotect.c index 8bc969d8112d..2d1bf7cf8851 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -10,7 +10,6 @@ #include <linux/mm.h> #include <linux/hugetlb.h> -#include <linux/slab.h> #include <linux/shm.h> #include <linux/mman.h> #include <linux/fs.h> diff --git a/mm/mremap.c b/mm/mremap.c index 97bff2547719..cde56ee51ef7 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -9,7 +9,6 @@ #include <linux/mm.h> #include <linux/hugetlb.h> -#include <linux/slab.h> #include <linux/shm.h> #include <linux/ksm.h> #include <linux/mman.h> @@ -261,6 +260,137 @@ static unsigned long move_vma(struct vm_area_struct *vma, return new_addr; } +static struct vm_area_struct *vma_to_resize(unsigned long addr, + unsigned long old_len, unsigned long new_len, unsigned long *p) +{ + struct mm_struct *mm = current->mm; + struct vm_area_struct *vma = find_vma(mm, addr); + + if (!vma || vma->vm_start > addr) + goto Efault; + + if (is_vm_hugetlb_page(vma)) + goto Einval; + + /* We can't remap across vm area boundaries */ + if (old_len > vma->vm_end - addr) + goto Efault; + + if (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP)) { + if (new_len > old_len) + goto Efault; + } + + if (vma->vm_flags & VM_LOCKED) { + unsigned long locked, lock_limit; + locked = mm->locked_vm << PAGE_SHIFT; + lock_limit = rlimit(RLIMIT_MEMLOCK); + locked += new_len - old_len; + if (locked > lock_limit && !capable(CAP_IPC_LOCK)) + goto Eagain; + } + + if (!may_expand_vm(mm, (new_len - old_len) >> PAGE_SHIFT)) + goto Enomem; + + if (vma->vm_flags & VM_ACCOUNT) { + unsigned long charged = (new_len - old_len) >> PAGE_SHIFT; + if (security_vm_enough_memory(charged)) + goto Efault; + *p = charged; + } + + return vma; + +Efault: /* very odd choice for most of the cases, but... */ + return ERR_PTR(-EFAULT); +Einval: + return ERR_PTR(-EINVAL); +Enomem: + return ERR_PTR(-ENOMEM); +Eagain: + return ERR_PTR(-EAGAIN); +} + +static unsigned long mremap_to(unsigned long addr, + unsigned long old_len, unsigned long new_addr, + unsigned long new_len) +{ + struct mm_struct *mm = current->mm; + struct vm_area_struct *vma; + unsigned long ret = -EINVAL; + unsigned long charged = 0; + unsigned long map_flags; + + if (new_addr & ~PAGE_MASK) + goto out; + + if (new_len > TASK_SIZE || new_addr > TASK_SIZE - new_len) + goto out; + + /* Check if the location we're moving into overlaps the + * old location at all, and fail if it does. + */ + if ((new_addr <= addr) && (new_addr+new_len) > addr) + goto out; + + if ((addr <= new_addr) && (addr+old_len) > new_addr) + goto out; + + ret = security_file_mmap(NULL, 0, 0, 0, new_addr, 1); + if (ret) + goto out; + + ret = do_munmap(mm, new_addr, new_len); + if (ret) + goto out; + + if (old_len >= new_len) { + ret = do_munmap(mm, addr+new_len, old_len - new_len); + if (ret && old_len != new_len) + goto out; + old_len = new_len; + } + + vma = vma_to_resize(addr, old_len, new_len, &charged); + if (IS_ERR(vma)) { + ret = PTR_ERR(vma); + goto out; + } + + map_flags = MAP_FIXED; + if (vma->vm_flags & VM_MAYSHARE) + map_flags |= MAP_SHARED; + + ret = get_unmapped_area(vma->vm_file, new_addr, new_len, vma->vm_pgoff + + ((addr - vma->vm_start) >> PAGE_SHIFT), + map_flags); + if (ret & ~PAGE_MASK) + goto out1; + + ret = move_vma(vma, addr, old_len, new_len, new_addr); + if (!(ret & ~PAGE_MASK)) + goto out; +out1: + vm_unacct_memory(charged); + +out: + return ret; +} + +static int vma_expandable(struct vm_area_struct *vma, unsigned long delta) +{ + unsigned long end = vma->vm_end + delta; + if (end < vma->vm_end) /* overflow */ + return 0; + if (vma->vm_next && vma->vm_next->vm_start < end) /* intersection */ + return 0; + if (get_unmapped_area(NULL, vma->vm_start, end - vma->vm_start, + 0, MAP_FIXED) & ~PAGE_MASK) + return 0; + return 1; +} + /* * Expand (or shrink) an existing mapping, potentially moving it at the * same time (controlled by the MREMAP_MAYMOVE flag and available VM space) @@ -294,32 +424,10 @@ unsigned long do_mremap(unsigned long addr, if (!new_len) goto out; - /* new_addr is only valid if MREMAP_FIXED is specified */ if (flags & MREMAP_FIXED) { - if (new_addr & ~PAGE_MASK) - goto out; - if (!(flags & MREMAP_MAYMOVE)) - goto out; - - if (new_len > TASK_SIZE || new_addr > TASK_SIZE - new_len) - goto out; - - /* Check if the location we're moving into overlaps the - * old location at all, and fail if it does. - */ - if ((new_addr <= addr) && (new_addr+new_len) > addr) - goto out; - - if ((addr <= new_addr) && (addr+old_len) > new_addr) - goto out; - - ret = security_file_mmap(NULL, 0, 0, 0, new_addr, 1); - if (ret) - goto out; - - ret = do_munmap(mm, new_addr, new_len); - if (ret) - goto out; + if (flags & MREMAP_MAYMOVE) + ret = mremap_to(addr, old_len, new_addr, new_len); + goto out; } /* @@ -332,64 +440,30 @@ unsigned long do_mremap(unsigned long addr, if (ret && old_len != new_len) goto out; ret = addr; - if (!(flags & MREMAP_FIXED) || (new_addr == addr)) - goto out; - old_len = new_len; + goto out; } /* - * Ok, we need to grow.. or relocate. + * Ok, we need to grow.. */ - ret = -EFAULT; - vma = find_vma(mm, addr); - if (!vma || vma->vm_start > addr) - goto out; - if (is_vm_hugetlb_page(vma)) { - ret = -EINVAL; - goto out; - } - /* We can't remap across vm area boundaries */ - if (old_len > vma->vm_end - addr) - goto out; - if (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP)) { - if (new_len > old_len) - goto out; - } - if (vma->vm_flags & VM_LOCKED) { - unsigned long locked, lock_limit; - locked = mm->locked_vm << PAGE_SHIFT; - lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur; - locked += new_len - old_len; - ret = -EAGAIN; - if (locked > lock_limit && !capable(CAP_IPC_LOCK)) - goto out; - } - if (!may_expand_vm(mm, (new_len - old_len) >> PAGE_SHIFT)) { - ret = -ENOMEM; + vma = vma_to_resize(addr, old_len, new_len, &charged); + if (IS_ERR(vma)) { + ret = PTR_ERR(vma); goto out; } - if (vma->vm_flags & VM_ACCOUNT) { - charged = (new_len - old_len) >> PAGE_SHIFT; - if (security_vm_enough_memory(charged)) - goto out_nc; - } - /* old_len exactly to the end of the area.. - * And we're not relocating the area. */ - if (old_len == vma->vm_end - addr && - !((flags & MREMAP_FIXED) && (addr != new_addr)) && - (old_len != new_len || !(flags & MREMAP_MAYMOVE))) { - unsigned long max_addr = TASK_SIZE; - if (vma->vm_next) - max_addr = vma->vm_next->vm_start; + if (old_len == vma->vm_end - addr) { /* can we just expand the current mapping? */ - if (max_addr - addr >= new_len) { + if (vma_expandable(vma, new_len - old_len)) { int pages = (new_len - old_len) >> PAGE_SHIFT; - vma_adjust(vma, vma->vm_start, - addr + new_len, vma->vm_pgoff, NULL); + if (vma_adjust(vma, vma->vm_start, addr + new_len, + vma->vm_pgoff, NULL)) { + ret = -ENOMEM; + goto out; + } mm->total_vm += pages; vm_stat_account(mm, vma->vm_flags, vma->vm_file, pages); @@ -409,28 +483,27 @@ unsigned long do_mremap(unsigned long addr, */ ret = -ENOMEM; if (flags & MREMAP_MAYMOVE) { - if (!(flags & MREMAP_FIXED)) { - unsigned long map_flags = 0; - if (vma->vm_flags & VM_MAYSHARE) - map_flags |= MAP_SHARED; - - new_addr = get_unmapped_area(vma->vm_file, 0, new_len, - vma->vm_pgoff, map_flags); - if (new_addr & ~PAGE_MASK) { - ret = new_addr; - goto out; - } - - ret = security_file_mmap(NULL, 0, 0, 0, new_addr, 1); - if (ret) - goto out; + unsigned long map_flags = 0; + if (vma->vm_flags & VM_MAYSHARE) + map_flags |= MAP_SHARED; + + new_addr = get_unmapped_area(vma->vm_file, 0, new_len, + vma->vm_pgoff + + ((addr - vma->vm_start) >> PAGE_SHIFT), + map_flags); + if (new_addr & ~PAGE_MASK) { + ret = new_addr; + goto out; } + + ret = security_file_mmap(NULL, 0, 0, 0, new_addr, 1); + if (ret) + goto out; ret = move_vma(vma, addr, old_len, new_len, new_addr); } out: if (ret & ~PAGE_MASK) vm_unacct_memory(charged); -out_nc: return ret; } diff --git a/mm/nommu.c b/mm/nommu.c index 9876fa0c3ad3..63fa17d121f0 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -162,7 +162,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, } if (vmas) vmas[i] = vma; - start += PAGE_SIZE; + start = (start + PAGE_SIZE) & PAGE_MASK; } return i; @@ -432,6 +432,7 @@ SYSCALL_DEFINE1(brk, unsigned long, brk) /* * Ok, looks good - let it rip. */ + flush_icache_range(mm->brk, brk); return mm->brk = brk; } @@ -551,11 +552,11 @@ static void free_page_series(unsigned long from, unsigned long to) static void __put_nommu_region(struct vm_region *region) __releases(nommu_region_sem) { - kenter("%p{%d}", region, atomic_read(®ion->vm_usage)); + kenter("%p{%d}", region, region->vm_usage); BUG_ON(!nommu_region_tree.rb_node); - if (atomic_dec_and_test(®ion->vm_usage)) { + if (--region->vm_usage == 0) { if (region->vm_top > region->vm_start) delete_nommu_region(region); up_write(&nommu_region_sem); @@ -1039,10 +1040,9 @@ static int do_mmap_shared_file(struct vm_area_struct *vma) if (ret != -ENOSYS) return ret; - /* getting an ENOSYS error indicates that direct mmap isn't - * possible (as opposed to tried but failed) so we'll fall - * through to making a private copy of the data and mapping - * that if we can */ + /* getting -ENOSYS indicates that direct mmap isn't possible (as + * opposed to tried but failed) so we can only give a suitable error as + * it's not possible to make a private copy if MAP_SHARED was given */ return -ENODEV; } @@ -1143,9 +1143,6 @@ static int do_mmap_private(struct vm_area_struct *vma, if (ret < rlen) memset(base + ret, 0, rlen - ret); - } else { - /* if it's an anonymous mapping, then just clear it */ - memset(base, 0, rlen); } return 0; @@ -1207,11 +1204,11 @@ unsigned long do_mmap_pgoff(struct file *file, if (!vma) goto error_getting_vma; - atomic_set(®ion->vm_usage, 1); + region->vm_usage = 1; region->vm_flags = vm_flags; region->vm_pgoff = pgoff; - INIT_LIST_HEAD(&vma->anon_vma_node); + INIT_LIST_HEAD(&vma->anon_vma_chain); vma->vm_flags = vm_flags; vma->vm_pgoff = pgoff; @@ -1274,7 +1271,7 @@ unsigned long do_mmap_pgoff(struct file *file, } /* we've found a region we can share */ - atomic_inc(&pregion->vm_usage); + pregion->vm_usage++; vma->vm_region = pregion; start = pregion->vm_start; start += (pgoff - pregion->vm_pgoff) << PAGE_SHIFT; @@ -1291,7 +1288,7 @@ unsigned long do_mmap_pgoff(struct file *file, vma->vm_region = NULL; vma->vm_start = 0; vma->vm_end = 0; - atomic_dec(&pregion->vm_usage); + pregion->vm_usage--; pregion = NULL; goto error_just_free; } @@ -1343,6 +1340,11 @@ unsigned long do_mmap_pgoff(struct file *file, goto error_just_free; add_nommu_region(region); + /* clear anonymous mappings that don't ask for uninitialized data */ + if (!vma->vm_file && !(flags & MAP_UNINITIALIZED)) + memset((void *)region->vm_start, 0, + region->vm_end - region->vm_start); + /* okay... we have a mapping; now we have to register it */ result = vma->vm_start; @@ -1351,10 +1353,14 @@ unsigned long do_mmap_pgoff(struct file *file, share: add_vma_to_mm(current->mm, vma); - up_write(&nommu_region_sem); + /* we flush the region from the icache only when the first executable + * mapping of it is made */ + if (vma->vm_flags & VM_EXEC && !region->vm_icache_flushed) { + flush_icache_range(region->vm_start, region->vm_end); + region->vm_icache_flushed = true; + } - if (prot & PROT_EXEC) - flush_icache_range(result, result + len); + up_write(&nommu_region_sem); kleave(" = %lx", result); return result; @@ -1396,6 +1402,55 @@ error_getting_region: } EXPORT_SYMBOL(do_mmap_pgoff); +SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len, + unsigned long, prot, unsigned long, flags, + unsigned long, fd, unsigned long, pgoff) +{ + struct file *file = NULL; + unsigned long retval = -EBADF; + + if (!(flags & MAP_ANONYMOUS)) { + file = fget(fd); + if (!file) + goto out; + } + + flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); + + down_write(¤t->mm->mmap_sem); + retval = do_mmap_pgoff(file, addr, len, prot, flags, pgoff); + up_write(¤t->mm->mmap_sem); + + if (file) + fput(file); +out: + return retval; +} + +#ifdef __ARCH_WANT_SYS_OLD_MMAP +struct mmap_arg_struct { + unsigned long addr; + unsigned long len; + unsigned long prot; + unsigned long flags; + unsigned long fd; + unsigned long offset; +}; + +SYSCALL_DEFINE1(old_mmap, struct mmap_arg_struct __user *, arg) +{ + struct mmap_arg_struct a; + + if (copy_from_user(&a, arg, sizeof(a))) + return -EFAULT; + if (a.offset & ~PAGE_MASK) + return -EINVAL; + + return sys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd, + a.offset >> PAGE_SHIFT); +} +#endif /* __ARCH_WANT_SYS_OLD_MMAP */ + /* * split a vma into two pieces at address 'addr', a new vma is allocated either * for the first part or the tail. @@ -1409,10 +1464,9 @@ int split_vma(struct mm_struct *mm, struct vm_area_struct *vma, kenter(""); - /* we're only permitted to split anonymous regions that have a single - * owner */ - if (vma->vm_file || - atomic_read(&vma->vm_region->vm_usage) != 1) + /* we're only permitted to split anonymous regions (these should have + * only a single usage on the region) */ + if (vma->vm_file) return -ENOMEM; if (mm->map_count >= sysctl_max_map_count) @@ -1486,7 +1540,7 @@ static int shrink_vma(struct mm_struct *mm, /* cut the backing region down to size */ region = vma->vm_region; - BUG_ON(atomic_read(®ion->vm_usage) != 1); + BUG_ON(region->vm_usage != 1); down_write(&nommu_region_sem); delete_nommu_region(region); @@ -1730,27 +1784,6 @@ void unmap_mapping_range(struct address_space *mapping, EXPORT_SYMBOL(unmap_mapping_range); /* - * ask for an unmapped area at which to create a mapping on a file - */ -unsigned long get_unmapped_area(struct file *file, unsigned long addr, - unsigned long len, unsigned long pgoff, - unsigned long flags) -{ - unsigned long (*get_area)(struct file *, unsigned long, unsigned long, - unsigned long, unsigned long); - - get_area = current->mm->get_unmapped_area; - if (file && file->f_op && file->f_op->get_unmapped_area) - get_area = file->f_op->get_unmapped_area; - - if (!get_area) - return -ENOSYS; - - return get_area(file, addr, len, pgoff, flags); -} -EXPORT_SYMBOL(get_unmapped_area); - -/* * Check that a process has enough memory to allocate a new virtual * mapping. 0 means there is enough memory for the allocation to * succeed and -ENOMEM implies there is not. @@ -1889,9 +1922,11 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, in /* only read or write mappings where it is permitted */ if (write && vma->vm_flags & VM_MAYWRITE) - len -= copy_to_user((void *) addr, buf, len); + copy_to_user_page(vma, NULL, addr, + (void *) addr, buf, len); else if (!write && vma->vm_flags & VM_MAYREAD) - len -= copy_from_user(buf, (void *) addr, len); + copy_from_user_page(vma, NULL, addr, + buf, (void *) addr, len); else len = 0; } else { @@ -1902,3 +1937,65 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, in mmput(mm); return len; } + +/** + * nommu_shrink_inode_mappings - Shrink the shared mappings on an inode + * @inode: The inode to check + * @size: The current filesize of the inode + * @newsize: The proposed filesize of the inode + * + * Check the shared mappings on an inode on behalf of a shrinking truncate to + * make sure that that any outstanding VMAs aren't broken and then shrink the + * vm_regions that extend that beyond so that do_mmap_pgoff() doesn't + * automatically grant mappings that are too large. + */ +int nommu_shrink_inode_mappings(struct inode *inode, size_t size, + size_t newsize) +{ + struct vm_area_struct *vma; + struct prio_tree_iter iter; + struct vm_region *region; + pgoff_t low, high; + size_t r_size, r_top; + + low = newsize >> PAGE_SHIFT; + high = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; + + down_write(&nommu_region_sem); + + /* search for VMAs that fall within the dead zone */ + vma_prio_tree_foreach(vma, &iter, &inode->i_mapping->i_mmap, + low, high) { + /* found one - only interested if it's shared out of the page + * cache */ + if (vma->vm_flags & VM_SHARED) { + up_write(&nommu_region_sem); + return -ETXTBSY; /* not quite true, but near enough */ + } + } + + /* reduce any regions that overlap the dead zone - if in existence, + * these will be pointed to by VMAs that don't overlap the dead zone + * + * we don't check for any regions that start beyond the EOF as there + * shouldn't be any + */ + vma_prio_tree_foreach(vma, &iter, &inode->i_mapping->i_mmap, + 0, ULONG_MAX) { + if (!(vma->vm_flags & VM_SHARED)) + continue; + + region = vma->vm_region; + r_size = region->vm_top - region->vm_start; + r_top = (region->vm_pgoff << PAGE_SHIFT) + r_size; + + if (r_top > newsize) { + region->vm_top -= r_top - newsize; + if (region->vm_end > region->vm_top) + region->vm_end = region->vm_top; + } + } + + up_write(&nommu_region_sem); + return 0; +} diff --git a/mm/oom_kill.c b/mm/oom_kill.c index ea2147dabba6..b68e802a7a7d 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -18,6 +18,7 @@ #include <linux/oom.h> #include <linux/mm.h> #include <linux/err.h> +#include <linux/gfp.h> #include <linux/sched.h> #include <linux/swap.h> #include <linux/timex.h> @@ -196,27 +197,46 @@ unsigned long badness(struct task_struct *p, unsigned long uptime) /* * Determine the type of allocation constraint. */ -static inline enum oom_constraint constrained_alloc(struct zonelist *zonelist, - gfp_t gfp_mask) -{ #ifdef CONFIG_NUMA +static enum oom_constraint constrained_alloc(struct zonelist *zonelist, + gfp_t gfp_mask, nodemask_t *nodemask) +{ struct zone *zone; struct zoneref *z; enum zone_type high_zoneidx = gfp_zone(gfp_mask); - nodemask_t nodes = node_states[N_HIGH_MEMORY]; - for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) - if (cpuset_zone_allowed_softwall(zone, gfp_mask)) - node_clear(zone_to_nid(zone), nodes); - else - return CONSTRAINT_CPUSET; + /* + * Reach here only when __GFP_NOFAIL is used. So, we should avoid + * to kill current.We have to random task kill in this case. + * Hopefully, CONSTRAINT_THISNODE...but no way to handle it, now. + */ + if (gfp_mask & __GFP_THISNODE) + return CONSTRAINT_NONE; - if (!nodes_empty(nodes)) + /* + * The nodemask here is a nodemask passed to alloc_pages(). Now, + * cpuset doesn't use this nodemask for its hardwall/softwall/hierarchy + * feature. mempolicy is an only user of nodemask here. + * check mempolicy's nodemask contains all N_HIGH_MEMORY + */ + if (nodemask && !nodes_subset(node_states[N_HIGH_MEMORY], *nodemask)) return CONSTRAINT_MEMORY_POLICY; -#endif + + /* Check this allocation failure is caused by cpuset's wall function */ + for_each_zone_zonelist_nodemask(zone, z, zonelist, + high_zoneidx, nodemask) + if (!cpuset_zone_allowed_softwall(zone, gfp_mask)) + return CONSTRAINT_CPUSET; return CONSTRAINT_NONE; } +#else +static enum oom_constraint constrained_alloc(struct zonelist *zonelist, + gfp_t gfp_mask, nodemask_t *nodemask) +{ + return CONSTRAINT_NONE; +} +#endif /* * Simple selection loop. We chose the process with the highest @@ -337,6 +357,24 @@ static void dump_tasks(const struct mem_cgroup *mem) } while_each_thread(g, p); } +static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order, + struct mem_cgroup *mem) +{ + pr_warning("%s invoked oom-killer: gfp_mask=0x%x, order=%d, " + "oom_adj=%d\n", + current->comm, gfp_mask, order, current->signal->oom_adj); + task_lock(current); + cpuset_print_task_mems_allowed(current); + task_unlock(current); + dump_stack(); + mem_cgroup_print_oom_info(mem, p); + show_mem(); + if (sysctl_oom_dump_tasks) + dump_tasks(mem); +} + +#define K(x) ((x) << (PAGE_SHIFT-10)) + /* * Send SIGKILL to the selected process irrespective of CAP_SYS_RAW_IO * flag though it's unlikely that we select a process with CAP_SYS_RAW_IO @@ -350,15 +388,23 @@ static void __oom_kill_task(struct task_struct *p, int verbose) return; } + task_lock(p); if (!p->mm) { WARN_ON(1); - printk(KERN_WARNING "tried to kill an mm-less task!\n"); + printk(KERN_WARNING "tried to kill an mm-less task %d (%s)!\n", + task_pid_nr(p), p->comm); + task_unlock(p); return; } if (verbose) - printk(KERN_ERR "Killed process %d (%s)\n", - task_pid_nr(p), p->comm); + printk(KERN_ERR "Killed process %d (%s) " + "vsz:%lukB, anon-rss:%lukB, file-rss:%lukB\n", + task_pid_nr(p), p->comm, + K(p->mm->total_vm), + K(get_mm_counter(p->mm, MM_ANONPAGES)), + K(get_mm_counter(p->mm, MM_FILEPAGES))); + task_unlock(p); /* * We give our sacrificial lamb high priority and access to @@ -395,20 +441,8 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, { struct task_struct *c; - if (printk_ratelimit()) { - printk(KERN_WARNING "%s invoked oom-killer: " - "gfp_mask=0x%x, order=%d, oom_adj=%d\n", - current->comm, gfp_mask, order, - current->signal->oom_adj); - task_lock(current); - cpuset_print_task_mems_allowed(current); - task_unlock(current); - dump_stack(); - mem_cgroup_print_oom_info(mem, current); - show_mem(); - if (sysctl_oom_dump_tasks) - dump_tasks(mem); - } + if (printk_ratelimit()) + dump_header(p, gfp_mask, order, mem); /* * If the task is already exiting, don't alarm the sysadmin or kill @@ -426,6 +460,8 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, list_for_each_entry(c, &p->children, sibling) { if (c->mm == p->mm) continue; + if (mem && !task_in_mem_cgroup(c, mem)) + continue; if (!oom_kill_task(c)) return 0; } @@ -438,6 +474,8 @@ void mem_cgroup_out_of_memory(struct mem_cgroup *mem, gfp_t gfp_mask) unsigned long points = 0; struct task_struct *p; + if (sysctl_panic_on_oom == 2) + panic("out of memory(memcg). panic_on_oom is selected.\n"); read_lock(&tasklist_lock); retry: p = select_bad_process(&points, mem); @@ -544,6 +582,7 @@ retry: /* Found nothing?!?! Either we hang forever, or we panic. */ if (!p) { read_unlock(&tasklist_lock); + dump_header(NULL, gfp_mask, order, NULL); panic("Out of memory and no killable processes...\n"); } @@ -565,13 +604,6 @@ void pagefault_out_of_memory(void) /* Got some memory back in the last second. */ return; - /* - * If this is from memcg, oom-killer is already invoked. - * and not worth to go system-wide-oom. - */ - if (mem_cgroup_oom_called(current)) - goto rest_and_return; - if (sysctl_panic_on_oom) panic("out of memory from page fault. panic_on_oom is selected.\n"); @@ -583,7 +615,6 @@ void pagefault_out_of_memory(void) * Give "p" a good chance of killing itself before we * retry to allocate memory. */ -rest_and_return: if (!test_thread_flag(TIF_MEMDIE)) schedule_timeout_uninterruptible(1); } @@ -599,7 +630,8 @@ rest_and_return: * OR try to be smart about which process to kill. Note that we * don't have to be perfect here, we just have to be good. */ -void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order) +void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, + int order, nodemask_t *nodemask) { unsigned long freed = 0; enum oom_constraint constraint; @@ -609,14 +641,16 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order) /* Got some memory back in the last second. */ return; - if (sysctl_panic_on_oom == 2) + if (sysctl_panic_on_oom == 2) { + dump_header(NULL, gfp_mask, order, NULL); panic("out of memory. Compulsory panic_on_oom is selected.\n"); + } /* * Check if there were limitations on the allocation (only relevant for * NUMA) that may require different handling. */ - constraint = constrained_alloc(zonelist, gfp_mask); + constraint = constrained_alloc(zonelist, gfp_mask, nodemask); read_lock(&tasklist_lock); switch (constraint) { @@ -626,8 +660,10 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order) break; case CONSTRAINT_NONE: - if (sysctl_panic_on_oom) + if (sysctl_panic_on_oom) { + dump_header(NULL, gfp_mask, order, NULL); panic("out of memory. panic_on_oom is selected\n"); + } /* Fall-through */ case CONSTRAINT_CPUSET: __out_of_memory(gfp_mask, order); diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 2c5d79236ead..0b19943ecf8b 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -821,7 +821,6 @@ int write_cache_pages(struct address_space *mapping, struct writeback_control *wbc, writepage_t writepage, void *data) { - struct backing_dev_info *bdi = mapping->backing_dev_info; int ret = 0; int done = 0; struct pagevec pvec; @@ -834,11 +833,6 @@ int write_cache_pages(struct address_space *mapping, int range_whole = 0; long nr_to_write = wbc->nr_to_write; - if (wbc->nonblocking && bdi_write_congested(bdi)) { - wbc->encountered_congestion = 1; - return 0; - } - pagevec_init(&pvec, 0); if (wbc->range_cyclic) { writeback_index = mapping->writeback_index; /* prev offset */ @@ -957,12 +951,6 @@ continue_unlock: break; } } - - if (wbc->nonblocking && bdi_write_congested(bdi)) { - wbc->encountered_congestion = 1; - done = 1; - break; - } } pagevec_release(&pvec); cond_resched(); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 2bc2ac63f41e..d03c946d5566 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -48,7 +48,9 @@ #include <linux/page_cgroup.h> #include <linux/debugobjects.h> #include <linux/kmemleak.h> +#include <linux/memory.h> #include <trace/events/kmem.h> +#include <linux/ftrace_event.h> #include <asm/tlbflush.h> #include <asm/div64.h> @@ -75,6 +77,31 @@ unsigned long totalreserve_pages __read_mostly; int percpu_pagelist_fraction; gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK; +#ifdef CONFIG_PM_SLEEP +/* + * The following functions are used by the suspend/hibernate code to temporarily + * change gfp_allowed_mask in order to avoid using I/O during memory allocations + * while devices are suspended. To avoid races with the suspend/hibernate code, + * they should always be called with pm_mutex held (gfp_allowed_mask also should + * only be modified with pm_mutex held, unless the suspend/hibernate code is + * guaranteed not to run in parallel with that modification). + */ +void set_gfp_allowed_mask(gfp_t mask) +{ + WARN_ON(!mutex_is_locked(&pm_mutex)); + gfp_allowed_mask = mask; +} + +gfp_t clear_gfp_allowed_mask(gfp_t mask) +{ + gfp_t ret = gfp_allowed_mask; + + WARN_ON(!mutex_is_locked(&pm_mutex)); + gfp_allowed_mask &= ~mask; + return ret; +} +#endif /* CONFIG_PM_SLEEP */ + #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE int pageblock_order __read_mostly; #endif @@ -262,10 +289,7 @@ static void bad_page(struct page *page) printk(KERN_ALERT "BUG: Bad page state in process %s pfn:%05lx\n", current->comm, page_to_pfn(page)); - printk(KERN_ALERT - "page:%p flags:%p count:%d mapcount:%d mapping:%p index:%lx\n", - page, (void *)page->flags, page_count(page), - page_mapcount(page), page->mapping, page->index); + dump_page(page); dump_stack(); out: @@ -486,7 +510,6 @@ static inline void __free_one_page(struct page *page, zone->free_area[order].nr_free++; } -#ifdef CONFIG_HAVE_MLOCKED_PAGE_BIT /* * free_page_mlock() -- clean up attempts to free and mlocked() page. * Page should not be on lru, so no need to fix that up. @@ -497,9 +520,6 @@ static inline void free_page_mlock(struct page *page) __dec_zone_page_state(page, NR_MLOCK); __count_vm_event(UNEVICTABLE_MLOCKFREED); } -#else -static void free_page_mlock(struct page *page) { } -#endif static inline int free_pages_check(struct page *page) { @@ -533,7 +553,7 @@ static void free_pcppages_bulk(struct zone *zone, int count, int batch_free = 0; spin_lock(&zone->lock); - zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE); + zone->all_unreclaimable = 0; zone->pages_scanned = 0; __mod_zone_page_state(zone, NR_FREE_PAGES, count); @@ -559,8 +579,9 @@ static void free_pcppages_bulk(struct zone *zone, int count, page = list_entry(list->prev, struct page, lru); /* must delete as __free_one_page list manipulates */ list_del(&page->lru); - __free_one_page(page, zone, 0, migratetype); - trace_mm_page_pcpu_drain(page, 0, migratetype); + /* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */ + __free_one_page(page, zone, 0, page_private(page)); + trace_mm_page_pcpu_drain(page, 0, page_private(page)); } while (--count && --batch_free && !list_empty(list)); } spin_unlock(&zone->lock); @@ -570,7 +591,7 @@ static void free_one_page(struct zone *zone, struct page *page, int order, int migratetype) { spin_lock(&zone->lock); - zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE); + zone->all_unreclaimable = 0; zone->pages_scanned = 0; __mod_zone_page_state(zone, NR_FREE_PAGES, 1 << order); @@ -585,6 +606,7 @@ static void __free_pages_ok(struct page *page, unsigned int order) int bad = 0; int wasMlocked = __TestClearPageMlocked(page); + trace_mm_page_free_direct(page, order); kmemcheck_free_shadow(page, order); for (i = 0 ; i < (1 << order) ; ++i) @@ -1011,10 +1033,10 @@ static void drain_pages(unsigned int cpu) struct per_cpu_pageset *pset; struct per_cpu_pages *pcp; - pset = zone_pcp(zone, cpu); + local_irq_save(flags); + pset = per_cpu_ptr(zone->pageset, cpu); pcp = &pset->pcp; - local_irq_save(flags); free_pcppages_bulk(zone, pcp->count, pcp); pcp->count = 0; local_irq_restore(flags); @@ -1075,8 +1097,9 @@ void mark_free_pages(struct zone *zone) /* * Free a 0-order page + * cold == 1 ? free a cold page : free a hot page */ -static void free_hot_cold_page(struct page *page, int cold) +void free_hot_cold_page(struct page *page, int cold) { struct zone *zone = page_zone(page); struct per_cpu_pages *pcp; @@ -1084,6 +1107,7 @@ static void free_hot_cold_page(struct page *page, int cold) int migratetype; int wasMlocked = __TestClearPageMlocked(page); + trace_mm_page_free_direct(page, 0); kmemcheck_free_shadow(page, 0); if (PageAnon(page)) @@ -1098,7 +1122,6 @@ static void free_hot_cold_page(struct page *page, int cold) arch_free_page(page, 0); kernel_map_pages(page, 1, 0); - pcp = &zone_pcp(zone, get_cpu())->pcp; migratetype = get_pageblock_migratetype(page); set_page_private(page, migratetype); local_irq_save(flags); @@ -1121,6 +1144,7 @@ static void free_hot_cold_page(struct page *page, int cold) migratetype = MIGRATE_MOVABLE; } + pcp = &this_cpu_ptr(zone->pageset)->pcp; if (cold) list_add_tail(&page->lru, &pcp->lists[migratetype]); else @@ -1133,15 +1157,8 @@ static void free_hot_cold_page(struct page *page, int cold) out: local_irq_restore(flags); - put_cpu(); } -void free_hot_page(struct page *page) -{ - trace_mm_page_free_direct(page, 0); - free_hot_cold_page(page, 0); -} - /* * split_page takes a non-compound higher-order page, and splits it into * n (1<<order) sub-pages: page[0..n] @@ -1183,17 +1200,15 @@ struct page *buffered_rmqueue(struct zone *preferred_zone, unsigned long flags; struct page *page; int cold = !!(gfp_flags & __GFP_COLD); - int cpu; again: - cpu = get_cpu(); if (likely(order == 0)) { struct per_cpu_pages *pcp; struct list_head *list; - pcp = &zone_pcp(zone, cpu)->pcp; - list = &pcp->lists[migratetype]; local_irq_save(flags); + pcp = &this_cpu_ptr(zone->pageset)->pcp; + list = &pcp->lists[migratetype]; if (list_empty(list)) { pcp->count += rmqueue_bulk(zone, 0, pcp->batch, list, @@ -1225,16 +1240,15 @@ again: } spin_lock_irqsave(&zone->lock, flags); page = __rmqueue(zone, order, migratetype); - __mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << order)); spin_unlock(&zone->lock); if (!page) goto failed; + __mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << order)); } __count_zone_vm_events(PGALLOC, zone, 1 << order); zone_statistics(preferred_zone, zone); local_irq_restore(flags); - put_cpu(); VM_BUG_ON(bad_range(zone, page)); if (prep_new_page(page, order, gfp_flags)) @@ -1243,7 +1257,6 @@ again: failed: local_irq_restore(flags); - put_cpu(); return NULL; } @@ -1658,12 +1671,22 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, if (page) goto out; - /* The OOM killer will not help higher order allocs */ - if (order > PAGE_ALLOC_COSTLY_ORDER && !(gfp_mask & __GFP_NOFAIL)) - goto out; - + if (!(gfp_mask & __GFP_NOFAIL)) { + /* The OOM killer will not help higher order allocs */ + if (order > PAGE_ALLOC_COSTLY_ORDER) + goto out; + /* + * GFP_THISNODE contains __GFP_NORETRY and we never hit this. + * Sanity check for bare calls of __GFP_THISNODE, not real OOM. + * The caller should handle page allocation failure by itself if + * it specifies __GFP_THISNODE. + * Note: Hugepage uses it but will hit PAGE_ALLOC_COSTLY_ORDER. + */ + if (gfp_mask & __GFP_THISNODE) + goto out; + } /* Exhausted what can be done so it's blamo time */ - out_of_memory(zonelist, gfp_mask, order); + out_of_memory(zonelist, gfp_mask, order, nodemask); out: clear_zonelist_oom(zonelist, gfp_mask); @@ -2005,9 +2028,8 @@ void __pagevec_free(struct pagevec *pvec) void __free_pages(struct page *page, unsigned int order) { if (put_page_testzero(page)) { - trace_mm_page_free_direct(page, order); if (order == 0) - free_hot_page(page); + free_hot_cold_page(page, 0); else __free_pages_ok(page, order); } @@ -2172,7 +2194,7 @@ void show_free_areas(void) for_each_online_cpu(cpu) { struct per_cpu_pageset *pageset; - pageset = zone_pcp(zone, cpu); + pageset = per_cpu_ptr(zone->pageset, cpu); printk("CPU %4d: hi:%5d, btch:%4d usd:%4d\n", cpu, pageset->pcp.high, @@ -2263,7 +2285,7 @@ void show_free_areas(void) K(zone_page_state(zone, NR_BOUNCE)), K(zone_page_state(zone, NR_WRITEBACK_TEMP)), zone->pages_scanned, - (zone_is_all_unreclaimable(zone) ? "yes" : "no") + (zone->all_unreclaimable ? "yes" : "no") ); printk("lowmem_reserve[]:"); for (i = 0; i < MAX_NR_ZONES; i++) @@ -2395,13 +2417,14 @@ int numa_zonelist_order_handler(ctl_table *table, int write, { char saved_string[NUMA_ZONELIST_ORDER_LEN]; int ret; + static DEFINE_MUTEX(zl_order_mutex); + mutex_lock(&zl_order_mutex); if (write) - strncpy(saved_string, (char*)table->data, - NUMA_ZONELIST_ORDER_LEN); + strcpy(saved_string, (char*)table->data); ret = proc_dostring(table, write, buffer, length, ppos); if (ret) - return ret; + goto out; if (write) { int oldval = user_zonelist_order; if (__parse_numa_zonelist_order((char*)table->data)) { @@ -2414,7 +2437,9 @@ int numa_zonelist_order_handler(ctl_table *table, int write, } else if (oldval != user_zonelist_order) build_all_zonelists(); } - return 0; +out: + mutex_unlock(&zl_order_mutex); + return ret; } @@ -2734,10 +2759,29 @@ static void build_zonelist_cache(pg_data_t *pgdat) #endif /* CONFIG_NUMA */ +/* + * Boot pageset table. One per cpu which is going to be used for all + * zones and all nodes. The parameters will be set in such a way + * that an item put on a list will immediately be handed over to + * the buddy list. This is safe since pageset manipulation is done + * with interrupts disabled. + * + * The boot_pagesets must be kept even after bootup is complete for + * unused processors and/or zones. They do play a role for bootstrapping + * hotplugged processors. + * + * zoneinfo_show() and maybe other functions do + * not check if the processor is online before following the pageset pointer. + * Other parts of the kernel may not check if the zone is available. + */ +static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch); +static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset); + /* return values int ....just for stop_machine() */ static int __build_all_zonelists(void *dummy) { int nid; + int cpu; #ifdef CONFIG_NUMA memset(node_load, 0, sizeof(node_load)); @@ -2748,6 +2792,23 @@ static int __build_all_zonelists(void *dummy) build_zonelists(pgdat); build_zonelist_cache(pgdat); } + + /* + * Initialize the boot_pagesets that are going to be used + * for bootstrapping processors. The real pagesets for + * each zone will be allocated later when the per cpu + * allocator is available. + * + * boot_pagesets are used also for bootstrapping offline + * cpus if the system is already booted because the pagesets + * are needed to initialize allocators on a specific cpu too. + * F.e. the percpu allocator needs the page allocator which + * needs the percpu allocator in order to allocate its pagesets + * (a chicken-egg dilemma). + */ + for_each_possible_cpu(cpu) + setup_pageset(&per_cpu(boot_pageset, cpu), 0); + return 0; } @@ -3085,121 +3146,33 @@ static void setup_pagelist_highmark(struct per_cpu_pageset *p, pcp->batch = PAGE_SHIFT * 8; } - -#ifdef CONFIG_NUMA -/* - * Boot pageset table. One per cpu which is going to be used for all - * zones and all nodes. The parameters will be set in such a way - * that an item put on a list will immediately be handed over to - * the buddy list. This is safe since pageset manipulation is done - * with interrupts disabled. - * - * Some NUMA counter updates may also be caught by the boot pagesets. - * - * The boot_pagesets must be kept even after bootup is complete for - * unused processors and/or zones. They do play a role for bootstrapping - * hotplugged processors. - * - * zoneinfo_show() and maybe other functions do - * not check if the processor is online before following the pageset pointer. - * Other parts of the kernel may not check if the zone is available. - */ -static struct per_cpu_pageset boot_pageset[NR_CPUS]; - /* - * Dynamically allocate memory for the - * per cpu pageset array in struct zone. + * Allocate per cpu pagesets and initialize them. + * Before this call only boot pagesets were available. + * Boot pagesets will no longer be used by this processorr + * after setup_per_cpu_pageset(). */ -static int __cpuinit process_zones(int cpu) +void __init setup_per_cpu_pageset(void) { - struct zone *zone, *dzone; - int node = cpu_to_node(cpu); - - node_set_state(node, N_CPU); /* this node has a cpu */ + struct zone *zone; + int cpu; for_each_populated_zone(zone) { - zone_pcp(zone, cpu) = kmalloc_node(sizeof(struct per_cpu_pageset), - GFP_KERNEL, node); - if (!zone_pcp(zone, cpu)) - goto bad; + zone->pageset = alloc_percpu(struct per_cpu_pageset); - setup_pageset(zone_pcp(zone, cpu), zone_batchsize(zone)); + for_each_possible_cpu(cpu) { + struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu); - if (percpu_pagelist_fraction) - setup_pagelist_highmark(zone_pcp(zone, cpu), - (zone->present_pages / percpu_pagelist_fraction)); - } + setup_pageset(pcp, zone_batchsize(zone)); - return 0; -bad: - for_each_zone(dzone) { - if (!populated_zone(dzone)) - continue; - if (dzone == zone) - break; - kfree(zone_pcp(dzone, cpu)); - zone_pcp(dzone, cpu) = &boot_pageset[cpu]; - } - return -ENOMEM; -} - -static inline void free_zone_pagesets(int cpu) -{ - struct zone *zone; - - for_each_zone(zone) { - struct per_cpu_pageset *pset = zone_pcp(zone, cpu); - - /* Free per_cpu_pageset if it is slab allocated */ - if (pset != &boot_pageset[cpu]) - kfree(pset); - zone_pcp(zone, cpu) = &boot_pageset[cpu]; - } -} - -static int __cpuinit pageset_cpuup_callback(struct notifier_block *nfb, - unsigned long action, - void *hcpu) -{ - int cpu = (long)hcpu; - int ret = NOTIFY_OK; - - switch (action) { - case CPU_UP_PREPARE: - case CPU_UP_PREPARE_FROZEN: - if (process_zones(cpu)) - ret = NOTIFY_BAD; - break; - case CPU_UP_CANCELED: - case CPU_UP_CANCELED_FROZEN: - case CPU_DEAD: - case CPU_DEAD_FROZEN: - free_zone_pagesets(cpu); - break; - default: - break; + if (percpu_pagelist_fraction) + setup_pagelist_highmark(pcp, + (zone->present_pages / + percpu_pagelist_fraction)); + } } - return ret; } -static struct notifier_block __cpuinitdata pageset_notifier = - { &pageset_cpuup_callback, NULL, 0 }; - -void __init setup_per_cpu_pageset(void) -{ - int err; - - /* Initialize per_cpu_pageset for cpu 0. - * A cpuup callback will do this for every cpu - * as it comes online - */ - err = process_zones(smp_processor_id()); - BUG_ON(err); - register_cpu_notifier(&pageset_notifier); -} - -#endif - static noinline __init_refok int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages) { @@ -3249,11 +3222,11 @@ static int __zone_pcp_update(void *data) int cpu; unsigned long batch = zone_batchsize(zone), flags; - for (cpu = 0; cpu < NR_CPUS; cpu++) { + for_each_possible_cpu(cpu) { struct per_cpu_pageset *pset; struct per_cpu_pages *pcp; - pset = zone_pcp(zone, cpu); + pset = per_cpu_ptr(zone->pageset, cpu); pcp = &pset->pcp; local_irq_save(flags); @@ -3271,21 +3244,17 @@ void zone_pcp_update(struct zone *zone) static __meminit void zone_pcp_init(struct zone *zone) { - int cpu; - unsigned long batch = zone_batchsize(zone); + /* + * per cpu subsystem is not up at this point. The following code + * relies on the ability of the linker to provide the + * offset of a (static) per cpu variable into the per cpu area. + */ + zone->pageset = &boot_pageset; - for (cpu = 0; cpu < NR_CPUS; cpu++) { -#ifdef CONFIG_NUMA - /* Early boot. Slab allocator not functional yet */ - zone_pcp(zone, cpu) = &boot_pageset[cpu]; - setup_pageset(&boot_pageset[cpu],0); -#else - setup_pageset(zone_pcp(zone,cpu), batch); -#endif - } if (zone->present_pages) - printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%lu\n", - zone->name, zone->present_pages, batch); + printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%u\n", + zone->name, zone->present_pages, + zone_batchsize(zone)); } __meminit int init_currently_empty_zone(struct zone *zone, @@ -3424,6 +3393,61 @@ void __init free_bootmem_with_active_regions(int nid, } } +int __init add_from_early_node_map(struct range *range, int az, + int nr_range, int nid) +{ + int i; + u64 start, end; + + /* need to go over early_node_map to find out good range for node */ + for_each_active_range_index_in_nid(i, nid) { + start = early_node_map[i].start_pfn; + end = early_node_map[i].end_pfn; + nr_range = add_range(range, az, nr_range, start, end); + } + return nr_range; +} + +#ifdef CONFIG_NO_BOOTMEM +void * __init __alloc_memory_core_early(int nid, u64 size, u64 align, + u64 goal, u64 limit) +{ + int i; + void *ptr; + + /* need to go over early_node_map to find out good range for node */ + for_each_active_range_index_in_nid(i, nid) { + u64 addr; + u64 ei_start, ei_last; + + ei_last = early_node_map[i].end_pfn; + ei_last <<= PAGE_SHIFT; + ei_start = early_node_map[i].start_pfn; + ei_start <<= PAGE_SHIFT; + addr = find_early_area(ei_start, ei_last, + goal, limit, size, align); + + if (addr == -1ULL) + continue; + +#if 0 + printk(KERN_DEBUG "alloc (nid=%d %llx - %llx) (%llx - %llx) %llx %llx => %llx\n", + nid, + ei_start, ei_last, goal, limit, size, + align, addr); +#endif + + ptr = phys_to_virt(addr); + memset(ptr, 0, size); + reserve_early_without_check(addr, addr + size, "BOOTMEM"); + return ptr; + } + + return NULL; +} +#endif + + void __init work_with_active_regions(int nid, work_fn_t work_fn, void *data) { int i; @@ -3573,7 +3597,7 @@ static unsigned long __meminit zone_spanned_pages_in_node(int nid, * Return the number of holes in a range on a node. If nid is MAX_NUMNODES, * then all holes in the requested range will be accounted for. */ -static unsigned long __meminit __absent_pages_in_range(int nid, +unsigned long __meminit __absent_pages_in_range(int nid, unsigned long range_start_pfn, unsigned long range_end_pfn) { @@ -3988,7 +4012,7 @@ void __init add_active_range(unsigned int nid, unsigned long start_pfn, } /* Merge backward if suitable */ - if (start_pfn < early_node_map[i].end_pfn && + if (start_pfn < early_node_map[i].start_pfn && end_pfn >= early_node_map[i].start_pfn) { early_node_map[i].start_pfn = start_pfn; return; @@ -4102,7 +4126,7 @@ static int __init cmp_node_active_region(const void *a, const void *b) } /* sort the node_map by start_pfn */ -static void __init sort_node_map(void) +void __init sort_node_map(void) { sort(early_node_map, (size_t)nr_nodemap_entries, sizeof(struct node_active_region), @@ -4366,8 +4390,12 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn) for (i = 0; i < MAX_NR_ZONES; i++) { if (i == ZONE_MOVABLE) continue; - printk(" %-8s %0#10lx -> %0#10lx\n", - zone_names[i], + printk(" %-8s ", zone_names[i]); + if (arch_zone_lowest_possible_pfn[i] == + arch_zone_highest_possible_pfn[i]) + printk("empty\n"); + else + printk("%0#10lx -> %0#10lx\n", arch_zone_lowest_possible_pfn[i], arch_zone_highest_possible_pfn[i]); } @@ -4456,7 +4484,11 @@ void __init set_dma_reserve(unsigned long new_dma_reserve) } #ifndef CONFIG_NEED_MULTIPLE_NODES -struct pglist_data __refdata contig_page_data = { .bdata = &bootmem_node_data[0] }; +struct pglist_data __refdata contig_page_data = { +#ifndef CONFIG_NO_BOOTMEM + .bdata = &bootmem_node_data[0] +#endif + }; EXPORT_SYMBOL(contig_page_data); #endif @@ -4799,10 +4831,11 @@ int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write, if (!write || (ret == -EINVAL)) return ret; for_each_populated_zone(zone) { - for_each_online_cpu(cpu) { + for_each_possible_cpu(cpu) { unsigned long high; high = zone->present_pages / percpu_pagelist_fraction; - setup_pagelist_highmark(zone_pcp(zone, cpu), high); + setup_pagelist_highmark( + per_cpu_ptr(zone->pageset, cpu), high); } } return 0; @@ -5002,23 +5035,65 @@ void set_pageblock_flags_group(struct page *page, unsigned long flags, int set_migratetype_isolate(struct page *page) { struct zone *zone; - unsigned long flags; + struct page *curr_page; + unsigned long flags, pfn, iter; + unsigned long immobile = 0; + struct memory_isolate_notify arg; + int notifier_ret; int ret = -EBUSY; int zone_idx; zone = page_zone(page); zone_idx = zone_idx(zone); + spin_lock_irqsave(&zone->lock, flags); + if (get_pageblock_migratetype(page) == MIGRATE_MOVABLE || + zone_idx == ZONE_MOVABLE) { + ret = 0; + goto out; + } + + pfn = page_to_pfn(page); + arg.start_pfn = pfn; + arg.nr_pages = pageblock_nr_pages; + arg.pages_found = 0; + /* - * In future, more migrate types will be able to be isolation target. + * It may be possible to isolate a pageblock even if the + * migratetype is not MIGRATE_MOVABLE. The memory isolation + * notifier chain is used by balloon drivers to return the + * number of pages in a range that are held by the balloon + * driver to shrink memory. If all the pages are accounted for + * by balloons, are free, or on the LRU, isolation can continue. + * Later, for example, when memory hotplug notifier runs, these + * pages reported as "can be isolated" should be isolated(freed) + * by the balloon driver through the memory notifier chain. */ - if (get_pageblock_migratetype(page) != MIGRATE_MOVABLE && - zone_idx != ZONE_MOVABLE) + notifier_ret = memory_isolate_notify(MEM_ISOLATE_COUNT, &arg); + notifier_ret = notifier_to_errno(notifier_ret); + if (notifier_ret || !arg.pages_found) goto out; - set_pageblock_migratetype(page, MIGRATE_ISOLATE); - move_freepages_block(zone, page, MIGRATE_ISOLATE); - ret = 0; + + for (iter = pfn; iter < (pfn + pageblock_nr_pages); iter++) { + if (!pfn_valid_within(pfn)) + continue; + + curr_page = pfn_to_page(iter); + if (!page_count(curr_page) || PageLRU(curr_page)) + continue; + + immobile++; + } + + if (arg.pages_found == immobile) + ret = 0; + out: + if (!ret) { + set_pageblock_migratetype(page, MIGRATE_ISOLATE); + move_freepages_block(zone, page, MIGRATE_ISOLATE); + } + spin_unlock_irqrestore(&zone->lock, flags); if (!ret) drain_all_pages(); @@ -5085,3 +5160,101 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) spin_unlock_irqrestore(&zone->lock, flags); } #endif + +#ifdef CONFIG_MEMORY_FAILURE +bool is_free_buddy_page(struct page *page) +{ + struct zone *zone = page_zone(page); + unsigned long pfn = page_to_pfn(page); + unsigned long flags; + int order; + + spin_lock_irqsave(&zone->lock, flags); + for (order = 0; order < MAX_ORDER; order++) { + struct page *page_head = page - (pfn & ((1 << order) - 1)); + + if (PageBuddy(page_head) && page_order(page_head) >= order) + break; + } + spin_unlock_irqrestore(&zone->lock, flags); + + return order < MAX_ORDER; +} +#endif + +static struct trace_print_flags pageflag_names[] = { + {1UL << PG_locked, "locked" }, + {1UL << PG_error, "error" }, + {1UL << PG_referenced, "referenced" }, + {1UL << PG_uptodate, "uptodate" }, + {1UL << PG_dirty, "dirty" }, + {1UL << PG_lru, "lru" }, + {1UL << PG_active, "active" }, + {1UL << PG_slab, "slab" }, + {1UL << PG_owner_priv_1, "owner_priv_1" }, + {1UL << PG_arch_1, "arch_1" }, + {1UL << PG_reserved, "reserved" }, + {1UL << PG_private, "private" }, + {1UL << PG_private_2, "private_2" }, + {1UL << PG_writeback, "writeback" }, +#ifdef CONFIG_PAGEFLAGS_EXTENDED + {1UL << PG_head, "head" }, + {1UL << PG_tail, "tail" }, +#else + {1UL << PG_compound, "compound" }, +#endif + {1UL << PG_swapcache, "swapcache" }, + {1UL << PG_mappedtodisk, "mappedtodisk" }, + {1UL << PG_reclaim, "reclaim" }, + {1UL << PG_buddy, "buddy" }, + {1UL << PG_swapbacked, "swapbacked" }, + {1UL << PG_unevictable, "unevictable" }, +#ifdef CONFIG_MMU + {1UL << PG_mlocked, "mlocked" }, +#endif +#ifdef CONFIG_ARCH_USES_PG_UNCACHED + {1UL << PG_uncached, "uncached" }, +#endif +#ifdef CONFIG_MEMORY_FAILURE + {1UL << PG_hwpoison, "hwpoison" }, +#endif + {-1UL, NULL }, +}; + +static void dump_page_flags(unsigned long flags) +{ + const char *delim = ""; + unsigned long mask; + int i; + + printk(KERN_ALERT "page flags: %#lx(", flags); + + /* remove zone id */ + flags &= (1UL << NR_PAGEFLAGS) - 1; + + for (i = 0; pageflag_names[i].name && flags; i++) { + + mask = pageflag_names[i].mask; + if ((flags & mask) != mask) + continue; + + flags &= ~mask; + printk("%s%s", delim, pageflag_names[i].name); + delim = "|"; + } + + /* check for left over flags */ + if (flags) + printk("%s%#lx", delim, flags); + + printk(")\n"); +} + +void dump_page(struct page *page) +{ + printk(KERN_ALERT + "page:%p count:%d mapcount:%d mapping:%p index:%#lx\n", + page, page_count(page), page_mapcount(page), + page->mapping, page->index); + dump_page_flags(page->flags); +} diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c index 3d535d594826..6c0081441a32 100644 --- a/mm/page_cgroup.c +++ b/mm/page_cgroup.c @@ -284,6 +284,7 @@ static DEFINE_MUTEX(swap_cgroup_mutex); struct swap_cgroup_ctrl { struct page **map; unsigned long length; + spinlock_t lock; }; struct swap_cgroup_ctrl swap_cgroup_ctrl[MAX_SWAPFILES]; @@ -335,6 +336,43 @@ not_enough_page: } /** + * swap_cgroup_cmpxchg - cmpxchg mem_cgroup's id for this swp_entry. + * @end: swap entry to be cmpxchged + * @old: old id + * @new: new id + * + * Returns old id at success, 0 at failure. + * (There is no mem_cgroup useing 0 as its id) + */ +unsigned short swap_cgroup_cmpxchg(swp_entry_t ent, + unsigned short old, unsigned short new) +{ + int type = swp_type(ent); + unsigned long offset = swp_offset(ent); + unsigned long idx = offset / SC_PER_PAGE; + unsigned long pos = offset & SC_POS_MASK; + struct swap_cgroup_ctrl *ctrl; + struct page *mappage; + struct swap_cgroup *sc; + unsigned long flags; + unsigned short retval; + + ctrl = &swap_cgroup_ctrl[type]; + + mappage = ctrl->map[idx]; + sc = page_address(mappage); + sc += pos; + spin_lock_irqsave(&ctrl->lock, flags); + retval = sc->id; + if (retval == old) + sc->id = new; + else + retval = 0; + spin_unlock_irqrestore(&ctrl->lock, flags); + return retval; +} + +/** * swap_cgroup_record - record mem_cgroup for this swp_entry. * @ent: swap entry to be recorded into * @mem: mem_cgroup to be recorded @@ -352,14 +390,17 @@ unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id) struct page *mappage; struct swap_cgroup *sc; unsigned short old; + unsigned long flags; ctrl = &swap_cgroup_ctrl[type]; mappage = ctrl->map[idx]; sc = page_address(mappage); sc += pos; + spin_lock_irqsave(&ctrl->lock, flags); old = sc->id; sc->id = id; + spin_unlock_irqrestore(&ctrl->lock, flags); return old; } @@ -411,6 +452,7 @@ int swap_cgroup_swapon(int type, unsigned long max_pages) mutex_lock(&swap_cgroup_mutex); ctrl->length = length; ctrl->map = array; + spin_lock_init(&ctrl->lock); if (swap_cgroup_prepare(type)) { /* memory shortage */ ctrl->map = NULL; diff --git a/mm/page_io.c b/mm/page_io.c index c6f3e5071de3..31a3b962230a 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -12,6 +12,7 @@ #include <linux/mm.h> #include <linux/kernel_stat.h> +#include <linux/gfp.h> #include <linux/pagemap.h> #include <linux/swap.h> #include <linux/bio.h> @@ -19,20 +20,15 @@ #include <linux/writeback.h> #include <asm/pgtable.h> -static struct bio *get_swap_bio(gfp_t gfp_flags, pgoff_t index, +static struct bio *get_swap_bio(gfp_t gfp_flags, struct page *page, bio_end_io_t end_io) { struct bio *bio; bio = bio_alloc(gfp_flags, 1); if (bio) { - struct swap_info_struct *sis; - swp_entry_t entry = { .val = index, }; - - sis = get_swap_info_struct(swp_type(entry)); - bio->bi_sector = map_swap_page(sis, swp_offset(entry)) * - (PAGE_SIZE >> 9); - bio->bi_bdev = sis->bdev; + bio->bi_sector = map_swap_page(page, &bio->bi_bdev); + bio->bi_sector <<= PAGE_SHIFT - 9; bio->bi_io_vec[0].bv_page = page; bio->bi_io_vec[0].bv_len = PAGE_SIZE; bio->bi_io_vec[0].bv_offset = 0; @@ -102,8 +98,7 @@ int swap_writepage(struct page *page, struct writeback_control *wbc) unlock_page(page); goto out; } - bio = get_swap_bio(GFP_NOIO, page_private(page), page, - end_swap_bio_write); + bio = get_swap_bio(GFP_NOIO, page, end_swap_bio_write); if (bio == NULL) { set_page_dirty(page); unlock_page(page); @@ -127,8 +122,7 @@ int swap_readpage(struct page *page) VM_BUG_ON(!PageLocked(page)); VM_BUG_ON(PageUptodate(page)); - bio = get_swap_bio(GFP_KERNEL, page_private(page), page, - end_swap_bio_read); + bio = get_swap_bio(GFP_KERNEL, page, end_swap_bio_read); if (bio == NULL) { unlock_page(page); ret = -ENOMEM; diff --git a/mm/pagewalk.c b/mm/pagewalk.c index d5878bed7841..8b1a2ce21ee5 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c @@ -1,6 +1,7 @@ #include <linux/mm.h> #include <linux/highmem.h> #include <linux/sched.h> +#include <linux/hugetlb.h> static int walk_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, struct mm_walk *walk) @@ -79,6 +80,37 @@ static int walk_pud_range(pgd_t *pgd, unsigned long addr, unsigned long end, return err; } +#ifdef CONFIG_HUGETLB_PAGE +static unsigned long hugetlb_entry_end(struct hstate *h, unsigned long addr, + unsigned long end) +{ + unsigned long boundary = (addr & huge_page_mask(h)) + huge_page_size(h); + return boundary < end ? boundary : end; +} + +static int walk_hugetlb_range(struct vm_area_struct *vma, + unsigned long addr, unsigned long end, + struct mm_walk *walk) +{ + struct hstate *h = hstate_vma(vma); + unsigned long next; + unsigned long hmask = huge_page_mask(h); + pte_t *pte; + int err = 0; + + do { + next = hugetlb_entry_end(h, addr, end); + pte = huge_pte_offset(walk->mm, addr & hmask); + if (pte && walk->hugetlb_entry) + err = walk->hugetlb_entry(pte, hmask, addr, next, walk); + if (err) + return err; + } while (addr = next, addr != end); + + return 0; +} +#endif + /** * walk_page_range - walk a memory map's page tables with a callback * @mm: memory map to walk @@ -107,6 +139,7 @@ int walk_page_range(unsigned long addr, unsigned long end, pgd_t *pgd; unsigned long next; int err = 0; + struct vm_area_struct *vma; if (addr >= end) return err; @@ -117,11 +150,34 @@ int walk_page_range(unsigned long addr, unsigned long end, pgd = pgd_offset(walk->mm, addr); do { next = pgd_addr_end(addr, end); + + /* + * handle hugetlb vma individually because pagetable walk for + * the hugetlb page is dependent on the architecture and + * we can't handled it in the same manner as non-huge pages. + */ + vma = find_vma(walk->mm, addr); +#ifdef CONFIG_HUGETLB_PAGE + if (vma && is_vm_hugetlb_page(vma)) { + if (vma->vm_end < next) + next = vma->vm_end; + /* + * Hugepage is very tightly coupled with vma, so + * walk through hugetlb entries within a given vma. + */ + err = walk_hugetlb_range(vma, addr, next, walk); + if (err) + break; + pgd = pgd_offset(walk->mm, next); + continue; + } +#endif if (pgd_none_or_clear_bad(pgd)) { if (walk->pte_hole) err = walk->pte_hole(addr, next, walk); if (err) break; + pgd++; continue; } if (walk->pgd_entry) @@ -131,7 +187,8 @@ int walk_page_range(unsigned long addr, unsigned long end, err = walk_pud_range(pgd, addr, next, walk); if (err) break; - } while (pgd++, addr = next, addr != end); + pgd++; + } while (addr = next, addr != end); return err; } diff --git a/mm/percpu.c b/mm/percpu.c index 5adfc268b408..6e09741ddc62 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -46,8 +46,6 @@ * * To use this allocator, arch code should do the followings. * - * - drop CONFIG_HAVE_LEGACY_PER_CPU_AREA - * * - define __addr_to_pcpu_ptr() and __pcpu_ptr_to_addr() to translate * regular address to percpu pointer and back if they need to be * different from the default @@ -74,6 +72,7 @@ #include <asm/cacheflush.h> #include <asm/sections.h> #include <asm/tlbflush.h> +#include <asm/io.h> #define PCPU_SLOT_BASE_SHIFT 5 /* 1-31 shares the same slot */ #define PCPU_DFL_MAP_ALLOC 16 /* start a map with 16 ents */ @@ -81,13 +80,15 @@ /* default addr <-> pcpu_ptr mapping, override in asm/percpu.h if necessary */ #ifndef __addr_to_pcpu_ptr #define __addr_to_pcpu_ptr(addr) \ - (void *)((unsigned long)(addr) - (unsigned long)pcpu_base_addr \ - + (unsigned long)__per_cpu_start) + (void __percpu *)((unsigned long)(addr) - \ + (unsigned long)pcpu_base_addr + \ + (unsigned long)__per_cpu_start) #endif #ifndef __pcpu_ptr_to_addr #define __pcpu_ptr_to_addr(ptr) \ - (void *)((unsigned long)(ptr) + (unsigned long)pcpu_base_addr \ - - (unsigned long)__per_cpu_start) + (void __force *)((unsigned long)(ptr) + \ + (unsigned long)pcpu_base_addr - \ + (unsigned long)__per_cpu_start) #endif struct pcpu_chunk { @@ -914,11 +915,10 @@ static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size) int rs, re; /* quick path, check whether it's empty already */ - pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) { - if (rs == page_start && re == page_end) - return; - break; - } + rs = page_start; + pcpu_next_unpop(chunk, &rs, &re, page_end); + if (rs == page_start && re == page_end) + return; /* immutable chunks can't be depopulated */ WARN_ON(chunk->immutable); @@ -969,11 +969,10 @@ static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size) int rs, re, rc; /* quick path, check whether all pages are already there */ - pcpu_for_each_pop_region(chunk, rs, re, page_start, page_end) { - if (rs == page_start && re == page_end) - goto clear; - break; - } + rs = page_start; + pcpu_next_pop(chunk, &rs, &re, page_end); + if (rs == page_start && re == page_end) + goto clear; /* need to allocate and map pages, this chunk can't be immutable */ WARN_ON(chunk->immutable); @@ -1068,7 +1067,7 @@ static struct pcpu_chunk *alloc_pcpu_chunk(void) * RETURNS: * Percpu pointer to the allocated area on success, NULL on failure. */ -static void *pcpu_alloc(size_t size, size_t align, bool reserved) +static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved) { static int warn_limit = 10; struct pcpu_chunk *chunk; @@ -1197,7 +1196,7 @@ fail_unlock_mutex: * RETURNS: * Percpu pointer to the allocated area on success, NULL on failure. */ -void *__alloc_percpu(size_t size, size_t align) +void __percpu *__alloc_percpu(size_t size, size_t align) { return pcpu_alloc(size, align, false); } @@ -1218,7 +1217,7 @@ EXPORT_SYMBOL_GPL(__alloc_percpu); * RETURNS: * Percpu pointer to the allocated area on success, NULL on failure. */ -void *__alloc_reserved_percpu(size_t size, size_t align) +void __percpu *__alloc_reserved_percpu(size_t size, size_t align) { return pcpu_alloc(size, align, true); } @@ -1270,9 +1269,9 @@ static void pcpu_reclaim(struct work_struct *work) * CONTEXT: * Can be called from atomic context. */ -void free_percpu(void *ptr) +void free_percpu(void __percpu *ptr) { - void *addr = __pcpu_ptr_to_addr(ptr); + void *addr; struct pcpu_chunk *chunk; unsigned long flags; int off; @@ -1280,6 +1279,8 @@ void free_percpu(void *ptr) if (!ptr) return; + addr = __pcpu_ptr_to_addr(ptr); + spin_lock_irqsave(&pcpu_lock, flags); chunk = pcpu_chunk_addr_search(addr); @@ -1302,6 +1303,53 @@ void free_percpu(void *ptr) } EXPORT_SYMBOL_GPL(free_percpu); +/** + * is_kernel_percpu_address - test whether address is from static percpu area + * @addr: address to test + * + * Test whether @addr belongs to in-kernel static percpu area. Module + * static percpu areas are not considered. For those, use + * is_module_percpu_address(). + * + * RETURNS: + * %true if @addr is from in-kernel static percpu area, %false otherwise. + */ +bool is_kernel_percpu_address(unsigned long addr) +{ + const size_t static_size = __per_cpu_end - __per_cpu_start; + void __percpu *base = __addr_to_pcpu_ptr(pcpu_base_addr); + unsigned int cpu; + + for_each_possible_cpu(cpu) { + void *start = per_cpu_ptr(base, cpu); + + if ((void *)addr >= start && (void *)addr < start + static_size) + return true; + } + return false; +} + +/** + * per_cpu_ptr_to_phys - convert translated percpu address to physical address + * @addr: the address to be converted to physical address + * + * Given @addr which is dereferenceable address obtained via one of + * percpu access macros, this function translates it into its physical + * address. The caller is responsible for ensuring @addr stays valid + * until this function finishes. + * + * RETURNS: + * The physical address for @addr. + */ +phys_addr_t per_cpu_ptr_to_phys(void *addr) +{ + if ((unsigned long)addr < VMALLOC_START || + (unsigned long)addr >= VMALLOC_END) + return __pa(addr); + else + return page_to_phys(vmalloc_to_page(addr)); +} + static inline size_t pcpu_calc_fc_sizes(size_t static_size, size_t reserved_size, ssize_t *dyn_sizep) diff --git a/mm/percpu_up.c b/mm/percpu_up.c new file mode 100644 index 000000000000..c4351c7f57d2 --- /dev/null +++ b/mm/percpu_up.c @@ -0,0 +1,30 @@ +/* + * mm/percpu_up.c - dummy percpu memory allocator implementation for UP + */ + +#include <linux/module.h> +#include <linux/percpu.h> +#include <linux/slab.h> + +void __percpu *__alloc_percpu(size_t size, size_t align) +{ + /* + * Can't easily make larger alignment work with kmalloc. WARN + * on it. Larger alignment should only be used for module + * percpu sections on SMP for which this path isn't used. + */ + WARN_ON_ONCE(align > SMP_CACHE_BYTES); + return kzalloc(size, GFP_KERNEL); +} +EXPORT_SYMBOL_GPL(__alloc_percpu); + +void free_percpu(void __percpu *p) +{ + kfree(p); +} +EXPORT_SYMBOL_GPL(free_percpu); + +phys_addr_t per_cpu_ptr_to_phys(void *addr) +{ + return __pa(addr); +} diff --git a/mm/quicklist.c b/mm/quicklist.c index 6633965bb27b..2876349339a7 100644 --- a/mm/quicklist.c +++ b/mm/quicklist.c @@ -14,6 +14,7 @@ */ #include <linux/kernel.h> +#include <linux/gfp.h> #include <linux/mm.h> #include <linux/mmzone.h> #include <linux/module.h> diff --git a/mm/readahead.c b/mm/readahead.c index aa1aa2345235..dfa9a1a03a11 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -9,6 +9,7 @@ #include <linux/kernel.h> #include <linux/fs.h> +#include <linux/gfp.h> #include <linux/mm.h> #include <linux/module.h> #include <linux/blkdev.h> @@ -501,6 +502,12 @@ void page_cache_sync_readahead(struct address_space *mapping, if (!ra->ra_pages) return; + /* be dumb */ + if (filp && (filp->f_mode & FMODE_RANDOM)) { + force_page_cache_readahead(mapping, filp, offset, req_size); + return; + } + /* do read-ahead */ ondemand_readahead(mapping, ra, filp, false, offset, req_size); } @@ -547,5 +554,17 @@ page_cache_async_readahead(struct address_space *mapping, /* do read-ahead */ ondemand_readahead(mapping, ra, filp, true, offset, req_size); + +#ifdef CONFIG_BLOCK + /* + * Normally the current page is !uptodate and lock_page() will be + * immediately called to implicitly unplug the device. However this + * is not always true for RAID conifgurations, where data arrives + * not strictly in their submission order. In this case we need to + * explicitly kick off the IO. + */ + if (PageUptodate(page)) + blk_run_backing_dev(mapping->backing_dev_info, NULL); +#endif } EXPORT_SYMBOL_GPL(page_cache_async_readahead); diff --git a/mm/rmap.c b/mm/rmap.c index dd43373a483f..eaa7a09eb72e 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -49,6 +49,7 @@ #include <linux/swapops.h> #include <linux/slab.h> #include <linux/init.h> +#include <linux/ksm.h> #include <linux/rmap.h> #include <linux/rcupdate.h> #include <linux/module.h> @@ -61,17 +62,28 @@ #include "internal.h" static struct kmem_cache *anon_vma_cachep; +static struct kmem_cache *anon_vma_chain_cachep; static inline struct anon_vma *anon_vma_alloc(void) { return kmem_cache_alloc(anon_vma_cachep, GFP_KERNEL); } -static inline void anon_vma_free(struct anon_vma *anon_vma) +void anon_vma_free(struct anon_vma *anon_vma) { kmem_cache_free(anon_vma_cachep, anon_vma); } +static inline struct anon_vma_chain *anon_vma_chain_alloc(void) +{ + return kmem_cache_alloc(anon_vma_chain_cachep, GFP_KERNEL); +} + +void anon_vma_chain_free(struct anon_vma_chain *anon_vma_chain) +{ + kmem_cache_free(anon_vma_chain_cachep, anon_vma_chain); +} + /** * anon_vma_prepare - attach an anon_vma to a memory region * @vma: the memory region in question @@ -102,18 +114,23 @@ static inline void anon_vma_free(struct anon_vma *anon_vma) int anon_vma_prepare(struct vm_area_struct *vma) { struct anon_vma *anon_vma = vma->anon_vma; + struct anon_vma_chain *avc; might_sleep(); if (unlikely(!anon_vma)) { struct mm_struct *mm = vma->vm_mm; struct anon_vma *allocated; + avc = anon_vma_chain_alloc(); + if (!avc) + goto out_enomem; + anon_vma = find_mergeable_anon_vma(vma); allocated = NULL; if (!anon_vma) { anon_vma = anon_vma_alloc(); if (unlikely(!anon_vma)) - return -ENOMEM; + goto out_enomem_free_avc; allocated = anon_vma; } spin_lock(&anon_vma->lock); @@ -122,67 +139,141 @@ int anon_vma_prepare(struct vm_area_struct *vma) spin_lock(&mm->page_table_lock); if (likely(!vma->anon_vma)) { vma->anon_vma = anon_vma; - list_add_tail(&vma->anon_vma_node, &anon_vma->head); + avc->anon_vma = anon_vma; + avc->vma = vma; + list_add(&avc->same_vma, &vma->anon_vma_chain); + list_add(&avc->same_anon_vma, &anon_vma->head); allocated = NULL; } spin_unlock(&mm->page_table_lock); spin_unlock(&anon_vma->lock); - if (unlikely(allocated)) + if (unlikely(allocated)) { anon_vma_free(allocated); + anon_vma_chain_free(avc); + } } return 0; + + out_enomem_free_avc: + anon_vma_chain_free(avc); + out_enomem: + return -ENOMEM; } -void __anon_vma_merge(struct vm_area_struct *vma, struct vm_area_struct *next) +static void anon_vma_chain_link(struct vm_area_struct *vma, + struct anon_vma_chain *avc, + struct anon_vma *anon_vma) { - BUG_ON(vma->anon_vma != next->anon_vma); - list_del(&next->anon_vma_node); + avc->vma = vma; + avc->anon_vma = anon_vma; + list_add(&avc->same_vma, &vma->anon_vma_chain); + + spin_lock(&anon_vma->lock); + list_add_tail(&avc->same_anon_vma, &anon_vma->head); + spin_unlock(&anon_vma->lock); } -void __anon_vma_link(struct vm_area_struct *vma) +/* + * Attach the anon_vmas from src to dst. + * Returns 0 on success, -ENOMEM on failure. + */ +int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src) { - struct anon_vma *anon_vma = vma->anon_vma; + struct anon_vma_chain *avc, *pavc; + + list_for_each_entry(pavc, &src->anon_vma_chain, same_vma) { + avc = anon_vma_chain_alloc(); + if (!avc) + goto enomem_failure; + anon_vma_chain_link(dst, avc, pavc->anon_vma); + } + return 0; - if (anon_vma) - list_add_tail(&vma->anon_vma_node, &anon_vma->head); + enomem_failure: + unlink_anon_vmas(dst); + return -ENOMEM; } -void anon_vma_link(struct vm_area_struct *vma) +/* + * Attach vma to its own anon_vma, as well as to the anon_vmas that + * the corresponding VMA in the parent process is attached to. + * Returns 0 on success, non-zero on failure. + */ +int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma) { - struct anon_vma *anon_vma = vma->anon_vma; + struct anon_vma_chain *avc; + struct anon_vma *anon_vma; - if (anon_vma) { - spin_lock(&anon_vma->lock); - list_add_tail(&vma->anon_vma_node, &anon_vma->head); - spin_unlock(&anon_vma->lock); - } + /* Don't bother if the parent process has no anon_vma here. */ + if (!pvma->anon_vma) + return 0; + + /* + * First, attach the new VMA to the parent VMA's anon_vmas, + * so rmap can find non-COWed pages in child processes. + */ + if (anon_vma_clone(vma, pvma)) + return -ENOMEM; + + /* Then add our own anon_vma. */ + anon_vma = anon_vma_alloc(); + if (!anon_vma) + goto out_error; + avc = anon_vma_chain_alloc(); + if (!avc) + goto out_error_free_anon_vma; + anon_vma_chain_link(vma, avc, anon_vma); + /* Mark this anon_vma as the one where our new (COWed) pages go. */ + vma->anon_vma = anon_vma; + + return 0; + + out_error_free_anon_vma: + anon_vma_free(anon_vma); + out_error: + unlink_anon_vmas(vma); + return -ENOMEM; } -void anon_vma_unlink(struct vm_area_struct *vma) +static void anon_vma_unlink(struct anon_vma_chain *anon_vma_chain) { - struct anon_vma *anon_vma = vma->anon_vma; + struct anon_vma *anon_vma = anon_vma_chain->anon_vma; int empty; + /* If anon_vma_fork fails, we can get an empty anon_vma_chain. */ if (!anon_vma) return; spin_lock(&anon_vma->lock); - list_del(&vma->anon_vma_node); + list_del(&anon_vma_chain->same_anon_vma); /* We must garbage collect the anon_vma if it's empty */ - empty = list_empty(&anon_vma->head); + empty = list_empty(&anon_vma->head) && !ksm_refcount(anon_vma); spin_unlock(&anon_vma->lock); if (empty) anon_vma_free(anon_vma); } +void unlink_anon_vmas(struct vm_area_struct *vma) +{ + struct anon_vma_chain *avc, *next; + + /* Unlink each anon_vma chained to the VMA. */ + list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) { + anon_vma_unlink(avc); + list_del(&avc->same_vma); + anon_vma_chain_free(avc); + } +} + static void anon_vma_ctor(void *data) { struct anon_vma *anon_vma = data; spin_lock_init(&anon_vma->lock); + ksm_refcount_init(anon_vma); INIT_LIST_HEAD(&anon_vma->head); } @@ -190,6 +281,7 @@ void __init anon_vma_init(void) { anon_vma_cachep = kmem_cache_create("anon_vma", sizeof(struct anon_vma), 0, SLAB_DESTROY_BY_RCU|SLAB_PANIC, anon_vma_ctor); + anon_vma_chain_cachep = KMEM_CACHE(anon_vma_chain, SLAB_PANIC); } /* @@ -202,8 +294,8 @@ struct anon_vma *page_lock_anon_vma(struct page *page) unsigned long anon_mapping; rcu_read_lock(); - anon_mapping = (unsigned long) page->mapping; - if (!(anon_mapping & PAGE_MAPPING_ANON)) + anon_mapping = (unsigned long) ACCESS_ONCE(page->mapping); + if ((anon_mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON) goto out; if (!page_mapped(page)) goto out; @@ -248,8 +340,7 @@ vma_address(struct page *page, struct vm_area_struct *vma) unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma) { if (PageAnon(page)) { - if ((void *)vma->anon_vma != - (void *)page->mapping - PAGE_MAPPING_ANON) + if (vma->anon_vma != page_anon_vma(page)) return -EFAULT; } else if (page->mapping && !(vma->vm_flags & VM_NONLINEAR)) { if (!vma->vm_file || @@ -337,21 +428,15 @@ int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma) * Subfunctions of page_referenced: page_referenced_one called * repeatedly from either page_referenced_anon or page_referenced_file. */ -static int page_referenced_one(struct page *page, - struct vm_area_struct *vma, - unsigned int *mapcount, - unsigned long *vm_flags) +int page_referenced_one(struct page *page, struct vm_area_struct *vma, + unsigned long address, unsigned int *mapcount, + unsigned long *vm_flags) { struct mm_struct *mm = vma->vm_mm; - unsigned long address; pte_t *pte; spinlock_t *ptl; int referenced = 0; - address = vma_address(page, vma); - if (address == -EFAULT) - goto out; - pte = page_check_address(page, mm, address, &ptl, 0); if (!pte) goto out; @@ -388,9 +473,10 @@ static int page_referenced_one(struct page *page, out_unmap: (*mapcount)--; pte_unmap_unlock(pte, ptl); -out: + if (referenced) *vm_flags |= vma->vm_flags; +out: return referenced; } @@ -400,7 +486,7 @@ static int page_referenced_anon(struct page *page, { unsigned int mapcount; struct anon_vma *anon_vma; - struct vm_area_struct *vma; + struct anon_vma_chain *avc; int referenced = 0; anon_vma = page_lock_anon_vma(page); @@ -408,7 +494,11 @@ static int page_referenced_anon(struct page *page, return referenced; mapcount = page_mapcount(page); - list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { + list_for_each_entry(avc, &anon_vma->head, same_anon_vma) { + struct vm_area_struct *vma = avc->vma; + unsigned long address = vma_address(page, vma); + if (address == -EFAULT) + continue; /* * If we are reclaiming on behalf of a cgroup, skip * counting on behalf of references from different @@ -416,7 +506,7 @@ static int page_referenced_anon(struct page *page, */ if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont)) continue; - referenced += page_referenced_one(page, vma, + referenced += page_referenced_one(page, vma, address, &mapcount, vm_flags); if (!mapcount) break; @@ -474,6 +564,9 @@ static int page_referenced_file(struct page *page, mapcount = page_mapcount(page); vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { + unsigned long address = vma_address(page, vma); + if (address == -EFAULT) + continue; /* * If we are reclaiming on behalf of a cgroup, skip * counting on behalf of references from different @@ -481,7 +574,7 @@ static int page_referenced_file(struct page *page, */ if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont)) continue; - referenced += page_referenced_one(page, vma, + referenced += page_referenced_one(page, vma, address, &mapcount, vm_flags); if (!mapcount) break; @@ -507,46 +600,44 @@ int page_referenced(struct page *page, unsigned long *vm_flags) { int referenced = 0; - - if (TestClearPageReferenced(page)) - referenced++; + int we_locked = 0; *vm_flags = 0; - if (page_mapped(page) && page->mapping) { - if (PageAnon(page)) + if (page_mapped(page) && page_rmapping(page)) { + if (!is_locked && (!PageAnon(page) || PageKsm(page))) { + we_locked = trylock_page(page); + if (!we_locked) { + referenced++; + goto out; + } + } + if (unlikely(PageKsm(page))) + referenced += page_referenced_ksm(page, mem_cont, + vm_flags); + else if (PageAnon(page)) referenced += page_referenced_anon(page, mem_cont, vm_flags); - else if (is_locked) + else if (page->mapping) referenced += page_referenced_file(page, mem_cont, vm_flags); - else if (!trylock_page(page)) - referenced++; - else { - if (page->mapping) - referenced += page_referenced_file(page, - mem_cont, vm_flags); + if (we_locked) unlock_page(page); - } } - +out: if (page_test_and_clear_young(page)) referenced++; return referenced; } -static int page_mkclean_one(struct page *page, struct vm_area_struct *vma) +static int page_mkclean_one(struct page *page, struct vm_area_struct *vma, + unsigned long address) { struct mm_struct *mm = vma->vm_mm; - unsigned long address; pte_t *pte; spinlock_t *ptl; int ret = 0; - address = vma_address(page, vma); - if (address == -EFAULT) - goto out; - pte = page_check_address(page, mm, address, &ptl, 1); if (!pte) goto out; @@ -578,8 +669,12 @@ static int page_mkclean_file(struct address_space *mapping, struct page *page) spin_lock(&mapping->i_mmap_lock); vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { - if (vma->vm_flags & VM_SHARED) - ret += page_mkclean_one(page, vma); + if (vma->vm_flags & VM_SHARED) { + unsigned long address = vma_address(page, vma); + if (address == -EFAULT) + continue; + ret += page_mkclean_one(page, vma, address); + } } spin_unlock(&mapping->i_mmap_lock); return ret; @@ -607,6 +702,30 @@ int page_mkclean(struct page *page) EXPORT_SYMBOL_GPL(page_mkclean); /** + * page_move_anon_rmap - move a page to our anon_vma + * @page: the page to move to our anon_vma + * @vma: the vma the page belongs to + * @address: the user virtual address mapped + * + * When a page belongs exclusively to one process after a COW event, + * that page can be moved into the anon_vma that belongs to just that + * process, so the rmap code will not search the parent or sibling + * processes. + */ +void page_move_anon_rmap(struct page *page, + struct vm_area_struct *vma, unsigned long address) +{ + struct anon_vma *anon_vma = vma->anon_vma; + + VM_BUG_ON(!PageLocked(page)); + VM_BUG_ON(!anon_vma); + VM_BUG_ON(page->index != linear_page_index(vma, address)); + + anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; + page->mapping = (struct address_space *) anon_vma; +} + +/** * __page_set_anon_rmap - setup new anonymous rmap * @page: the page to add the mapping to * @vma: the vm area in which the mapping is added @@ -620,14 +739,7 @@ static void __page_set_anon_rmap(struct page *page, BUG_ON(!anon_vma); anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; page->mapping = (struct address_space *) anon_vma; - page->index = linear_page_index(vma, address); - - /* - * nr_mapped state can be updated without turning off - * interrupts because it is not modified via interrupt. - */ - __inc_zone_page_state(page, NR_ANON_PAGES); } /** @@ -652,9 +764,6 @@ static void __page_check_anon_rmap(struct page *page, * are initially only visible via the pagetables, and the pte is locked * over the call to page_add_new_anon_rmap. */ - struct anon_vma *anon_vma = vma->anon_vma; - anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; - BUG_ON(page->mapping != (struct address_space *)anon_vma); BUG_ON(page->index != linear_page_index(vma, address)); #endif } @@ -665,14 +774,23 @@ static void __page_check_anon_rmap(struct page *page, * @vma: the vm area in which the mapping is added * @address: the user virtual address mapped * - * The caller needs to hold the pte lock and the page must be locked. + * The caller needs to hold the pte lock, and the page must be locked in + * the anon_vma case: to serialize mapping,index checking after setting, + * and to ensure that PageAnon is not being upgraded racily to PageKsm + * (but PageKsm is never downgraded to PageAnon). */ void page_add_anon_rmap(struct page *page, struct vm_area_struct *vma, unsigned long address) { + int first = atomic_inc_and_test(&page->_mapcount); + if (first) + __inc_zone_page_state(page, NR_ANON_PAGES); + if (unlikely(PageKsm(page))) + return; + VM_BUG_ON(!PageLocked(page)); VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end); - if (atomic_inc_and_test(&page->_mapcount)) + if (first) __page_set_anon_rmap(page, vma, address); else __page_check_anon_rmap(page, vma, address); @@ -694,6 +812,7 @@ void page_add_new_anon_rmap(struct page *page, VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end); SetPageSwapBacked(page); atomic_set(&page->_mapcount, 0); /* increment count (starts at -1) */ + __inc_zone_page_state(page, NR_ANON_PAGES); __page_set_anon_rmap(page, vma, address); if (page_evictable(page, vma)) lru_cache_add_lru(page, LRU_ACTIVE_ANON); @@ -711,7 +830,7 @@ void page_add_file_rmap(struct page *page) { if (atomic_inc_and_test(&page->_mapcount)) { __inc_zone_page_state(page, NR_FILE_MAPPED); - mem_cgroup_update_mapped_file_stat(page, 1); + mem_cgroup_update_file_mapped(page, 1); } } @@ -743,8 +862,8 @@ void page_remove_rmap(struct page *page) __dec_zone_page_state(page, NR_ANON_PAGES); } else { __dec_zone_page_state(page, NR_FILE_MAPPED); + mem_cgroup_update_file_mapped(page, -1); } - mem_cgroup_update_mapped_file_stat(page, -1); /* * It would be tidy to reset the PageAnon mapping here, * but that might overwrite a racing page_add_anon_rmap @@ -760,20 +879,15 @@ void page_remove_rmap(struct page *page) * Subfunctions of try_to_unmap: try_to_unmap_one called * repeatedly from either try_to_unmap_anon or try_to_unmap_file. */ -static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, - enum ttu_flags flags) +int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, + unsigned long address, enum ttu_flags flags) { struct mm_struct *mm = vma->vm_mm; - unsigned long address; pte_t *pte; pte_t pteval; spinlock_t *ptl; int ret = SWAP_AGAIN; - address = vma_address(page, vma); - if (address == -EFAULT) - goto out; - pte = page_check_address(page, mm, address, &ptl, 0); if (!pte) goto out; @@ -784,10 +898,11 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, * skipped over this mm) then we should reactivate it. */ if (!(flags & TTU_IGNORE_MLOCK)) { - if (vma->vm_flags & VM_LOCKED) { - ret = SWAP_MLOCK; + if (vma->vm_flags & VM_LOCKED) + goto out_mlock; + + if (TTU_ACTION(flags) == TTU_MUNLOCK) goto out_unmap; - } } if (!(flags & TTU_IGNORE_ACCESS)) { if (ptep_clear_flush_young_notify(vma, address, pte)) { @@ -809,9 +924,9 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, if (PageHWPoison(page) && !(flags & TTU_IGNORE_HWPOISON)) { if (PageAnon(page)) - dec_mm_counter(mm, anon_rss); + dec_mm_counter(mm, MM_ANONPAGES); else - dec_mm_counter(mm, file_rss); + dec_mm_counter(mm, MM_FILEPAGES); set_pte_at(mm, address, pte, swp_entry_to_pte(make_hwpoison_entry(page))); } else if (PageAnon(page)) { @@ -822,14 +937,19 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, * Store the swap location in the pte. * See handle_pte_fault() ... */ - swap_duplicate(entry); + if (swap_duplicate(entry) < 0) { + set_pte_at(mm, address, pte, pteval); + ret = SWAP_FAIL; + goto out_unmap; + } if (list_empty(&mm->mmlist)) { spin_lock(&mmlist_lock); if (list_empty(&mm->mmlist)) list_add(&mm->mmlist, &init_mm.mmlist); spin_unlock(&mmlist_lock); } - dec_mm_counter(mm, anon_rss); + dec_mm_counter(mm, MM_ANONPAGES); + inc_mm_counter(mm, MM_SWAPENTS); } else if (PAGE_MIGRATION) { /* * Store the pfn of the page in a special migration @@ -847,8 +967,7 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, entry = make_migration_entry(page, pte_write(pteval)); set_pte_at(mm, address, pte, swp_entry_to_pte(entry)); } else - dec_mm_counter(mm, file_rss); - + dec_mm_counter(mm, MM_FILEPAGES); page_remove_rmap(page); page_cache_release(page); @@ -857,6 +976,27 @@ out_unmap: pte_unmap_unlock(pte, ptl); out: return ret; + +out_mlock: + pte_unmap_unlock(pte, ptl); + + + /* + * We need mmap_sem locking, Otherwise VM_LOCKED check makes + * unstable result and race. Plus, We can't wait here because + * we now hold anon_vma->lock or mapping->i_mmap_lock. + * if trylock failed, the page remain in evictable lru and later + * vmscan could retry to move the page to unevictable lru if the + * page is actually mlocked. + */ + if (down_read_trylock(&vma->vm_mm->mmap_sem)) { + if (vma->vm_flags & VM_LOCKED) { + mlock_vma_page(page); + ret = SWAP_MLOCK; + } + up_read(&vma->vm_mm->mmap_sem); + } + return ret; } /* @@ -922,11 +1062,10 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount, return ret; /* - * MLOCK_PAGES => feature is configured. - * if we can acquire the mmap_sem for read, and vma is VM_LOCKED, + * If we can acquire the mmap_sem for read, and vma is VM_LOCKED, * keep the sem while scanning the cluster for mlocking pages. */ - if (MLOCK_PAGES && down_read_trylock(&vma->vm_mm->mmap_sem)) { + if (down_read_trylock(&vma->vm_mm->mmap_sem)) { locked_vma = (vma->vm_flags & VM_LOCKED); if (!locked_vma) up_read(&vma->vm_mm->mmap_sem); /* don't need it */ @@ -967,7 +1106,7 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount, page_remove_rmap(page); page_cache_release(page); - dec_mm_counter(mm, file_rss); + dec_mm_counter(mm, MM_FILEPAGES); (*mapcount)--; } pte_unmap_unlock(pte - 1, ptl); @@ -976,29 +1115,11 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount, return ret; } -/* - * common handling for pages mapped in VM_LOCKED vmas - */ -static int try_to_mlock_page(struct page *page, struct vm_area_struct *vma) -{ - int mlocked = 0; - - if (down_read_trylock(&vma->vm_mm->mmap_sem)) { - if (vma->vm_flags & VM_LOCKED) { - mlock_vma_page(page); - mlocked++; /* really mlocked the page */ - } - up_read(&vma->vm_mm->mmap_sem); - } - return mlocked; -} - /** * try_to_unmap_anon - unmap or unlock anonymous page using the object-based * rmap method * @page: the page to unmap/unlock - * @unlock: request for unlock rather than unmap [unlikely] - * @migration: unmapping for migration - ignored if @unlock + * @flags: action and flags * * Find all the mappings of a page using the mapping pointer and the vma chains * contained in the anon_vma struct it points to. @@ -1013,43 +1134,24 @@ static int try_to_mlock_page(struct page *page, struct vm_area_struct *vma) static int try_to_unmap_anon(struct page *page, enum ttu_flags flags) { struct anon_vma *anon_vma; - struct vm_area_struct *vma; - unsigned int mlocked = 0; + struct anon_vma_chain *avc; int ret = SWAP_AGAIN; - int unlock = TTU_ACTION(flags) == TTU_MUNLOCK; - - if (MLOCK_PAGES && unlikely(unlock)) - ret = SWAP_SUCCESS; /* default for try_to_munlock() */ anon_vma = page_lock_anon_vma(page); if (!anon_vma) return ret; - list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { - if (MLOCK_PAGES && unlikely(unlock)) { - if (!((vma->vm_flags & VM_LOCKED) && - page_mapped_in_vma(page, vma))) - continue; /* must visit all unlocked vmas */ - ret = SWAP_MLOCK; /* saw at least one mlocked vma */ - } else { - ret = try_to_unmap_one(page, vma, flags); - if (ret == SWAP_FAIL || !page_mapped(page)) - break; - } - if (ret == SWAP_MLOCK) { - mlocked = try_to_mlock_page(page, vma); - if (mlocked) - break; /* stop if actually mlocked page */ - } + list_for_each_entry(avc, &anon_vma->head, same_anon_vma) { + struct vm_area_struct *vma = avc->vma; + unsigned long address = vma_address(page, vma); + if (address == -EFAULT) + continue; + ret = try_to_unmap_one(page, vma, address, flags); + if (ret != SWAP_AGAIN || !page_mapped(page)) + break; } page_unlock_anon_vma(anon_vma); - - if (mlocked) - ret = SWAP_MLOCK; /* actually mlocked the page */ - else if (ret == SWAP_MLOCK) - ret = SWAP_AGAIN; /* saw VM_LOCKED vma */ - return ret; } @@ -1079,48 +1181,30 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags) unsigned long max_nl_cursor = 0; unsigned long max_nl_size = 0; unsigned int mapcount; - unsigned int mlocked = 0; - int unlock = TTU_ACTION(flags) == TTU_MUNLOCK; - - if (MLOCK_PAGES && unlikely(unlock)) - ret = SWAP_SUCCESS; /* default for try_to_munlock() */ spin_lock(&mapping->i_mmap_lock); vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { - if (MLOCK_PAGES && unlikely(unlock)) { - if (!((vma->vm_flags & VM_LOCKED) && - page_mapped_in_vma(page, vma))) - continue; /* must visit all vmas */ - ret = SWAP_MLOCK; - } else { - ret = try_to_unmap_one(page, vma, flags); - if (ret == SWAP_FAIL || !page_mapped(page)) - goto out; - } - if (ret == SWAP_MLOCK) { - mlocked = try_to_mlock_page(page, vma); - if (mlocked) - break; /* stop if actually mlocked page */ - } + unsigned long address = vma_address(page, vma); + if (address == -EFAULT) + continue; + ret = try_to_unmap_one(page, vma, address, flags); + if (ret != SWAP_AGAIN || !page_mapped(page)) + goto out; } - if (mlocked) + if (list_empty(&mapping->i_mmap_nonlinear)) goto out; - if (list_empty(&mapping->i_mmap_nonlinear)) + /* + * We don't bother to try to find the munlocked page in nonlinears. + * It's costly. Instead, later, page reclaim logic may call + * try_to_unmap(TTU_MUNLOCK) and recover PG_mlocked lazily. + */ + if (TTU_ACTION(flags) == TTU_MUNLOCK) goto out; list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list) { - if (MLOCK_PAGES && unlikely(unlock)) { - if (!(vma->vm_flags & VM_LOCKED)) - continue; /* must visit all vmas */ - ret = SWAP_MLOCK; /* leave mlocked == 0 */ - goto out; /* no need to look further */ - } - if (!MLOCK_PAGES && !(flags & TTU_IGNORE_MLOCK) && - (vma->vm_flags & VM_LOCKED)) - continue; cursor = (unsigned long) vma->vm_private_data; if (cursor > max_nl_cursor) max_nl_cursor = cursor; @@ -1153,16 +1237,12 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags) do { list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list) { - if (!MLOCK_PAGES && !(flags & TTU_IGNORE_MLOCK) && - (vma->vm_flags & VM_LOCKED)) - continue; cursor = (unsigned long) vma->vm_private_data; while ( cursor < max_nl_cursor && cursor < vma->vm_end - vma->vm_start) { - ret = try_to_unmap_cluster(cursor, &mapcount, - vma, page); - if (ret == SWAP_MLOCK) - mlocked = 2; /* to return below */ + if (try_to_unmap_cluster(cursor, &mapcount, + vma, page) == SWAP_MLOCK) + ret = SWAP_MLOCK; cursor += CLUSTER_SIZE; vma->vm_private_data = (void *) cursor; if ((int)mapcount <= 0) @@ -1183,10 +1263,6 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags) vma->vm_private_data = NULL; out: spin_unlock(&mapping->i_mmap_lock); - if (mlocked) - ret = SWAP_MLOCK; /* actually mlocked the page */ - else if (ret == SWAP_MLOCK) - ret = SWAP_AGAIN; /* saw VM_LOCKED vma */ return ret; } @@ -1210,7 +1286,9 @@ int try_to_unmap(struct page *page, enum ttu_flags flags) BUG_ON(!PageLocked(page)); - if (PageAnon(page)) + if (unlikely(PageKsm(page))) + ret = try_to_unmap_ksm(page, flags); + else if (PageAnon(page)) ret = try_to_unmap_anon(page, flags); else ret = try_to_unmap_file(page, flags); @@ -1229,17 +1307,99 @@ int try_to_unmap(struct page *page, enum ttu_flags flags) * * Return values are: * - * SWAP_SUCCESS - no vma's holding page mlocked. + * SWAP_AGAIN - no vma is holding page mlocked, or, * SWAP_AGAIN - page mapped in mlocked vma -- couldn't acquire mmap sem + * SWAP_FAIL - page cannot be located at present * SWAP_MLOCK - page is now mlocked. */ int try_to_munlock(struct page *page) { VM_BUG_ON(!PageLocked(page) || PageLRU(page)); - if (PageAnon(page)) + if (unlikely(PageKsm(page))) + return try_to_unmap_ksm(page, TTU_MUNLOCK); + else if (PageAnon(page)) return try_to_unmap_anon(page, TTU_MUNLOCK); else return try_to_unmap_file(page, TTU_MUNLOCK); } +#ifdef CONFIG_MIGRATION +/* + * rmap_walk() and its helpers rmap_walk_anon() and rmap_walk_file(): + * Called by migrate.c to remove migration ptes, but might be used more later. + */ +static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *, + struct vm_area_struct *, unsigned long, void *), void *arg) +{ + struct anon_vma *anon_vma; + struct anon_vma_chain *avc; + int ret = SWAP_AGAIN; + + /* + * Note: remove_migration_ptes() cannot use page_lock_anon_vma() + * because that depends on page_mapped(); but not all its usages + * are holding mmap_sem, which also gave the necessary guarantee + * (that this anon_vma's slab has not already been destroyed). + * This needs to be reviewed later: avoiding page_lock_anon_vma() + * is risky, and currently limits the usefulness of rmap_walk(). + */ + anon_vma = page_anon_vma(page); + if (!anon_vma) + return ret; + spin_lock(&anon_vma->lock); + list_for_each_entry(avc, &anon_vma->head, same_anon_vma) { + struct vm_area_struct *vma = avc->vma; + unsigned long address = vma_address(page, vma); + if (address == -EFAULT) + continue; + ret = rmap_one(page, vma, address, arg); + if (ret != SWAP_AGAIN) + break; + } + spin_unlock(&anon_vma->lock); + return ret; +} + +static int rmap_walk_file(struct page *page, int (*rmap_one)(struct page *, + struct vm_area_struct *, unsigned long, void *), void *arg) +{ + struct address_space *mapping = page->mapping; + pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); + struct vm_area_struct *vma; + struct prio_tree_iter iter; + int ret = SWAP_AGAIN; + + if (!mapping) + return ret; + spin_lock(&mapping->i_mmap_lock); + vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { + unsigned long address = vma_address(page, vma); + if (address == -EFAULT) + continue; + ret = rmap_one(page, vma, address, arg); + if (ret != SWAP_AGAIN) + break; + } + /* + * No nonlinear handling: being always shared, nonlinear vmas + * never contain migration ptes. Decide what to do about this + * limitation to linear when we need rmap_walk() on nonlinear. + */ + spin_unlock(&mapping->i_mmap_lock); + return ret; +} + +int rmap_walk(struct page *page, int (*rmap_one)(struct page *, + struct vm_area_struct *, unsigned long, void *), void *arg) +{ + VM_BUG_ON(!PageLocked(page)); + + if (unlikely(PageKsm(page))) + return rmap_walk_ksm(page, rmap_one, arg); + else if (PageAnon(page)) + return rmap_walk_anon(page, rmap_one, arg); + else + return rmap_walk_file(page, rmap_one, arg); +} +#endif /* CONFIG_MIGRATION */ diff --git a/mm/shmem.c b/mm/shmem.c index 356dd99566ec..eef4ebea5158 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -29,7 +29,6 @@ #include <linux/mm.h> #include <linux/module.h> #include <linux/swap.h> -#include <linux/ima.h> static struct vfsmount *shm_mnt; @@ -42,6 +41,7 @@ static struct vfsmount *shm_mnt; #include <linux/xattr.h> #include <linux/exportfs.h> +#include <linux/posix_acl.h> #include <linux/generic_acl.h> #include <linux/mman.h> #include <linux/string.h> @@ -810,7 +810,7 @@ static int shmem_notify_change(struct dentry *dentry, struct iattr *attr) error = inode_setattr(inode, attr); #ifdef CONFIG_TMPFS_POSIX_ACL if (!error && (attr->ia_valid & ATTR_MODE)) - error = generic_acl_chmod(inode, &shmem_acl_ops); + error = generic_acl_chmod(inode); #endif if (page) page_cache_release(page); @@ -1017,7 +1017,14 @@ int shmem_unuse(swp_entry_t entry, struct page *page) goto out; } mutex_unlock(&shmem_swaplist_mutex); -out: return found; /* 0 or 1 or -ENOMEM */ + /* + * Can some race bring us here? We've been holding page lock, + * so I think not; but would rather try again later than BUG() + */ + unlock_page(page); + page_cache_release(page); +out: + return (found < 0) ? found : 0; } /* @@ -1080,7 +1087,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) else inode = NULL; spin_unlock(&info->lock); - swap_duplicate(swap); + swap_shmem_alloc(swap); BUG_ON(page_mapped(page)); page_cache_release(page); /* pagecache ref */ swap_writepage(page, wbc); @@ -1817,11 +1824,15 @@ shmem_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev) return error; } } - error = shmem_acl_init(inode, dir); +#ifdef CONFIG_TMPFS_POSIX_ACL + error = generic_acl_init(inode, dir); if (error) { iput(inode); return error; } +#else + error = 0; +#endif if (dir->i_mode & S_ISGID) { inode->i_gid = dir->i_gid; if (S_ISDIR(mode)) @@ -2036,27 +2047,28 @@ static const struct inode_operations shmem_symlink_inode_operations = { * filesystem level, though. */ -static size_t shmem_xattr_security_list(struct inode *inode, char *list, +static size_t shmem_xattr_security_list(struct dentry *dentry, char *list, size_t list_len, const char *name, - size_t name_len) + size_t name_len, int handler_flags) { - return security_inode_listsecurity(inode, list, list_len); + return security_inode_listsecurity(dentry->d_inode, list, list_len); } -static int shmem_xattr_security_get(struct inode *inode, const char *name, - void *buffer, size_t size) +static int shmem_xattr_security_get(struct dentry *dentry, const char *name, + void *buffer, size_t size, int handler_flags) { if (strcmp(name, "") == 0) return -EINVAL; - return xattr_getsecurity(inode, name, buffer, size); + return xattr_getsecurity(dentry->d_inode, name, buffer, size); } -static int shmem_xattr_security_set(struct inode *inode, const char *name, - const void *value, size_t size, int flags) +static int shmem_xattr_security_set(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags, int handler_flags) { if (strcmp(name, "") == 0) return -EINVAL; - return security_inode_setsecurity(inode, name, value, size, flags); + return security_inode_setsecurity(dentry->d_inode, name, value, + size, flags); } static struct xattr_handler shmem_xattr_security_handler = { @@ -2067,8 +2079,8 @@ static struct xattr_handler shmem_xattr_security_handler = { }; static struct xattr_handler *shmem_xattr_handlers[] = { - &shmem_xattr_acl_access_handler, - &shmem_xattr_acl_default_handler, + &generic_acl_access_handler, + &generic_acl_default_handler, &shmem_xattr_security_handler, NULL }; @@ -2447,7 +2459,7 @@ static const struct inode_operations shmem_inode_operations = { .getxattr = generic_getxattr, .listxattr = generic_listxattr, .removexattr = generic_removexattr, - .check_acl = shmem_check_acl, + .check_acl = generic_check_acl, #endif }; @@ -2470,7 +2482,7 @@ static const struct inode_operations shmem_dir_inode_operations = { .getxattr = generic_getxattr, .listxattr = generic_listxattr, .removexattr = generic_removexattr, - .check_acl = shmem_check_acl, + .check_acl = generic_check_acl, #endif }; @@ -2481,7 +2493,7 @@ static const struct inode_operations shmem_special_inode_operations = { .getxattr = generic_getxattr, .listxattr = generic_listxattr, .removexattr = generic_removexattr, - .check_acl = shmem_check_acl, + .check_acl = generic_check_acl, #endif }; @@ -2619,7 +2631,8 @@ struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags int error; struct file *file; struct inode *inode; - struct dentry *dentry, *root; + struct path path; + struct dentry *root; struct qstr this; if (IS_ERR(shm_mnt)) @@ -2636,38 +2649,35 @@ struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags this.len = strlen(name); this.hash = 0; /* will go */ root = shm_mnt->mnt_root; - dentry = d_alloc(root, &this); - if (!dentry) + path.dentry = d_alloc(root, &this); + if (!path.dentry) goto put_memory; - - error = -ENFILE; - file = get_empty_filp(); - if (!file) - goto put_dentry; + path.mnt = mntget(shm_mnt); error = -ENOSPC; inode = shmem_get_inode(root->d_sb, S_IFREG | S_IRWXUGO, 0, flags); if (!inode) - goto close_file; + goto put_dentry; - d_instantiate(dentry, inode); + d_instantiate(path.dentry, inode); inode->i_size = size; inode->i_nlink = 0; /* It is unlinked */ - init_file(file, shm_mnt, dentry, FMODE_WRITE | FMODE_READ, - &shmem_file_operations); - #ifndef CONFIG_MMU error = ramfs_nommu_expand_for_mapping(inode, size); if (error) - goto close_file; + goto put_dentry; #endif - ima_counts_get(file); + + error = -ENFILE; + file = alloc_file(&path, FMODE_WRITE | FMODE_READ, + &shmem_file_operations); + if (!file) + goto put_dentry; + return file; -close_file: - put_filp(file); put_dentry: - dput(dentry); + path_put(&path); put_memory: shmem_unacct_size(flags, size); return ERR_PTR(error); diff --git a/mm/shmem_acl.c b/mm/shmem_acl.c deleted file mode 100644 index df2c87fdae50..000000000000 --- a/mm/shmem_acl.c +++ /dev/null @@ -1,171 +0,0 @@ -/* - * mm/shmem_acl.c - * - * (C) 2005 Andreas Gruenbacher <agruen@suse.de> - * - * This file is released under the GPL. - */ - -#include <linux/fs.h> -#include <linux/shmem_fs.h> -#include <linux/xattr.h> -#include <linux/generic_acl.h> - -/** - * shmem_get_acl - generic_acl_operations->getacl() operation - */ -static struct posix_acl * -shmem_get_acl(struct inode *inode, int type) -{ - struct posix_acl *acl = NULL; - - spin_lock(&inode->i_lock); - switch(type) { - case ACL_TYPE_ACCESS: - acl = posix_acl_dup(inode->i_acl); - break; - - case ACL_TYPE_DEFAULT: - acl = posix_acl_dup(inode->i_default_acl); - break; - } - spin_unlock(&inode->i_lock); - - return acl; -} - -/** - * shmem_set_acl - generic_acl_operations->setacl() operation - */ -static void -shmem_set_acl(struct inode *inode, int type, struct posix_acl *acl) -{ - struct posix_acl *free = NULL; - - spin_lock(&inode->i_lock); - switch(type) { - case ACL_TYPE_ACCESS: - free = inode->i_acl; - inode->i_acl = posix_acl_dup(acl); - break; - - case ACL_TYPE_DEFAULT: - free = inode->i_default_acl; - inode->i_default_acl = posix_acl_dup(acl); - break; - } - spin_unlock(&inode->i_lock); - posix_acl_release(free); -} - -struct generic_acl_operations shmem_acl_ops = { - .getacl = shmem_get_acl, - .setacl = shmem_set_acl, -}; - -/** - * shmem_list_acl_access, shmem_get_acl_access, shmem_set_acl_access, - * shmem_xattr_acl_access_handler - plumbing code to implement the - * system.posix_acl_access xattr using the generic acl functions. - */ - -static size_t -shmem_list_acl_access(struct inode *inode, char *list, size_t list_size, - const char *name, size_t name_len) -{ - return generic_acl_list(inode, &shmem_acl_ops, ACL_TYPE_ACCESS, - list, list_size); -} - -static int -shmem_get_acl_access(struct inode *inode, const char *name, void *buffer, - size_t size) -{ - if (strcmp(name, "") != 0) - return -EINVAL; - return generic_acl_get(inode, &shmem_acl_ops, ACL_TYPE_ACCESS, buffer, - size); -} - -static int -shmem_set_acl_access(struct inode *inode, const char *name, const void *value, - size_t size, int flags) -{ - if (strcmp(name, "") != 0) - return -EINVAL; - return generic_acl_set(inode, &shmem_acl_ops, ACL_TYPE_ACCESS, value, - size); -} - -struct xattr_handler shmem_xattr_acl_access_handler = { - .prefix = POSIX_ACL_XATTR_ACCESS, - .list = shmem_list_acl_access, - .get = shmem_get_acl_access, - .set = shmem_set_acl_access, -}; - -/** - * shmem_list_acl_default, shmem_get_acl_default, shmem_set_acl_default, - * shmem_xattr_acl_default_handler - plumbing code to implement the - * system.posix_acl_default xattr using the generic acl functions. - */ - -static size_t -shmem_list_acl_default(struct inode *inode, char *list, size_t list_size, - const char *name, size_t name_len) -{ - return generic_acl_list(inode, &shmem_acl_ops, ACL_TYPE_DEFAULT, - list, list_size); -} - -static int -shmem_get_acl_default(struct inode *inode, const char *name, void *buffer, - size_t size) -{ - if (strcmp(name, "") != 0) - return -EINVAL; - return generic_acl_get(inode, &shmem_acl_ops, ACL_TYPE_DEFAULT, buffer, - size); -} - -static int -shmem_set_acl_default(struct inode *inode, const char *name, const void *value, - size_t size, int flags) -{ - if (strcmp(name, "") != 0) - return -EINVAL; - return generic_acl_set(inode, &shmem_acl_ops, ACL_TYPE_DEFAULT, value, - size); -} - -struct xattr_handler shmem_xattr_acl_default_handler = { - .prefix = POSIX_ACL_XATTR_DEFAULT, - .list = shmem_list_acl_default, - .get = shmem_get_acl_default, - .set = shmem_set_acl_default, -}; - -/** - * shmem_acl_init - Inizialize the acl(s) of a new inode - */ -int -shmem_acl_init(struct inode *inode, struct inode *dir) -{ - return generic_acl_init(inode, dir, &shmem_acl_ops); -} - -/** - * shmem_check_acl - check_acl() callback for generic_permission() - */ -int -shmem_check_acl(struct inode *inode, int mask) -{ - struct posix_acl *acl = shmem_get_acl(inode, ACL_TYPE_ACCESS); - - if (acl) { - int error = posix_acl_permission(inode, acl, mask); - posix_acl_release(acl); - return error; - } - return -EAGAIN; -} diff --git a/mm/slab.c b/mm/slab.c index 7dfa481c96ba..bac0f4fcc216 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -490,7 +490,7 @@ static void **dbg_userword(struct kmem_cache *cachep, void *objp) #endif -#ifdef CONFIG_KMEMTRACE +#ifdef CONFIG_TRACING size_t slab_buffer_size(struct kmem_cache *cachep) { return cachep->buffer_size; @@ -604,6 +604,26 @@ static struct kmem_cache cache_cache = { #define BAD_ALIEN_MAGIC 0x01020304ul +/* + * chicken and egg problem: delay the per-cpu array allocation + * until the general caches are up. + */ +static enum { + NONE, + PARTIAL_AC, + PARTIAL_L3, + EARLY, + FULL +} g_cpucache_up; + +/* + * used by boot code to determine if it can use slab based allocator + */ +int slab_is_available(void) +{ + return g_cpucache_up >= EARLY; +} + #ifdef CONFIG_LOCKDEP /* @@ -620,40 +640,52 @@ static struct kmem_cache cache_cache = { static struct lock_class_key on_slab_l3_key; static struct lock_class_key on_slab_alc_key; -static inline void init_lock_keys(void) - +static void init_node_lock_keys(int q) { - int q; struct cache_sizes *s = malloc_sizes; - while (s->cs_size != ULONG_MAX) { - for_each_node(q) { - struct array_cache **alc; - int r; - struct kmem_list3 *l3 = s->cs_cachep->nodelists[q]; - if (!l3 || OFF_SLAB(s->cs_cachep)) - continue; - lockdep_set_class(&l3->list_lock, &on_slab_l3_key); - alc = l3->alien; - /* - * FIXME: This check for BAD_ALIEN_MAGIC - * should go away when common slab code is taught to - * work even without alien caches. - * Currently, non NUMA code returns BAD_ALIEN_MAGIC - * for alloc_alien_cache, - */ - if (!alc || (unsigned long)alc == BAD_ALIEN_MAGIC) - continue; - for_each_node(r) { - if (alc[r]) - lockdep_set_class(&alc[r]->lock, - &on_slab_alc_key); - } + if (g_cpucache_up != FULL) + return; + + for (s = malloc_sizes; s->cs_size != ULONG_MAX; s++) { + struct array_cache **alc; + struct kmem_list3 *l3; + int r; + + l3 = s->cs_cachep->nodelists[q]; + if (!l3 || OFF_SLAB(s->cs_cachep)) + continue; + lockdep_set_class(&l3->list_lock, &on_slab_l3_key); + alc = l3->alien; + /* + * FIXME: This check for BAD_ALIEN_MAGIC + * should go away when common slab code is taught to + * work even without alien caches. + * Currently, non NUMA code returns BAD_ALIEN_MAGIC + * for alloc_alien_cache, + */ + if (!alc || (unsigned long)alc == BAD_ALIEN_MAGIC) + continue; + for_each_node(r) { + if (alc[r]) + lockdep_set_class(&alc[r]->lock, + &on_slab_alc_key); } - s++; } } + +static inline void init_lock_keys(void) +{ + int node; + + for_each_node(node) + init_node_lock_keys(node); +} #else +static void init_node_lock_keys(int q) +{ +} + static inline void init_lock_keys(void) { } @@ -665,27 +697,7 @@ static inline void init_lock_keys(void) static DEFINE_MUTEX(cache_chain_mutex); static struct list_head cache_chain; -/* - * chicken and egg problem: delay the per-cpu array allocation - * until the general caches are up. - */ -static enum { - NONE, - PARTIAL_AC, - PARTIAL_L3, - EARLY, - FULL -} g_cpucache_up; - -/* - * used by boot code to determine if it can use slab based allocator - */ -int slab_is_available(void) -{ - return g_cpucache_up >= EARLY; -} - -static DEFINE_PER_CPU(struct delayed_work, reap_work); +static DEFINE_PER_CPU(struct delayed_work, slab_reap_work); static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep) { @@ -826,7 +838,7 @@ __setup("noaliencache", noaliencache_setup); * objects freed on different nodes from which they were allocated) and the * flushing of remote pcps by calling drain_node_pages. */ -static DEFINE_PER_CPU(unsigned long, reap_node); +static DEFINE_PER_CPU(unsigned long, slab_reap_node); static void init_reap_node(int cpu) { @@ -836,17 +848,17 @@ static void init_reap_node(int cpu) if (node == MAX_NUMNODES) node = first_node(node_online_map); - per_cpu(reap_node, cpu) = node; + per_cpu(slab_reap_node, cpu) = node; } static void next_reap_node(void) { - int node = __get_cpu_var(reap_node); + int node = __get_cpu_var(slab_reap_node); node = next_node(node, node_online_map); if (unlikely(node >= MAX_NUMNODES)) node = first_node(node_online_map); - __get_cpu_var(reap_node) = node; + __get_cpu_var(slab_reap_node) = node; } #else @@ -863,7 +875,7 @@ static void next_reap_node(void) */ static void __cpuinit start_cpu_timer(int cpu) { - struct delayed_work *reap_work = &per_cpu(reap_work, cpu); + struct delayed_work *reap_work = &per_cpu(slab_reap_work, cpu); /* * When this gets called from do_initcalls via cpucache_init(), @@ -923,7 +935,6 @@ static int transfer_objects(struct array_cache *to, from->avail -= nr; to->avail += nr; - to->touched = 1; return nr; } @@ -971,13 +982,11 @@ static struct array_cache **alloc_alien_cache(int node, int limit, gfp_t gfp) if (limit > 1) limit = 12; - ac_ptr = kmalloc_node(memsize, gfp, node); + ac_ptr = kzalloc_node(memsize, gfp, node); if (ac_ptr) { for_each_node(i) { - if (i == node || !node_online(i)) { - ac_ptr[i] = NULL; + if (i == node || !node_online(i)) continue; - } ac_ptr[i] = alloc_arraycache(node, limit, 0xbaadf00d, gfp); if (!ac_ptr[i]) { for (i--; i >= 0; i--) @@ -1027,7 +1036,7 @@ static void __drain_alien_cache(struct kmem_cache *cachep, */ static void reap_alien(struct kmem_cache *cachep, struct kmem_list3 *l3) { - int node = __get_cpu_var(reap_node); + int node = __get_cpu_var(slab_reap_node); if (l3->alien) { struct array_cache *ac = l3->alien[node]; @@ -1120,7 +1129,7 @@ static void __cpuinit cpuup_canceled(long cpu) if (nc) free_block(cachep, nc->entry, nc->avail, node); - if (!cpus_empty(*mask)) { + if (!cpumask_empty(mask)) { spin_unlock_irq(&l3->list_lock); goto free_array_cache; } @@ -1254,6 +1263,8 @@ static int __cpuinit cpuup_prepare(long cpu) kfree(shared); free_alien_cache(alien); } + init_node_lock_keys(node); + return 0; bad: cpuup_canceled(cpu); @@ -1286,9 +1297,9 @@ static int __cpuinit cpuup_callback(struct notifier_block *nfb, * anything expensive but will only modify reap_work * and reschedule the timer. */ - cancel_rearming_delayed_work(&per_cpu(reap_work, cpu)); + cancel_rearming_delayed_work(&per_cpu(slab_reap_work, cpu)); /* Now the cache_reaper is guaranteed to be not running. */ - per_cpu(reap_work, cpu).work.func = NULL; + per_cpu(slab_reap_work, cpu).work.func = NULL; break; case CPU_DOWN_FAILED: case CPU_DOWN_FAILED_FROZEN: @@ -2261,9 +2272,11 @@ kmem_cache_create (const char *name, size_t size, size_t align, /* * Determine if the slab management is 'on' or 'off' slab. * (bootstrapping cannot cope with offslab caches so don't do - * it too early on.) + * it too early on. Always use on-slab management when + * SLAB_NOLEAKTRACE to avoid recursive calls into kmemleak) */ - if ((size >= (PAGE_SIZE >> 3)) && !slab_early_init) + if ((size >= (PAGE_SIZE >> 3)) && !slab_early_init && + !(flags & SLAB_NOLEAKTRACE)) /* * Size is large, assume best to place the slab management obj * off-slab (should allow better packing of objs). @@ -2582,8 +2595,8 @@ static struct slab *alloc_slabmgmt(struct kmem_cache *cachep, void *objp, * kmemleak does not treat the ->s_mem pointer as a reference * to the object. Otherwise we will not report the leak. */ - kmemleak_scan_area(slabp, offsetof(struct slab, list), - sizeof(struct list_head), local_flags); + kmemleak_scan_area(&slabp->list, sizeof(struct list_head), + local_flags); if (!slabp) return NULL; } else { @@ -2947,8 +2960,10 @@ retry: spin_lock(&l3->list_lock); /* See if we can refill from the shared array */ - if (l3->shared && transfer_objects(ac, l3->shared, batchcount)) + if (l3->shared && transfer_objects(ac, l3->shared, batchcount)) { + l3->shared->touched = 1; goto alloc_done; + } while (batchcount > 0) { struct list_head *entry; @@ -3085,7 +3100,7 @@ static bool slab_should_failslab(struct kmem_cache *cachep, gfp_t flags) if (cachep == &cache_cache) return false; - return should_failslab(obj_size(cachep), flags); + return should_failslab(obj_size(cachep), flags, cachep->flags); } static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags) @@ -3103,13 +3118,19 @@ static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags) } else { STATS_INC_ALLOCMISS(cachep); objp = cache_alloc_refill(cachep, flags); + /* + * the 'ac' may be updated by cache_alloc_refill(), + * and kmemleak_erase() requires its correct value. + */ + ac = cpu_cache_get(cachep); } /* * To avoid a false negative, if an object that is in one of the * per-CPU caches is leaked, we need to make sure kmemleak doesn't * treat the array pointers as a reference to the object. */ - kmemleak_erase(&ac->entry[ac->avail]); + if (objp) + kmemleak_erase(&ac->entry[ac->avail]); return objp; } @@ -3306,7 +3327,7 @@ __cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, cache_alloc_debugcheck_before(cachep, flags); local_irq_save(save_flags); - if (unlikely(nodeid == -1)) + if (nodeid == -1) nodeid = numa_node_id(); if (unlikely(!cachep->nodelists[nodeid])) { @@ -3558,7 +3579,7 @@ void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags) } EXPORT_SYMBOL(kmem_cache_alloc); -#ifdef CONFIG_KMEMTRACE +#ifdef CONFIG_TRACING void *kmem_cache_alloc_notrace(struct kmem_cache *cachep, gfp_t flags) { return __cache_alloc(cachep, flags, __builtin_return_address(0)); @@ -3581,21 +3602,10 @@ EXPORT_SYMBOL(kmem_cache_alloc_notrace); */ int kmem_ptr_validate(struct kmem_cache *cachep, const void *ptr) { - unsigned long addr = (unsigned long)ptr; - unsigned long min_addr = PAGE_OFFSET; - unsigned long align_mask = BYTES_PER_WORD - 1; unsigned long size = cachep->buffer_size; struct page *page; - if (unlikely(addr < min_addr)) - goto out; - if (unlikely(addr > (unsigned long)high_memory - size)) - goto out; - if (unlikely(addr & align_mask)) - goto out; - if (unlikely(!kern_addr_valid(addr))) - goto out; - if (unlikely(!kern_addr_valid(addr + size - 1))) + if (unlikely(!kern_ptr_validate(ptr, size))) goto out; page = virt_to_page(ptr); if (unlikely(!PageSlab(page))) @@ -3621,7 +3631,7 @@ void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid) } EXPORT_SYMBOL(kmem_cache_alloc_node); -#ifdef CONFIG_KMEMTRACE +#ifdef CONFIG_TRACING void *kmem_cache_alloc_node_notrace(struct kmem_cache *cachep, gfp_t flags, int nodeid) @@ -3649,7 +3659,7 @@ __do_kmalloc_node(size_t size, gfp_t flags, int node, void *caller) return ret; } -#if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_KMEMTRACE) +#if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_TRACING) void *__kmalloc_node(size_t size, gfp_t flags, int node) { return __do_kmalloc_node(size, flags, node, @@ -3669,7 +3679,7 @@ void *__kmalloc_node(size_t size, gfp_t flags, int node) return __do_kmalloc_node(size, flags, node, NULL); } EXPORT_SYMBOL(__kmalloc_node); -#endif /* CONFIG_DEBUG_SLAB */ +#endif /* CONFIG_DEBUG_SLAB || CONFIG_TRACING */ #endif /* CONFIG_NUMA */ /** @@ -3701,7 +3711,7 @@ static __always_inline void *__do_kmalloc(size_t size, gfp_t flags, } -#if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_KMEMTRACE) +#if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_TRACING) void *__kmalloc(size_t size, gfp_t flags) { return __do_kmalloc(size, flags, __builtin_return_address(0)); diff --git a/mm/slub.c b/mm/slub.c index 4996fc719552..7d6c8b1ccf63 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -151,7 +151,8 @@ * Set of flags that will prevent slab merging */ #define SLUB_NEVER_MERGE (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \ - SLAB_TRACE | SLAB_DESTROY_BY_RCU | SLAB_NOLEAKTRACE) + SLAB_TRACE | SLAB_DESTROY_BY_RCU | SLAB_NOLEAKTRACE | \ + SLAB_FAILSLAB) #define SLUB_MERGE_SAME (SLAB_DEBUG_FREE | SLAB_RECLAIM_ACCOUNT | \ SLAB_CACHE_DMA | SLAB_NOTRACK) @@ -217,10 +218,10 @@ static inline void sysfs_slab_remove(struct kmem_cache *s) #endif -static inline void stat(struct kmem_cache_cpu *c, enum stat_item si) +static inline void stat(struct kmem_cache *s, enum stat_item si) { #ifdef CONFIG_SLUB_STATS - c->stat[si]++; + __this_cpu_inc(s->cpu_slab->stat[si]); #endif } @@ -242,15 +243,6 @@ static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node) #endif } -static inline struct kmem_cache_cpu *get_cpu_slab(struct kmem_cache *s, int cpu) -{ -#ifdef CONFIG_SMP - return s->cpu_slab[cpu]; -#else - return &s->cpu_slab; -#endif -} - /* Verify that a pointer has an address that is valid within a slab page */ static inline int check_valid_pointer(struct kmem_cache *s, struct page *page, const void *object) @@ -269,13 +261,6 @@ static inline int check_valid_pointer(struct kmem_cache *s, return 1; } -/* - * Slow version of get and set free pointer. - * - * This version requires touching the cache lines of kmem_cache which - * we avoid to do in the fast alloc free paths. There we obtain the offset - * from the page struct. - */ static inline void *get_freepointer(struct kmem_cache *s, void *object) { return *(void **)(object + s->offset); @@ -1020,6 +1005,9 @@ static int __init setup_slub_debug(char *str) case 't': slub_debug |= SLAB_TRACE; break; + case 'a': + slub_debug |= SLAB_FAILSLAB; + break; default: printk(KERN_ERR "slub_debug option '%c' " "unknown. skipped\n", *str); @@ -1124,7 +1112,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) if (!page) return NULL; - stat(get_cpu_slab(s, raw_smp_processor_id()), ORDER_FALLBACK); + stat(s, ORDER_FALLBACK); } if (kmemcheck_enabled @@ -1422,23 +1410,22 @@ static struct page *get_partial(struct kmem_cache *s, gfp_t flags, int node) static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail) { struct kmem_cache_node *n = get_node(s, page_to_nid(page)); - struct kmem_cache_cpu *c = get_cpu_slab(s, smp_processor_id()); __ClearPageSlubFrozen(page); if (page->inuse) { if (page->freelist) { add_partial(n, page, tail); - stat(c, tail ? DEACTIVATE_TO_TAIL : DEACTIVATE_TO_HEAD); + stat(s, tail ? DEACTIVATE_TO_TAIL : DEACTIVATE_TO_HEAD); } else { - stat(c, DEACTIVATE_FULL); + stat(s, DEACTIVATE_FULL); if (SLABDEBUG && PageSlubDebug(page) && (s->flags & SLAB_STORE_USER)) add_full(n, page); } slab_unlock(page); } else { - stat(c, DEACTIVATE_EMPTY); + stat(s, DEACTIVATE_EMPTY); if (n->nr_partial < s->min_partial) { /* * Adding an empty slab to the partial slabs in order @@ -1454,7 +1441,7 @@ static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail) slab_unlock(page); } else { slab_unlock(page); - stat(get_cpu_slab(s, raw_smp_processor_id()), FREE_SLAB); + stat(s, FREE_SLAB); discard_slab(s, page); } } @@ -1469,7 +1456,7 @@ static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) int tail = 1; if (page->freelist) - stat(c, DEACTIVATE_REMOTE_FREES); + stat(s, DEACTIVATE_REMOTE_FREES); /* * Merge cpu freelist into slab freelist. Typically we get here * because both freelists are empty. So this is unlikely @@ -1482,10 +1469,10 @@ static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) /* Retrieve object from cpu_freelist */ object = c->freelist; - c->freelist = c->freelist[c->offset]; + c->freelist = get_freepointer(s, c->freelist); /* And put onto the regular freelist */ - object[c->offset] = page->freelist; + set_freepointer(s, object, page->freelist); page->freelist = object; page->inuse--; } @@ -1495,7 +1482,7 @@ static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) { - stat(c, CPUSLAB_FLUSH); + stat(s, CPUSLAB_FLUSH); slab_lock(c->page); deactivate_slab(s, c); } @@ -1507,7 +1494,7 @@ static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) */ static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu) { - struct kmem_cache_cpu *c = get_cpu_slab(s, cpu); + struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu); if (likely(c && c->page)) flush_slab(s, c); @@ -1635,7 +1622,7 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, if (unlikely(!node_match(c, node))) goto another_slab; - stat(c, ALLOC_REFILL); + stat(s, ALLOC_REFILL); load_freelist: object = c->page->freelist; @@ -1644,13 +1631,13 @@ load_freelist: if (unlikely(SLABDEBUG && PageSlubDebug(c->page))) goto debug; - c->freelist = object[c->offset]; + c->freelist = get_freepointer(s, object); c->page->inuse = c->page->objects; c->page->freelist = NULL; c->node = page_to_nid(c->page); unlock_out: slab_unlock(c->page); - stat(c, ALLOC_SLOWPATH); + stat(s, ALLOC_SLOWPATH); return object; another_slab: @@ -1660,7 +1647,7 @@ new_slab: new = get_partial(s, gfpflags, node); if (new) { c->page = new; - stat(c, ALLOC_FROM_PARTIAL); + stat(s, ALLOC_FROM_PARTIAL); goto load_freelist; } @@ -1673,8 +1660,8 @@ new_slab: local_irq_disable(); if (new) { - c = get_cpu_slab(s, smp_processor_id()); - stat(c, ALLOC_SLAB); + c = __this_cpu_ptr(s->cpu_slab); + stat(s, ALLOC_SLAB); if (c->page) flush_slab(s, c); slab_lock(new); @@ -1690,7 +1677,7 @@ debug: goto another_slab; c->page->inuse++; - c->page->freelist = object[c->offset]; + c->page->freelist = get_freepointer(s, object); c->node = -1; goto unlock_out; } @@ -1711,35 +1698,33 @@ static __always_inline void *slab_alloc(struct kmem_cache *s, void **object; struct kmem_cache_cpu *c; unsigned long flags; - unsigned int objsize; gfpflags &= gfp_allowed_mask; lockdep_trace_alloc(gfpflags); might_sleep_if(gfpflags & __GFP_WAIT); - if (should_failslab(s->objsize, gfpflags)) + if (should_failslab(s->objsize, gfpflags, s->flags)) return NULL; local_irq_save(flags); - c = get_cpu_slab(s, smp_processor_id()); - objsize = c->objsize; - if (unlikely(!c->freelist || !node_match(c, node))) + c = __this_cpu_ptr(s->cpu_slab); + object = c->freelist; + if (unlikely(!object || !node_match(c, node))) object = __slab_alloc(s, gfpflags, node, addr, c); else { - object = c->freelist; - c->freelist = object[c->offset]; - stat(c, ALLOC_FASTPATH); + c->freelist = get_freepointer(s, object); + stat(s, ALLOC_FASTPATH); } local_irq_restore(flags); - if (unlikely((gfpflags & __GFP_ZERO) && object)) - memset(object, 0, objsize); + if (unlikely(gfpflags & __GFP_ZERO) && object) + memset(object, 0, s->objsize); - kmemcheck_slab_alloc(s, gfpflags, object, c->objsize); - kmemleak_alloc_recursive(object, objsize, 1, s->flags, gfpflags); + kmemcheck_slab_alloc(s, gfpflags, object, s->objsize); + kmemleak_alloc_recursive(object, s->objsize, 1, s->flags, gfpflags); return object; } @@ -1754,7 +1739,7 @@ void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags) } EXPORT_SYMBOL(kmem_cache_alloc); -#ifdef CONFIG_KMEMTRACE +#ifdef CONFIG_TRACING void *kmem_cache_alloc_notrace(struct kmem_cache *s, gfp_t gfpflags) { return slab_alloc(s, gfpflags, -1, _RET_IP_); @@ -1775,7 +1760,7 @@ void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node) EXPORT_SYMBOL(kmem_cache_alloc_node); #endif -#ifdef CONFIG_KMEMTRACE +#ifdef CONFIG_TRACING void *kmem_cache_alloc_node_notrace(struct kmem_cache *s, gfp_t gfpflags, int node) @@ -1794,26 +1779,25 @@ EXPORT_SYMBOL(kmem_cache_alloc_node_notrace); * handling required then we can return immediately. */ static void __slab_free(struct kmem_cache *s, struct page *page, - void *x, unsigned long addr, unsigned int offset) + void *x, unsigned long addr) { void *prior; void **object = (void *)x; - struct kmem_cache_cpu *c; - c = get_cpu_slab(s, raw_smp_processor_id()); - stat(c, FREE_SLOWPATH); + stat(s, FREE_SLOWPATH); slab_lock(page); if (unlikely(SLABDEBUG && PageSlubDebug(page))) goto debug; checks_ok: - prior = object[offset] = page->freelist; + prior = page->freelist; + set_freepointer(s, object, prior); page->freelist = object; page->inuse--; if (unlikely(PageSlubFrozen(page))) { - stat(c, FREE_FROZEN); + stat(s, FREE_FROZEN); goto out_unlock; } @@ -1826,7 +1810,7 @@ checks_ok: */ if (unlikely(!prior)) { add_partial(get_node(s, page_to_nid(page)), page, 1); - stat(c, FREE_ADD_PARTIAL); + stat(s, FREE_ADD_PARTIAL); } out_unlock: @@ -1839,10 +1823,10 @@ slab_empty: * Slab still on the partial list. */ remove_partial(s, page); - stat(c, FREE_REMOVE_PARTIAL); + stat(s, FREE_REMOVE_PARTIAL); } slab_unlock(page); - stat(c, FREE_SLAB); + stat(s, FREE_SLAB); discard_slab(s, page); return; @@ -1872,17 +1856,17 @@ static __always_inline void slab_free(struct kmem_cache *s, kmemleak_free_recursive(x, s->flags); local_irq_save(flags); - c = get_cpu_slab(s, smp_processor_id()); - kmemcheck_slab_free(s, object, c->objsize); - debug_check_no_locks_freed(object, c->objsize); + c = __this_cpu_ptr(s->cpu_slab); + kmemcheck_slab_free(s, object, s->objsize); + debug_check_no_locks_freed(object, s->objsize); if (!(s->flags & SLAB_DEBUG_OBJECTS)) - debug_check_no_obj_freed(object, c->objsize); + debug_check_no_obj_freed(object, s->objsize); if (likely(page == c->page && c->node >= 0)) { - object[c->offset] = c->freelist; + set_freepointer(s, object, c->freelist); c->freelist = object; - stat(c, FREE_FASTPATH); + stat(s, FREE_FASTPATH); } else - __slab_free(s, page, x, addr, c->offset); + __slab_free(s, page, x, addr); local_irq_restore(flags); } @@ -2069,19 +2053,6 @@ static unsigned long calculate_alignment(unsigned long flags, return ALIGN(align, sizeof(void *)); } -static void init_kmem_cache_cpu(struct kmem_cache *s, - struct kmem_cache_cpu *c) -{ - c->page = NULL; - c->freelist = NULL; - c->node = 0; - c->offset = s->offset / sizeof(void *); - c->objsize = s->objsize; -#ifdef CONFIG_SLUB_STATS - memset(c->stat, 0, NR_SLUB_STAT_ITEMS * sizeof(unsigned)); -#endif -} - static void init_kmem_cache_node(struct kmem_cache_node *n, struct kmem_cache *s) { @@ -2095,130 +2066,24 @@ init_kmem_cache_node(struct kmem_cache_node *n, struct kmem_cache *s) #endif } -#ifdef CONFIG_SMP -/* - * Per cpu array for per cpu structures. - * - * The per cpu array places all kmem_cache_cpu structures from one processor - * close together meaning that it becomes possible that multiple per cpu - * structures are contained in one cacheline. This may be particularly - * beneficial for the kmalloc caches. - * - * A desktop system typically has around 60-80 slabs. With 100 here we are - * likely able to get per cpu structures for all caches from the array defined - * here. We must be able to cover all kmalloc caches during bootstrap. - * - * If the per cpu array is exhausted then fall back to kmalloc - * of individual cachelines. No sharing is possible then. - */ -#define NR_KMEM_CACHE_CPU 100 - -static DEFINE_PER_CPU(struct kmem_cache_cpu [NR_KMEM_CACHE_CPU], - kmem_cache_cpu); - -static DEFINE_PER_CPU(struct kmem_cache_cpu *, kmem_cache_cpu_free); -static DECLARE_BITMAP(kmem_cach_cpu_free_init_once, CONFIG_NR_CPUS); - -static struct kmem_cache_cpu *alloc_kmem_cache_cpu(struct kmem_cache *s, - int cpu, gfp_t flags) -{ - struct kmem_cache_cpu *c = per_cpu(kmem_cache_cpu_free, cpu); - - if (c) - per_cpu(kmem_cache_cpu_free, cpu) = - (void *)c->freelist; - else { - /* Table overflow: So allocate ourselves */ - c = kmalloc_node( - ALIGN(sizeof(struct kmem_cache_cpu), cache_line_size()), - flags, cpu_to_node(cpu)); - if (!c) - return NULL; - } - - init_kmem_cache_cpu(s, c); - return c; -} - -static void free_kmem_cache_cpu(struct kmem_cache_cpu *c, int cpu) -{ - if (c < per_cpu(kmem_cache_cpu, cpu) || - c >= per_cpu(kmem_cache_cpu, cpu) + NR_KMEM_CACHE_CPU) { - kfree(c); - return; - } - c->freelist = (void *)per_cpu(kmem_cache_cpu_free, cpu); - per_cpu(kmem_cache_cpu_free, cpu) = c; -} - -static void free_kmem_cache_cpus(struct kmem_cache *s) -{ - int cpu; - - for_each_online_cpu(cpu) { - struct kmem_cache_cpu *c = get_cpu_slab(s, cpu); - - if (c) { - s->cpu_slab[cpu] = NULL; - free_kmem_cache_cpu(c, cpu); - } - } -} - -static int alloc_kmem_cache_cpus(struct kmem_cache *s, gfp_t flags) -{ - int cpu; - - for_each_online_cpu(cpu) { - struct kmem_cache_cpu *c = get_cpu_slab(s, cpu); - - if (c) - continue; - - c = alloc_kmem_cache_cpu(s, cpu, flags); - if (!c) { - free_kmem_cache_cpus(s); - return 0; - } - s->cpu_slab[cpu] = c; - } - return 1; -} - -/* - * Initialize the per cpu array. - */ -static void init_alloc_cpu_cpu(int cpu) -{ - int i; - - if (cpumask_test_cpu(cpu, to_cpumask(kmem_cach_cpu_free_init_once))) - return; - - for (i = NR_KMEM_CACHE_CPU - 1; i >= 0; i--) - free_kmem_cache_cpu(&per_cpu(kmem_cache_cpu, cpu)[i], cpu); - - cpumask_set_cpu(cpu, to_cpumask(kmem_cach_cpu_free_init_once)); -} +static DEFINE_PER_CPU(struct kmem_cache_cpu, kmalloc_percpu[KMALLOC_CACHES]); -static void __init init_alloc_cpu(void) +static inline int alloc_kmem_cache_cpus(struct kmem_cache *s, gfp_t flags) { - int cpu; - - for_each_online_cpu(cpu) - init_alloc_cpu_cpu(cpu); - } + if (s < kmalloc_caches + KMALLOC_CACHES && s >= kmalloc_caches) + /* + * Boot time creation of the kmalloc array. Use static per cpu data + * since the per cpu allocator is not available yet. + */ + s->cpu_slab = kmalloc_percpu + (s - kmalloc_caches); + else + s->cpu_slab = alloc_percpu(struct kmem_cache_cpu); -#else -static inline void free_kmem_cache_cpus(struct kmem_cache *s) {} -static inline void init_alloc_cpu(void) {} + if (!s->cpu_slab) + return 0; -static inline int alloc_kmem_cache_cpus(struct kmem_cache *s, gfp_t flags) -{ - init_kmem_cache_cpu(s, &s->cpu_slab); return 1; } -#endif #ifdef CONFIG_NUMA /* @@ -2287,7 +2152,8 @@ static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags) int node; int local_node; - if (slab_state >= UP) + if (slab_state >= UP && (s < kmalloc_caches || + s > kmalloc_caches + KMALLOC_CACHES)) local_node = page_to_nid(virt_to_page(s)); else local_node = 0; @@ -2502,6 +2368,7 @@ static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags, if (alloc_kmem_cache_cpus(s, gfpflags & ~SLUB_DMA)) return 1; + free_kmem_cache_nodes(s); error: if (flags & SLAB_PANIC) @@ -2519,6 +2386,9 @@ int kmem_ptr_validate(struct kmem_cache *s, const void *object) { struct page *page; + if (!kern_ptr_validate(object, s->size)) + return 0; + page = get_object_page(object); if (!page || s != page->slab) @@ -2609,9 +2479,8 @@ static inline int kmem_cache_close(struct kmem_cache *s) int node; flush_all(s); - + free_percpu(s->cpu_slab); /* Attempt to free all objects */ - free_kmem_cache_cpus(s); for_each_node_state(node, N_NORMAL_MEMORY) { struct kmem_cache_node *n = get_node(s, node); @@ -2651,7 +2520,7 @@ EXPORT_SYMBOL(kmem_cache_destroy); * Kmalloc subsystem *******************************************************************/ -struct kmem_cache kmalloc_caches[SLUB_PAGE_SHIFT] __cacheline_aligned; +struct kmem_cache kmalloc_caches[KMALLOC_CACHES] __cacheline_aligned; EXPORT_SYMBOL(kmalloc_caches); static int __init setup_slub_min_order(char *str) @@ -2741,6 +2610,7 @@ static noinline struct kmem_cache *dma_kmalloc_cache(int index, gfp_t flags) char *text; size_t realsize; unsigned long slabflags; + int i; s = kmalloc_caches_dma[index]; if (s) @@ -2760,7 +2630,14 @@ static noinline struct kmem_cache *dma_kmalloc_cache(int index, gfp_t flags) realsize = kmalloc_caches[index].objsize; text = kasprintf(flags & ~SLUB_DMA, "kmalloc_dma-%d", (unsigned int)realsize); - s = kmalloc(kmem_size, flags & ~SLUB_DMA); + + s = NULL; + for (i = 0; i < KMALLOC_CACHES; i++) + if (!kmalloc_caches[i].size) + break; + + BUG_ON(i >= KMALLOC_CACHES); + s = kmalloc_caches + i; /* * Must defer sysfs creation to a workqueue because we don't know @@ -2772,9 +2649,9 @@ static noinline struct kmem_cache *dma_kmalloc_cache(int index, gfp_t flags) if (slab_state >= SYSFS) slabflags |= __SYSFS_ADD_DEFERRED; - if (!s || !text || !kmem_cache_open(s, flags, text, + if (!text || !kmem_cache_open(s, flags, text, realsize, ARCH_KMALLOC_MINALIGN, slabflags, NULL)) { - kfree(s); + s->size = 0; kfree(text); goto unlock_out; } @@ -3086,7 +2963,7 @@ static void slab_mem_offline_callback(void *arg) /* * if n->nr_slabs > 0, slabs still exist on the node * that is going down. We were unable to free them, - * and offline_pages() function shoudn't call this + * and offline_pages() function shouldn't call this * callback. So, we must fail. */ BUG_ON(slabs_node(s, offline_node)); @@ -3176,8 +3053,6 @@ void __init kmem_cache_init(void) int i; int caches = 0; - init_alloc_cpu(); - #ifdef CONFIG_NUMA /* * Must first have the slab cache available for the allocations of the @@ -3261,8 +3136,10 @@ void __init kmem_cache_init(void) #ifdef CONFIG_SMP register_cpu_notifier(&slab_notifier); - kmem_size = offsetof(struct kmem_cache, cpu_slab) + - nr_cpu_ids * sizeof(struct kmem_cache_cpu *); +#endif +#ifdef CONFIG_NUMA + kmem_size = offsetof(struct kmem_cache, node) + + nr_node_ids * sizeof(struct kmem_cache_node *); #else kmem_size = sizeof(struct kmem_cache); #endif @@ -3351,22 +3228,12 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size, down_write(&slub_lock); s = find_mergeable(size, align, flags, name, ctor); if (s) { - int cpu; - s->refcount++; /* * Adjust the object sizes so that we clear * the complete object on kzalloc. */ s->objsize = max(s->objsize, (int)size); - - /* - * And then we need to update the object size in the - * per cpu structures - */ - for_each_online_cpu(cpu) - get_cpu_slab(s, cpu)->objsize = s->objsize; - s->inuse = max_t(int, s->inuse, ALIGN(size, sizeof(void *))); up_write(&slub_lock); @@ -3420,29 +3287,15 @@ static int __cpuinit slab_cpuup_callback(struct notifier_block *nfb, unsigned long flags; switch (action) { - case CPU_UP_PREPARE: - case CPU_UP_PREPARE_FROZEN: - init_alloc_cpu_cpu(cpu); - down_read(&slub_lock); - list_for_each_entry(s, &slab_caches, list) - s->cpu_slab[cpu] = alloc_kmem_cache_cpu(s, cpu, - GFP_KERNEL); - up_read(&slub_lock); - break; - case CPU_UP_CANCELED: case CPU_UP_CANCELED_FROZEN: case CPU_DEAD: case CPU_DEAD_FROZEN: down_read(&slub_lock); list_for_each_entry(s, &slab_caches, list) { - struct kmem_cache_cpu *c = get_cpu_slab(s, cpu); - local_irq_save(flags); __flush_cpu_slab(s, cpu); local_irq_restore(flags); - free_kmem_cache_cpu(c, cpu); - s->cpu_slab[cpu] = NULL; } up_read(&slub_lock); break; @@ -3928,7 +3781,7 @@ static ssize_t show_slab_objects(struct kmem_cache *s, int cpu; for_each_possible_cpu(cpu) { - struct kmem_cache_cpu *c = get_cpu_slab(s, cpu); + struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu); if (!c || c->node < 0) continue; @@ -4171,6 +4024,23 @@ static ssize_t trace_store(struct kmem_cache *s, const char *buf, } SLAB_ATTR(trace); +#ifdef CONFIG_FAILSLAB +static ssize_t failslab_show(struct kmem_cache *s, char *buf) +{ + return sprintf(buf, "%d\n", !!(s->flags & SLAB_FAILSLAB)); +} + +static ssize_t failslab_store(struct kmem_cache *s, const char *buf, + size_t length) +{ + s->flags &= ~SLAB_FAILSLAB; + if (buf[0] == '1') + s->flags |= SLAB_FAILSLAB; + return length; +} +SLAB_ATTR(failslab); +#endif + static ssize_t reclaim_account_show(struct kmem_cache *s, char *buf) { return sprintf(buf, "%d\n", !!(s->flags & SLAB_RECLAIM_ACCOUNT)); @@ -4353,7 +4223,7 @@ static int show_stat(struct kmem_cache *s, char *buf, enum stat_item si) return -ENOMEM; for_each_online_cpu(cpu) { - unsigned x = get_cpu_slab(s, cpu)->stat[si]; + unsigned x = per_cpu_ptr(s->cpu_slab, cpu)->stat[si]; data[cpu] = x; sum += x; @@ -4371,12 +4241,28 @@ static int show_stat(struct kmem_cache *s, char *buf, enum stat_item si) return len + sprintf(buf + len, "\n"); } +static void clear_stat(struct kmem_cache *s, enum stat_item si) +{ + int cpu; + + for_each_online_cpu(cpu) + per_cpu_ptr(s->cpu_slab, cpu)->stat[si] = 0; +} + #define STAT_ATTR(si, text) \ static ssize_t text##_show(struct kmem_cache *s, char *buf) \ { \ return show_stat(s, buf, si); \ } \ -SLAB_ATTR_RO(text); \ +static ssize_t text##_store(struct kmem_cache *s, \ + const char *buf, size_t length) \ +{ \ + if (buf[0] != '0') \ + return -EINVAL; \ + clear_stat(s, si); \ + return length; \ +} \ +SLAB_ATTR(text); \ STAT_ATTR(ALLOC_FASTPATH, alloc_fastpath); STAT_ATTR(ALLOC_SLOWPATH, alloc_slowpath); @@ -4451,6 +4337,10 @@ static struct attribute *slab_attrs[] = { &deactivate_remote_frees_attr.attr, &order_fallback_attr.attr, #endif +#ifdef CONFIG_FAILSLAB + &failslab_attr.attr, +#endif + NULL }; @@ -4503,7 +4393,7 @@ static void kmem_cache_release(struct kobject *kobj) kfree(s); } -static struct sysfs_ops slab_sysfs_ops = { +static const struct sysfs_ops slab_sysfs_ops = { .show = slab_attr_show, .store = slab_attr_store, }; @@ -4522,7 +4412,7 @@ static int uevent_filter(struct kset *kset, struct kobject *kobj) return 0; } -static struct kset_uevent_ops slab_uevent_ops = { +static const struct kset_uevent_ops slab_uevent_ops = { .filter = uevent_filter, }; diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index d9714bdcb4a3..aa33fd67fa41 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c @@ -22,6 +22,7 @@ #include <linux/bootmem.h> #include <linux/highmem.h> #include <linux/module.h> +#include <linux/slab.h> #include <linux/spinlock.h> #include <linux/vmalloc.h> #include <linux/sched.h> @@ -40,9 +41,11 @@ static void * __init_refok __earlyonly_bootmem_alloc(int node, unsigned long align, unsigned long goal) { - return __alloc_bootmem_node(NODE_DATA(node), size, align, goal); + return __alloc_bootmem_node_high(NODE_DATA(node), size, align, goal); } +static void *vmemmap_buf; +static void *vmemmap_buf_end; void * __meminit vmemmap_alloc_block(unsigned long size, int node) { @@ -64,6 +67,24 @@ void * __meminit vmemmap_alloc_block(unsigned long size, int node) __pa(MAX_DMA_ADDRESS)); } +/* need to make sure size is all the same during early stage */ +void * __meminit vmemmap_alloc_block_buf(unsigned long size, int node) +{ + void *ptr; + + if (!vmemmap_buf) + return vmemmap_alloc_block(size, node); + + /* take the from buf */ + ptr = (void *)ALIGN((unsigned long)vmemmap_buf, size); + if (ptr + size > vmemmap_buf_end) + return vmemmap_alloc_block(size, node); + + vmemmap_buf = ptr + size; + + return ptr; +} + void __meminit vmemmap_verify(pte_t *pte, int node, unsigned long start, unsigned long end) { @@ -80,7 +101,7 @@ pte_t * __meminit vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node) pte_t *pte = pte_offset_kernel(pmd, addr); if (pte_none(*pte)) { pte_t entry; - void *p = vmemmap_alloc_block(PAGE_SIZE, node); + void *p = vmemmap_alloc_block_buf(PAGE_SIZE, node); if (!p) return NULL; entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL); @@ -163,3 +184,55 @@ struct page * __meminit sparse_mem_map_populate(unsigned long pnum, int nid) return map; } + +void __init sparse_mem_maps_populate_node(struct page **map_map, + unsigned long pnum_begin, + unsigned long pnum_end, + unsigned long map_count, int nodeid) +{ + unsigned long pnum; + unsigned long size = sizeof(struct page) * PAGES_PER_SECTION; + void *vmemmap_buf_start; + + size = ALIGN(size, PMD_SIZE); + vmemmap_buf_start = __earlyonly_bootmem_alloc(nodeid, size * map_count, + PMD_SIZE, __pa(MAX_DMA_ADDRESS)); + + if (vmemmap_buf_start) { + vmemmap_buf = vmemmap_buf_start; + vmemmap_buf_end = vmemmap_buf_start + size * map_count; + } + + for (pnum = pnum_begin; pnum < pnum_end; pnum++) { + struct mem_section *ms; + + if (!present_section_nr(pnum)) + continue; + + map_map[pnum] = sparse_mem_map_populate(pnum, nodeid); + if (map_map[pnum]) + continue; + ms = __nr_to_section(pnum); + printk(KERN_ERR "%s: sparsemem memory map backing failed " + "some memory will not be available.\n", __func__); + ms->section_mem_map = 0; + } + + if (vmemmap_buf_start) { + /* need to free left buf */ +#ifdef CONFIG_NO_BOOTMEM + free_early(__pa(vmemmap_buf_start), __pa(vmemmap_buf_end)); + if (vmemmap_buf_start < vmemmap_buf) { + char name[15]; + + snprintf(name, sizeof(name), "MEMMAP %d", nodeid); + reserve_early_without_check(__pa(vmemmap_buf_start), + __pa(vmemmap_buf), name); + } +#else + free_bootmem(__pa(vmemmap_buf), vmemmap_buf_end - vmemmap_buf); +#endif + vmemmap_buf = NULL; + vmemmap_buf_end = NULL; + } +} diff --git a/mm/sparse.c b/mm/sparse.c index 6ce4aab69e99..dc0cc4d43ff3 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -2,6 +2,7 @@ * sparse memory mappings. */ #include <linux/mm.h> +#include <linux/slab.h> #include <linux/mmzone.h> #include <linux/bootmem.h> #include <linux/highmem.h> @@ -271,7 +272,8 @@ static unsigned long *__kmalloc_section_usemap(void) #ifdef CONFIG_MEMORY_HOTREMOVE static unsigned long * __init -sparse_early_usemap_alloc_pgdat_section(struct pglist_data *pgdat) +sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat, + unsigned long count) { unsigned long section_nr; @@ -286,7 +288,7 @@ sparse_early_usemap_alloc_pgdat_section(struct pglist_data *pgdat) * this problem. */ section_nr = pfn_to_section_nr(__pa(pgdat) >> PAGE_SHIFT); - return alloc_bootmem_section(usemap_size(), section_nr); + return alloc_bootmem_section(usemap_size() * count, section_nr); } static void __init check_usemap_section_nr(int nid, unsigned long *usemap) @@ -329,7 +331,8 @@ static void __init check_usemap_section_nr(int nid, unsigned long *usemap) } #else static unsigned long * __init -sparse_early_usemap_alloc_pgdat_section(struct pglist_data *pgdat) +sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat, + unsigned long count) { return NULL; } @@ -339,27 +342,40 @@ static void __init check_usemap_section_nr(int nid, unsigned long *usemap) } #endif /* CONFIG_MEMORY_HOTREMOVE */ -static unsigned long *__init sparse_early_usemap_alloc(unsigned long pnum) +static void __init sparse_early_usemaps_alloc_node(unsigned long**usemap_map, + unsigned long pnum_begin, + unsigned long pnum_end, + unsigned long usemap_count, int nodeid) { - unsigned long *usemap; - struct mem_section *ms = __nr_to_section(pnum); - int nid = sparse_early_nid(ms); - - usemap = sparse_early_usemap_alloc_pgdat_section(NODE_DATA(nid)); - if (usemap) - return usemap; + void *usemap; + unsigned long pnum; + int size = usemap_size(); - usemap = alloc_bootmem_node(NODE_DATA(nid), usemap_size()); + usemap = sparse_early_usemaps_alloc_pgdat_section(NODE_DATA(nodeid), + usemap_count); if (usemap) { - check_usemap_section_nr(nid, usemap); - return usemap; + for (pnum = pnum_begin; pnum < pnum_end; pnum++) { + if (!present_section_nr(pnum)) + continue; + usemap_map[pnum] = usemap; + usemap += size; + } + return; } - /* Stupid: suppress gcc warning for SPARSEMEM && !NUMA */ - nid = 0; + usemap = alloc_bootmem_node(NODE_DATA(nodeid), size * usemap_count); + if (usemap) { + for (pnum = pnum_begin; pnum < pnum_end; pnum++) { + if (!present_section_nr(pnum)) + continue; + usemap_map[pnum] = usemap; + usemap += size; + check_usemap_section_nr(nodeid, usemap_map[pnum]); + } + return; + } printk(KERN_WARNING "%s: allocation failed\n", __func__); - return NULL; } #ifndef CONFIG_SPARSEMEM_VMEMMAP @@ -375,8 +391,65 @@ struct page __init *sparse_mem_map_populate(unsigned long pnum, int nid) PAGE_ALIGN(sizeof(struct page) * PAGES_PER_SECTION)); return map; } +void __init sparse_mem_maps_populate_node(struct page **map_map, + unsigned long pnum_begin, + unsigned long pnum_end, + unsigned long map_count, int nodeid) +{ + void *map; + unsigned long pnum; + unsigned long size = sizeof(struct page) * PAGES_PER_SECTION; + + map = alloc_remap(nodeid, size * map_count); + if (map) { + for (pnum = pnum_begin; pnum < pnum_end; pnum++) { + if (!present_section_nr(pnum)) + continue; + map_map[pnum] = map; + map += size; + } + return; + } + + size = PAGE_ALIGN(size); + map = alloc_bootmem_pages_node(NODE_DATA(nodeid), size * map_count); + if (map) { + for (pnum = pnum_begin; pnum < pnum_end; pnum++) { + if (!present_section_nr(pnum)) + continue; + map_map[pnum] = map; + map += size; + } + return; + } + + /* fallback */ + for (pnum = pnum_begin; pnum < pnum_end; pnum++) { + struct mem_section *ms; + + if (!present_section_nr(pnum)) + continue; + map_map[pnum] = sparse_mem_map_populate(pnum, nodeid); + if (map_map[pnum]) + continue; + ms = __nr_to_section(pnum); + printk(KERN_ERR "%s: sparsemem memory map backing failed " + "some memory will not be available.\n", __func__); + ms->section_mem_map = 0; + } +} #endif /* !CONFIG_SPARSEMEM_VMEMMAP */ +#ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER +static void __init sparse_early_mem_maps_alloc_node(struct page **map_map, + unsigned long pnum_begin, + unsigned long pnum_end, + unsigned long map_count, int nodeid) +{ + sparse_mem_maps_populate_node(map_map, pnum_begin, pnum_end, + map_count, nodeid); +} +#else static struct page __init *sparse_early_mem_map_alloc(unsigned long pnum) { struct page *map; @@ -392,10 +465,12 @@ static struct page __init *sparse_early_mem_map_alloc(unsigned long pnum) ms->section_mem_map = 0; return NULL; } +#endif void __attribute__((weak)) __meminit vmemmap_populate_print_last(void) { } + /* * Allocate the accumulated non-linear sections, allocate a mem_map * for each and record the physical to section mapping. @@ -407,6 +482,14 @@ void __init sparse_init(void) unsigned long *usemap; unsigned long **usemap_map; int size; + int nodeid_begin = 0; + unsigned long pnum_begin = 0; + unsigned long usemap_count; +#ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER + unsigned long map_count; + int size2; + struct page **map_map; +#endif /* * map is using big page (aka 2M in x86 64 bit) @@ -425,10 +508,81 @@ void __init sparse_init(void) panic("can not allocate usemap_map\n"); for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) { + struct mem_section *ms; + if (!present_section_nr(pnum)) continue; - usemap_map[pnum] = sparse_early_usemap_alloc(pnum); + ms = __nr_to_section(pnum); + nodeid_begin = sparse_early_nid(ms); + pnum_begin = pnum; + break; } + usemap_count = 1; + for (pnum = pnum_begin + 1; pnum < NR_MEM_SECTIONS; pnum++) { + struct mem_section *ms; + int nodeid; + + if (!present_section_nr(pnum)) + continue; + ms = __nr_to_section(pnum); + nodeid = sparse_early_nid(ms); + if (nodeid == nodeid_begin) { + usemap_count++; + continue; + } + /* ok, we need to take cake of from pnum_begin to pnum - 1*/ + sparse_early_usemaps_alloc_node(usemap_map, pnum_begin, pnum, + usemap_count, nodeid_begin); + /* new start, update count etc*/ + nodeid_begin = nodeid; + pnum_begin = pnum; + usemap_count = 1; + } + /* ok, last chunk */ + sparse_early_usemaps_alloc_node(usemap_map, pnum_begin, NR_MEM_SECTIONS, + usemap_count, nodeid_begin); + +#ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER + size2 = sizeof(struct page *) * NR_MEM_SECTIONS; + map_map = alloc_bootmem(size2); + if (!map_map) + panic("can not allocate map_map\n"); + + for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) { + struct mem_section *ms; + + if (!present_section_nr(pnum)) + continue; + ms = __nr_to_section(pnum); + nodeid_begin = sparse_early_nid(ms); + pnum_begin = pnum; + break; + } + map_count = 1; + for (pnum = pnum_begin + 1; pnum < NR_MEM_SECTIONS; pnum++) { + struct mem_section *ms; + int nodeid; + + if (!present_section_nr(pnum)) + continue; + ms = __nr_to_section(pnum); + nodeid = sparse_early_nid(ms); + if (nodeid == nodeid_begin) { + map_count++; + continue; + } + /* ok, we need to take cake of from pnum_begin to pnum - 1*/ + sparse_early_mem_maps_alloc_node(map_map, pnum_begin, pnum, + map_count, nodeid_begin); + /* new start, update count etc*/ + nodeid_begin = nodeid; + pnum_begin = pnum; + map_count = 1; + } + /* ok, last chunk */ + sparse_early_mem_maps_alloc_node(map_map, pnum_begin, NR_MEM_SECTIONS, + map_count, nodeid_begin); +#endif for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) { if (!present_section_nr(pnum)) @@ -438,7 +592,11 @@ void __init sparse_init(void) if (!usemap) continue; +#ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER + map = map_map[pnum]; +#else map = sparse_early_mem_map_alloc(pnum); +#endif if (!map) continue; @@ -448,6 +606,9 @@ void __init sparse_init(void) vmemmap_populate_print_last(); +#ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER + free_bootmem(__pa(map_map), size2); +#endif free_bootmem(__pa(usemap_map), size); } diff --git a/mm/swap.c b/mm/swap.c index 308e57d8d7ed..7cd60bf0a972 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -30,6 +30,7 @@ #include <linux/notifier.h> #include <linux/backing-dev.h> #include <linux/memcontrol.h> +#include <linux/gfp.h> #include "internal.h" @@ -55,7 +56,7 @@ static void __page_cache_release(struct page *page) del_page_from_lru(zone, page); spin_unlock_irqrestore(&zone->lru_lock, flags); } - free_hot_page(page); + free_hot_cold_page(page, 0); } static void put_compound_page(struct page *page) diff --git a/mm/swap_state.c b/mm/swap_state.c index 6d1daeb1cb4a..e10f5833167f 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -8,6 +8,7 @@ */ #include <linux/module.h> #include <linux/mm.h> +#include <linux/gfp.h> #include <linux/kernel_stat.h> #include <linux/swap.h> #include <linux/swapops.h> diff --git a/mm/swapfile.c b/mm/swapfile.c index 9c590eef7912..6cd0a8f90dc7 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -22,6 +22,7 @@ #include <linux/seq_file.h> #include <linux/init.h> #include <linux/module.h> +#include <linux/ksm.h> #include <linux/rmap.h> #include <linux/security.h> #include <linux/backing-dev.h> @@ -35,11 +36,15 @@ #include <linux/swapops.h> #include <linux/page_cgroup.h> +static bool swap_count_continued(struct swap_info_struct *, pgoff_t, + unsigned char); +static void free_swap_count_continuations(struct swap_info_struct *); +static sector_t map_swap_entry(swp_entry_t, struct block_device**); + static DEFINE_SPINLOCK(swap_lock); static unsigned int nr_swapfiles; long nr_swap_pages; long total_swap_pages; -static int swap_overflow; static int least_priority; static const char Bad_file[] = "Bad swap file entry "; @@ -49,42 +54,20 @@ static const char Unused_offset[] = "Unused swap offset entry "; static struct swap_list_t swap_list = {-1, -1}; -static struct swap_info_struct swap_info[MAX_SWAPFILES]; +static struct swap_info_struct *swap_info[MAX_SWAPFILES]; static DEFINE_MUTEX(swapon_mutex); -/* For reference count accounting in swap_map */ -/* enum for swap_map[] handling. internal use only */ -enum { - SWAP_MAP = 0, /* ops for reference from swap users */ - SWAP_CACHE, /* ops for reference from swap cache */ -}; - -static inline int swap_count(unsigned short ent) -{ - return ent & SWAP_COUNT_MASK; -} - -static inline bool swap_has_cache(unsigned short ent) -{ - return !!(ent & SWAP_HAS_CACHE); -} - -static inline unsigned short encode_swapmap(int count, bool has_cache) +static inline unsigned char swap_count(unsigned char ent) { - unsigned short ret = count; - - if (has_cache) - return SWAP_HAS_CACHE | ret; - return ret; + return ent & ~SWAP_HAS_CACHE; /* may include SWAP_HAS_CONT flag */ } -/* returnes 1 if swap entry is freed */ +/* returns 1 if swap entry is freed */ static int __try_to_reclaim_swap(struct swap_info_struct *si, unsigned long offset) { - int type = si - swap_info; - swp_entry_t entry = swp_entry(type, offset); + swp_entry_t entry = swp_entry(si->type, offset); struct page *page; int ret = 0; @@ -120,7 +103,7 @@ void swap_unplug_io_fn(struct backing_dev_info *unused_bdi, struct page *page) down_read(&swap_unplug_sem); entry.val = page_private(page); if (PageSwapCache(page)) { - struct block_device *bdev = swap_info[swp_type(entry)].bdev; + struct block_device *bdev = swap_info[swp_type(entry)]->bdev; struct backing_dev_info *bdi; /* @@ -146,23 +129,28 @@ void swap_unplug_io_fn(struct backing_dev_info *unused_bdi, struct page *page) static int discard_swap(struct swap_info_struct *si) { struct swap_extent *se; + sector_t start_block; + sector_t nr_blocks; int err = 0; - list_for_each_entry(se, &si->extent_list, list) { - sector_t start_block = se->start_block << (PAGE_SHIFT - 9); - sector_t nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9); + /* Do not discard the swap header page! */ + se = &si->first_swap_extent; + start_block = (se->start_block + 1) << (PAGE_SHIFT - 9); + nr_blocks = ((sector_t)se->nr_pages - 1) << (PAGE_SHIFT - 9); + if (nr_blocks) { + err = blkdev_issue_discard(si->bdev, start_block, + nr_blocks, GFP_KERNEL, DISCARD_FL_BARRIER); + if (err) + return err; + cond_resched(); + } - if (se->start_page == 0) { - /* Do not discard the swap header page! */ - start_block += 1 << (PAGE_SHIFT - 9); - nr_blocks -= 1 << (PAGE_SHIFT - 9); - if (!nr_blocks) - continue; - } + list_for_each_entry(se, &si->first_swap_extent.list, list) { + start_block = se->start_block << (PAGE_SHIFT - 9); + nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9); err = blkdev_issue_discard(si->bdev, start_block, - nr_blocks, GFP_KERNEL, - DISCARD_FL_BARRIER); + nr_blocks, GFP_KERNEL, DISCARD_FL_BARRIER); if (err) break; @@ -201,14 +189,11 @@ static void discard_swap_cluster(struct swap_info_struct *si, start_block <<= PAGE_SHIFT - 9; nr_blocks <<= PAGE_SHIFT - 9; if (blkdev_issue_discard(si->bdev, start_block, - nr_blocks, GFP_NOIO, - DISCARD_FL_BARRIER)) + nr_blocks, GFP_NOIO, DISCARD_FL_BARRIER)) break; } lh = se->list.next; - if (lh == &si->extent_list) - lh = lh->next; se = list_entry(lh, struct swap_extent, list); } } @@ -223,7 +208,7 @@ static int wait_for_discard(void *word) #define LATENCY_LIMIT 256 static inline unsigned long scan_swap_map(struct swap_info_struct *si, - int cache) + unsigned char usage) { unsigned long offset; unsigned long scan_base; @@ -354,10 +339,7 @@ checks: si->lowest_bit = si->max; si->highest_bit = 0; } - if (cache == SWAP_CACHE) /* at usual swap-out via vmscan.c */ - si->swap_map[offset] = encode_swapmap(0, true); - else /* at suspend */ - si->swap_map[offset] = encode_swapmap(1, false); + si->swap_map[offset] = usage; si->cluster_next = offset + 1; si->flags -= SWP_SCANNING; @@ -467,10 +449,10 @@ swp_entry_t get_swap_page(void) nr_swap_pages--; for (type = swap_list.next; type >= 0 && wrapped < 2; type = next) { - si = swap_info + type; + si = swap_info[type]; next = si->next; if (next < 0 || - (!wrapped && si->prio != swap_info[next].prio)) { + (!wrapped && si->prio != swap_info[next]->prio)) { next = swap_list.head; wrapped++; } @@ -482,7 +464,7 @@ swp_entry_t get_swap_page(void) swap_list.next = next; /* This is called for allocating swap entry for cache */ - offset = scan_swap_map(si, SWAP_CACHE); + offset = scan_swap_map(si, SWAP_HAS_CACHE); if (offset) { spin_unlock(&swap_lock); return swp_entry(type, offset); @@ -503,11 +485,11 @@ swp_entry_t get_swap_page_of_type(int type) pgoff_t offset; spin_lock(&swap_lock); - si = swap_info + type; - if (si->flags & SWP_WRITEOK) { + si = swap_info[type]; + if (si && (si->flags & SWP_WRITEOK)) { nr_swap_pages--; /* This is called for allocating swap entry, not cache */ - offset = scan_swap_map(si, SWAP_MAP); + offset = scan_swap_map(si, 1); if (offset) { spin_unlock(&swap_lock); return swp_entry(type, offset); @@ -518,9 +500,9 @@ swp_entry_t get_swap_page_of_type(int type) return (swp_entry_t) {0}; } -static struct swap_info_struct * swap_info_get(swp_entry_t entry) +static struct swap_info_struct *swap_info_get(swp_entry_t entry) { - struct swap_info_struct * p; + struct swap_info_struct *p; unsigned long offset, type; if (!entry.val) @@ -528,7 +510,7 @@ static struct swap_info_struct * swap_info_get(swp_entry_t entry) type = swp_type(entry); if (type >= nr_swapfiles) goto bad_nofile; - p = & swap_info[type]; + p = swap_info[type]; if (!(p->flags & SWP_USED)) goto bad_device; offset = swp_offset(entry); @@ -554,41 +536,56 @@ out: return NULL; } -static int swap_entry_free(struct swap_info_struct *p, - swp_entry_t ent, int cache) +static unsigned char swap_entry_free(struct swap_info_struct *p, + swp_entry_t entry, unsigned char usage) { - unsigned long offset = swp_offset(ent); - int count = swap_count(p->swap_map[offset]); - bool has_cache; + unsigned long offset = swp_offset(entry); + unsigned char count; + unsigned char has_cache; - has_cache = swap_has_cache(p->swap_map[offset]); + count = p->swap_map[offset]; + has_cache = count & SWAP_HAS_CACHE; + count &= ~SWAP_HAS_CACHE; - if (cache == SWAP_MAP) { /* dropping usage count of swap */ - if (count < SWAP_MAP_MAX) { - count--; - p->swap_map[offset] = encode_swapmap(count, has_cache); - } - } else { /* dropping swap cache flag */ + if (usage == SWAP_HAS_CACHE) { VM_BUG_ON(!has_cache); - p->swap_map[offset] = encode_swapmap(count, false); - + has_cache = 0; + } else if (count == SWAP_MAP_SHMEM) { + /* + * Or we could insist on shmem.c using a special + * swap_shmem_free() and free_shmem_swap_and_cache()... + */ + count = 0; + } else if ((count & ~COUNT_CONTINUED) <= SWAP_MAP_MAX) { + if (count == COUNT_CONTINUED) { + if (swap_count_continued(p, offset, count)) + count = SWAP_MAP_MAX | COUNT_CONTINUED; + else + count = SWAP_MAP_MAX; + } else + count--; } - /* return code. */ - count = p->swap_map[offset]; + + if (!count) + mem_cgroup_uncharge_swap(entry); + + usage = count | has_cache; + p->swap_map[offset] = usage; + /* free if no reference */ - if (!count) { + if (!usage) { if (offset < p->lowest_bit) p->lowest_bit = offset; if (offset > p->highest_bit) p->highest_bit = offset; - if (p->prio > swap_info[swap_list.next].prio) - swap_list.next = p - swap_info; + if (swap_list.next >= 0 && + p->prio > swap_info[swap_list.next]->prio) + swap_list.next = p->type; nr_swap_pages++; p->inuse_pages--; } - if (!swap_count(count)) - mem_cgroup_uncharge_swap(ent); - return count; + + return usage; } /* @@ -597,11 +594,11 @@ static int swap_entry_free(struct swap_info_struct *p, */ void swap_free(swp_entry_t entry) { - struct swap_info_struct * p; + struct swap_info_struct *p; p = swap_info_get(entry); if (p) { - swap_entry_free(p, entry, SWAP_MAP); + swap_entry_free(p, entry, 1); spin_unlock(&swap_lock); } } @@ -612,26 +609,21 @@ void swap_free(swp_entry_t entry) void swapcache_free(swp_entry_t entry, struct page *page) { struct swap_info_struct *p; - int ret; + unsigned char count; p = swap_info_get(entry); if (p) { - ret = swap_entry_free(p, entry, SWAP_CACHE); - if (page) { - bool swapout; - if (ret) - swapout = true; /* the end of swap out */ - else - swapout = false; /* no more swap users! */ - mem_cgroup_uncharge_swapcache(page, entry, swapout); - } + count = swap_entry_free(p, entry, SWAP_HAS_CACHE); + if (page) + mem_cgroup_uncharge_swapcache(page, entry, count != 0); spin_unlock(&swap_lock); } - return; } /* * How many references to page are currently swapped out? + * This does not give an exact answer when swap count is continued, + * but does include the high COUNT_CONTINUED flag to allow for that. */ static inline int page_swapcount(struct page *page) { @@ -659,6 +651,8 @@ int reuse_swap_page(struct page *page) int count; VM_BUG_ON(!PageLocked(page)); + if (unlikely(PageKsm(page))) + return 0; count = page_mapcount(page); if (count <= 1 && PageSwapCache(page)) { count += page_swapcount(page); @@ -667,7 +661,7 @@ int reuse_swap_page(struct page *page) SetPageDirty(page); } } - return count == 1; + return count <= 1; } /* @@ -704,7 +698,7 @@ int free_swap_and_cache(swp_entry_t entry) p = swap_info_get(entry); if (p) { - if (swap_entry_free(p, entry, SWAP_MAP) == SWAP_HAS_CACHE) { + if (swap_entry_free(p, entry, 1) == SWAP_HAS_CACHE) { page = find_get_page(&swapper_space, entry.val); if (page && !trylock_page(page)) { page_cache_release(page); @@ -729,6 +723,37 @@ int free_swap_and_cache(swp_entry_t entry) return p != NULL; } +#ifdef CONFIG_CGROUP_MEM_RES_CTLR +/** + * mem_cgroup_count_swap_user - count the user of a swap entry + * @ent: the swap entry to be checked + * @pagep: the pointer for the swap cache page of the entry to be stored + * + * Returns the number of the user of the swap entry. The number is valid only + * for swaps of anonymous pages. + * If the entry is found on swap cache, the page is stored to pagep with + * refcount of it being incremented. + */ +int mem_cgroup_count_swap_user(swp_entry_t ent, struct page **pagep) +{ + struct page *page; + struct swap_info_struct *p; + int count = 0; + + page = find_get_page(&swapper_space, ent.val); + if (page) + count += page_mapcount(page); + p = swap_info_get(ent); + if (p) { + count += swap_count(p->swap_map[swp_offset(ent)]); + spin_unlock(&swap_lock); + } + + *pagep = page; + return count; +} +#endif + #ifdef CONFIG_HIBERNATION /* * Find the swap type that corresponds to given device (if any). @@ -741,14 +766,14 @@ int free_swap_and_cache(swp_entry_t entry) int swap_type_of(dev_t device, sector_t offset, struct block_device **bdev_p) { struct block_device *bdev = NULL; - int i; + int type; if (device) bdev = bdget(device); spin_lock(&swap_lock); - for (i = 0; i < nr_swapfiles; i++) { - struct swap_info_struct *sis = swap_info + i; + for (type = 0; type < nr_swapfiles; type++) { + struct swap_info_struct *sis = swap_info[type]; if (!(sis->flags & SWP_WRITEOK)) continue; @@ -758,20 +783,18 @@ int swap_type_of(dev_t device, sector_t offset, struct block_device **bdev_p) *bdev_p = bdgrab(sis->bdev); spin_unlock(&swap_lock); - return i; + return type; } if (bdev == sis->bdev) { - struct swap_extent *se; + struct swap_extent *se = &sis->first_swap_extent; - se = list_entry(sis->extent_list.next, - struct swap_extent, list); if (se->start_block == offset) { if (bdev_p) *bdev_p = bdgrab(sis->bdev); spin_unlock(&swap_lock); bdput(bdev); - return i; + return type; } } } @@ -783,6 +806,21 @@ int swap_type_of(dev_t device, sector_t offset, struct block_device **bdev_p) } /* + * Get the (PAGE_SIZE) block corresponding to given offset on the swapdev + * corresponding to given index in swap_info (swap type). + */ +sector_t swapdev_block(int type, pgoff_t offset) +{ + struct block_device *bdev; + + if ((unsigned int)type >= nr_swapfiles) + return 0; + if (!(swap_info[type]->flags & SWP_WRITEOK)) + return 0; + return map_swap_entry(swp_entry(type, offset), &bdev); +} + +/* * Return either the total number of swap pages of given type, or the number * of free pages of that type (depending on @free) * @@ -792,18 +830,20 @@ unsigned int count_swap_pages(int type, int free) { unsigned int n = 0; - if (type < nr_swapfiles) { - spin_lock(&swap_lock); - if (swap_info[type].flags & SWP_WRITEOK) { - n = swap_info[type].pages; + spin_lock(&swap_lock); + if ((unsigned int)type < nr_swapfiles) { + struct swap_info_struct *sis = swap_info[type]; + + if (sis->flags & SWP_WRITEOK) { + n = sis->pages; if (free) - n -= swap_info[type].inuse_pages; + n -= sis->inuse_pages; } - spin_unlock(&swap_lock); } + spin_unlock(&swap_lock); return n; } -#endif +#endif /* CONFIG_HIBERNATION */ /* * No need to decide whether this PTE shares the swap entry with others, @@ -831,7 +871,8 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, goto out; } - inc_mm_counter(vma->vm_mm, anon_rss); + dec_mm_counter(vma->vm_mm, MM_SWAPENTS); + inc_mm_counter(vma->vm_mm, MM_ANONPAGES); get_page(page); set_pte_at(vma->vm_mm, addr, pte, pte_mkold(mk_pte(page, vma->vm_page_prot))); @@ -932,7 +973,7 @@ static int unuse_vma(struct vm_area_struct *vma, unsigned long addr, end, next; int ret; - if (page->mapping) { + if (page_anon_vma(page)) { addr = page_address_in_vma(page, vma); if (addr == -EFAULT) return 0; @@ -988,7 +1029,7 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si, { unsigned int max = si->max; unsigned int i = prev; - int count; + unsigned char count; /* * No need for swap_lock here: we're just looking @@ -1024,16 +1065,14 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si, */ static int try_to_unuse(unsigned int type) { - struct swap_info_struct * si = &swap_info[type]; + struct swap_info_struct *si = swap_info[type]; struct mm_struct *start_mm; - unsigned short *swap_map; - unsigned short swcount; + unsigned char *swap_map; + unsigned char swcount; struct page *page; swp_entry_t entry; unsigned int i = 0; int retval = 0; - int reset_overflow = 0; - int shmem; /* * When searching mms for an entry, a good strategy is to @@ -1047,8 +1086,7 @@ static int try_to_unuse(unsigned int type) * together, child after parent. If we race with dup_mmap(), we * prefer to resolve parent before child, lest we miss entries * duplicated after we scanned child: using last mm would invert - * that. Though it's only a serious concern when an overflowed - * swap count is reset from SWAP_MAP_MAX, preventing a rescan. + * that. */ start_mm = &init_mm; atomic_inc(&init_mm.mm_users); @@ -1110,17 +1148,18 @@ static int try_to_unuse(unsigned int type) /* * Remove all references to entry. - * Whenever we reach init_mm, there's no address space - * to search, but use it as a reminder to search shmem. */ - shmem = 0; swcount = *swap_map; - if (swap_count(swcount)) { - if (start_mm == &init_mm) - shmem = shmem_unuse(entry, page); - else - retval = unuse_mm(start_mm, entry, page); + if (swap_count(swcount) == SWAP_MAP_SHMEM) { + retval = shmem_unuse(entry, page); + /* page has already been unlocked and released */ + if (retval < 0) + break; + continue; } + if (swap_count(swcount) && start_mm != &init_mm) + retval = unuse_mm(start_mm, entry, page); + if (swap_count(*swap_map)) { int set_start_mm = (*swap_map >= swcount); struct list_head *p = &start_mm->mmlist; @@ -1131,7 +1170,7 @@ static int try_to_unuse(unsigned int type) atomic_inc(&new_start_mm->mm_users); atomic_inc(&prev_mm->mm_users); spin_lock(&mmlist_lock); - while (swap_count(*swap_map) && !retval && !shmem && + while (swap_count(*swap_map) && !retval && (p = p->next) != &start_mm->mmlist) { mm = list_entry(p, struct mm_struct, mmlist); if (!atomic_inc_not_zero(&mm->mm_users)) @@ -1145,10 +1184,9 @@ static int try_to_unuse(unsigned int type) swcount = *swap_map; if (!swap_count(swcount)) /* any usage ? */ ; - else if (mm == &init_mm) { + else if (mm == &init_mm) set_start_mm = 1; - shmem = shmem_unuse(entry, page); - } else + else retval = unuse_mm(mm, entry, page); if (set_start_mm && *swap_map < swcount) { @@ -1164,13 +1202,6 @@ static int try_to_unuse(unsigned int type) mmput(start_mm); start_mm = new_start_mm; } - if (shmem) { - /* page has already been unlocked and released */ - if (shmem > 0) - continue; - retval = shmem; - break; - } if (retval) { unlock_page(page); page_cache_release(page); @@ -1178,30 +1209,6 @@ static int try_to_unuse(unsigned int type) } /* - * How could swap count reach 0x7ffe ? - * There's no way to repeat a swap page within an mm - * (except in shmem, where it's the shared object which takes - * the reference count)? - * We believe SWAP_MAP_MAX cannot occur.(if occur, unsigned - * short is too small....) - * If that's wrong, then we should worry more about - * exit_mmap() and do_munmap() cases described above: - * we might be resetting SWAP_MAP_MAX too early here. - * We know "Undead"s can happen, they're okay, so don't - * report them; but do report if we reset SWAP_MAP_MAX. - */ - /* We might release the lock_page() in unuse_mm(). */ - if (!PageSwapCache(page) || page_private(page) != entry.val) - goto retry; - - if (swap_count(*swap_map) == SWAP_MAP_MAX) { - spin_lock(&swap_lock); - *swap_map = encode_swapmap(0, true); - spin_unlock(&swap_lock); - reset_overflow = 1; - } - - /* * If a reference remains (rare), we would like to leave * the page in the swap cache; but try_to_unmap could * then re-duplicate the entry once we drop page lock, @@ -1213,6 +1220,12 @@ static int try_to_unuse(unsigned int type) * read from disk into another page. Splitting into two * pages would be incorrect if swap supported "shared * private" pages, but they are handled by tmpfs files. + * + * Given how unuse_vma() targets one particular offset + * in an anon_vma, once the anon_vma has been determined, + * this splitting happens to be just what is needed to + * handle where KSM pages have been swapped out: re-reading + * is unnecessarily slow, but we can fix that later on. */ if (swap_count(*swap_map) && PageDirty(page) && PageSwapCache(page)) { @@ -1242,7 +1255,6 @@ static int try_to_unuse(unsigned int type) * mark page dirty so shrink_page_list will preserve it. */ SetPageDirty(page); -retry: unlock_page(page); page_cache_release(page); @@ -1254,10 +1266,6 @@ retry: } mmput(start_mm); - if (reset_overflow) { - printk(KERN_WARNING "swapoff: cleared swap entry overflow\n"); - swap_overflow = 0; - } return retval; } @@ -1270,10 +1278,10 @@ retry: static void drain_mmlist(void) { struct list_head *p, *next; - unsigned int i; + unsigned int type; - for (i = 0; i < nr_swapfiles; i++) - if (swap_info[i].inuse_pages) + for (type = 0; type < nr_swapfiles; type++) + if (swap_info[type]->inuse_pages) return; spin_lock(&mmlist_lock); list_for_each_safe(p, next, &init_mm.mmlist) @@ -1283,12 +1291,23 @@ static void drain_mmlist(void) /* * Use this swapdev's extent info to locate the (PAGE_SIZE) block which - * corresponds to page offset `offset'. + * corresponds to page offset for the specified swap entry. + * Note that the type of this function is sector_t, but it returns page offset + * into the bdev, not sector offset. */ -sector_t map_swap_page(struct swap_info_struct *sis, pgoff_t offset) +static sector_t map_swap_entry(swp_entry_t entry, struct block_device **bdev) { - struct swap_extent *se = sis->curr_swap_extent; - struct swap_extent *start_se = se; + struct swap_info_struct *sis; + struct swap_extent *start_se; + struct swap_extent *se; + pgoff_t offset; + + sis = swap_info[swp_type(entry)]; + *bdev = sis->bdev; + + offset = swp_offset(entry); + start_se = sis->curr_swap_extent; + se = start_se; for ( ; ; ) { struct list_head *lh; @@ -1298,40 +1317,31 @@ sector_t map_swap_page(struct swap_info_struct *sis, pgoff_t offset) return se->start_block + (offset - se->start_page); } lh = se->list.next; - if (lh == &sis->extent_list) - lh = lh->next; se = list_entry(lh, struct swap_extent, list); sis->curr_swap_extent = se; BUG_ON(se == start_se); /* It *must* be present */ } } -#ifdef CONFIG_HIBERNATION /* - * Get the (PAGE_SIZE) block corresponding to given offset on the swapdev - * corresponding to given index in swap_info (swap type). + * Returns the page offset into bdev for the specified page's swap entry. */ -sector_t swapdev_block(int swap_type, pgoff_t offset) +sector_t map_swap_page(struct page *page, struct block_device **bdev) { - struct swap_info_struct *sis; - - if (swap_type >= nr_swapfiles) - return 0; - - sis = swap_info + swap_type; - return (sis->flags & SWP_WRITEOK) ? map_swap_page(sis, offset) : 0; + swp_entry_t entry; + entry.val = page_private(page); + return map_swap_entry(entry, bdev); } -#endif /* CONFIG_HIBERNATION */ /* * Free all of a swapdev's extent information */ static void destroy_swap_extents(struct swap_info_struct *sis) { - while (!list_empty(&sis->extent_list)) { + while (!list_empty(&sis->first_swap_extent.list)) { struct swap_extent *se; - se = list_entry(sis->extent_list.next, + se = list_entry(sis->first_swap_extent.list.next, struct swap_extent, list); list_del(&se->list); kfree(se); @@ -1352,8 +1362,15 @@ add_swap_extent(struct swap_info_struct *sis, unsigned long start_page, struct swap_extent *new_se; struct list_head *lh; - lh = sis->extent_list.prev; /* The highest page extent */ - if (lh != &sis->extent_list) { + if (start_page == 0) { + se = &sis->first_swap_extent; + sis->curr_swap_extent = se; + se->start_page = 0; + se->nr_pages = nr_pages; + se->start_block = start_block; + return 1; + } else { + lh = sis->first_swap_extent.list.prev; /* Highest extent */ se = list_entry(lh, struct swap_extent, list); BUG_ON(se->start_page + se->nr_pages != start_page); if (se->start_block + se->nr_pages == start_block) { @@ -1373,7 +1390,7 @@ add_swap_extent(struct swap_info_struct *sis, unsigned long start_page, new_se->nr_pages = nr_pages; new_se->start_block = start_block; - list_add_tail(&new_se->list, &sis->extent_list); + list_add_tail(&new_se->list, &sis->first_swap_extent.list); return 1; } @@ -1425,7 +1442,7 @@ static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span) if (S_ISBLK(inode->i_mode)) { ret = add_swap_extent(sis, 0, sis->max, 0); *span = sis->pages; - goto done; + goto out; } blkbits = inode->i_blkbits; @@ -1496,25 +1513,22 @@ reprobe: sis->max = page_no; sis->pages = page_no - 1; sis->highest_bit = page_no - 1; -done: - sis->curr_swap_extent = list_entry(sis->extent_list.prev, - struct swap_extent, list); - goto out; +out: + return ret; bad_bmap: printk(KERN_ERR "swapon: swapfile has holes\n"); ret = -EINVAL; -out: - return ret; + goto out; } SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) { - struct swap_info_struct * p = NULL; - unsigned short *swap_map; + struct swap_info_struct *p = NULL; + unsigned char *swap_map; struct file *swap_file, *victim; struct address_space *mapping; struct inode *inode; - char * pathname; + char *pathname; int i, type, prev; int err; @@ -1535,8 +1549,8 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) mapping = victim->f_mapping; prev = -1; spin_lock(&swap_lock); - for (type = swap_list.head; type >= 0; type = swap_info[type].next) { - p = swap_info + type; + for (type = swap_list.head; type >= 0; type = swap_info[type]->next) { + p = swap_info[type]; if (p->flags & SWP_WRITEOK) { if (p->swap_file->f_mapping == mapping) break; @@ -1555,18 +1569,17 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) spin_unlock(&swap_lock); goto out_dput; } - if (prev < 0) { + if (prev < 0) swap_list.head = p->next; - } else { - swap_info[prev].next = p->next; - } + else + swap_info[prev]->next = p->next; if (type == swap_list.next) { /* just pick something that's safe... */ swap_list.next = swap_list.head; } if (p->prio < 0) { - for (i = p->next; i >= 0; i = swap_info[i].next) - swap_info[i].prio = p->prio--; + for (i = p->next; i >= 0; i = swap_info[i]->next) + swap_info[i]->prio = p->prio--; least_priority++; } nr_swap_pages -= p->pages; @@ -1584,16 +1597,16 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) if (p->prio < 0) p->prio = --least_priority; prev = -1; - for (i = swap_list.head; i >= 0; i = swap_info[i].next) { - if (p->prio >= swap_info[i].prio) + for (i = swap_list.head; i >= 0; i = swap_info[i]->next) { + if (p->prio >= swap_info[i]->prio) break; prev = i; } p->next = i; if (prev < 0) - swap_list.head = swap_list.next = p - swap_info; + swap_list.head = swap_list.next = type; else - swap_info[prev].next = p - swap_info; + swap_info[prev]->next = type; nr_swap_pages += p->pages; total_swap_pages += p->pages; p->flags |= SWP_WRITEOK; @@ -1606,6 +1619,9 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) up_write(&swap_unplug_sem); destroy_swap_extents(p); + if (p->flags & SWP_CONTINUED) + free_swap_count_continuations(p); + mutex_lock(&swapon_mutex); spin_lock(&swap_lock); drain_mmlist(); @@ -1653,8 +1669,8 @@ out: /* iterator */ static void *swap_start(struct seq_file *swap, loff_t *pos) { - struct swap_info_struct *ptr = swap_info; - int i; + struct swap_info_struct *si; + int type; loff_t l = *pos; mutex_lock(&swapon_mutex); @@ -1662,11 +1678,13 @@ static void *swap_start(struct seq_file *swap, loff_t *pos) if (!l) return SEQ_START_TOKEN; - for (i = 0; i < nr_swapfiles; i++, ptr++) { - if (!(ptr->flags & SWP_USED) || !ptr->swap_map) + for (type = 0; type < nr_swapfiles; type++) { + smp_rmb(); /* read nr_swapfiles before swap_info[type] */ + si = swap_info[type]; + if (!(si->flags & SWP_USED) || !si->swap_map) continue; if (!--l) - return ptr; + return si; } return NULL; @@ -1674,21 +1692,21 @@ static void *swap_start(struct seq_file *swap, loff_t *pos) static void *swap_next(struct seq_file *swap, void *v, loff_t *pos) { - struct swap_info_struct *ptr; - struct swap_info_struct *endptr = swap_info + nr_swapfiles; + struct swap_info_struct *si = v; + int type; if (v == SEQ_START_TOKEN) - ptr = swap_info; - else { - ptr = v; - ptr++; - } + type = 0; + else + type = si->type + 1; - for (; ptr < endptr; ptr++) { - if (!(ptr->flags & SWP_USED) || !ptr->swap_map) + for (; type < nr_swapfiles; type++) { + smp_rmb(); /* read nr_swapfiles before swap_info[type] */ + si = swap_info[type]; + if (!(si->flags & SWP_USED) || !si->swap_map) continue; ++*pos; - return ptr; + return si; } return NULL; @@ -1701,24 +1719,24 @@ static void swap_stop(struct seq_file *swap, void *v) static int swap_show(struct seq_file *swap, void *v) { - struct swap_info_struct *ptr = v; + struct swap_info_struct *si = v; struct file *file; int len; - if (ptr == SEQ_START_TOKEN) { + if (si == SEQ_START_TOKEN) { seq_puts(swap,"Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n"); return 0; } - file = ptr->swap_file; + file = si->swap_file; len = seq_path(swap, &file->f_path, " \t\n\\"); seq_printf(swap, "%*s%s\t%u\t%u\t%d\n", len < 40 ? 40 - len : 1, " ", S_ISBLK(file->f_path.dentry->d_inode->i_mode) ? "partition" : "file\t", - ptr->pages << (PAGE_SHIFT - 10), - ptr->inuse_pages << (PAGE_SHIFT - 10), - ptr->prio); + si->pages << (PAGE_SHIFT - 10), + si->inuse_pages << (PAGE_SHIFT - 10), + si->prio); return 0; } @@ -1765,7 +1783,7 @@ late_initcall(max_swapfiles_check); */ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) { - struct swap_info_struct * p; + struct swap_info_struct *p; char *name = NULL; struct block_device *bdev = NULL; struct file *swap_file = NULL; @@ -1773,36 +1791,58 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) unsigned int type; int i, prev; int error; - union swap_header *swap_header = NULL; - unsigned int nr_good_pages = 0; + union swap_header *swap_header; + unsigned int nr_good_pages; int nr_extents = 0; sector_t span; - unsigned long maxpages = 1; + unsigned long maxpages; unsigned long swapfilepages; - unsigned short *swap_map = NULL; + unsigned char *swap_map = NULL; struct page *page = NULL; struct inode *inode = NULL; int did_down = 0; if (!capable(CAP_SYS_ADMIN)) return -EPERM; + + p = kzalloc(sizeof(*p), GFP_KERNEL); + if (!p) + return -ENOMEM; + spin_lock(&swap_lock); - p = swap_info; - for (type = 0 ; type < nr_swapfiles ; type++,p++) - if (!(p->flags & SWP_USED)) + for (type = 0; type < nr_swapfiles; type++) { + if (!(swap_info[type]->flags & SWP_USED)) break; + } error = -EPERM; if (type >= MAX_SWAPFILES) { spin_unlock(&swap_lock); + kfree(p); goto out; } - if (type >= nr_swapfiles) - nr_swapfiles = type+1; - memset(p, 0, sizeof(*p)); - INIT_LIST_HEAD(&p->extent_list); + if (type >= nr_swapfiles) { + p->type = type; + swap_info[type] = p; + /* + * Write swap_info[type] before nr_swapfiles, in case a + * racing procfs swap_start() or swap_next() is reading them. + * (We never shrink nr_swapfiles, we never free this entry.) + */ + smp_wmb(); + nr_swapfiles++; + } else { + kfree(p); + p = swap_info[type]; + /* + * Do not memset this entry: a racing procfs swap_next() + * would be relying on p->type to remain valid. + */ + } + INIT_LIST_HEAD(&p->first_swap_extent.list); p->flags = SWP_USED; p->next = -1; spin_unlock(&swap_lock); + name = getname(specialfile); error = PTR_ERR(name); if (IS_ERR(name)) { @@ -1822,7 +1862,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) error = -EBUSY; for (i = 0; i < nr_swapfiles; i++) { - struct swap_info_struct *q = &swap_info[i]; + struct swap_info_struct *q = swap_info[i]; if (i == type || !q->swap_file) continue; @@ -1897,6 +1937,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) p->lowest_bit = 1; p->cluster_next = 1; + p->cluster_nr = 0; /* * Find out how many pages are allowed for a single swap @@ -1913,9 +1954,13 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) * swap pte. */ maxpages = swp_offset(pte_to_swp_entry( - swp_entry_to_pte(swp_entry(0, ~0UL)))) - 1; - if (maxpages > swap_header->info.last_page) - maxpages = swap_header->info.last_page; + swp_entry_to_pte(swp_entry(0, ~0UL)))) + 1; + if (maxpages > swap_header->info.last_page) { + maxpages = swap_header->info.last_page + 1; + /* p->max is an unsigned int: don't overflow it */ + if ((unsigned int)maxpages == 0) + maxpages = UINT_MAX; + } p->highest_bit = maxpages - 1; error = -EINVAL; @@ -1932,30 +1977,31 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) goto bad_swap; /* OK, set up the swap map and apply the bad block list */ - swap_map = vmalloc(maxpages * sizeof(short)); + swap_map = vmalloc(maxpages); if (!swap_map) { error = -ENOMEM; goto bad_swap; } - memset(swap_map, 0, maxpages * sizeof(short)); + memset(swap_map, 0, maxpages); + nr_good_pages = maxpages - 1; /* omit header page */ + for (i = 0; i < swap_header->info.nr_badpages; i++) { - int page_nr = swap_header->info.badpages[i]; - if (page_nr <= 0 || page_nr >= swap_header->info.last_page) { + unsigned int page_nr = swap_header->info.badpages[i]; + if (page_nr == 0 || page_nr > swap_header->info.last_page) { error = -EINVAL; goto bad_swap; } - swap_map[page_nr] = SWAP_MAP_BAD; + if (page_nr < maxpages) { + swap_map[page_nr] = SWAP_MAP_BAD; + nr_good_pages--; + } } error = swap_cgroup_swapon(type, maxpages); if (error) goto bad_swap; - nr_good_pages = swap_header->info.last_page - - swap_header->info.nr_badpages - - 1 /* header page */; - if (nr_good_pages) { swap_map[0] = SWAP_MAP_BAD; p->max = maxpages; @@ -2003,18 +2049,16 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) /* insert swap space into swap_list: */ prev = -1; - for (i = swap_list.head; i >= 0; i = swap_info[i].next) { - if (p->prio >= swap_info[i].prio) { + for (i = swap_list.head; i >= 0; i = swap_info[i]->next) { + if (p->prio >= swap_info[i]->prio) break; - } prev = i; } p->next = i; - if (prev < 0) { - swap_list.head = swap_list.next = p - swap_info; - } else { - swap_info[prev].next = p - swap_info; - } + if (prev < 0) + swap_list.head = swap_list.next = type; + else + swap_info[prev]->next = type; spin_unlock(&swap_lock); mutex_unlock(&swapon_mutex); error = 0; @@ -2051,15 +2095,15 @@ out: void si_swapinfo(struct sysinfo *val) { - unsigned int i; + unsigned int type; unsigned long nr_to_be_unused = 0; spin_lock(&swap_lock); - for (i = 0; i < nr_swapfiles; i++) { - if (!(swap_info[i].flags & SWP_USED) || - (swap_info[i].flags & SWP_WRITEOK)) - continue; - nr_to_be_unused += swap_info[i].inuse_pages; + for (type = 0; type < nr_swapfiles; type++) { + struct swap_info_struct *si = swap_info[type]; + + if ((si->flags & SWP_USED) && !(si->flags & SWP_WRITEOK)) + nr_to_be_unused += si->inuse_pages; } val->freeswap = nr_swap_pages + nr_to_be_unused; val->totalswap = total_swap_pages + nr_to_be_unused; @@ -2069,101 +2113,111 @@ void si_swapinfo(struct sysinfo *val) /* * Verify that a swap entry is valid and increment its swap map count. * - * Note: if swap_map[] reaches SWAP_MAP_MAX the entries are treated as - * "permanent", but will be reclaimed by the next swapoff. * Returns error code in following case. * - success -> 0 * - swp_entry is invalid -> EINVAL * - swp_entry is migration entry -> EINVAL * - swap-cache reference is requested but there is already one. -> EEXIST * - swap-cache reference is requested but the entry is not used. -> ENOENT + * - swap-mapped reference requested but needs continued swap count. -> ENOMEM */ -static int __swap_duplicate(swp_entry_t entry, bool cache) +static int __swap_duplicate(swp_entry_t entry, unsigned char usage) { - struct swap_info_struct * p; + struct swap_info_struct *p; unsigned long offset, type; - int result = -EINVAL; - int count; - bool has_cache; + unsigned char count; + unsigned char has_cache; + int err = -EINVAL; if (non_swap_entry(entry)) - return -EINVAL; + goto out; type = swp_type(entry); if (type >= nr_swapfiles) goto bad_file; - p = type + swap_info; + p = swap_info[type]; offset = swp_offset(entry); spin_lock(&swap_lock); - if (unlikely(offset >= p->max)) goto unlock_out; - count = swap_count(p->swap_map[offset]); - has_cache = swap_has_cache(p->swap_map[offset]); + count = p->swap_map[offset]; + has_cache = count & SWAP_HAS_CACHE; + count &= ~SWAP_HAS_CACHE; + err = 0; - if (cache == SWAP_CACHE) { /* called for swapcache/swapin-readahead */ + if (usage == SWAP_HAS_CACHE) { /* set SWAP_HAS_CACHE if there is no cache and entry is used */ - if (!has_cache && count) { - p->swap_map[offset] = encode_swapmap(count, true); - result = 0; - } else if (has_cache) /* someone added cache */ - result = -EEXIST; - else if (!count) /* no users */ - result = -ENOENT; + if (!has_cache && count) + has_cache = SWAP_HAS_CACHE; + else if (has_cache) /* someone else added cache */ + err = -EEXIST; + else /* no users remaining */ + err = -ENOENT; } else if (count || has_cache) { - if (count < SWAP_MAP_MAX - 1) { - p->swap_map[offset] = encode_swapmap(count + 1, - has_cache); - result = 0; - } else if (count <= SWAP_MAP_MAX) { - if (swap_overflow++ < 5) - printk(KERN_WARNING - "swap_dup: swap entry overflow\n"); - p->swap_map[offset] = encode_swapmap(SWAP_MAP_MAX, - has_cache); - result = 0; - } + + if ((count & ~COUNT_CONTINUED) < SWAP_MAP_MAX) + count += usage; + else if ((count & ~COUNT_CONTINUED) > SWAP_MAP_MAX) + err = -EINVAL; + else if (swap_count_continued(p, offset, count)) + count = COUNT_CONTINUED; + else + err = -ENOMEM; } else - result = -ENOENT; /* unused swap entry */ + err = -ENOENT; /* unused swap entry */ + + p->swap_map[offset] = count | has_cache; + unlock_out: spin_unlock(&swap_lock); out: - return result; + return err; bad_file: printk(KERN_ERR "swap_dup: %s%08lx\n", Bad_file, entry.val); goto out; } + +/* + * Help swapoff by noting that swap entry belongs to shmem/tmpfs + * (in which case its reference count is never incremented). + */ +void swap_shmem_alloc(swp_entry_t entry) +{ + __swap_duplicate(entry, SWAP_MAP_SHMEM); +} + /* - * increase reference count of swap entry by 1. + * Increase reference count of swap entry by 1. + * Returns 0 for success, or -ENOMEM if a swap_count_continuation is required + * but could not be atomically allocated. Returns 0, just as if it succeeded, + * if __swap_duplicate() fails for another reason (-EINVAL or -ENOENT), which + * might occur if a page table entry has got corrupted. */ -void swap_duplicate(swp_entry_t entry) +int swap_duplicate(swp_entry_t entry) { - __swap_duplicate(entry, SWAP_MAP); + int err = 0; + + while (!err && __swap_duplicate(entry, 1) == -ENOMEM) + err = add_swap_count_continuation(entry, GFP_ATOMIC); + return err; } /* * @entry: swap entry for which we allocate swap cache. * - * Called when allocating swap cache for exising swap entry, + * Called when allocating swap cache for existing swap entry, * This can return error codes. Returns 0 at success. * -EBUSY means there is a swap cache. * Note: return code is different from swap_duplicate(). */ int swapcache_prepare(swp_entry_t entry) { - return __swap_duplicate(entry, SWAP_CACHE); -} - - -struct swap_info_struct * -get_swap_info_struct(unsigned type) -{ - return &swap_info[type]; + return __swap_duplicate(entry, SWAP_HAS_CACHE); } /* @@ -2181,7 +2235,7 @@ int valid_swaphandles(swp_entry_t entry, unsigned long *offset) if (!our_page_cluster) /* no readahead */ return 0; - si = &swap_info[swp_type(entry)]; + si = swap_info[swp_type(entry)]; target = swp_offset(entry); base = (target >> our_page_cluster) << our_page_cluster; end = base + (1 << our_page_cluster); @@ -2217,3 +2271,219 @@ int valid_swaphandles(swp_entry_t entry, unsigned long *offset) *offset = ++toff; return nr_pages? ++nr_pages: 0; } + +/* + * add_swap_count_continuation - called when a swap count is duplicated + * beyond SWAP_MAP_MAX, it allocates a new page and links that to the entry's + * page of the original vmalloc'ed swap_map, to hold the continuation count + * (for that entry and for its neighbouring PAGE_SIZE swap entries). Called + * again when count is duplicated beyond SWAP_MAP_MAX * SWAP_CONT_MAX, etc. + * + * These continuation pages are seldom referenced: the common paths all work + * on the original swap_map, only referring to a continuation page when the + * low "digit" of a count is incremented or decremented through SWAP_MAP_MAX. + * + * add_swap_count_continuation(, GFP_ATOMIC) can be called while holding + * page table locks; if it fails, add_swap_count_continuation(, GFP_KERNEL) + * can be called after dropping locks. + */ +int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask) +{ + struct swap_info_struct *si; + struct page *head; + struct page *page; + struct page *list_page; + pgoff_t offset; + unsigned char count; + + /* + * When debugging, it's easier to use __GFP_ZERO here; but it's better + * for latency not to zero a page while GFP_ATOMIC and holding locks. + */ + page = alloc_page(gfp_mask | __GFP_HIGHMEM); + + si = swap_info_get(entry); + if (!si) { + /* + * An acceptable race has occurred since the failing + * __swap_duplicate(): the swap entry has been freed, + * perhaps even the whole swap_map cleared for swapoff. + */ + goto outer; + } + + offset = swp_offset(entry); + count = si->swap_map[offset] & ~SWAP_HAS_CACHE; + + if ((count & ~COUNT_CONTINUED) != SWAP_MAP_MAX) { + /* + * The higher the swap count, the more likely it is that tasks + * will race to add swap count continuation: we need to avoid + * over-provisioning. + */ + goto out; + } + + if (!page) { + spin_unlock(&swap_lock); + return -ENOMEM; + } + + /* + * We are fortunate that although vmalloc_to_page uses pte_offset_map, + * no architecture is using highmem pages for kernel pagetables: so it + * will not corrupt the GFP_ATOMIC caller's atomic pagetable kmaps. + */ + head = vmalloc_to_page(si->swap_map + offset); + offset &= ~PAGE_MASK; + + /* + * Page allocation does not initialize the page's lru field, + * but it does always reset its private field. + */ + if (!page_private(head)) { + BUG_ON(count & COUNT_CONTINUED); + INIT_LIST_HEAD(&head->lru); + set_page_private(head, SWP_CONTINUED); + si->flags |= SWP_CONTINUED; + } + + list_for_each_entry(list_page, &head->lru, lru) { + unsigned char *map; + + /* + * If the previous map said no continuation, but we've found + * a continuation page, free our allocation and use this one. + */ + if (!(count & COUNT_CONTINUED)) + goto out; + + map = kmap_atomic(list_page, KM_USER0) + offset; + count = *map; + kunmap_atomic(map, KM_USER0); + + /* + * If this continuation count now has some space in it, + * free our allocation and use this one. + */ + if ((count & ~COUNT_CONTINUED) != SWAP_CONT_MAX) + goto out; + } + + list_add_tail(&page->lru, &head->lru); + page = NULL; /* now it's attached, don't free it */ +out: + spin_unlock(&swap_lock); +outer: + if (page) + __free_page(page); + return 0; +} + +/* + * swap_count_continued - when the original swap_map count is incremented + * from SWAP_MAP_MAX, check if there is already a continuation page to carry + * into, carry if so, or else fail until a new continuation page is allocated; + * when the original swap_map count is decremented from 0 with continuation, + * borrow from the continuation and report whether it still holds more. + * Called while __swap_duplicate() or swap_entry_free() holds swap_lock. + */ +static bool swap_count_continued(struct swap_info_struct *si, + pgoff_t offset, unsigned char count) +{ + struct page *head; + struct page *page; + unsigned char *map; + + head = vmalloc_to_page(si->swap_map + offset); + if (page_private(head) != SWP_CONTINUED) { + BUG_ON(count & COUNT_CONTINUED); + return false; /* need to add count continuation */ + } + + offset &= ~PAGE_MASK; + page = list_entry(head->lru.next, struct page, lru); + map = kmap_atomic(page, KM_USER0) + offset; + + if (count == SWAP_MAP_MAX) /* initial increment from swap_map */ + goto init_map; /* jump over SWAP_CONT_MAX checks */ + + if (count == (SWAP_MAP_MAX | COUNT_CONTINUED)) { /* incrementing */ + /* + * Think of how you add 1 to 999 + */ + while (*map == (SWAP_CONT_MAX | COUNT_CONTINUED)) { + kunmap_atomic(map, KM_USER0); + page = list_entry(page->lru.next, struct page, lru); + BUG_ON(page == head); + map = kmap_atomic(page, KM_USER0) + offset; + } + if (*map == SWAP_CONT_MAX) { + kunmap_atomic(map, KM_USER0); + page = list_entry(page->lru.next, struct page, lru); + if (page == head) + return false; /* add count continuation */ + map = kmap_atomic(page, KM_USER0) + offset; +init_map: *map = 0; /* we didn't zero the page */ + } + *map += 1; + kunmap_atomic(map, KM_USER0); + page = list_entry(page->lru.prev, struct page, lru); + while (page != head) { + map = kmap_atomic(page, KM_USER0) + offset; + *map = COUNT_CONTINUED; + kunmap_atomic(map, KM_USER0); + page = list_entry(page->lru.prev, struct page, lru); + } + return true; /* incremented */ + + } else { /* decrementing */ + /* + * Think of how you subtract 1 from 1000 + */ + BUG_ON(count != COUNT_CONTINUED); + while (*map == COUNT_CONTINUED) { + kunmap_atomic(map, KM_USER0); + page = list_entry(page->lru.next, struct page, lru); + BUG_ON(page == head); + map = kmap_atomic(page, KM_USER0) + offset; + } + BUG_ON(*map == 0); + *map -= 1; + if (*map == 0) + count = 0; + kunmap_atomic(map, KM_USER0); + page = list_entry(page->lru.prev, struct page, lru); + while (page != head) { + map = kmap_atomic(page, KM_USER0) + offset; + *map = SWAP_CONT_MAX | count; + count = COUNT_CONTINUED; + kunmap_atomic(map, KM_USER0); + page = list_entry(page->lru.prev, struct page, lru); + } + return count == COUNT_CONTINUED; + } +} + +/* + * free_swap_count_continuations - swapoff free all the continuation pages + * appended to the swap_map, after swap_map is quiesced, before vfree'ing it. + */ +static void free_swap_count_continuations(struct swap_info_struct *si) +{ + pgoff_t offset; + + for (offset = 0; offset < si->max; offset += PAGE_SIZE) { + struct page *head; + head = vmalloc_to_page(si->swap_map + offset); + if (page_private(head)) { + struct list_head *this, *next; + list_for_each_safe(this, next, &head->lru) { + struct page *page; + page = list_entry(this, struct page, lru); + list_del(this); + __free_page(page); + } + } + } +} diff --git a/mm/truncate.c b/mm/truncate.c index 450cebdabfc0..f42675a3615d 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -9,6 +9,7 @@ #include <linux/kernel.h> #include <linux/backing-dev.h> +#include <linux/gfp.h> #include <linux/mm.h> #include <linux/swap.h> #include <linux/module.h> @@ -272,6 +273,7 @@ void truncate_inode_pages_range(struct address_space *mapping, pagevec_release(&pvec); break; } + mem_cgroup_uncharge_start(); for (i = 0; i < pagevec_count(&pvec); i++) { struct page *page = pvec.pages[i]; @@ -286,6 +288,7 @@ void truncate_inode_pages_range(struct address_space *mapping, unlock_page(page); } pagevec_release(&pvec); + mem_cgroup_uncharge_end(); } } EXPORT_SYMBOL(truncate_inode_pages_range); @@ -327,6 +330,7 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping, pagevec_init(&pvec, 0); while (next <= end && pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) { + mem_cgroup_uncharge_start(); for (i = 0; i < pagevec_count(&pvec); i++) { struct page *page = pvec.pages[i]; pgoff_t index; @@ -354,6 +358,7 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping, break; } pagevec_release(&pvec); + mem_cgroup_uncharge_end(); cond_resched(); } return ret; @@ -428,6 +433,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping, while (next <= end && !wrapped && pagevec_lookup(&pvec, mapping, next, min(end - next, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) { + mem_cgroup_uncharge_start(); for (i = 0; i < pagevec_count(&pvec); i++) { struct page *page = pvec.pages[i]; pgoff_t page_index; @@ -477,6 +483,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping, unlock_page(page); } pagevec_release(&pvec); + mem_cgroup_uncharge_end(); cond_resched(); } return ret; @@ -490,7 +497,7 @@ EXPORT_SYMBOL_GPL(invalidate_inode_pages2_range); * Any pages which are found to be mapped into pagetables are unmapped prior to * invalidation. * - * Returns -EIO if any pages could not be invalidated. + * Returns -EBUSY if any pages could not be invalidated. */ int invalidate_inode_pages2(struct address_space *mapping) { @@ -516,22 +523,20 @@ EXPORT_SYMBOL_GPL(invalidate_inode_pages2); */ void truncate_pagecache(struct inode *inode, loff_t old, loff_t new) { - if (new < old) { - struct address_space *mapping = inode->i_mapping; - - /* - * unmap_mapping_range is called twice, first simply for - * efficiency so that truncate_inode_pages does fewer - * single-page unmaps. However after this first call, and - * before truncate_inode_pages finishes, it is possible for - * private pages to be COWed, which remain after - * truncate_inode_pages finishes, hence the second - * unmap_mapping_range call must be made for correctness. - */ - unmap_mapping_range(mapping, new + PAGE_SIZE - 1, 0, 1); - truncate_inode_pages(mapping, new); - unmap_mapping_range(mapping, new + PAGE_SIZE - 1, 0, 1); - } + struct address_space *mapping = inode->i_mapping; + + /* + * unmap_mapping_range is called twice, first simply for + * efficiency so that truncate_inode_pages does fewer + * single-page unmaps. However after this first call, and + * before truncate_inode_pages finishes, it is possible for + * private pages to be COWed, which remain after + * truncate_inode_pages finishes, hence the second + * unmap_mapping_range call must be made for correctness. + */ + unmap_mapping_range(mapping, new + PAGE_SIZE - 1, 0, 1); + truncate_inode_pages(mapping, new); + unmap_mapping_range(mapping, new + PAGE_SIZE - 1, 0, 1); } EXPORT_SYMBOL(truncate_pagecache); diff --git a/mm/util.c b/mm/util.c index 7c35ad95f927..f5712e8964be 100644 --- a/mm/util.c +++ b/mm/util.c @@ -186,6 +186,27 @@ void kzfree(const void *p) } EXPORT_SYMBOL(kzfree); +int kern_ptr_validate(const void *ptr, unsigned long size) +{ + unsigned long addr = (unsigned long)ptr; + unsigned long min_addr = PAGE_OFFSET; + unsigned long align_mask = sizeof(void *) - 1; + + if (unlikely(addr < min_addr)) + goto out; + if (unlikely(addr > (unsigned long)high_memory - size)) + goto out; + if (unlikely(addr & align_mask)) + goto out; + if (unlikely(!kern_addr_valid(addr))) + goto out; + if (unlikely(!kern_addr_valid(addr + size - 1))) + goto out; + return 1; +out: + return 0; +} + /* * strndup_user - duplicate an existing string from user space * @s: The string to duplicate @@ -220,7 +241,7 @@ char *strndup_user(const char __user *s, long n) } EXPORT_SYMBOL(strndup_user); -#ifndef HAVE_ARCH_PICK_MMAP_LAYOUT +#if defined(CONFIG_MMU) && !defined(HAVE_ARCH_PICK_MMAP_LAYOUT) void arch_pick_mmap_layout(struct mm_struct *mm) { mm->mmap_base = TASK_UNMAPPED_BASE; diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 0f551a4a44cd..ae007462b7f6 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -509,6 +509,9 @@ static unsigned long lazy_max_pages(void) static atomic_t vmap_lazy_nr = ATOMIC_INIT(0); +/* for per-CPU blocks */ +static void purge_fragmented_blocks_allcpus(void); + /* * Purges all lazily-freed vmap areas. * @@ -539,6 +542,9 @@ static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end, } else spin_lock(&purge_lock); + if (sync) + purge_fragmented_blocks_allcpus(); + rcu_read_lock(); list_for_each_entry_rcu(va, &vmap_area_list, list) { if (va->flags & VM_LAZY_FREE) { @@ -555,10 +561,8 @@ static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end, } rcu_read_unlock(); - if (nr) { - BUG_ON(nr > atomic_read(&vmap_lazy_nr)); + if (nr) atomic_sub(nr, &vmap_lazy_nr); - } if (nr || force_flush) flush_tlb_kernel_range(*start, *end); @@ -669,8 +673,6 @@ static bool vmap_initialized __read_mostly = false; struct vmap_block_queue { spinlock_t lock; struct list_head free; - struct list_head dirty; - unsigned int nr_dirty; }; struct vmap_block { @@ -680,10 +682,9 @@ struct vmap_block { unsigned long free, dirty; DECLARE_BITMAP(alloc_map, VMAP_BBMAP_BITS); DECLARE_BITMAP(dirty_map, VMAP_BBMAP_BITS); - union { - struct list_head free_list; - struct rcu_head rcu_head; - }; + struct list_head free_list; + struct rcu_head rcu_head; + struct list_head purge; }; /* Queue of free and dirty vmap blocks, for allocation and flushing purposes */ @@ -759,9 +760,9 @@ static struct vmap_block *new_vmap_block(gfp_t gfp_mask) vbq = &get_cpu_var(vmap_block_queue); vb->vbq = vbq; spin_lock(&vbq->lock); - list_add(&vb->free_list, &vbq->free); + list_add_rcu(&vb->free_list, &vbq->free); spin_unlock(&vbq->lock); - put_cpu_var(vmap_cpu_blocks); + put_cpu_var(vmap_block_queue); return vb; } @@ -778,8 +779,6 @@ static void free_vmap_block(struct vmap_block *vb) struct vmap_block *tmp; unsigned long vb_idx; - BUG_ON(!list_empty(&vb->free_list)); - vb_idx = addr_to_vb_idx(vb->va->va_start); spin_lock(&vmap_block_tree_lock); tmp = radix_tree_delete(&vmap_block_tree, vb_idx); @@ -790,12 +789,61 @@ static void free_vmap_block(struct vmap_block *vb) call_rcu(&vb->rcu_head, rcu_free_vb); } +static void purge_fragmented_blocks(int cpu) +{ + LIST_HEAD(purge); + struct vmap_block *vb; + struct vmap_block *n_vb; + struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, cpu); + + rcu_read_lock(); + list_for_each_entry_rcu(vb, &vbq->free, free_list) { + + if (!(vb->free + vb->dirty == VMAP_BBMAP_BITS && vb->dirty != VMAP_BBMAP_BITS)) + continue; + + spin_lock(&vb->lock); + if (vb->free + vb->dirty == VMAP_BBMAP_BITS && vb->dirty != VMAP_BBMAP_BITS) { + vb->free = 0; /* prevent further allocs after releasing lock */ + vb->dirty = VMAP_BBMAP_BITS; /* prevent purging it again */ + bitmap_fill(vb->alloc_map, VMAP_BBMAP_BITS); + bitmap_fill(vb->dirty_map, VMAP_BBMAP_BITS); + spin_lock(&vbq->lock); + list_del_rcu(&vb->free_list); + spin_unlock(&vbq->lock); + spin_unlock(&vb->lock); + list_add_tail(&vb->purge, &purge); + } else + spin_unlock(&vb->lock); + } + rcu_read_unlock(); + + list_for_each_entry_safe(vb, n_vb, &purge, purge) { + list_del(&vb->purge); + free_vmap_block(vb); + } +} + +static void purge_fragmented_blocks_thiscpu(void) +{ + purge_fragmented_blocks(smp_processor_id()); +} + +static void purge_fragmented_blocks_allcpus(void) +{ + int cpu; + + for_each_possible_cpu(cpu) + purge_fragmented_blocks(cpu); +} + static void *vb_alloc(unsigned long size, gfp_t gfp_mask) { struct vmap_block_queue *vbq; struct vmap_block *vb; unsigned long addr = 0; unsigned int order; + int purge = 0; BUG_ON(size & ~PAGE_MASK); BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC); @@ -808,25 +856,39 @@ again: int i; spin_lock(&vb->lock); + if (vb->free < 1UL << order) + goto next; + i = bitmap_find_free_region(vb->alloc_map, VMAP_BBMAP_BITS, order); - if (i >= 0) { - addr = vb->va->va_start + (i << PAGE_SHIFT); - BUG_ON(addr_to_vb_idx(addr) != - addr_to_vb_idx(vb->va->va_start)); - vb->free -= 1UL << order; - if (vb->free == 0) { - spin_lock(&vbq->lock); - list_del_init(&vb->free_list); - spin_unlock(&vbq->lock); + if (i < 0) { + if (vb->free + vb->dirty == VMAP_BBMAP_BITS) { + /* fragmented and no outstanding allocations */ + BUG_ON(vb->dirty != VMAP_BBMAP_BITS); + purge = 1; } - spin_unlock(&vb->lock); - break; + goto next; + } + addr = vb->va->va_start + (i << PAGE_SHIFT); + BUG_ON(addr_to_vb_idx(addr) != + addr_to_vb_idx(vb->va->va_start)); + vb->free -= 1UL << order; + if (vb->free == 0) { + spin_lock(&vbq->lock); + list_del_rcu(&vb->free_list); + spin_unlock(&vbq->lock); } spin_unlock(&vb->lock); + break; +next: + spin_unlock(&vb->lock); } - put_cpu_var(vmap_cpu_blocks); + + if (purge) + purge_fragmented_blocks_thiscpu(); + + put_cpu_var(vmap_block_queue); rcu_read_unlock(); if (!addr) { @@ -862,11 +924,11 @@ static void vb_free(const void *addr, unsigned long size) BUG_ON(!vb); spin_lock(&vb->lock); - bitmap_allocate_region(vb->dirty_map, offset >> PAGE_SHIFT, order); + BUG_ON(bitmap_allocate_region(vb->dirty_map, offset >> PAGE_SHIFT, order)); vb->dirty += 1UL << order; if (vb->dirty == VMAP_BBMAP_BITS) { - BUG_ON(vb->free || !list_empty(&vb->free_list)); + BUG_ON(vb->free); spin_unlock(&vb->lock); free_vmap_block(vb); } else @@ -1035,8 +1097,6 @@ void __init vmalloc_init(void) vbq = &per_cpu(vmap_block_queue, i); spin_lock_init(&vbq->lock); INIT_LIST_HEAD(&vbq->free); - INIT_LIST_HEAD(&vbq->dirty); - vbq->nr_dirty = 0; } /* Import existing vmlist entries. */ @@ -1411,6 +1471,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, { struct page **pages; unsigned int nr_pages, array_size, i; + gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO; nr_pages = (area->size - PAGE_SIZE) >> PAGE_SHIFT; array_size = (nr_pages * sizeof(struct page *)); @@ -1418,13 +1479,11 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, area->nr_pages = nr_pages; /* Please note that the recursion is strictly bounded. */ if (array_size > PAGE_SIZE) { - pages = __vmalloc_node(array_size, 1, gfp_mask | __GFP_ZERO, + pages = __vmalloc_node(array_size, 1, nested_gfp|__GFP_HIGHMEM, PAGE_KERNEL, node, caller); area->flags |= VM_VPAGES; } else { - pages = kmalloc_node(array_size, - (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO, - node); + pages = kmalloc_node(array_size, nested_gfp, node); } area->pages = pages; area->caller = caller; diff --git a/mm/vmscan.c b/mm/vmscan.c index 777af57fd8c8..3ff3311447f5 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -13,7 +13,7 @@ #include <linux/mm.h> #include <linux/module.h> -#include <linux/slab.h> +#include <linux/gfp.h> #include <linux/kernel_stat.h> #include <linux/swap.h> #include <linux/pagemap.h> @@ -55,6 +55,11 @@ struct scan_control { /* Number of pages freed so far during a call to shrink_zones() */ unsigned long nr_reclaimed; + /* How many pages shrink_list() should reclaim */ + unsigned long nr_to_reclaim; + + unsigned long hibernation_mode; + /* This context's GFP mask */ gfp_t gfp_mask; @@ -66,12 +71,6 @@ struct scan_control { /* Can pages be swapped as part of reclaim? */ int may_swap; - /* This context's SWAP_CLUSTER_MAX. If freeing memory for - * suspend, we effectively ignore SWAP_CLUSTER_MAX. - * In this context, it doesn't matter that we scan the - * whole list at once. */ - int swap_cluster_max; - int swappiness; int all_unreclaimable; @@ -263,27 +262,6 @@ unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask, return ret; } -/* Called without lock on whether page is mapped, so answer is unstable */ -static inline int page_mapping_inuse(struct page *page) -{ - struct address_space *mapping; - - /* Page is in somebody's page tables. */ - if (page_mapped(page)) - return 1; - - /* Be more reluctant to reclaim swapcache than pagecache */ - if (PageSwapCache(page)) - return 1; - - mapping = page_mapping(page); - if (!mapping) - return 0; - - /* File is mmap'd by somebody? */ - return mapping_mapped(mapping); -} - static inline int is_page_cache_freeable(struct page *page) { /* @@ -358,7 +336,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping, * stalls if we need to run get_block(). We could test * PagePrivate for that. * - * If this process is currently in generic_file_write() against + * If this process is currently in __generic_file_aio_write() against * this page's queue, we can perform writeback even if that * will block. * @@ -580,6 +558,65 @@ redo: put_page(page); /* drop ref from isolate */ } +enum page_references { + PAGEREF_RECLAIM, + PAGEREF_RECLAIM_CLEAN, + PAGEREF_KEEP, + PAGEREF_ACTIVATE, +}; + +static enum page_references page_check_references(struct page *page, + struct scan_control *sc) +{ + int referenced_ptes, referenced_page; + unsigned long vm_flags; + + referenced_ptes = page_referenced(page, 1, sc->mem_cgroup, &vm_flags); + referenced_page = TestClearPageReferenced(page); + + /* Lumpy reclaim - ignore references */ + if (sc->order > PAGE_ALLOC_COSTLY_ORDER) + return PAGEREF_RECLAIM; + + /* + * Mlock lost the isolation race with us. Let try_to_unmap() + * move the page to the unevictable list. + */ + if (vm_flags & VM_LOCKED) + return PAGEREF_RECLAIM; + + if (referenced_ptes) { + if (PageAnon(page)) + return PAGEREF_ACTIVATE; + /* + * All mapped pages start out with page table + * references from the instantiating fault, so we need + * to look twice if a mapped file page is used more + * than once. + * + * Mark it and spare it for another trip around the + * inactive list. Another page table reference will + * lead to its activation. + * + * Note: the mark is set for activated pages as well + * so that recently deactivated but used pages are + * quickly recovered. + */ + SetPageReferenced(page); + + if (referenced_page) + return PAGEREF_ACTIVATE; + + return PAGEREF_KEEP; + } + + /* Reclaim if clean, defer dirty pages to writeback */ + if (referenced_page) + return PAGEREF_RECLAIM_CLEAN; + + return PAGEREF_RECLAIM; +} + /* * shrink_page_list() returns the number of reclaimed pages */ @@ -591,16 +628,15 @@ static unsigned long shrink_page_list(struct list_head *page_list, struct pagevec freed_pvec; int pgactivate = 0; unsigned long nr_reclaimed = 0; - unsigned long vm_flags; cond_resched(); pagevec_init(&freed_pvec, 1); while (!list_empty(page_list)) { + enum page_references references; struct address_space *mapping; struct page *page; int may_enter_fs; - int referenced; cond_resched(); @@ -642,17 +678,16 @@ static unsigned long shrink_page_list(struct list_head *page_list, goto keep_locked; } - referenced = page_referenced(page, 1, - sc->mem_cgroup, &vm_flags); - /* - * In active use or really unfreeable? Activate it. - * If page which have PG_mlocked lost isoltation race, - * try_to_unmap moves it to unevictable list - */ - if (sc->order <= PAGE_ALLOC_COSTLY_ORDER && - referenced && page_mapping_inuse(page) - && !(vm_flags & VM_LOCKED)) + references = page_check_references(page, sc); + switch (references) { + case PAGEREF_ACTIVATE: goto activate_locked; + case PAGEREF_KEEP: + goto keep_locked; + case PAGEREF_RECLAIM: + case PAGEREF_RECLAIM_CLEAN: + ; /* try to reclaim the page below */ + } /* * Anonymous process memory has backing store? @@ -686,7 +721,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, } if (PageDirty(page)) { - if (sc->order <= PAGE_ALLOC_COSTLY_ORDER && referenced) + if (references == PAGEREF_RECLAIM_CLEAN) goto keep_locked; if (!may_enter_fs) goto keep_locked; @@ -1132,7 +1167,7 @@ static unsigned long shrink_inactive_list(unsigned long max_scan, unsigned long nr_anon; unsigned long nr_file; - nr_taken = sc->isolate_pages(sc->swap_cluster_max, + nr_taken = sc->isolate_pages(SWAP_CLUSTER_MAX, &page_list, &nr_scan, sc->order, mode, zone, sc->mem_cgroup, 0, file); @@ -1166,10 +1201,8 @@ static unsigned long shrink_inactive_list(unsigned long max_scan, __mod_zone_page_state(zone, NR_ISOLATED_ANON, nr_anon); __mod_zone_page_state(zone, NR_ISOLATED_FILE, nr_file); - reclaim_stat->recent_scanned[0] += count[LRU_INACTIVE_ANON]; - reclaim_stat->recent_scanned[0] += count[LRU_ACTIVE_ANON]; - reclaim_stat->recent_scanned[1] += count[LRU_INACTIVE_FILE]; - reclaim_stat->recent_scanned[1] += count[LRU_ACTIVE_FILE]; + reclaim_stat->recent_scanned[0] += nr_anon; + reclaim_stat->recent_scanned[1] += nr_file; spin_unlock_irq(&zone->lru_lock); @@ -1353,9 +1386,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, continue; } - /* page_referenced clears PageReferenced */ - if (page_mapping_inuse(page) && - page_referenced(page, 0, sc->mem_cgroup, &vm_flags)) { + if (page_referenced(page, 0, sc->mem_cgroup, &vm_flags)) { nr_rotated++; /* * Identify referenced, file-backed active pages and @@ -1464,20 +1495,26 @@ static int inactive_file_is_low(struct zone *zone, struct scan_control *sc) return low; } +static int inactive_list_is_low(struct zone *zone, struct scan_control *sc, + int file) +{ + if (file) + return inactive_file_is_low(zone, sc); + else + return inactive_anon_is_low(zone, sc); +} + static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, struct zone *zone, struct scan_control *sc, int priority) { int file = is_file_lru(lru); - if (lru == LRU_ACTIVE_FILE && inactive_file_is_low(zone, sc)) { - shrink_active_list(nr_to_scan, zone, sc, priority, file); + if (is_active_lru(lru)) { + if (inactive_list_is_low(zone, sc, file)) + shrink_active_list(nr_to_scan, zone, sc, priority, file); return 0; } - if (lru == LRU_ACTIVE_ANON && inactive_anon_is_low(zone, sc)) { - shrink_active_list(nr_to_scan, zone, sc, priority, file); - return 0; - } return shrink_inactive_list(nr_to_scan, zone, sc, priority, file); } @@ -1567,15 +1604,14 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc, * until we collected @swap_cluster_max pages to scan. */ static unsigned long nr_scan_try_batch(unsigned long nr_to_scan, - unsigned long *nr_saved_scan, - unsigned long swap_cluster_max) + unsigned long *nr_saved_scan) { unsigned long nr; *nr_saved_scan += nr_to_scan; nr = *nr_saved_scan; - if (nr >= swap_cluster_max) + if (nr >= SWAP_CLUSTER_MAX) *nr_saved_scan = 0; else nr = 0; @@ -1594,7 +1630,7 @@ static void shrink_zone(int priority, struct zone *zone, unsigned long percent[2]; /* anon @ 0; file @ 1 */ enum lru_list l; unsigned long nr_reclaimed = sc->nr_reclaimed; - unsigned long swap_cluster_max = sc->swap_cluster_max; + unsigned long nr_to_reclaim = sc->nr_to_reclaim; struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); int noswap = 0; @@ -1616,15 +1652,15 @@ static void shrink_zone(int priority, struct zone *zone, scan = (scan * percent[file]) / 100; } nr[l] = nr_scan_try_batch(scan, - &reclaim_stat->nr_saved_scan[l], - swap_cluster_max); + &reclaim_stat->nr_saved_scan[l]); } while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || nr[LRU_INACTIVE_FILE]) { for_each_evictable_lru(l) { if (nr[l]) { - nr_to_scan = min(nr[l], swap_cluster_max); + nr_to_scan = min_t(unsigned long, + nr[l], SWAP_CLUSTER_MAX); nr[l] -= nr_to_scan; nr_reclaimed += shrink_list(l, nr_to_scan, @@ -1639,8 +1675,7 @@ static void shrink_zone(int priority, struct zone *zone, * with multiple processes reclaiming pages, the total * freeing target can get unreasonably large. */ - if (nr_reclaimed > swap_cluster_max && - priority < DEF_PRIORITY && !current_is_kswapd()) + if (nr_reclaimed >= nr_to_reclaim && priority < DEF_PRIORITY) break; } @@ -1693,8 +1728,7 @@ static void shrink_zones(int priority, struct zonelist *zonelist, continue; note_zone_scanning_priority(zone, priority); - if (zone_is_all_unreclaimable(zone) && - priority != DEF_PRIORITY) + if (zone->all_unreclaimable && priority != DEF_PRIORITY) continue; /* Let kswapd poll it */ sc->all_unreclaimable = 0; } else { @@ -1738,6 +1772,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, struct zoneref *z; struct zone *zone; enum zone_type high_zoneidx = gfp_zone(sc->gfp_mask); + unsigned long writeback_threshold; delayacct_freepages_start(); @@ -1773,7 +1808,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, } } total_scanned += sc->nr_scanned; - if (sc->nr_reclaimed >= sc->swap_cluster_max) { + if (sc->nr_reclaimed >= sc->nr_to_reclaim) { ret = sc->nr_reclaimed; goto out; } @@ -1785,14 +1820,15 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, * that's undesirable in laptop mode, where we *want* lumpy * writeout. So in laptop mode, write out the whole world. */ - if (total_scanned > sc->swap_cluster_max + - sc->swap_cluster_max / 2) { + writeback_threshold = sc->nr_to_reclaim + sc->nr_to_reclaim / 2; + if (total_scanned > writeback_threshold) { wakeup_flusher_threads(laptop_mode ? 0 : total_scanned); sc->may_writepage = 1; } /* Take a nap, wait for some writeback to complete */ - if (sc->nr_scanned && priority < DEF_PRIORITY - 2) + if (!sc->hibernation_mode && sc->nr_scanned && + priority < DEF_PRIORITY - 2) congestion_wait(BLK_RW_ASYNC, HZ/10); } /* top priority shrink_zones still had more to do? don't OOM, then */ @@ -1831,7 +1867,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, struct scan_control sc = { .gfp_mask = gfp_mask, .may_writepage = !laptop_mode, - .swap_cluster_max = SWAP_CLUSTER_MAX, + .nr_to_reclaim = SWAP_CLUSTER_MAX, .may_unmap = 1, .may_swap = 1, .swappiness = vm_swappiness, @@ -1855,7 +1891,6 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, .may_writepage = !laptop_mode, .may_unmap = 1, .may_swap = !noswap, - .swap_cluster_max = SWAP_CLUSTER_MAX, .swappiness = swappiness, .order = 0, .mem_cgroup = mem, @@ -1889,7 +1924,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, .may_writepage = !laptop_mode, .may_unmap = 1, .may_swap = !noswap, - .swap_cluster_max = SWAP_CLUSTER_MAX, + .nr_to_reclaim = SWAP_CLUSTER_MAX, .swappiness = swappiness, .order = 0, .mem_cgroup = mem_cont, @@ -1904,6 +1939,33 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, } #endif +/* is kswapd sleeping prematurely? */ +static int sleeping_prematurely(pg_data_t *pgdat, int order, long remaining) +{ + int i; + + /* If a direct reclaimer woke kswapd within HZ/10, it's premature */ + if (remaining) + return 1; + + /* If after HZ/10, a zone is below the high mark, it's premature */ + for (i = 0; i < pgdat->nr_zones; i++) { + struct zone *zone = pgdat->node_zones + i; + + if (!populated_zone(zone)) + continue; + + if (zone->all_unreclaimable) + continue; + + if (!zone_watermark_ok(zone, order, high_wmark_pages(zone), + 0, 0)) + return 1; + } + + return 0; +} + /* * For kswapd, balance_pgdat() will work across all this node's zones until * they are all at high_wmark_pages(zone). @@ -1936,7 +1998,11 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order) .gfp_mask = GFP_KERNEL, .may_unmap = 1, .may_swap = 1, - .swap_cluster_max = SWAP_CLUSTER_MAX, + /* + * kswapd doesn't want to be bailed out while reclaim. because + * we want to put equal scanning pressure on each zone. + */ + .nr_to_reclaim = ULONG_MAX, .swappiness = vm_swappiness, .order = order, .mem_cgroup = NULL, @@ -1961,6 +2027,7 @@ loop_again: for (priority = DEF_PRIORITY; priority >= 0; priority--) { int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ unsigned long lru_pages = 0; + int has_under_min_watermark_zone = 0; /* The swap token gets in the way of swapout... */ if (!priority) @@ -1978,8 +2045,7 @@ loop_again: if (!populated_zone(zone)) continue; - if (zone_is_all_unreclaimable(zone) && - priority != DEF_PRIORITY) + if (zone->all_unreclaimable && priority != DEF_PRIORITY) continue; /* @@ -2022,13 +2088,9 @@ loop_again: if (!populated_zone(zone)) continue; - if (zone_is_all_unreclaimable(zone) && - priority != DEF_PRIORITY) + if (zone->all_unreclaimable && priority != DEF_PRIORITY) continue; - if (!zone_watermark_ok(zone, order, - high_wmark_pages(zone), end_zone, 0)) - all_zones_ok = 0; temp_priority[i] = priority; sc.nr_scanned = 0; note_zone_scanning_priority(zone, priority); @@ -2053,12 +2115,11 @@ loop_again: lru_pages); sc.nr_reclaimed += reclaim_state->reclaimed_slab; total_scanned += sc.nr_scanned; - if (zone_is_all_unreclaimable(zone)) + if (zone->all_unreclaimable) continue; - if (nr_slab == 0 && zone->pages_scanned >= - (zone_reclaimable_pages(zone) * 6)) - zone_set_flag(zone, - ZONE_ALL_UNRECLAIMABLE); + if (nr_slab == 0 && + zone->pages_scanned >= (zone_reclaimable_pages(zone) * 6)) + zone->all_unreclaimable = 1; /* * If we've done a decent amount of scanning and * the reclaim ratio is low, start doing writepage @@ -2067,6 +2128,20 @@ loop_again: if (total_scanned > SWAP_CLUSTER_MAX * 2 && total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2) sc.may_writepage = 1; + + if (!zone_watermark_ok(zone, order, + high_wmark_pages(zone), end_zone, 0)) { + all_zones_ok = 0; + /* + * We are still under min water mark. This + * means that we have a GFP_ATOMIC allocation + * failure risk. Hurry up! + */ + if (!zone_watermark_ok(zone, order, + min_wmark_pages(zone), end_zone, 0)) + has_under_min_watermark_zone = 1; + } + } if (all_zones_ok) break; /* kswapd: all done */ @@ -2074,8 +2149,12 @@ loop_again: * OK, kswapd is getting into trouble. Take a nap, then take * another pass across the zones. */ - if (total_scanned && priority < DEF_PRIORITY - 2) - congestion_wait(BLK_RW_ASYNC, HZ/10); + if (total_scanned && (priority < DEF_PRIORITY - 2)) { + if (has_under_min_watermark_zone) + count_vm_event(KSWAPD_SKIP_CONGESTION_WAIT); + else + congestion_wait(BLK_RW_ASYNC, HZ/10); + } /* * We do this so kswapd doesn't build up large priorities for @@ -2173,6 +2252,7 @@ static int kswapd(void *p) order = 0; for ( ; ; ) { unsigned long new_order; + int ret; prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); new_order = pgdat->kswapd_max_order; @@ -2184,19 +2264,45 @@ static int kswapd(void *p) */ order = new_order; } else { - if (!freezing(current)) - schedule(); + if (!freezing(current) && !kthread_should_stop()) { + long remaining = 0; + + /* Try to sleep for a short interval */ + if (!sleeping_prematurely(pgdat, order, remaining)) { + remaining = schedule_timeout(HZ/10); + finish_wait(&pgdat->kswapd_wait, &wait); + prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); + } + + /* + * After a short sleep, check if it was a + * premature sleep. If not, then go fully + * to sleep until explicitly woken up + */ + if (!sleeping_prematurely(pgdat, order, remaining)) + schedule(); + else { + if (remaining) + count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY); + else + count_vm_event(KSWAPD_HIGH_WMARK_HIT_QUICKLY); + } + } order = pgdat->kswapd_max_order; } finish_wait(&pgdat->kswapd_wait, &wait); - if (!try_to_freeze()) { - /* We can speed up thawing tasks if we don't call - * balance_pgdat after returning from the refrigerator - */ + ret = try_to_freeze(); + if (kthread_should_stop()) + break; + + /* + * We can speed up thawing tasks if we don't call balance_pgdat + * after returning from the refrigerator + */ + if (!ret) balance_pgdat(pgdat, order); - } } return 0; } @@ -2260,148 +2366,43 @@ unsigned long zone_reclaimable_pages(struct zone *zone) #ifdef CONFIG_HIBERNATION /* - * Helper function for shrink_all_memory(). Tries to reclaim 'nr_pages' pages - * from LRU lists system-wide, for given pass and priority. - * - * For pass > 3 we also try to shrink the LRU lists that contain a few pages - */ -static void shrink_all_zones(unsigned long nr_pages, int prio, - int pass, struct scan_control *sc) -{ - struct zone *zone; - unsigned long nr_reclaimed = 0; - struct zone_reclaim_stat *reclaim_stat; - - for_each_populated_zone(zone) { - enum lru_list l; - - if (zone_is_all_unreclaimable(zone) && prio != DEF_PRIORITY) - continue; - - for_each_evictable_lru(l) { - enum zone_stat_item ls = NR_LRU_BASE + l; - unsigned long lru_pages = zone_page_state(zone, ls); - - /* For pass = 0, we don't shrink the active list */ - if (pass == 0 && (l == LRU_ACTIVE_ANON || - l == LRU_ACTIVE_FILE)) - continue; - - reclaim_stat = get_reclaim_stat(zone, sc); - reclaim_stat->nr_saved_scan[l] += - (lru_pages >> prio) + 1; - if (reclaim_stat->nr_saved_scan[l] - >= nr_pages || pass > 3) { - unsigned long nr_to_scan; - - reclaim_stat->nr_saved_scan[l] = 0; - nr_to_scan = min(nr_pages, lru_pages); - nr_reclaimed += shrink_list(l, nr_to_scan, zone, - sc, prio); - if (nr_reclaimed >= nr_pages) { - sc->nr_reclaimed += nr_reclaimed; - return; - } - } - } - } - sc->nr_reclaimed += nr_reclaimed; -} - -/* - * Try to free `nr_pages' of memory, system-wide, and return the number of + * Try to free `nr_to_reclaim' of memory, system-wide, and return the number of * freed pages. * * Rather than trying to age LRUs the aim is to preserve the overall * LRU order by reclaiming preferentially * inactive > active > active referenced > active mapped */ -unsigned long shrink_all_memory(unsigned long nr_pages) +unsigned long shrink_all_memory(unsigned long nr_to_reclaim) { - unsigned long lru_pages, nr_slab; - int pass; struct reclaim_state reclaim_state; struct scan_control sc = { - .gfp_mask = GFP_KERNEL, - .may_unmap = 0, + .gfp_mask = GFP_HIGHUSER_MOVABLE, + .may_swap = 1, + .may_unmap = 1, .may_writepage = 1, + .nr_to_reclaim = nr_to_reclaim, + .hibernation_mode = 1, + .swappiness = vm_swappiness, + .order = 0, .isolate_pages = isolate_pages_global, - .nr_reclaimed = 0, }; + struct zonelist * zonelist = node_zonelist(numa_node_id(), sc.gfp_mask); + struct task_struct *p = current; + unsigned long nr_reclaimed; - current->reclaim_state = &reclaim_state; - - lru_pages = global_reclaimable_pages(); - nr_slab = global_page_state(NR_SLAB_RECLAIMABLE); - /* If slab caches are huge, it's better to hit them first */ - while (nr_slab >= lru_pages) { - reclaim_state.reclaimed_slab = 0; - shrink_slab(nr_pages, sc.gfp_mask, lru_pages); - if (!reclaim_state.reclaimed_slab) - break; - - sc.nr_reclaimed += reclaim_state.reclaimed_slab; - if (sc.nr_reclaimed >= nr_pages) - goto out; - - nr_slab -= reclaim_state.reclaimed_slab; - } - - /* - * We try to shrink LRUs in 5 passes: - * 0 = Reclaim from inactive_list only - * 1 = Reclaim from active list but don't reclaim mapped - * 2 = 2nd pass of type 1 - * 3 = Reclaim mapped (normal reclaim) - * 4 = 2nd pass of type 3 - */ - for (pass = 0; pass < 5; pass++) { - int prio; - - /* Force reclaiming mapped pages in the passes #3 and #4 */ - if (pass > 2) - sc.may_unmap = 1; - - for (prio = DEF_PRIORITY; prio >= 0; prio--) { - unsigned long nr_to_scan = nr_pages - sc.nr_reclaimed; - - sc.nr_scanned = 0; - sc.swap_cluster_max = nr_to_scan; - shrink_all_zones(nr_to_scan, prio, pass, &sc); - if (sc.nr_reclaimed >= nr_pages) - goto out; - - reclaim_state.reclaimed_slab = 0; - shrink_slab(sc.nr_scanned, sc.gfp_mask, - global_reclaimable_pages()); - sc.nr_reclaimed += reclaim_state.reclaimed_slab; - if (sc.nr_reclaimed >= nr_pages) - goto out; - - if (sc.nr_scanned && prio < DEF_PRIORITY - 2) - congestion_wait(BLK_RW_ASYNC, HZ / 10); - } - } - - /* - * If sc.nr_reclaimed = 0, we could not shrink LRUs, but there may be - * something in slab caches - */ - if (!sc.nr_reclaimed) { - do { - reclaim_state.reclaimed_slab = 0; - shrink_slab(nr_pages, sc.gfp_mask, - global_reclaimable_pages()); - sc.nr_reclaimed += reclaim_state.reclaimed_slab; - } while (sc.nr_reclaimed < nr_pages && - reclaim_state.reclaimed_slab > 0); - } + p->flags |= PF_MEMALLOC; + lockdep_set_current_reclaim_state(sc.gfp_mask); + reclaim_state.reclaimed_slab = 0; + p->reclaim_state = &reclaim_state; + nr_reclaimed = do_try_to_free_pages(zonelist, &sc); -out: - current->reclaim_state = NULL; + p->reclaim_state = NULL; + lockdep_clear_current_reclaim_state(); + p->flags &= ~PF_MEMALLOC; - return sc.nr_reclaimed; + return nr_reclaimed; } #endif /* CONFIG_HIBERNATION */ @@ -2451,6 +2452,17 @@ int kswapd_run(int nid) return ret; } +/* + * Called by memory hotplug when all memory in a node is offlined. + */ +void kswapd_stop(int nid) +{ + struct task_struct *kswapd = NODE_DATA(nid)->kswapd; + + if (kswapd) + kthread_stop(kswapd); +} + static int __init kswapd_init(void) { int nid; @@ -2553,8 +2565,8 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE), .may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP), .may_swap = 1, - .swap_cluster_max = max_t(unsigned long, nr_pages, - SWAP_CLUSTER_MAX), + .nr_to_reclaim = max_t(unsigned long, nr_pages, + SWAP_CLUSTER_MAX), .gfp_mask = gfp_mask, .swappiness = vm_swappiness, .order = order, @@ -2570,6 +2582,7 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) * and RECLAIM_SWAP. */ p->flags |= PF_MEMALLOC | PF_SWAPWRITE; + lockdep_set_current_reclaim_state(gfp_mask); reclaim_state.reclaimed_slab = 0; p->reclaim_state = &reclaim_state; @@ -2613,6 +2626,7 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) p->reclaim_state = NULL; current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE); + lockdep_clear_current_reclaim_state(); return sc.nr_reclaimed >= nr_pages; } @@ -2635,7 +2649,7 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) zone_page_state(zone, NR_SLAB_RECLAIMABLE) <= zone->min_slab_pages) return ZONE_RECLAIM_FULL; - if (zone_is_all_unreclaimable(zone)) + if (zone->all_unreclaimable) return ZONE_RECLAIM_FULL; /* diff --git a/mm/vmstat.c b/mm/vmstat.c index c81321f9feec..fa12ea3051fb 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -12,6 +12,7 @@ #include <linux/mm.h> #include <linux/err.h> #include <linux/module.h> +#include <linux/slab.h> #include <linux/cpu.h> #include <linux/vmstat.h> #include <linux/sched.h> @@ -139,7 +140,8 @@ static void refresh_zone_stat_thresholds(void) threshold = calculate_threshold(zone); for_each_online_cpu(cpu) - zone_pcp(zone, cpu)->stat_threshold = threshold; + per_cpu_ptr(zone->pageset, cpu)->stat_threshold + = threshold; } } @@ -149,7 +151,8 @@ static void refresh_zone_stat_thresholds(void) void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item, int delta) { - struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id()); + struct per_cpu_pageset *pcp = this_cpu_ptr(zone->pageset); + s8 *p = pcp->vm_stat_diff + item; long x; @@ -202,7 +205,7 @@ EXPORT_SYMBOL(mod_zone_page_state); */ void __inc_zone_state(struct zone *zone, enum zone_stat_item item) { - struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id()); + struct per_cpu_pageset *pcp = this_cpu_ptr(zone->pageset); s8 *p = pcp->vm_stat_diff + item; (*p)++; @@ -223,7 +226,7 @@ EXPORT_SYMBOL(__inc_zone_page_state); void __dec_zone_state(struct zone *zone, enum zone_stat_item item) { - struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id()); + struct per_cpu_pageset *pcp = this_cpu_ptr(zone->pageset); s8 *p = pcp->vm_stat_diff + item; (*p)--; @@ -300,7 +303,7 @@ void refresh_cpu_vm_stats(int cpu) for_each_populated_zone(zone) { struct per_cpu_pageset *p; - p = zone_pcp(zone, cpu); + p = per_cpu_ptr(zone->pageset, cpu); for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) if (p->vm_stat_diff[i]) { @@ -683,6 +686,9 @@ static const char * const vmstat_text[] = { "slabs_scanned", "kswapd_steal", "kswapd_inodesteal", + "kswapd_low_wmark_hit_quickly", + "kswapd_high_wmark_hit_quickly", + "kswapd_skip_congestion_wait", "pageoutrun", "allocstall", @@ -738,7 +744,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, for_each_online_cpu(i) { struct per_cpu_pageset *pageset; - pageset = zone_pcp(zone, i); + pageset = per_cpu_ptr(zone->pageset, i); seq_printf(m, "\n cpu: %i" "\n count: %i" @@ -758,7 +764,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, "\n prev_priority: %i" "\n start_pfn: %lu" "\n inactive_ratio: %u", - zone_is_all_unreclaimable(zone), + zone->all_unreclaimable, zone->prev_priority, zone->zone_start_pfn, zone->inactive_ratio); @@ -883,11 +889,10 @@ static void vmstat_update(struct work_struct *w) static void __cpuinit start_cpu_timer(int cpu) { - struct delayed_work *vmstat_work = &per_cpu(vmstat_work, cpu); + struct delayed_work *work = &per_cpu(vmstat_work, cpu); - INIT_DELAYED_WORK_DEFERRABLE(vmstat_work, vmstat_update); - schedule_delayed_work_on(cpu, vmstat_work, - __round_jiffies_relative(HZ, cpu)); + INIT_DELAYED_WORK_DEFERRABLE(work, vmstat_update); + schedule_delayed_work_on(cpu, work, __round_jiffies_relative(HZ, cpu)); } /* @@ -904,6 +909,7 @@ static int __cpuinit vmstat_cpuup_callback(struct notifier_block *nfb, case CPU_ONLINE: case CPU_ONLINE_FROZEN: start_cpu_timer(cpu); + node_set_state(cpu_to_node(cpu), N_CPU); break; case CPU_DOWN_PREPARE: case CPU_DOWN_PREPARE_FROZEN: |