diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2016-03-16 11:51:08 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2016-03-16 11:51:08 -0700 |
commit | 271ecc5253e2b317d729d366560789cd7f93836c (patch) | |
tree | d3a60bc4dfa8245ff934f357f2367db76b59e7cf /mm | |
parent | aa6865d836418eb2ba888a4cb1318a28e9aa2e0c (diff) | |
parent | 63c06227a22b098a3849c5c99e836aea161ca0d7 (diff) |
Merge branch 'akpm' (patches from Andrew)
Merge first patch-bomb from Andrew Morton:
- some misc things
- ofs2 updates
- about half of MM
- checkpatch updates
- autofs4 update
* emailed patches from Andrew Morton <akpm@linux-foundation.org>: (120 commits)
autofs4: fix string.h include in auto_dev-ioctl.h
autofs4: use pr_xxx() macros directly for logging
autofs4: change log print macros to not insert newline
autofs4: make autofs log prints consistent
autofs4: fix some white space errors
autofs4: fix invalid ioctl return in autofs4_root_ioctl_unlocked()
autofs4: fix coding style line length in autofs4_wait()
autofs4: fix coding style problem in autofs4_get_set_timeout()
autofs4: coding style fixes
autofs: show pipe inode in mount options
kallsyms: add support for relative offsets in kallsyms address table
kallsyms: don't overload absolute symbol type for percpu symbols
x86: kallsyms: disable absolute percpu symbols on !SMP
checkpatch: fix another left brace warning
checkpatch: improve UNSPECIFIED_INT test for bare signed/unsigned uses
checkpatch: warn on bare unsigned or signed declarations without int
checkpatch: exclude asm volatile from complex macro check
mm: memcontrol: drop unnecessary lru locking from mem_cgroup_migrate()
mm: migrate: consolidate mem_cgroup_migrate() calls
mm/compaction: speed up pageblock_pfn_to_page() when zone is contiguous
...
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig.debug | 57 | ||||
-rw-r--r-- | mm/Makefile | 2 | ||||
-rw-r--r-- | mm/compaction.c | 93 | ||||
-rw-r--r-- | mm/debug.c | 165 | ||||
-rw-r--r-- | mm/failslab.c | 12 | ||||
-rw-r--r-- | mm/filemap.c | 113 | ||||
-rw-r--r-- | mm/huge_memory.c | 20 | ||||
-rw-r--r-- | mm/internal.h | 18 | ||||
-rw-r--r-- | mm/kmemcheck.c | 3 | ||||
-rw-r--r-- | mm/madvise.c | 19 | ||||
-rw-r--r-- | mm/memblock.c | 8 | ||||
-rw-r--r-- | mm/memcontrol.c | 92 | ||||
-rw-r--r-- | mm/memory-failure.c | 2 | ||||
-rw-r--r-- | mm/memory.c | 7 | ||||
-rw-r--r-- | mm/memory_hotplug.c | 30 | ||||
-rw-r--r-- | mm/mempolicy.c | 4 | ||||
-rw-r--r-- | mm/migrate.c | 23 | ||||
-rw-r--r-- | mm/oom_kill.c | 7 | ||||
-rw-r--r-- | mm/page-writeback.c | 62 | ||||
-rw-r--r-- | mm/page_alloc.c | 295 | ||||
-rw-r--r-- | mm/page_ext.c | 10 | ||||
-rw-r--r-- | mm/page_owner.c | 100 | ||||
-rw-r--r-- | mm/page_poison.c (renamed from mm/debug-pagealloc.c) | 67 | ||||
-rw-r--r-- | mm/rmap.c | 16 | ||||
-rw-r--r-- | mm/shmem.c | 2 | ||||
-rw-r--r-- | mm/slab.c | 1037 | ||||
-rw-r--r-- | mm/slab.h | 69 | ||||
-rw-r--r-- | mm/slab_common.c | 8 | ||||
-rw-r--r-- | mm/slub.c | 325 | ||||
-rw-r--r-- | mm/truncate.c | 6 | ||||
-rw-r--r-- | mm/vmscan.c | 47 | ||||
-rw-r--r-- | mm/vmstat.c | 15 | ||||
-rw-r--r-- | mm/workingset.c | 160 |
33 files changed, 1707 insertions, 1187 deletions
diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug index 957d3da53ddd..5c50b238b770 100644 --- a/mm/Kconfig.debug +++ b/mm/Kconfig.debug @@ -16,8 +16,8 @@ config DEBUG_PAGEALLOC select PAGE_POISONING if !ARCH_SUPPORTS_DEBUG_PAGEALLOC ---help--- Unmap pages from the kernel linear mapping after free_pages(). - This results in a large slowdown, but helps to find certain types - of memory corruption. + Depending on runtime enablement, this results in a small or large + slowdown, but helps to find certain types of memory corruption. For architectures which don't enable ARCH_SUPPORTS_DEBUG_PAGEALLOC, fill the pages with poison patterns after free_pages() and verify @@ -26,5 +26,56 @@ config DEBUG_PAGEALLOC that would result in incorrect warnings of memory corruption after a resume because free pages are not saved to the suspend image. + By default this option will have a small overhead, e.g. by not + allowing the kernel mapping to be backed by large pages on some + architectures. Even bigger overhead comes when the debugging is + enabled by DEBUG_PAGEALLOC_ENABLE_DEFAULT or the debug_pagealloc + command line parameter. + +config DEBUG_PAGEALLOC_ENABLE_DEFAULT + bool "Enable debug page memory allocations by default?" + default n + depends on DEBUG_PAGEALLOC + ---help--- + Enable debug page memory allocations by default? This value + can be overridden by debug_pagealloc=off|on. + config PAGE_POISONING - bool + bool "Poison pages after freeing" + select PAGE_EXTENSION + select PAGE_POISONING_NO_SANITY if HIBERNATION + ---help--- + Fill the pages with poison patterns after free_pages() and verify + the patterns before alloc_pages. The filling of the memory helps + reduce the risk of information leaks from freed data. This does + have a potential performance impact. + + Note that "poison" here is not the same thing as the "HWPoison" + for CONFIG_MEMORY_FAILURE. This is software poisoning only. + + If unsure, say N + +config PAGE_POISONING_NO_SANITY + depends on PAGE_POISONING + bool "Only poison, don't sanity check" + ---help--- + Skip the sanity checking on alloc, only fill the pages with + poison on free. This reduces some of the overhead of the + poisoning feature. + + If you are only interested in sanitization, say Y. Otherwise + say N. + +config PAGE_POISONING_ZERO + bool "Use zero for poisoning instead of random data" + depends on PAGE_POISONING + ---help--- + Instead of using the existing poison value, fill the pages with + zeros. This makes it harder to detect when errors are occurring + due to sanitization but the zeroing at free means that it is + no longer necessary to write zeros when GFP_ZERO is used on + allocation. + + Enabling page poisoning with this option will disable hibernation + + If unsure, say N diff --git a/mm/Makefile b/mm/Makefile index 2ed43191fc3b..cfdd481d27a5 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -48,7 +48,7 @@ obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o obj-$(CONFIG_SLOB) += slob.o obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o obj-$(CONFIG_KSM) += ksm.o -obj-$(CONFIG_PAGE_POISONING) += debug-pagealloc.o +obj-$(CONFIG_PAGE_POISONING) += page_poison.o obj-$(CONFIG_SLAB) += slab.o obj-$(CONFIG_SLUB) += slub.o obj-$(CONFIG_KMEMCHECK) += kmemcheck.o diff --git a/mm/compaction.c b/mm/compaction.c index 585de54dbe8c..93f71d968098 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -71,49 +71,6 @@ static inline bool migrate_async_suitable(int migratetype) return is_migrate_cma(migratetype) || migratetype == MIGRATE_MOVABLE; } -/* - * Check that the whole (or subset of) a pageblock given by the interval of - * [start_pfn, end_pfn) is valid and within the same zone, before scanning it - * with the migration of free compaction scanner. The scanners then need to - * use only pfn_valid_within() check for arches that allow holes within - * pageblocks. - * - * Return struct page pointer of start_pfn, or NULL if checks were not passed. - * - * It's possible on some configurations to have a setup like node0 node1 node0 - * i.e. it's possible that all pages within a zones range of pages do not - * belong to a single zone. We assume that a border between node0 and node1 - * can occur within a single pageblock, but not a node0 node1 node0 - * interleaving within a single pageblock. It is therefore sufficient to check - * the first and last page of a pageblock and avoid checking each individual - * page in a pageblock. - */ -static struct page *pageblock_pfn_to_page(unsigned long start_pfn, - unsigned long end_pfn, struct zone *zone) -{ - struct page *start_page; - struct page *end_page; - - /* end_pfn is one past the range we are checking */ - end_pfn--; - - if (!pfn_valid(start_pfn) || !pfn_valid(end_pfn)) - return NULL; - - start_page = pfn_to_page(start_pfn); - - if (page_zone(start_page) != zone) - return NULL; - - end_page = pfn_to_page(end_pfn); - - /* This gives a shorter code than deriving page_zone(end_page) */ - if (page_zone_id(start_page) != page_zone_id(end_page)) - return NULL; - - return start_page; -} - #ifdef CONFIG_COMPACTION /* Do not skip compaction more than 64 times */ @@ -200,7 +157,8 @@ static void reset_cached_positions(struct zone *zone) { zone->compact_cached_migrate_pfn[0] = zone->zone_start_pfn; zone->compact_cached_migrate_pfn[1] = zone->zone_start_pfn; - zone->compact_cached_free_pfn = zone_end_pfn(zone); + zone->compact_cached_free_pfn = + round_down(zone_end_pfn(zone) - 1, pageblock_nr_pages); } /* @@ -554,13 +512,17 @@ unsigned long isolate_freepages_range(struct compact_control *cc, unsigned long start_pfn, unsigned long end_pfn) { - unsigned long isolated, pfn, block_end_pfn; + unsigned long isolated, pfn, block_start_pfn, block_end_pfn; LIST_HEAD(freelist); pfn = start_pfn; + block_start_pfn = pfn & ~(pageblock_nr_pages - 1); + if (block_start_pfn < cc->zone->zone_start_pfn) + block_start_pfn = cc->zone->zone_start_pfn; block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages); for (; pfn < end_pfn; pfn += isolated, + block_start_pfn = block_end_pfn, block_end_pfn += pageblock_nr_pages) { /* Protect pfn from changing by isolate_freepages_block */ unsigned long isolate_start_pfn = pfn; @@ -573,11 +535,13 @@ isolate_freepages_range(struct compact_control *cc, * scanning range to right one. */ if (pfn >= block_end_pfn) { + block_start_pfn = pfn & ~(pageblock_nr_pages - 1); block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages); block_end_pfn = min(block_end_pfn, end_pfn); } - if (!pageblock_pfn_to_page(pfn, block_end_pfn, cc->zone)) + if (!pageblock_pfn_to_page(block_start_pfn, + block_end_pfn, cc->zone)) break; isolated = isolate_freepages_block(cc, &isolate_start_pfn, @@ -863,18 +827,23 @@ unsigned long isolate_migratepages_range(struct compact_control *cc, unsigned long start_pfn, unsigned long end_pfn) { - unsigned long pfn, block_end_pfn; + unsigned long pfn, block_start_pfn, block_end_pfn; /* Scan block by block. First and last block may be incomplete */ pfn = start_pfn; + block_start_pfn = pfn & ~(pageblock_nr_pages - 1); + if (block_start_pfn < cc->zone->zone_start_pfn) + block_start_pfn = cc->zone->zone_start_pfn; block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages); for (; pfn < end_pfn; pfn = block_end_pfn, + block_start_pfn = block_end_pfn, block_end_pfn += pageblock_nr_pages) { block_end_pfn = min(block_end_pfn, end_pfn); - if (!pageblock_pfn_to_page(pfn, block_end_pfn, cc->zone)) + if (!pageblock_pfn_to_page(block_start_pfn, + block_end_pfn, cc->zone)) continue; pfn = isolate_migratepages_block(cc, pfn, block_end_pfn, @@ -1103,7 +1072,9 @@ int sysctl_compact_unevictable_allowed __read_mostly = 1; static isolate_migrate_t isolate_migratepages(struct zone *zone, struct compact_control *cc) { - unsigned long low_pfn, end_pfn; + unsigned long block_start_pfn; + unsigned long block_end_pfn; + unsigned long low_pfn; unsigned long isolate_start_pfn; struct page *page; const isolate_mode_t isolate_mode = @@ -1115,16 +1086,21 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, * initialized by compact_zone() */ low_pfn = cc->migrate_pfn; + block_start_pfn = cc->migrate_pfn & ~(pageblock_nr_pages - 1); + if (block_start_pfn < zone->zone_start_pfn) + block_start_pfn = zone->zone_start_pfn; /* Only scan within a pageblock boundary */ - end_pfn = ALIGN(low_pfn + 1, pageblock_nr_pages); + block_end_pfn = ALIGN(low_pfn + 1, pageblock_nr_pages); /* * Iterate over whole pageblocks until we find the first suitable. * Do not cross the free scanner. */ - for (; end_pfn <= cc->free_pfn; - low_pfn = end_pfn, end_pfn += pageblock_nr_pages) { + for (; block_end_pfn <= cc->free_pfn; + low_pfn = block_end_pfn, + block_start_pfn = block_end_pfn, + block_end_pfn += pageblock_nr_pages) { /* * This can potentially iterate a massively long zone with @@ -1135,7 +1111,8 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, && compact_should_abort(cc)) break; - page = pageblock_pfn_to_page(low_pfn, end_pfn, zone); + page = pageblock_pfn_to_page(block_start_pfn, block_end_pfn, + zone); if (!page) continue; @@ -1154,8 +1131,8 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, /* Perform the isolation */ isolate_start_pfn = low_pfn; - low_pfn = isolate_migratepages_block(cc, low_pfn, end_pfn, - isolate_mode); + low_pfn = isolate_migratepages_block(cc, low_pfn, + block_end_pfn, isolate_mode); if (!low_pfn || cc->contended) { acct_isolated(zone, cc); @@ -1371,11 +1348,11 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) */ cc->migrate_pfn = zone->compact_cached_migrate_pfn[sync]; cc->free_pfn = zone->compact_cached_free_pfn; - if (cc->free_pfn < start_pfn || cc->free_pfn > end_pfn) { - cc->free_pfn = end_pfn & ~(pageblock_nr_pages-1); + if (cc->free_pfn < start_pfn || cc->free_pfn >= end_pfn) { + cc->free_pfn = round_down(end_pfn - 1, pageblock_nr_pages); zone->compact_cached_free_pfn = cc->free_pfn; } - if (cc->migrate_pfn < start_pfn || cc->migrate_pfn > end_pfn) { + if (cc->migrate_pfn < start_pfn || cc->migrate_pfn >= end_pfn) { cc->migrate_pfn = start_pfn; zone->compact_cached_migrate_pfn[0] = cc->migrate_pfn; zone->compact_cached_migrate_pfn[1] = cc->migrate_pfn; diff --git a/mm/debug.c b/mm/debug.c index f05b2d5d6481..df7247b0b532 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -9,75 +9,38 @@ #include <linux/mm.h> #include <linux/trace_events.h> #include <linux/memcontrol.h> - -static const struct trace_print_flags pageflag_names[] = { - {1UL << PG_locked, "locked" }, - {1UL << PG_error, "error" }, - {1UL << PG_referenced, "referenced" }, - {1UL << PG_uptodate, "uptodate" }, - {1UL << PG_dirty, "dirty" }, - {1UL << PG_lru, "lru" }, - {1UL << PG_active, "active" }, - {1UL << PG_slab, "slab" }, - {1UL << PG_owner_priv_1, "owner_priv_1" }, - {1UL << PG_arch_1, "arch_1" }, - {1UL << PG_reserved, "reserved" }, - {1UL << PG_private, "private" }, - {1UL << PG_private_2, "private_2" }, - {1UL << PG_writeback, "writeback" }, - {1UL << PG_head, "head" }, - {1UL << PG_swapcache, "swapcache" }, - {1UL << PG_mappedtodisk, "mappedtodisk" }, - {1UL << PG_reclaim, "reclaim" }, - {1UL << PG_swapbacked, "swapbacked" }, - {1UL << PG_unevictable, "unevictable" }, -#ifdef CONFIG_MMU - {1UL << PG_mlocked, "mlocked" }, -#endif -#ifdef CONFIG_ARCH_USES_PG_UNCACHED - {1UL << PG_uncached, "uncached" }, -#endif -#ifdef CONFIG_MEMORY_FAILURE - {1UL << PG_hwpoison, "hwpoison" }, -#endif -#if defined(CONFIG_IDLE_PAGE_TRACKING) && defined(CONFIG_64BIT) - {1UL << PG_young, "young" }, - {1UL << PG_idle, "idle" }, -#endif +#include <trace/events/mmflags.h> +#include <linux/migrate.h> +#include <linux/page_owner.h> + +#include "internal.h" + +char *migrate_reason_names[MR_TYPES] = { + "compaction", + "memory_failure", + "memory_hotplug", + "syscall_or_cpuset", + "mempolicy_mbind", + "numa_misplaced", + "cma", }; -static void dump_flags(unsigned long flags, - const struct trace_print_flags *names, int count) -{ - const char *delim = ""; - unsigned long mask; - int i; - - pr_emerg("flags: %#lx(", flags); - - /* remove zone id */ - flags &= (1UL << NR_PAGEFLAGS) - 1; - - for (i = 0; i < count && flags; i++) { - - mask = names[i].mask; - if ((flags & mask) != mask) - continue; - - flags &= ~mask; - pr_cont("%s%s", delim, names[i].name); - delim = "|"; - } +const struct trace_print_flags pageflag_names[] = { + __def_pageflag_names, + {0, NULL} +}; - /* check for left over flags */ - if (flags) - pr_cont("%s%#lx", delim, flags); +const struct trace_print_flags gfpflag_names[] = { + __def_gfpflag_names, + {0, NULL} +}; - pr_cont(")\n"); -} +const struct trace_print_flags vmaflag_names[] = { + __def_vmaflag_names, + {0, NULL} +}; -void dump_page_badflags(struct page *page, const char *reason, - unsigned long badflags) +void __dump_page(struct page *page, const char *reason) { pr_emerg("page:%p count:%d mapcount:%d mapping:%p index:%#lx", page, atomic_read(&page->_count), page_mapcount(page), @@ -85,15 +48,13 @@ void dump_page_badflags(struct page *page, const char *reason, if (PageCompound(page)) pr_cont(" compound_mapcount: %d", compound_mapcount(page)); pr_cont("\n"); - BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS); - dump_flags(page->flags, pageflag_names, ARRAY_SIZE(pageflag_names)); + BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS + 1); + + pr_emerg("flags: %#lx(%pGp)\n", page->flags, &page->flags); + if (reason) pr_alert("page dumped because: %s\n", reason); - if (page->flags & badflags) { - pr_alert("bad because of flags:\n"); - dump_flags(page->flags & badflags, - pageflag_names, ARRAY_SIZE(pageflag_names)); - } + #ifdef CONFIG_MEMCG if (page->mem_cgroup) pr_alert("page->mem_cgroup:%p\n", page->mem_cgroup); @@ -102,67 +63,26 @@ void dump_page_badflags(struct page *page, const char *reason, void dump_page(struct page *page, const char *reason) { - dump_page_badflags(page, reason, 0); + __dump_page(page, reason); + dump_page_owner(page); } EXPORT_SYMBOL(dump_page); #ifdef CONFIG_DEBUG_VM -static const struct trace_print_flags vmaflags_names[] = { - {VM_READ, "read" }, - {VM_WRITE, "write" }, - {VM_EXEC, "exec" }, - {VM_SHARED, "shared" }, - {VM_MAYREAD, "mayread" }, - {VM_MAYWRITE, "maywrite" }, - {VM_MAYEXEC, "mayexec" }, - {VM_MAYSHARE, "mayshare" }, - {VM_GROWSDOWN, "growsdown" }, - {VM_PFNMAP, "pfnmap" }, - {VM_DENYWRITE, "denywrite" }, - {VM_LOCKONFAULT, "lockonfault" }, - {VM_LOCKED, "locked" }, - {VM_IO, "io" }, - {VM_SEQ_READ, "seqread" }, - {VM_RAND_READ, "randread" }, - {VM_DONTCOPY, "dontcopy" }, - {VM_DONTEXPAND, "dontexpand" }, - {VM_ACCOUNT, "account" }, - {VM_NORESERVE, "noreserve" }, - {VM_HUGETLB, "hugetlb" }, -#if defined(CONFIG_X86) - {VM_PAT, "pat" }, -#elif defined(CONFIG_PPC) - {VM_SAO, "sao" }, -#elif defined(CONFIG_PARISC) || defined(CONFIG_METAG) || defined(CONFIG_IA64) - {VM_GROWSUP, "growsup" }, -#elif !defined(CONFIG_MMU) - {VM_MAPPED_COPY, "mappedcopy" }, -#else - {VM_ARCH_1, "arch_1" }, -#endif - {VM_DONTDUMP, "dontdump" }, -#ifdef CONFIG_MEM_SOFT_DIRTY - {VM_SOFTDIRTY, "softdirty" }, -#endif - {VM_MIXEDMAP, "mixedmap" }, - {VM_HUGEPAGE, "hugepage" }, - {VM_NOHUGEPAGE, "nohugepage" }, - {VM_MERGEABLE, "mergeable" }, -}; - void dump_vma(const struct vm_area_struct *vma) { pr_emerg("vma %p start %p end %p\n" "next %p prev %p mm %p\n" "prot %lx anon_vma %p vm_ops %p\n" - "pgoff %lx file %p private_data %p\n", + "pgoff %lx file %p private_data %p\n" + "flags: %#lx(%pGv)\n", vma, (void *)vma->vm_start, (void *)vma->vm_end, vma->vm_next, vma->vm_prev, vma->vm_mm, (unsigned long)pgprot_val(vma->vm_page_prot), vma->anon_vma, vma->vm_ops, vma->vm_pgoff, - vma->vm_file, vma->vm_private_data); - dump_flags(vma->vm_flags, vmaflags_names, ARRAY_SIZE(vmaflags_names)); + vma->vm_file, vma->vm_private_data, + vma->vm_flags, &vma->vm_flags); } EXPORT_SYMBOL(dump_vma); @@ -196,7 +116,7 @@ void dump_mm(const struct mm_struct *mm) #if defined(CONFIG_NUMA_BALANCING) || defined(CONFIG_COMPACTION) "tlb_flush_pending %d\n" #endif - "%s", /* This is here to hold the comma */ + "def_flags: %#lx(%pGv)\n", mm, mm->mmap, mm->vmacache_seqnum, mm->task_size, #ifdef CONFIG_MMU @@ -230,11 +150,8 @@ void dump_mm(const struct mm_struct *mm) #if defined(CONFIG_NUMA_BALANCING) || defined(CONFIG_COMPACTION) mm->tlb_flush_pending, #endif - "" /* This is here to not have a comma! */ - ); - - dump_flags(mm->def_flags, vmaflags_names, - ARRAY_SIZE(vmaflags_names)); + mm->def_flags, &mm->def_flags + ); } #endif /* CONFIG_DEBUG_VM */ diff --git a/mm/failslab.c b/mm/failslab.c index 79171b4a5826..b0fac98cd938 100644 --- a/mm/failslab.c +++ b/mm/failslab.c @@ -1,5 +1,7 @@ #include <linux/fault-inject.h> #include <linux/slab.h> +#include <linux/mm.h> +#include "slab.h" static struct { struct fault_attr attr; @@ -11,18 +13,22 @@ static struct { .cache_filter = false, }; -bool should_failslab(size_t size, gfp_t gfpflags, unsigned long cache_flags) +bool should_failslab(struct kmem_cache *s, gfp_t gfpflags) { + /* No fault-injection for bootstrap cache */ + if (unlikely(s == kmem_cache)) + return false; + if (gfpflags & __GFP_NOFAIL) return false; if (failslab.ignore_gfp_reclaim && (gfpflags & __GFP_RECLAIM)) return false; - if (failslab.cache_filter && !(cache_flags & SLAB_FAILSLAB)) + if (failslab.cache_filter && !(s->flags & SLAB_FAILSLAB)) return false; - return should_fail(&failslab.attr, size); + return should_fail(&failslab.attr, s->object_size); } static int __init setup_failslab(char *str) diff --git a/mm/filemap.c b/mm/filemap.c index da7a35d83de7..61b441b191ad 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -101,7 +101,7 @@ * ->tree_lock (page_remove_rmap->set_page_dirty) * bdi.wb->list_lock (page_remove_rmap->set_page_dirty) * ->inode->i_lock (page_remove_rmap->set_page_dirty) - * ->memcg->move_lock (page_remove_rmap->mem_cgroup_begin_page_stat) + * ->memcg->move_lock (page_remove_rmap->lock_page_memcg) * bdi.wb->list_lock (zap_pte_range->set_page_dirty) * ->inode->i_lock (zap_pte_range->set_page_dirty) * ->private_lock (zap_pte_range->__set_page_dirty_buffers) @@ -176,11 +176,9 @@ static void page_cache_tree_delete(struct address_space *mapping, /* * Delete a page from the page cache and free it. Caller has to make * sure the page is locked and that nobody else uses it - or that usage - * is safe. The caller must hold the mapping's tree_lock and - * mem_cgroup_begin_page_stat(). + * is safe. The caller must hold the mapping's tree_lock. */ -void __delete_from_page_cache(struct page *page, void *shadow, - struct mem_cgroup *memcg) +void __delete_from_page_cache(struct page *page, void *shadow) { struct address_space *mapping = page->mapping; @@ -239,8 +237,7 @@ void __delete_from_page_cache(struct page *page, void *shadow, * anyway will be cleared before returning page into buddy allocator. */ if (WARN_ON_ONCE(PageDirty(page))) - account_page_cleaned(page, mapping, memcg, - inode_to_wb(mapping->host)); + account_page_cleaned(page, mapping, inode_to_wb(mapping->host)); } /** @@ -254,7 +251,6 @@ void __delete_from_page_cache(struct page *page, void *shadow, void delete_from_page_cache(struct page *page) { struct address_space *mapping = page->mapping; - struct mem_cgroup *memcg; unsigned long flags; void (*freepage)(struct page *); @@ -263,11 +259,9 @@ void delete_from_page_cache(struct page *page) freepage = mapping->a_ops->freepage; - memcg = mem_cgroup_begin_page_stat(page); spin_lock_irqsave(&mapping->tree_lock, flags); - __delete_from_page_cache(page, NULL, memcg); + __delete_from_page_cache(page, NULL); spin_unlock_irqrestore(&mapping->tree_lock, flags); - mem_cgroup_end_page_stat(memcg); if (freepage) freepage(page); @@ -551,7 +545,6 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask) if (!error) { struct address_space *mapping = old->mapping; void (*freepage)(struct page *); - struct mem_cgroup *memcg; unsigned long flags; pgoff_t offset = old->index; @@ -561,9 +554,8 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask) new->mapping = mapping; new->index = offset; - memcg = mem_cgroup_begin_page_stat(old); spin_lock_irqsave(&mapping->tree_lock, flags); - __delete_from_page_cache(old, NULL, memcg); + __delete_from_page_cache(old, NULL); error = radix_tree_insert(&mapping->page_tree, offset, new); BUG_ON(error); mapping->nrpages++; @@ -576,8 +568,7 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask) if (PageSwapBacked(new)) __inc_zone_page_state(new, NR_SHMEM); spin_unlock_irqrestore(&mapping->tree_lock, flags); - mem_cgroup_end_page_stat(memcg); - mem_cgroup_replace_page(old, new); + mem_cgroup_migrate(old, new); radix_tree_preload_end(); if (freepage) freepage(old); @@ -1668,6 +1659,15 @@ find_page: index, last_index - index); } if (!PageUptodate(page)) { + /* + * See comment in do_read_cache_page on why + * wait_on_page_locked is used to avoid unnecessarily + * serialisations and why it's safe. + */ + wait_on_page_locked_killable(page); + if (PageUptodate(page)) + goto page_ok; + if (inode->i_blkbits == PAGE_CACHE_SHIFT || !mapping->a_ops->is_partially_uptodate) goto page_not_up_to_date; @@ -2303,7 +2303,7 @@ static struct page *wait_on_page_read(struct page *page) return page; } -static struct page *__read_cache_page(struct address_space *mapping, +static struct page *do_read_cache_page(struct address_space *mapping, pgoff_t index, int (*filler)(void *, struct page *), void *data, @@ -2325,53 +2325,74 @@ repeat: /* Presumably ENOMEM for radix tree node */ return ERR_PTR(err); } + +filler: err = filler(data, page); if (err < 0) { page_cache_release(page); - page = ERR_PTR(err); - } else { - page = wait_on_page_read(page); + return ERR_PTR(err); } - } - return page; -} - -static struct page *do_read_cache_page(struct address_space *mapping, - pgoff_t index, - int (*filler)(void *, struct page *), - void *data, - gfp_t gfp) -{ - struct page *page; - int err; + page = wait_on_page_read(page); + if (IS_ERR(page)) + return page; + goto out; + } + if (PageUptodate(page)) + goto out; -retry: - page = __read_cache_page(mapping, index, filler, data, gfp); - if (IS_ERR(page)) - return page; + /* + * Page is not up to date and may be locked due one of the following + * case a: Page is being filled and the page lock is held + * case b: Read/write error clearing the page uptodate status + * case c: Truncation in progress (page locked) + * case d: Reclaim in progress + * + * Case a, the page will be up to date when the page is unlocked. + * There is no need to serialise on the page lock here as the page + * is pinned so the lock gives no additional protection. Even if the + * the page is truncated, the data is still valid if PageUptodate as + * it's a race vs truncate race. + * Case b, the page will not be up to date + * Case c, the page may be truncated but in itself, the data may still + * be valid after IO completes as it's a read vs truncate race. The + * operation must restart if the page is not uptodate on unlock but + * otherwise serialising on page lock to stabilise the mapping gives + * no additional guarantees to the caller as the page lock is + * released before return. + * Case d, similar to truncation. If reclaim holds the page lock, it + * will be a race with remove_mapping that determines if the mapping + * is valid on unlock but otherwise the data is valid and there is + * no need to serialise with page lock. + * + * As the page lock gives no additional guarantee, we optimistically + * wait on the page to be unlocked and check if it's up to date and + * use the page if it is. Otherwise, the page lock is required to + * distinguish between the different cases. The motivation is that we + * avoid spurious serialisations and wakeups when multiple processes + * wait on the same page for IO to complete. + */ + wait_on_page_locked(page); if (PageUptodate(page)) goto out; + /* Distinguish between all the cases under the safety of the lock */ lock_page(page); + + /* Case c or d, restart the operation */ if (!page->mapping) { unlock_page(page); page_cache_release(page); - goto retry; + goto repeat; } + + /* Someone else locked and filled the page in a very small window */ if (PageUptodate(page)) { unlock_page(page); goto out; } - err = filler(data, page); - if (err < 0) { - page_cache_release(page); - return ERR_PTR(err); - } else { - page = wait_on_page_read(page); - if (IS_ERR(page)) - return page; - } + goto filler; + out: mark_page_accessed(page); return page; diff --git a/mm/huge_memory.c b/mm/huge_memory.c index e10a4fee88d2..1ea21e203a70 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -3220,28 +3220,26 @@ static void unfreeze_page(struct anon_vma *anon_vma, struct page *page) } } -static int __split_huge_page_tail(struct page *head, int tail, +static void __split_huge_page_tail(struct page *head, int tail, struct lruvec *lruvec, struct list_head *list) { - int mapcount; struct page *page_tail = head + tail; - mapcount = atomic_read(&page_tail->_mapcount) + 1; + VM_BUG_ON_PAGE(atomic_read(&page_tail->_mapcount) != -1, page_tail); VM_BUG_ON_PAGE(atomic_read(&page_tail->_count) != 0, page_tail); /* * tail_page->_count is zero and not changing from under us. But * get_page_unless_zero() may be running from under us on the - * tail_page. If we used atomic_set() below instead of atomic_add(), we + * tail_page. If we used atomic_set() below instead of atomic_inc(), we * would then run atomic_set() concurrently with * get_page_unless_zero(), and atomic_set() is implemented in C not * using locked ops. spin_unlock on x86 sometime uses locked ops * because of PPro errata 66, 92, so unless somebody can guarantee * atomic_set() here would be safe on all archs (and not only on x86), - * it's safer to use atomic_add(). + * it's safer to use atomic_inc(). */ - atomic_add(mapcount + 1, &page_tail->_count); - + atomic_inc(&page_tail->_count); page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; page_tail->flags |= (head->flags & @@ -3275,8 +3273,6 @@ static int __split_huge_page_tail(struct page *head, int tail, page_tail->index = head->index + tail; page_cpupid_xchg_last(page_tail, page_cpupid_last(head)); lru_add_page_tail(head, page_tail, lruvec, list); - - return mapcount; } static void __split_huge_page(struct page *page, struct list_head *list) @@ -3284,7 +3280,7 @@ static void __split_huge_page(struct page *page, struct list_head *list) struct page *head = compound_head(page); struct zone *zone = page_zone(head); struct lruvec *lruvec; - int i, tail_mapcount; + int i; /* prevent PageLRU to go away from under us, and freeze lru stats */ spin_lock_irq(&zone->lru_lock); @@ -3293,10 +3289,8 @@ static void __split_huge_page(struct page *page, struct list_head *list) /* complete memcg works before add pages to LRU */ mem_cgroup_split_huge_fixup(head); - tail_mapcount = 0; for (i = HPAGE_PMD_NR - 1; i >= 1; i--) - tail_mapcount += __split_huge_page_tail(head, i, lruvec, list); - atomic_sub(tail_mapcount, &head->_count); + __split_huge_page_tail(head, i, lruvec, list); ClearPageCompound(head); spin_unlock_irq(&zone->lru_lock); diff --git a/mm/internal.h b/mm/internal.h index a38a21ebddb4..ad9400d759c8 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -14,6 +14,7 @@ #include <linux/fs.h> #include <linux/mm.h> #include <linux/pagemap.h> +#include <linux/tracepoint-defs.h> /* * The set of flags that only affect watermark checking and reclaim @@ -131,6 +132,18 @@ __find_buddy_index(unsigned long page_idx, unsigned int order) return page_idx ^ (1 << order); } +extern struct page *__pageblock_pfn_to_page(unsigned long start_pfn, + unsigned long end_pfn, struct zone *zone); + +static inline struct page *pageblock_pfn_to_page(unsigned long start_pfn, + unsigned long end_pfn, struct zone *zone) +{ + if (zone->contiguous) + return pfn_to_page(start_pfn); + + return __pageblock_pfn_to_page(start_pfn, end_pfn, zone); +} + extern int __isolate_free_page(struct page *page, unsigned int order); extern void __free_pages_bootmem(struct page *page, unsigned long pfn, unsigned int order); @@ -466,4 +479,9 @@ static inline void try_to_unmap_flush_dirty(void) } #endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */ + +extern const struct trace_print_flags pageflag_names[]; +extern const struct trace_print_flags vmaflag_names[]; +extern const struct trace_print_flags gfpflag_names[]; + #endif /* __MM_INTERNAL_H */ diff --git a/mm/kmemcheck.c b/mm/kmemcheck.c index cab58bb592d8..6f4f424037c0 100644 --- a/mm/kmemcheck.c +++ b/mm/kmemcheck.c @@ -60,6 +60,9 @@ void kmemcheck_free_shadow(struct page *page, int order) void kmemcheck_slab_alloc(struct kmem_cache *s, gfp_t gfpflags, void *object, size_t size) { + if (unlikely(!object)) /* Skip object if allocation failed */ + return; + /* * Has already been memset(), which initializes the shadow for us * as well. diff --git a/mm/madvise.c b/mm/madvise.c index f56825b6d2e1..a01147359f3b 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -555,8 +555,9 @@ static int madvise_hwpoison(int bhv, unsigned long start, unsigned long end) } pr_info("Injecting memory failure for page %#lx at %#lx\n", page_to_pfn(p), start); - /* Ignore return value for now */ - memory_failure(page_to_pfn(p), 0, MF_COUNT_INCREASED); + ret = memory_failure(page_to_pfn(p), 0, MF_COUNT_INCREASED); + if (ret) + return ret; } return 0; } @@ -638,14 +639,28 @@ madvise_behavior_valid(int behavior) * some pages ahead. * MADV_DONTNEED - the application is finished with the given range, * so the kernel can free resources associated with it. + * MADV_FREE - the application marks pages in the given range as lazy free, + * where actual purges are postponed until memory pressure happens. * MADV_REMOVE - the application wants to free up the given range of * pages and associated backing store. * MADV_DONTFORK - omit this area from child's address space when forking: * typically, to avoid COWing pages pinned by get_user_pages(). * MADV_DOFORK - cancel MADV_DONTFORK: no longer omit this area when forking. + * MADV_HWPOISON - trigger memory error handler as if the given memory range + * were corrupted by unrecoverable hardware memory failure. + * MADV_SOFT_OFFLINE - try to soft-offline the given range of memory. * MADV_MERGEABLE - the application recommends that KSM try to merge pages in * this area with pages of identical content from other such areas. * MADV_UNMERGEABLE- cancel MADV_MERGEABLE: no longer merge pages with others. + * MADV_HUGEPAGE - the application wants to back the given range by transparent + * huge pages in the future. Existing pages might be coalesced and + * new pages might be allocated as THP. + * MADV_NOHUGEPAGE - mark the given range as not worth being backed by + * transparent huge pages so the existing pages will not be + * coalesced into THP and new pages will not be allocated as THP. + * MADV_DONTDUMP - the application wants to prevent pages in the given range + * from being included in its core dump. + * MADV_DODUMP - cancel MADV_DONTDUMP: no longer exclude from core dump. * * return values: * zero - success diff --git a/mm/memblock.c b/mm/memblock.c index dd7989929f13..fc7824fa1b42 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -612,14 +612,12 @@ static int __init_memblock memblock_add_region(phys_addr_t base, int nid, unsigned long flags) { - struct memblock_type *type = &memblock.memory; - memblock_dbg("memblock_add: [%#016llx-%#016llx] flags %#02lx %pF\n", (unsigned long long)base, (unsigned long long)base + size - 1, flags, (void *)_RET_IP_); - return memblock_add_range(type, base, size, nid, flags); + return memblock_add_range(&memblock.memory, base, size, nid, flags); } int __init_memblock memblock_add(phys_addr_t base, phys_addr_t size) @@ -740,14 +738,12 @@ static int __init_memblock memblock_reserve_region(phys_addr_t base, int nid, unsigned long flags) { - struct memblock_type *type = &memblock.reserved; - memblock_dbg("memblock_reserve: [%#016llx-%#016llx] flags %#02lx %pF\n", (unsigned long long)base, (unsigned long long)base + size - 1, flags, (void *)_RET_IP_); - return memblock_add_range(type, base, size, nid, flags); + return memblock_add_range(&memblock.reserved, base, size, nid, flags); } int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index d06cae2de783..42882c1e7fce 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -268,31 +268,6 @@ static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg) return (memcg == root_mem_cgroup); } -/* - * We restrict the id in the range of [1, 65535], so it can fit into - * an unsigned short. - */ -#define MEM_CGROUP_ID_MAX USHRT_MAX - -static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg) -{ - return memcg->css.id; -} - -/* - * A helper function to get mem_cgroup from ID. must be called under - * rcu_read_lock(). The caller is responsible for calling - * css_tryget_online() if the mem_cgroup is used for charging. (dropping - * refcnt from swap can be called against removed memcg.) - */ -static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id) -{ - struct cgroup_subsys_state *css; - - css = css_from_id(id, &memory_cgrp_subsys); - return mem_cgroup_from_css(css); -} - #ifndef CONFIG_SLOB /* * This will be the memcg's index in each cache's ->memcg_params.memcg_caches. @@ -1709,19 +1684,13 @@ cleanup: } /** - * mem_cgroup_begin_page_stat - begin a page state statistics transaction - * @page: page that is going to change accounted state - * - * This function must mark the beginning of an accounted page state - * change to prevent double accounting when the page is concurrently - * being moved to another memcg: + * lock_page_memcg - lock a page->mem_cgroup binding + * @page: the page * - * memcg = mem_cgroup_begin_page_stat(page); - * if (TestClearPageState(page)) - * mem_cgroup_update_page_stat(memcg, state, -1); - * mem_cgroup_end_page_stat(memcg); + * This function protects unlocked LRU pages from being moved to + * another cgroup and stabilizes their page->mem_cgroup binding. */ -struct mem_cgroup *mem_cgroup_begin_page_stat(struct page *page) +void lock_page_memcg(struct page *page) { struct mem_cgroup *memcg; unsigned long flags; @@ -1730,25 +1699,18 @@ struct mem_cgroup *mem_cgroup_begin_page_stat(struct page *page) * The RCU lock is held throughout the transaction. The fast * path can get away without acquiring the memcg->move_lock * because page moving starts with an RCU grace period. - * - * The RCU lock also protects the memcg from being freed when - * the page state that is going to change is the only thing - * preventing the page from being uncharged. - * E.g. end-writeback clearing PageWriteback(), which allows - * migration to go ahead and uncharge the page before the - * account transaction might be complete. */ rcu_read_lock(); if (mem_cgroup_disabled()) - return NULL; + return; again: memcg = page->mem_cgroup; if (unlikely(!memcg)) - return NULL; + return; if (atomic_read(&memcg->moving_account) <= 0) - return memcg; + return; spin_lock_irqsave(&memcg->move_lock, flags); if (memcg != page->mem_cgroup) { @@ -1759,21 +1721,23 @@ again: /* * When charge migration first begins, we can have locked and * unlocked page stat updates happening concurrently. Track - * the task who has the lock for mem_cgroup_end_page_stat(). + * the task who has the lock for unlock_page_memcg(). */ memcg->move_lock_task = current; memcg->move_lock_flags = flags; - return memcg; + return; } -EXPORT_SYMBOL(mem_cgroup_begin_page_stat); +EXPORT_SYMBOL(lock_page_memcg); /** - * mem_cgroup_end_page_stat - finish a page state statistics transaction - * @memcg: the memcg that was accounted against + * unlock_page_memcg - unlock a page->mem_cgroup binding + * @page: the page */ -void mem_cgroup_end_page_stat(struct mem_cgroup *memcg) +void unlock_page_memcg(struct page *page) { + struct mem_cgroup *memcg = page->mem_cgroup; + if (memcg && memcg->move_lock_task == current) { unsigned long flags = memcg->move_lock_flags; @@ -1785,7 +1749,7 @@ void mem_cgroup_end_page_stat(struct mem_cgroup *memcg) rcu_read_unlock(); } -EXPORT_SYMBOL(mem_cgroup_end_page_stat); +EXPORT_SYMBOL(unlock_page_memcg); /* * size of first charge trial. "32" comes from vmscan.c's magic value. @@ -4488,7 +4452,7 @@ static int mem_cgroup_move_account(struct page *page, VM_BUG_ON(compound && !PageTransHuge(page)); /* - * Prevent mem_cgroup_replace_page() from looking at + * Prevent mem_cgroup_migrate() from looking at * page->mem_cgroup of its source page while we change it. */ ret = -EBUSY; @@ -4923,9 +4887,9 @@ static void mem_cgroup_move_charge(struct mm_struct *mm) lru_add_drain_all(); /* - * Signal mem_cgroup_begin_page_stat() to take the memcg's - * move_lock while we're moving its pages to another memcg. - * Then wait for already started RCU-only updates to finish. + * Signal lock_page_memcg() to take the memcg's move_lock + * while we're moving its pages to another memcg. Then wait + * for already started RCU-only updates to finish. */ atomic_inc(&mc.from->moving_account); synchronize_rcu(); @@ -5517,16 +5481,16 @@ void mem_cgroup_uncharge_list(struct list_head *page_list) } /** - * mem_cgroup_replace_page - migrate a charge to another page - * @oldpage: currently charged page - * @newpage: page to transfer the charge to + * mem_cgroup_migrate - charge a page's replacement + * @oldpage: currently circulating page + * @newpage: replacement page * - * Migrate the charge from @oldpage to @newpage. + * Charge @newpage as a replacement page for @oldpage. @oldpage will + * be uncharged upon free. * * Both pages must be locked, @newpage->mapping must be set up. - * Either or both pages might be on the LRU already. */ -void mem_cgroup_replace_page(struct page *oldpage, struct page *newpage) +void mem_cgroup_migrate(struct page *oldpage, struct page *newpage) { struct mem_cgroup *memcg; unsigned int nr_pages; @@ -5559,7 +5523,7 @@ void mem_cgroup_replace_page(struct page *oldpage, struct page *newpage) page_counter_charge(&memcg->memsw, nr_pages); css_get_many(&memcg->css, nr_pages); - commit_charge(newpage, memcg, true); + commit_charge(newpage, memcg, false); local_irq_disable(); mem_cgroup_charge_statistics(memcg, newpage, compound, nr_pages); diff --git a/mm/memory-failure.c b/mm/memory-failure.c index ac595e7a3a95..67c30eb993f0 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -826,8 +826,6 @@ static struct page_state { #undef lru #undef swapbacked #undef head -#undef tail -#undef compound #undef slab #undef reserved diff --git a/mm/memory.c b/mm/memory.c index 906d8e3b42c0..0e247642ed5b 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1897,7 +1897,9 @@ int apply_to_page_range(struct mm_struct *mm, unsigned long addr, unsigned long end = addr + size; int err; - BUG_ON(addr >= end); + if (WARN_ON(addr >= end)) + return -EINVAL; + pgd = pgd_offset(mm, addr); do { next = pgd_addr_end(addr, end); @@ -3143,8 +3145,7 @@ static int do_fault(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pte_t *page_table, pmd_t *pmd, unsigned int flags, pte_t orig_pte) { - pgoff_t pgoff = (((address & PAGE_MASK) - - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; + pgoff_t pgoff = linear_page_index(vma, address); pte_unmap(page_table); /* The VMA was not fully populated on mmap() or missing VM_DONTEXPAND */ diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 979b18cbd343..24ea06393816 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -77,6 +77,9 @@ static struct { #define memhp_lock_acquire() lock_map_acquire(&mem_hotplug.dep_map) #define memhp_lock_release() lock_map_release(&mem_hotplug.dep_map) +bool memhp_auto_online; +EXPORT_SYMBOL_GPL(memhp_auto_online); + void get_online_mems(void) { might_sleep(); @@ -509,6 +512,8 @@ int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn, int start_sec, end_sec; struct vmem_altmap *altmap; + clear_zone_contiguous(zone); + /* during initialize mem_map, align hot-added range to section */ start_sec = pfn_to_section_nr(phys_start_pfn); end_sec = pfn_to_section_nr(phys_start_pfn + nr_pages - 1); @@ -521,7 +526,8 @@ int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn, if (altmap->base_pfn != phys_start_pfn || vmem_altmap_offset(altmap) > nr_pages) { pr_warn_once("memory add fail, invalid altmap\n"); - return -EINVAL; + err = -EINVAL; + goto out; } altmap->alloc = 0; } @@ -539,7 +545,8 @@ int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn, err = 0; } vmemmap_populate_print_last(); - +out: + set_zone_contiguous(zone); return err; } EXPORT_SYMBOL_GPL(__add_pages); @@ -811,6 +818,8 @@ int __remove_pages(struct zone *zone, unsigned long phys_start_pfn, } } + clear_zone_contiguous(zone); + /* * We can only remove entire sections */ @@ -826,6 +835,9 @@ int __remove_pages(struct zone *zone, unsigned long phys_start_pfn, if (ret) break; } + + set_zone_contiguous(zone); + return ret; } EXPORT_SYMBOL_GPL(__remove_pages); @@ -1261,8 +1273,13 @@ int zone_for_memory(int nid, u64 start, u64 size, int zone_default, return zone_default; } +static int online_memory_block(struct memory_block *mem, void *arg) +{ + return memory_block_change_state(mem, MEM_ONLINE, MEM_OFFLINE); +} + /* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */ -int __ref add_memory_resource(int nid, struct resource *res) +int __ref add_memory_resource(int nid, struct resource *res, bool online) { u64 start, size; pg_data_t *pgdat = NULL; @@ -1322,6 +1339,11 @@ int __ref add_memory_resource(int nid, struct resource *res) /* create new memmap entry */ firmware_map_add_hotplug(start, start + size, "System RAM"); + /* online pages if requested */ + if (online) + walk_memory_range(PFN_DOWN(start), PFN_UP(start + size - 1), + NULL, online_memory_block); + goto out; error: @@ -1345,7 +1367,7 @@ int __ref add_memory(int nid, u64 start, u64 size) if (IS_ERR(res)) return PTR_ERR(res); - ret = add_memory_resource(nid, res); + ret = add_memory_resource(nid, res, memhp_auto_online); if (ret < 0) release_memory_resource(res); return ret; diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 9a3f6b90e628..8cbc74387df3 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -643,7 +643,9 @@ static int queue_pages_test_walk(unsigned long start, unsigned long end, if (flags & MPOL_MF_LAZY) { /* Similar to task_numa_work, skip inaccessible VMAs */ - if (vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)) + if (!is_vm_hugetlb_page(vma) && + (vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)) && + !(vma->vm_flags & VM_MIXEDMAP)) change_prot_numa(vma, start, endvma); return 1; } diff --git a/mm/migrate.c b/mm/migrate.c index 3ad0fea5c438..568284ec75d4 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -38,6 +38,7 @@ #include <linux/balloon_compaction.h> #include <linux/mmu_notifier.h> #include <linux/page_idle.h> +#include <linux/page_owner.h> #include <asm/tlbflush.h> @@ -325,7 +326,6 @@ int migrate_page_move_mapping(struct address_space *mapping, return -EAGAIN; /* No turning back from here */ - set_page_memcg(newpage, page_memcg(page)); newpage->index = page->index; newpage->mapping = page->mapping; if (PageSwapBacked(page)) @@ -372,7 +372,6 @@ int migrate_page_move_mapping(struct address_space *mapping, * Now we know that no one else is looking at the page: * no turning back from here. */ - set_page_memcg(newpage, page_memcg(page)); newpage->index = page->index; newpage->mapping = page->mapping; if (PageSwapBacked(page)) @@ -457,9 +456,9 @@ int migrate_huge_page_move_mapping(struct address_space *mapping, return -EAGAIN; } - set_page_memcg(newpage, page_memcg(page)); newpage->index = page->index; newpage->mapping = page->mapping; + get_page(newpage); radix_tree_replace_slot(pslot, newpage); @@ -467,6 +466,7 @@ int migrate_huge_page_move_mapping(struct address_space *mapping, page_unfreeze_refs(page, expected_count - 1); spin_unlock_irq(&mapping->tree_lock); + return MIGRATEPAGE_SUCCESS; } @@ -578,6 +578,10 @@ void migrate_page_copy(struct page *newpage, struct page *page) */ if (PageWriteback(newpage)) end_page_writeback(newpage); + + copy_page_owner(page, newpage); + + mem_cgroup_migrate(page, newpage); } /************************************************************ @@ -772,7 +776,6 @@ static int move_to_new_page(struct page *newpage, struct page *page, * page is freed; but stats require that PageAnon be left as PageAnon. */ if (rc == MIGRATEPAGE_SUCCESS) { - set_page_memcg(page, NULL); if (!PageAnon(page)) page->mapping = NULL; } @@ -952,8 +955,10 @@ static ICE_noinline int unmap_and_move(new_page_t get_new_page, } rc = __unmap_and_move(page, newpage, force, mode); - if (rc == MIGRATEPAGE_SUCCESS) + if (rc == MIGRATEPAGE_SUCCESS) { put_new_page = NULL; + set_page_owner_migrate_reason(newpage, reason); + } out: if (rc != -EAGAIN) { @@ -1018,7 +1023,7 @@ out: static int unmap_and_move_huge_page(new_page_t get_new_page, free_page_t put_new_page, unsigned long private, struct page *hpage, int force, - enum migrate_mode mode) + enum migrate_mode mode, int reason) { int rc = -EAGAIN; int *result = NULL; @@ -1076,6 +1081,7 @@ put_anon: if (rc == MIGRATEPAGE_SUCCESS) { hugetlb_cgroup_migrate(hpage, new_hpage); put_new_page = NULL; + set_page_owner_migrate_reason(new_hpage, reason); } unlock_page(hpage); @@ -1148,7 +1154,7 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page, if (PageHuge(page)) rc = unmap_and_move_huge_page(get_new_page, put_new_page, private, page, - pass > 2, mode); + pass > 2, mode, reason); else rc = unmap_and_move(get_new_page, put_new_page, private, page, pass > 2, mode, @@ -1836,9 +1842,8 @@ fail_putback: } mlock_migrate_page(new_page, page); - set_page_memcg(new_page, page_memcg(page)); - set_page_memcg(page, NULL); page_remove_rmap(page, true); + set_page_owner_migrate_reason(new_page, MR_NUMA_MISPLACED); spin_unlock(ptl); mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); diff --git a/mm/oom_kill.c b/mm/oom_kill.c index dc490c06941b..e97a05d9621f 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -386,10 +386,11 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask) static void dump_header(struct oom_control *oc, struct task_struct *p, struct mem_cgroup *memcg) { - pr_warning("%s invoked oom-killer: gfp_mask=0x%x, order=%d, " - "oom_score_adj=%hd\n", - current->comm, oc->gfp_mask, oc->order, + pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), order=%d, " + "oom_score_adj=%hd\n", + current->comm, oc->gfp_mask, &oc->gfp_mask, oc->order, current->signal->oom_score_adj); + cpuset_print_current_mems_allowed(); dump_stack(); if (memcg) diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 6fe7d15bd1f7..11ff8f758631 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -1169,6 +1169,7 @@ static void wb_update_dirty_ratelimit(struct dirty_throttle_control *dtc, unsigned long balanced_dirty_ratelimit; unsigned long step; unsigned long x; + unsigned long shift; /* * The dirty rate will match the writeout rate in long term, except @@ -1293,11 +1294,11 @@ static void wb_update_dirty_ratelimit(struct dirty_throttle_control *dtc, * rate itself is constantly fluctuating. So decrease the track speed * when it gets close to the target. Helps eliminate pointless tremors. */ - step >>= dirty_ratelimit / (2 * step + 1); - /* - * Limit the tracking speed to avoid overshooting. - */ - step = (step + 7) / 8; + shift = dirty_ratelimit / (2 * step + 1); + if (shift < BITS_PER_LONG) + step = DIV_ROUND_UP(step >> shift, 8); + else + step = 0; if (dirty_ratelimit < balanced_dirty_ratelimit) dirty_ratelimit += step; @@ -2409,12 +2410,11 @@ int __set_page_dirty_no_writeback(struct page *page) /* * Helper function for set_page_dirty family. * - * Caller must hold mem_cgroup_begin_page_stat(). + * Caller must hold lock_page_memcg(). * * NOTE: This relies on being atomic wrt interrupts. */ -void account_page_dirtied(struct page *page, struct address_space *mapping, - struct mem_cgroup *memcg) +void account_page_dirtied(struct page *page, struct address_space *mapping) { struct inode *inode = mapping->host; @@ -2426,7 +2426,7 @@ void account_page_dirtied(struct page *page, struct address_space *mapping, inode_attach_wb(inode, page); wb = inode_to_wb(inode); - mem_cgroup_inc_page_stat(memcg, MEM_CGROUP_STAT_DIRTY); + mem_cgroup_inc_page_stat(page, MEM_CGROUP_STAT_DIRTY); __inc_zone_page_state(page, NR_FILE_DIRTY); __inc_zone_page_state(page, NR_DIRTIED); __inc_wb_stat(wb, WB_RECLAIMABLE); @@ -2441,13 +2441,13 @@ EXPORT_SYMBOL(account_page_dirtied); /* * Helper function for deaccounting dirty page without writeback. * - * Caller must hold mem_cgroup_begin_page_stat(). + * Caller must hold lock_page_memcg(). */ void account_page_cleaned(struct page *page, struct address_space *mapping, - struct mem_cgroup *memcg, struct bdi_writeback *wb) + struct bdi_writeback *wb) { if (mapping_cap_account_dirty(mapping)) { - mem_cgroup_dec_page_stat(memcg, MEM_CGROUP_STAT_DIRTY); + mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_DIRTY); dec_zone_page_state(page, NR_FILE_DIRTY); dec_wb_stat(wb, WB_RECLAIMABLE); task_io_account_cancelled_write(PAGE_CACHE_SIZE); @@ -2468,26 +2468,24 @@ void account_page_cleaned(struct page *page, struct address_space *mapping, */ int __set_page_dirty_nobuffers(struct page *page) { - struct mem_cgroup *memcg; - - memcg = mem_cgroup_begin_page_stat(page); + lock_page_memcg(page); if (!TestSetPageDirty(page)) { struct address_space *mapping = page_mapping(page); unsigned long flags; if (!mapping) { - mem_cgroup_end_page_stat(memcg); + unlock_page_memcg(page); return 1; } spin_lock_irqsave(&mapping->tree_lock, flags); BUG_ON(page_mapping(page) != mapping); WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page)); - account_page_dirtied(page, mapping, memcg); + account_page_dirtied(page, mapping); radix_tree_tag_set(&mapping->page_tree, page_index(page), PAGECACHE_TAG_DIRTY); spin_unlock_irqrestore(&mapping->tree_lock, flags); - mem_cgroup_end_page_stat(memcg); + unlock_page_memcg(page); if (mapping->host) { /* !PageAnon && !swapper_space */ @@ -2495,7 +2493,7 @@ int __set_page_dirty_nobuffers(struct page *page) } return 1; } - mem_cgroup_end_page_stat(memcg); + unlock_page_memcg(page); return 0; } EXPORT_SYMBOL(__set_page_dirty_nobuffers); @@ -2625,17 +2623,16 @@ void cancel_dirty_page(struct page *page) if (mapping_cap_account_dirty(mapping)) { struct inode *inode = mapping->host; struct bdi_writeback *wb; - struct mem_cgroup *memcg; bool locked; - memcg = mem_cgroup_begin_page_stat(page); + lock_page_memcg(page); wb = unlocked_inode_to_wb_begin(inode, &locked); if (TestClearPageDirty(page)) - account_page_cleaned(page, mapping, memcg, wb); + account_page_cleaned(page, mapping, wb); unlocked_inode_to_wb_end(inode, locked); - mem_cgroup_end_page_stat(memcg); + unlock_page_memcg(page); } else { ClearPageDirty(page); } @@ -2666,7 +2663,6 @@ int clear_page_dirty_for_io(struct page *page) if (mapping && mapping_cap_account_dirty(mapping)) { struct inode *inode = mapping->host; struct bdi_writeback *wb; - struct mem_cgroup *memcg; bool locked; /* @@ -2704,16 +2700,14 @@ int clear_page_dirty_for_io(struct page *page) * always locked coming in here, so we get the desired * exclusion. */ - memcg = mem_cgroup_begin_page_stat(page); wb = unlocked_inode_to_wb_begin(inode, &locked); if (TestClearPageDirty(page)) { - mem_cgroup_dec_page_stat(memcg, MEM_CGROUP_STAT_DIRTY); + mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_DIRTY); dec_zone_page_state(page, NR_FILE_DIRTY); dec_wb_stat(wb, WB_RECLAIMABLE); ret = 1; } unlocked_inode_to_wb_end(inode, locked); - mem_cgroup_end_page_stat(memcg); return ret; } return TestClearPageDirty(page); @@ -2723,10 +2717,9 @@ EXPORT_SYMBOL(clear_page_dirty_for_io); int test_clear_page_writeback(struct page *page) { struct address_space *mapping = page_mapping(page); - struct mem_cgroup *memcg; int ret; - memcg = mem_cgroup_begin_page_stat(page); + lock_page_memcg(page); if (mapping) { struct inode *inode = mapping->host; struct backing_dev_info *bdi = inode_to_bdi(inode); @@ -2750,21 +2743,20 @@ int test_clear_page_writeback(struct page *page) ret = TestClearPageWriteback(page); } if (ret) { - mem_cgroup_dec_page_stat(memcg, MEM_CGROUP_STAT_WRITEBACK); + mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_WRITEBACK); dec_zone_page_state(page, NR_WRITEBACK); inc_zone_page_state(page, NR_WRITTEN); } - mem_cgroup_end_page_stat(memcg); + unlock_page_memcg(page); return ret; } int __test_set_page_writeback(struct page *page, bool keep_write) { struct address_space *mapping = page_mapping(page); - struct mem_cgroup *memcg; int ret; - memcg = mem_cgroup_begin_page_stat(page); + lock_page_memcg(page); if (mapping) { struct inode *inode = mapping->host; struct backing_dev_info *bdi = inode_to_bdi(inode); @@ -2792,10 +2784,10 @@ int __test_set_page_writeback(struct page *page, bool keep_write) ret = TestSetPageWriteback(page); } if (!ret) { - mem_cgroup_inc_page_stat(memcg, MEM_CGROUP_STAT_WRITEBACK); + mem_cgroup_inc_page_stat(page, MEM_CGROUP_STAT_WRITEBACK); inc_zone_page_state(page, NR_WRITEBACK); } - mem_cgroup_end_page_stat(memcg); + unlock_page_memcg(page); return ret; } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 838ca8bb64f7..c46b75d14b6f 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -223,6 +223,19 @@ static char * const zone_names[MAX_NR_ZONES] = { #endif }; +char * const migratetype_names[MIGRATE_TYPES] = { + "Unmovable", + "Movable", + "Reclaimable", + "HighAtomic", +#ifdef CONFIG_CMA + "CMA", +#endif +#ifdef CONFIG_MEMORY_ISOLATION + "Isolate", +#endif +}; + compound_page_dtor * const compound_page_dtors[] = { NULL, free_compound_page, @@ -247,6 +260,7 @@ static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES]; static unsigned long __initdata required_kernelcore; static unsigned long __initdata required_movablecore; static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES]; +static bool mirrored_kernelcore; /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */ int movable_zone; @@ -416,7 +430,7 @@ static void bad_page(struct page *page, const char *reason, goto out; } if (nr_unshown) { - printk(KERN_ALERT + pr_alert( "BUG: Bad page state: %lu messages suppressed\n", nr_unshown); nr_unshown = 0; @@ -426,9 +440,14 @@ static void bad_page(struct page *page, const char *reason, if (nr_shown++ == 0) resume = jiffies + 60 * HZ; - printk(KERN_ALERT "BUG: Bad page state in process %s pfn:%05lx\n", + pr_alert("BUG: Bad page state in process %s pfn:%05lx\n", current->comm, page_to_pfn(page)); - dump_page_badflags(page, reason, bad_flags); + __dump_page(page, reason); + bad_flags &= page->flags; + if (bad_flags) + pr_alert("bad because of flags: %#lx(%pGp)\n", + bad_flags, &bad_flags); + dump_page_owner(page); print_modules(); dump_stack(); @@ -477,7 +496,8 @@ void prep_compound_page(struct page *page, unsigned int order) #ifdef CONFIG_DEBUG_PAGEALLOC unsigned int _debug_guardpage_minorder; -bool _debug_pagealloc_enabled __read_mostly; +bool _debug_pagealloc_enabled __read_mostly + = IS_ENABLED(CONFIG_DEBUG_PAGEALLOC_ENABLE_DEFAULT); bool _debug_guardpage_enabled __read_mostly; static int __init early_debug_pagealloc(char *buf) @@ -488,6 +508,9 @@ static int __init early_debug_pagealloc(char *buf) if (strcmp(buf, "on") == 0) _debug_pagealloc_enabled = true; + if (strcmp(buf, "off") == 0) + _debug_pagealloc_enabled = false; + return 0; } early_param("debug_pagealloc", early_debug_pagealloc); @@ -1002,6 +1025,7 @@ static bool free_pages_prepare(struct page *page, unsigned int order) PAGE_SIZE << order); } arch_free_page(page, order); + kernel_poison_pages(page, 1 << order, 0); kernel_map_pages(page, 1 << order, 0); return true; @@ -1104,6 +1128,75 @@ void __init __free_pages_bootmem(struct page *page, unsigned long pfn, return __free_pages_boot_core(page, pfn, order); } +/* + * Check that the whole (or subset of) a pageblock given by the interval of + * [start_pfn, end_pfn) is valid and within the same zone, before scanning it + * with the migration of free compaction scanner. The scanners then need to + * use only pfn_valid_within() check for arches that allow holes within + * pageblocks. + * + * Return struct page pointer of start_pfn, or NULL if checks were not passed. + * + * It's possible on some configurations to have a setup like node0 node1 node0 + * i.e. it's possible that all pages within a zones range of pages do not + * belong to a single zone. We assume that a border between node0 and node1 + * can occur within a single pageblock, but not a node0 node1 node0 + * interleaving within a single pageblock. It is therefore sufficient to check + * the first and last page of a pageblock and avoid checking each individual + * page in a pageblock. + */ +struct page *__pageblock_pfn_to_page(unsigned long start_pfn, + unsigned long end_pfn, struct zone *zone) +{ + struct page *start_page; + struct page *end_page; + + /* end_pfn is one past the range we are checking */ + end_pfn--; + + if (!pfn_valid(start_pfn) || !pfn_valid(end_pfn)) + return NULL; + + start_page = pfn_to_page(start_pfn); + + if (page_zone(start_page) != zone) + return NULL; + + end_page = pfn_to_page(end_pfn); + + /* This gives a shorter code than deriving page_zone(end_page) */ + if (page_zone_id(start_page) != page_zone_id(end_page)) + return NULL; + + return start_page; +} + +void set_zone_contiguous(struct zone *zone) +{ + unsigned long block_start_pfn = zone->zone_start_pfn; + unsigned long block_end_pfn; + + block_end_pfn = ALIGN(block_start_pfn + 1, pageblock_nr_pages); + for (; block_start_pfn < zone_end_pfn(zone); + block_start_pfn = block_end_pfn, + block_end_pfn += pageblock_nr_pages) { + + block_end_pfn = min(block_end_pfn, zone_end_pfn(zone)); + + if (!__pageblock_pfn_to_page(block_start_pfn, + block_end_pfn, zone)) + return; + } + + /* We confirm that there is no hole */ + zone->contiguous = true; +} + +void clear_zone_contiguous(struct zone *zone) +{ + zone->contiguous = false; +} + #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT static void __init deferred_free_range(struct page *page, unsigned long pfn, int nr_pages) @@ -1254,9 +1347,13 @@ free_range: pgdat_init_report_one_done(); return 0; } +#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */ void __init page_alloc_init_late(void) { + struct zone *zone; + +#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT int nid; /* There will be num_node_state(N_MEMORY) threads */ @@ -1270,8 +1367,11 @@ void __init page_alloc_init_late(void) /* Reinit limits that are based on free pages after the kernel is up */ files_maxfiles_init(); +#endif + + for_each_populated_zone(zone) + set_zone_contiguous(zone); } -#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */ #ifdef CONFIG_CMA /* Free whole pageblock and set its migration type to MIGRATE_CMA. */ @@ -1381,15 +1481,24 @@ static inline int check_new_page(struct page *page) return 0; } +static inline bool free_pages_prezeroed(bool poisoned) +{ + return IS_ENABLED(CONFIG_PAGE_POISONING_ZERO) && + page_poisoning_enabled() && poisoned; +} + static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags, int alloc_flags) { int i; + bool poisoned = true; for (i = 0; i < (1 << order); i++) { struct page *p = page + i; if (unlikely(check_new_page(p))) return 1; + if (poisoned) + poisoned &= page_is_poisoned(p); } set_page_private(page, 0); @@ -1397,9 +1506,10 @@ static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags, arch_alloc_page(page, order); kernel_map_pages(page, 1 << order, 1); + kernel_poison_pages(page, 1 << order, 1); kasan_alloc_pages(page, order); - if (gfp_flags & __GFP_ZERO) + if (!free_pages_prezeroed(poisoned) && (gfp_flags & __GFP_ZERO)) for (i = 0; i < (1 << order); i++) clear_highpage(page + i); @@ -2690,9 +2800,8 @@ void warn_alloc_failed(gfp_t gfp_mask, unsigned int order, const char *fmt, ...) va_end(args); } - pr_warn("%s: page allocation failure: order:%u, mode:0x%x\n", - current->comm, order, gfp_mask); - + pr_warn("%s: page allocation failure: order:%u, mode:%#x(%pGg)\n", + current->comm, order, gfp_mask, &gfp_mask); dump_stack(); if (!should_suppress_show_mem()) show_mem(filter); @@ -4491,6 +4600,9 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, pg_data_t *pgdat = NODE_DATA(nid); unsigned long pfn; unsigned long nr_initialised = 0; +#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP + struct memblock_region *r = NULL, *tmp; +#endif if (highest_memmap_pfn < end_pfn - 1) highest_memmap_pfn = end_pfn - 1; @@ -4504,20 +4616,51 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, for (pfn = start_pfn; pfn < end_pfn; pfn++) { /* - * There can be holes in boot-time mem_map[]s - * handed to this function. They do not - * exist on hotplugged memory. + * There can be holes in boot-time mem_map[]s handed to this + * function. They do not exist on hotplugged memory. */ - if (context == MEMMAP_EARLY) { - if (!early_pfn_valid(pfn)) + if (context != MEMMAP_EARLY) + goto not_early; + + if (!early_pfn_valid(pfn)) + continue; + if (!early_pfn_in_nid(pfn, nid)) + continue; + if (!update_defer_init(pgdat, pfn, end_pfn, &nr_initialised)) + break; + +#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP + /* + * If not mirrored_kernelcore and ZONE_MOVABLE exists, range + * from zone_movable_pfn[nid] to end of each node should be + * ZONE_MOVABLE not ZONE_NORMAL. skip it. + */ + if (!mirrored_kernelcore && zone_movable_pfn[nid]) + if (zone == ZONE_NORMAL && pfn >= zone_movable_pfn[nid]) continue; - if (!early_pfn_in_nid(pfn, nid)) + + /* + * Check given memblock attribute by firmware which can affect + * kernel memory layout. If zone==ZONE_MOVABLE but memory is + * mirrored, it's an overlapped memmap init. skip it. + */ + if (mirrored_kernelcore && zone == ZONE_MOVABLE) { + if (!r || pfn >= memblock_region_memory_end_pfn(r)) { + for_each_memblock(memory, tmp) + if (pfn < memblock_region_memory_end_pfn(tmp)) + break; + r = tmp; + } + if (pfn >= memblock_region_memory_base_pfn(r) && + memblock_is_mirror(r)) { + /* already initialized as NORMAL */ + pfn = memblock_region_memory_end_pfn(r); continue; - if (!update_defer_init(pgdat, pfn, end_pfn, - &nr_initialised)) - break; + } } +#endif +not_early: /* * Mark the block movable so that blocks are reserved for * movable at startup. This will force kernel allocations @@ -4934,11 +5077,6 @@ static void __meminit adjust_zone_range_for_zone_movable(int nid, *zone_end_pfn = min(node_end_pfn, arch_zone_highest_possible_pfn[movable_zone]); - /* Adjust for ZONE_MOVABLE starting within this range */ - } else if (*zone_start_pfn < zone_movable_pfn[nid] && - *zone_end_pfn > zone_movable_pfn[nid]) { - *zone_end_pfn = zone_movable_pfn[nid]; - /* Check if this whole range is within ZONE_MOVABLE */ } else if (*zone_start_pfn >= zone_movable_pfn[nid]) *zone_start_pfn = *zone_end_pfn; @@ -4953,31 +5091,31 @@ static unsigned long __meminit zone_spanned_pages_in_node(int nid, unsigned long zone_type, unsigned long node_start_pfn, unsigned long node_end_pfn, + unsigned long *zone_start_pfn, + unsigned long *zone_end_pfn, unsigned long *ignored) { - unsigned long zone_start_pfn, zone_end_pfn; - /* When hotadd a new node from cpu_up(), the node should be empty */ if (!node_start_pfn && !node_end_pfn) return 0; /* Get the start and end of the zone */ - zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type]; - zone_end_pfn = arch_zone_highest_possible_pfn[zone_type]; + *zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type]; + *zone_end_pfn = arch_zone_highest_possible_pfn[zone_type]; adjust_zone_range_for_zone_movable(nid, zone_type, node_start_pfn, node_end_pfn, - &zone_start_pfn, &zone_end_pfn); + zone_start_pfn, zone_end_pfn); /* Check that this node has pages within the zone's required range */ - if (zone_end_pfn < node_start_pfn || zone_start_pfn > node_end_pfn) + if (*zone_end_pfn < node_start_pfn || *zone_start_pfn > node_end_pfn) return 0; /* Move the zone boundaries inside the node if necessary */ - zone_end_pfn = min(zone_end_pfn, node_end_pfn); - zone_start_pfn = max(zone_start_pfn, node_start_pfn); + *zone_end_pfn = min(*zone_end_pfn, node_end_pfn); + *zone_start_pfn = max(*zone_start_pfn, node_start_pfn); /* Return the spanned pages */ - return zone_end_pfn - zone_start_pfn; + return *zone_end_pfn - *zone_start_pfn; } /* @@ -5023,6 +5161,7 @@ static unsigned long __meminit zone_absent_pages_in_node(int nid, unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type]; unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type]; unsigned long zone_start_pfn, zone_end_pfn; + unsigned long nr_absent; /* When hotadd a new node from cpu_up(), the node should be empty */ if (!node_start_pfn && !node_end_pfn) @@ -5034,7 +5173,39 @@ static unsigned long __meminit zone_absent_pages_in_node(int nid, adjust_zone_range_for_zone_movable(nid, zone_type, node_start_pfn, node_end_pfn, &zone_start_pfn, &zone_end_pfn); - return __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn); + nr_absent = __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn); + + /* + * ZONE_MOVABLE handling. + * Treat pages to be ZONE_MOVABLE in ZONE_NORMAL as absent pages + * and vice versa. + */ + if (zone_movable_pfn[nid]) { + if (mirrored_kernelcore) { + unsigned long start_pfn, end_pfn; + struct memblock_region *r; + + for_each_memblock(memory, r) { + start_pfn = clamp(memblock_region_memory_base_pfn(r), + zone_start_pfn, zone_end_pfn); + end_pfn = clamp(memblock_region_memory_end_pfn(r), + zone_start_pfn, zone_end_pfn); + + if (zone_type == ZONE_MOVABLE && + memblock_is_mirror(r)) + nr_absent += end_pfn - start_pfn; + + if (zone_type == ZONE_NORMAL && + !memblock_is_mirror(r)) + nr_absent += end_pfn - start_pfn; + } + } else { + if (zone_type == ZONE_NORMAL) + nr_absent += node_end_pfn - zone_movable_pfn[nid]; + } + } + + return nr_absent; } #else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ @@ -5042,8 +5213,18 @@ static inline unsigned long __meminit zone_spanned_pages_in_node(int nid, unsigned long zone_type, unsigned long node_start_pfn, unsigned long node_end_pfn, + unsigned long *zone_start_pfn, + unsigned long *zone_end_pfn, unsigned long *zones_size) { + unsigned int zone; + + *zone_start_pfn = node_start_pfn; + for (zone = 0; zone < zone_type; zone++) + *zone_start_pfn += zones_size[zone]; + + *zone_end_pfn = *zone_start_pfn + zones_size[zone_type]; + return zones_size[zone_type]; } @@ -5072,15 +5253,22 @@ static void __meminit calculate_node_totalpages(struct pglist_data *pgdat, for (i = 0; i < MAX_NR_ZONES; i++) { struct zone *zone = pgdat->node_zones + i; + unsigned long zone_start_pfn, zone_end_pfn; unsigned long size, real_size; size = zone_spanned_pages_in_node(pgdat->node_id, i, node_start_pfn, node_end_pfn, + &zone_start_pfn, + &zone_end_pfn, zones_size); real_size = size - zone_absent_pages_in_node(pgdat->node_id, i, node_start_pfn, node_end_pfn, zholes_size); + if (size) + zone->zone_start_pfn = zone_start_pfn; + else + zone->zone_start_pfn = 0; zone->spanned_pages = size; zone->present_pages = real_size; @@ -5201,7 +5389,6 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat) { enum zone_type j; int nid = pgdat->node_id; - unsigned long zone_start_pfn = pgdat->node_start_pfn; int ret; pgdat_resize_init(pgdat); @@ -5222,6 +5409,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat) for (j = 0; j < MAX_NR_ZONES; j++) { struct zone *zone = pgdat->node_zones + j; unsigned long size, realsize, freesize, memmap_pages; + unsigned long zone_start_pfn = zone->zone_start_pfn; size = zone->spanned_pages; realsize = freesize = zone->present_pages; @@ -5290,7 +5478,6 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat) ret = init_currently_empty_zone(zone, zone_start_pfn, size); BUG_ON(ret); memmap_init(size, nid, j, zone_start_pfn); - zone_start_pfn += size; } } @@ -5358,6 +5545,8 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size, pr_info("Initmem setup node %d [mem %#018Lx-%#018Lx]\n", nid, (u64)start_pfn << PAGE_SHIFT, end_pfn ? ((u64)end_pfn << PAGE_SHIFT) - 1 : 0); +#else + start_pfn = node_start_pfn; #endif calculate_node_totalpages(pgdat, start_pfn, end_pfn, zones_size, zholes_size); @@ -5529,6 +5718,36 @@ static void __init find_zone_movable_pfns_for_nodes(void) } /* + * If kernelcore=mirror is specified, ignore movablecore option + */ + if (mirrored_kernelcore) { + bool mem_below_4gb_not_mirrored = false; + + for_each_memblock(memory, r) { + if (memblock_is_mirror(r)) + continue; + + nid = r->nid; + + usable_startpfn = memblock_region_memory_base_pfn(r); + + if (usable_startpfn < 0x100000) { + mem_below_4gb_not_mirrored = true; + continue; + } + + zone_movable_pfn[nid] = zone_movable_pfn[nid] ? + min(usable_startpfn, zone_movable_pfn[nid]) : + usable_startpfn; + } + + if (mem_below_4gb_not_mirrored) + pr_warn("This configuration results in unmirrored kernel memory."); + + goto out2; + } + + /* * If movablecore=nn[KMG] was specified, calculate what size of * kernelcore that corresponds so that memory usable for * any allocation type is evenly spread. If both kernelcore @@ -5788,6 +6007,12 @@ static int __init cmdline_parse_core(char *p, unsigned long *core) */ static int __init cmdline_parse_kernelcore(char *p) { + /* parse kernelcore=mirror */ + if (parse_option_str(p, "mirror")) { + mirrored_kernelcore = true; + return 0; + } + return cmdline_parse_core(p, &required_kernelcore); } diff --git a/mm/page_ext.c b/mm/page_ext.c index 292ca7b8debd..2d864e64f7fe 100644 --- a/mm/page_ext.c +++ b/mm/page_ext.c @@ -106,12 +106,15 @@ struct page_ext *lookup_page_ext(struct page *page) struct page_ext *base; base = NODE_DATA(page_to_nid(page))->node_page_ext; -#ifdef CONFIG_DEBUG_VM +#if defined(CONFIG_DEBUG_VM) || defined(CONFIG_PAGE_POISONING) /* * The sanity checks the page allocator does upon freeing a * page can reach here before the page_ext arrays are * allocated when feeding a range of pages to the allocator * for the first time during bootup or memory hotplug. + * + * This check is also necessary for ensuring page poisoning + * works as expected when enabled */ if (unlikely(!base)) return NULL; @@ -180,12 +183,15 @@ struct page_ext *lookup_page_ext(struct page *page) { unsigned long pfn = page_to_pfn(page); struct mem_section *section = __pfn_to_section(pfn); -#ifdef CONFIG_DEBUG_VM +#if defined(CONFIG_DEBUG_VM) || defined(CONFIG_PAGE_POISONING) /* * The sanity checks the page allocator does upon freeing a * page can reach here before the page_ext arrays are * allocated when feeding a range of pages to the allocator * for the first time during bootup or memory hotplug. + * + * This check is also necessary for ensuring page poisoning + * works as expected when enabled */ if (!section->page_ext) return NULL; diff --git a/mm/page_owner.c b/mm/page_owner.c index 983c3a10fa07..44ad1f00c4e1 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -5,10 +5,12 @@ #include <linux/bootmem.h> #include <linux/stacktrace.h> #include <linux/page_owner.h> +#include <linux/jump_label.h> +#include <linux/migrate.h> #include "internal.h" static bool page_owner_disabled = true; -bool page_owner_inited __read_mostly; +DEFINE_STATIC_KEY_FALSE(page_owner_inited); static void init_early_allocated_pages(void); @@ -37,7 +39,7 @@ static void init_page_owner(void) if (page_owner_disabled) return; - page_owner_inited = true; + static_branch_enable(&page_owner_inited); init_early_allocated_pages(); } @@ -72,10 +74,18 @@ void __set_page_owner(struct page *page, unsigned int order, gfp_t gfp_mask) page_ext->order = order; page_ext->gfp_mask = gfp_mask; page_ext->nr_entries = trace.nr_entries; + page_ext->last_migrate_reason = -1; __set_bit(PAGE_EXT_OWNER, &page_ext->flags); } +void __set_page_owner_migrate_reason(struct page *page, int reason) +{ + struct page_ext *page_ext = lookup_page_ext(page); + + page_ext->last_migrate_reason = reason; +} + gfp_t __get_page_owner_gfp(struct page *page) { struct page_ext *page_ext = lookup_page_ext(page); @@ -83,6 +93,31 @@ gfp_t __get_page_owner_gfp(struct page *page) return page_ext->gfp_mask; } +void __copy_page_owner(struct page *oldpage, struct page *newpage) +{ + struct page_ext *old_ext = lookup_page_ext(oldpage); + struct page_ext *new_ext = lookup_page_ext(newpage); + int i; + + new_ext->order = old_ext->order; + new_ext->gfp_mask = old_ext->gfp_mask; + new_ext->nr_entries = old_ext->nr_entries; + + for (i = 0; i < ARRAY_SIZE(new_ext->trace_entries); i++) + new_ext->trace_entries[i] = old_ext->trace_entries[i]; + + /* + * We don't clear the bit on the oldpage as it's going to be freed + * after migration. Until then, the info can be useful in case of + * a bug, and the overal stats will be off a bit only temporarily. + * Also, migrate_misplaced_transhuge_page() can still fail the + * migration and then we want the oldpage to retain the info. But + * in that case we also don't need to explicitly clear the info from + * the new page, which will be freed. + */ + __set_bit(PAGE_EXT_OWNER, &new_ext->flags); +} + static ssize_t print_page_owner(char __user *buf, size_t count, unsigned long pfn, struct page *page, struct page_ext *page_ext) @@ -100,8 +135,9 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn, return -ENOMEM; ret = snprintf(kbuf, count, - "Page allocated via order %u, mask 0x%x\n", - page_ext->order, page_ext->gfp_mask); + "Page allocated via order %u, mask %#x(%pGg)\n", + page_ext->order, page_ext->gfp_mask, + &page_ext->gfp_mask); if (ret >= count) goto err; @@ -110,23 +146,12 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn, pageblock_mt = get_pfnblock_migratetype(page, pfn); page_mt = gfpflags_to_migratetype(page_ext->gfp_mask); ret += snprintf(kbuf + ret, count - ret, - "PFN %lu Block %lu type %d %s Flags %s%s%s%s%s%s%s%s%s%s%s%s\n", + "PFN %lu type %s Block %lu type %s Flags %#lx(%pGp)\n", pfn, + migratetype_names[page_mt], pfn >> pageblock_order, - pageblock_mt, - pageblock_mt != page_mt ? "Fallback" : " ", - PageLocked(page) ? "K" : " ", - PageError(page) ? "E" : " ", - PageReferenced(page) ? "R" : " ", - PageUptodate(page) ? "U" : " ", - PageDirty(page) ? "D" : " ", - PageLRU(page) ? "L" : " ", - PageActive(page) ? "A" : " ", - PageSlab(page) ? "S" : " ", - PageWriteback(page) ? "W" : " ", - PageCompound(page) ? "C" : " ", - PageSwapCache(page) ? "B" : " ", - PageMappedToDisk(page) ? "M" : " "); + migratetype_names[pageblock_mt], + page->flags, &page->flags); if (ret >= count) goto err; @@ -135,6 +160,14 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn, if (ret >= count) goto err; + if (page_ext->last_migrate_reason != -1) { + ret += snprintf(kbuf + ret, count - ret, + "Page has been migrated, last migrate reason: %s\n", + migrate_reason_names[page_ext->last_migrate_reason]); + if (ret >= count) + goto err; + } + ret += snprintf(kbuf + ret, count - ret, "\n"); if (ret >= count) goto err; @@ -150,6 +183,31 @@ err: return -ENOMEM; } +void __dump_page_owner(struct page *page) +{ + struct page_ext *page_ext = lookup_page_ext(page); + struct stack_trace trace = { + .nr_entries = page_ext->nr_entries, + .entries = &page_ext->trace_entries[0], + }; + gfp_t gfp_mask = page_ext->gfp_mask; + int mt = gfpflags_to_migratetype(gfp_mask); + + if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags)) { + pr_alert("page_owner info is not active (free page?)\n"); + return; + } + + pr_alert("page allocated via order %u, migratetype %s, " + "gfp_mask %#x(%pGg)\n", page_ext->order, + migratetype_names[mt], gfp_mask, &gfp_mask); + print_stack_trace(&trace, 0); + + if (page_ext->last_migrate_reason != -1) + pr_alert("page has been migrated, last migrate reason: %s\n", + migrate_reason_names[page_ext->last_migrate_reason]); +} + static ssize_t read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos) { @@ -157,7 +215,7 @@ read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos) struct page *page; struct page_ext *page_ext; - if (!page_owner_inited) + if (!static_branch_unlikely(&page_owner_inited)) return -EINVAL; page = NULL; @@ -305,7 +363,7 @@ static int __init pageowner_init(void) { struct dentry *dentry; - if (!page_owner_inited) { + if (!static_branch_unlikely(&page_owner_inited)) { pr_info("page_owner is disabled\n"); return 0; } diff --git a/mm/debug-pagealloc.c b/mm/page_poison.c index 5bf5906ce13b..479e7ea2bea6 100644 --- a/mm/debug-pagealloc.c +++ b/mm/page_poison.c @@ -6,22 +6,48 @@ #include <linux/poison.h> #include <linux/ratelimit.h> -static bool page_poisoning_enabled __read_mostly; +static bool __page_poisoning_enabled __read_mostly; +static bool want_page_poisoning __read_mostly; -static bool need_page_poisoning(void) +static int early_page_poison_param(char *buf) { - if (!debug_pagealloc_enabled()) - return false; + if (!buf) + return -EINVAL; + + if (strcmp(buf, "on") == 0) + want_page_poisoning = true; + else if (strcmp(buf, "off") == 0) + want_page_poisoning = false; - return true; + return 0; +} +early_param("page_poison", early_page_poison_param); + +bool page_poisoning_enabled(void) +{ + return __page_poisoning_enabled; +} + +static bool need_page_poisoning(void) +{ + return want_page_poisoning; } static void init_page_poisoning(void) { - if (!debug_pagealloc_enabled()) - return; + /* + * page poisoning is debug page alloc for some arches. If either + * of those options are enabled, enable poisoning + */ + if (!IS_ENABLED(CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC)) { + if (!want_page_poisoning && !debug_pagealloc_enabled()) + return; + } else { + if (!want_page_poisoning) + return; + } - page_poisoning_enabled = true; + __page_poisoning_enabled = true; } struct page_ext_operations page_poisoning_ops = { @@ -45,11 +71,14 @@ static inline void clear_page_poison(struct page *page) __clear_bit(PAGE_EXT_DEBUG_POISON, &page_ext->flags); } -static inline bool page_poison(struct page *page) +bool page_is_poisoned(struct page *page) { struct page_ext *page_ext; page_ext = lookup_page_ext(page); + if (!page_ext) + return false; + return test_bit(PAGE_EXT_DEBUG_POISON, &page_ext->flags); } @@ -83,6 +112,9 @@ static void check_poison_mem(unsigned char *mem, size_t bytes) unsigned char *start; unsigned char *end; + if (IS_ENABLED(CONFIG_PAGE_POISONING_NO_SANITY)) + return; + start = memchr_inv(mem, PAGE_POISON, bytes); if (!start) return; @@ -95,9 +127,9 @@ static void check_poison_mem(unsigned char *mem, size_t bytes) if (!__ratelimit(&ratelimit)) return; else if (start == end && single_bit_flip(*start, PAGE_POISON)) - printk(KERN_ERR "pagealloc: single bit error\n"); + pr_err("pagealloc: single bit error\n"); else - printk(KERN_ERR "pagealloc: memory corruption\n"); + pr_err("pagealloc: memory corruption\n"); print_hex_dump(KERN_ERR, "", DUMP_PREFIX_ADDRESS, 16, 1, start, end - start + 1, 1); @@ -108,7 +140,7 @@ static void unpoison_page(struct page *page) { void *addr; - if (!page_poison(page)) + if (!page_is_poisoned(page)) return; addr = kmap_atomic(page); @@ -125,9 +157,9 @@ static void unpoison_pages(struct page *page, int n) unpoison_page(page + i); } -void __kernel_map_pages(struct page *page, int numpages, int enable) +void kernel_poison_pages(struct page *page, int numpages, int enable) { - if (!page_poisoning_enabled) + if (!page_poisoning_enabled()) return; if (enable) @@ -135,3 +167,10 @@ void __kernel_map_pages(struct page *page, int numpages, int enable) else poison_pages(page, numpages); } + +#ifndef CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC +void __kernel_map_pages(struct page *page, int numpages, int enable) +{ + /* This function does nothing, all work is done via poison pages */ +} +#endif diff --git a/mm/rmap.c b/mm/rmap.c index 79f3bf047f38..02f0bfc3c80a 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1287,21 +1287,17 @@ void page_add_new_anon_rmap(struct page *page, */ void page_add_file_rmap(struct page *page) { - struct mem_cgroup *memcg; - - memcg = mem_cgroup_begin_page_stat(page); + lock_page_memcg(page); if (atomic_inc_and_test(&page->_mapcount)) { __inc_zone_page_state(page, NR_FILE_MAPPED); - mem_cgroup_inc_page_stat(memcg, MEM_CGROUP_STAT_FILE_MAPPED); + mem_cgroup_inc_page_stat(page, MEM_CGROUP_STAT_FILE_MAPPED); } - mem_cgroup_end_page_stat(memcg); + unlock_page_memcg(page); } static void page_remove_file_rmap(struct page *page) { - struct mem_cgroup *memcg; - - memcg = mem_cgroup_begin_page_stat(page); + lock_page_memcg(page); /* Hugepages are not counted in NR_FILE_MAPPED for now. */ if (unlikely(PageHuge(page))) { @@ -1320,12 +1316,12 @@ static void page_remove_file_rmap(struct page *page) * pte lock(a spinlock) is held, which implies preemption disabled. */ __dec_zone_page_state(page, NR_FILE_MAPPED); - mem_cgroup_dec_page_stat(memcg, MEM_CGROUP_STAT_FILE_MAPPED); + mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_FILE_MAPPED); if (unlikely(PageMlocked(page))) clear_page_mlock(page); out: - mem_cgroup_end_page_stat(memcg); + unlock_page_memcg(page); } static void page_remove_anon_compound_rmap(struct page *page) diff --git a/mm/shmem.c b/mm/shmem.c index 440e2a7e6c1c..1acfdbc4bd9e 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1116,7 +1116,7 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp, */ oldpage = newpage; } else { - mem_cgroup_replace_page(oldpage, newpage); + mem_cgroup_migrate(oldpage, newpage); lru_cache_add_anon(newpage); *pagep = newpage; } diff --git a/mm/slab.c b/mm/slab.c index 621fbcb35a36..852fc5c79829 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -169,12 +169,6 @@ typedef unsigned short freelist_idx_t; #define SLAB_OBJ_MAX_NUM ((1 << sizeof(freelist_idx_t) * BITS_PER_BYTE) - 1) /* - * true if a page was allocated from pfmemalloc reserves for network-based - * swap - */ -static bool pfmemalloc_active __read_mostly; - -/* * struct array_cache * * Purpose: @@ -195,10 +189,6 @@ struct array_cache { * Must have this definition in here for the proper * alignment of array_cache. Also simplifies accessing * the entries. - * - * Entries should not be directly dereferenced as - * entries belonging to slabs marked pfmemalloc will - * have the lower bits set SLAB_OBJ_PFMEMALLOC */ }; @@ -207,33 +197,6 @@ struct alien_cache { struct array_cache ac; }; -#define SLAB_OBJ_PFMEMALLOC 1 -static inline bool is_obj_pfmemalloc(void *objp) -{ - return (unsigned long)objp & SLAB_OBJ_PFMEMALLOC; -} - -static inline void set_obj_pfmemalloc(void **objp) -{ - *objp = (void *)((unsigned long)*objp | SLAB_OBJ_PFMEMALLOC); - return; -} - -static inline void clear_obj_pfmemalloc(void **objp) -{ - *objp = (void *)((unsigned long)*objp & ~SLAB_OBJ_PFMEMALLOC); -} - -/* - * bootstrap: The caches do not work without cpuarrays anymore, but the - * cpuarrays are allocated from the generic caches... - */ -#define BOOT_CPUCACHE_ENTRIES 1 -struct arraycache_init { - struct array_cache cache; - void *entries[BOOT_CPUCACHE_ENTRIES]; -}; - /* * Need this for bootstrapping a per node allocator. */ @@ -280,9 +243,10 @@ static void kmem_cache_node_init(struct kmem_cache_node *parent) MAKE_LIST((cachep), (&(ptr)->slabs_free), slabs_free, nodeid); \ } while (0) +#define CFLGS_OBJFREELIST_SLAB (0x40000000UL) #define CFLGS_OFF_SLAB (0x80000000UL) +#define OBJFREELIST_SLAB(x) ((x)->flags & CFLGS_OBJFREELIST_SLAB) #define OFF_SLAB(x) ((x)->flags & CFLGS_OFF_SLAB) -#define OFF_SLAB_MIN_SIZE (max_t(size_t, PAGE_SIZE >> 5, KMALLOC_MIN_SIZE + 1)) #define BATCHREFILL_LIMIT 16 /* @@ -390,36 +354,26 @@ static void **dbg_userword(struct kmem_cache *cachep, void *objp) #endif -#define OBJECT_FREE (0) -#define OBJECT_ACTIVE (1) - #ifdef CONFIG_DEBUG_SLAB_LEAK -static void set_obj_status(struct page *page, int idx, int val) +static inline bool is_store_user_clean(struct kmem_cache *cachep) { - int freelist_size; - char *status; - struct kmem_cache *cachep = page->slab_cache; - - freelist_size = cachep->num * sizeof(freelist_idx_t); - status = (char *)page->freelist + freelist_size; - status[idx] = val; + return atomic_read(&cachep->store_user_clean) == 1; } -static inline unsigned int get_obj_status(struct page *page, int idx) +static inline void set_store_user_clean(struct kmem_cache *cachep) { - int freelist_size; - char *status; - struct kmem_cache *cachep = page->slab_cache; - - freelist_size = cachep->num * sizeof(freelist_idx_t); - status = (char *)page->freelist + freelist_size; + atomic_set(&cachep->store_user_clean, 1); +} - return status[idx]; +static inline void set_store_user_dirty(struct kmem_cache *cachep) +{ + if (is_store_user_clean(cachep)) + atomic_set(&cachep->store_user_clean, 0); } #else -static inline void set_obj_status(struct page *page, int idx, int val) {} +static inline void set_store_user_dirty(struct kmem_cache *cachep) {} #endif @@ -457,6 +411,7 @@ static inline unsigned int obj_to_index(const struct kmem_cache *cache, return reciprocal_divide(offset, cache->reciprocal_buffer_size); } +#define BOOT_CPUCACHE_ENTRIES 1 /* internal cache of cache description objs */ static struct kmem_cache kmem_cache_boot = { .batchcount = 1, @@ -475,61 +430,13 @@ static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep) return this_cpu_ptr(cachep->cpu_cache); } -static size_t calculate_freelist_size(int nr_objs, size_t align) -{ - size_t freelist_size; - - freelist_size = nr_objs * sizeof(freelist_idx_t); - if (IS_ENABLED(CONFIG_DEBUG_SLAB_LEAK)) - freelist_size += nr_objs * sizeof(char); - - if (align) - freelist_size = ALIGN(freelist_size, align); - - return freelist_size; -} - -static int calculate_nr_objs(size_t slab_size, size_t buffer_size, - size_t idx_size, size_t align) -{ - int nr_objs; - size_t remained_size; - size_t freelist_size; - int extra_space = 0; - - if (IS_ENABLED(CONFIG_DEBUG_SLAB_LEAK)) - extra_space = sizeof(char); - /* - * Ignore padding for the initial guess. The padding - * is at most @align-1 bytes, and @buffer_size is at - * least @align. In the worst case, this result will - * be one greater than the number of objects that fit - * into the memory allocation when taking the padding - * into account. - */ - nr_objs = slab_size / (buffer_size + idx_size + extra_space); - - /* - * This calculated number will be either the right - * amount, or one greater than what we want. - */ - remained_size = slab_size - nr_objs * buffer_size; - freelist_size = calculate_freelist_size(nr_objs, align); - if (remained_size < freelist_size) - nr_objs--; - - return nr_objs; -} - /* * Calculate the number of objects and left-over bytes for a given buffer size. */ -static void cache_estimate(unsigned long gfporder, size_t buffer_size, - size_t align, int flags, size_t *left_over, - unsigned int *num) +static unsigned int cache_estimate(unsigned long gfporder, size_t buffer_size, + unsigned long flags, size_t *left_over) { - int nr_objs; - size_t mgmt_size; + unsigned int num; size_t slab_size = PAGE_SIZE << gfporder; /* @@ -537,26 +444,28 @@ static void cache_estimate(unsigned long gfporder, size_t buffer_size, * on it. For the latter case, the memory allocated for a * slab is used for: * - * - One unsigned int for each object - * - Padding to respect alignment of @align * - @buffer_size bytes for each object + * - One freelist_idx_t for each object + * + * We don't need to consider alignment of freelist because + * freelist will be at the end of slab page. The objects will be + * at the correct alignment. * * If the slab management structure is off the slab, then the * alignment will already be calculated into the size. Because * the slabs are all pages aligned, the objects will be at the * correct alignment when allocated. */ - if (flags & CFLGS_OFF_SLAB) { - mgmt_size = 0; - nr_objs = slab_size / buffer_size; - + if (flags & (CFLGS_OBJFREELIST_SLAB | CFLGS_OFF_SLAB)) { + num = slab_size / buffer_size; + *left_over = slab_size % buffer_size; } else { - nr_objs = calculate_nr_objs(slab_size, buffer_size, - sizeof(freelist_idx_t), align); - mgmt_size = calculate_freelist_size(nr_objs, align); + num = slab_size / (buffer_size + sizeof(freelist_idx_t)); + *left_over = slab_size % + (buffer_size + sizeof(freelist_idx_t)); } - *num = nr_objs; - *left_over = slab_size - nr_objs*buffer_size - mgmt_size; + + return num; } #if DEBUG @@ -687,120 +596,21 @@ static struct array_cache *alloc_arraycache(int node, int entries, return ac; } -static inline bool is_slab_pfmemalloc(struct page *page) -{ - return PageSlabPfmemalloc(page); -} - -/* Clears pfmemalloc_active if no slabs have pfmalloc set */ -static void recheck_pfmemalloc_active(struct kmem_cache *cachep, - struct array_cache *ac) -{ - struct kmem_cache_node *n = get_node(cachep, numa_mem_id()); - struct page *page; - unsigned long flags; - - if (!pfmemalloc_active) - return; - - spin_lock_irqsave(&n->list_lock, flags); - list_for_each_entry(page, &n->slabs_full, lru) - if (is_slab_pfmemalloc(page)) - goto out; - - list_for_each_entry(page, &n->slabs_partial, lru) - if (is_slab_pfmemalloc(page)) - goto out; - - list_for_each_entry(page, &n->slabs_free, lru) - if (is_slab_pfmemalloc(page)) - goto out; - - pfmemalloc_active = false; -out: - spin_unlock_irqrestore(&n->list_lock, flags); -} - -static void *__ac_get_obj(struct kmem_cache *cachep, struct array_cache *ac, - gfp_t flags, bool force_refill) +static noinline void cache_free_pfmemalloc(struct kmem_cache *cachep, + struct page *page, void *objp) { - int i; - void *objp = ac->entry[--ac->avail]; - - /* Ensure the caller is allowed to use objects from PFMEMALLOC slab */ - if (unlikely(is_obj_pfmemalloc(objp))) { - struct kmem_cache_node *n; - - if (gfp_pfmemalloc_allowed(flags)) { - clear_obj_pfmemalloc(&objp); - return objp; - } - - /* The caller cannot use PFMEMALLOC objects, find another one */ - for (i = 0; i < ac->avail; i++) { - /* If a !PFMEMALLOC object is found, swap them */ - if (!is_obj_pfmemalloc(ac->entry[i])) { - objp = ac->entry[i]; - ac->entry[i] = ac->entry[ac->avail]; - ac->entry[ac->avail] = objp; - return objp; - } - } - - /* - * If there are empty slabs on the slabs_free list and we are - * being forced to refill the cache, mark this one !pfmemalloc. - */ - n = get_node(cachep, numa_mem_id()); - if (!list_empty(&n->slabs_free) && force_refill) { - struct page *page = virt_to_head_page(objp); - ClearPageSlabPfmemalloc(page); - clear_obj_pfmemalloc(&objp); - recheck_pfmemalloc_active(cachep, ac); - return objp; - } - - /* No !PFMEMALLOC objects available */ - ac->avail++; - objp = NULL; - } - - return objp; -} - -static inline void *ac_get_obj(struct kmem_cache *cachep, - struct array_cache *ac, gfp_t flags, bool force_refill) -{ - void *objp; - - if (unlikely(sk_memalloc_socks())) - objp = __ac_get_obj(cachep, ac, flags, force_refill); - else - objp = ac->entry[--ac->avail]; - - return objp; -} - -static noinline void *__ac_put_obj(struct kmem_cache *cachep, - struct array_cache *ac, void *objp) -{ - if (unlikely(pfmemalloc_active)) { - /* Some pfmemalloc slabs exist, check if this is one */ - struct page *page = virt_to_head_page(objp); - if (PageSlabPfmemalloc(page)) - set_obj_pfmemalloc(&objp); - } + struct kmem_cache_node *n; + int page_node; + LIST_HEAD(list); - return objp; -} + page_node = page_to_nid(page); + n = get_node(cachep, page_node); -static inline void ac_put_obj(struct kmem_cache *cachep, struct array_cache *ac, - void *objp) -{ - if (unlikely(sk_memalloc_socks())) - objp = __ac_put_obj(cachep, ac, objp); + spin_lock(&n->list_lock); + free_block(cachep, &objp, 1, page_node, &list); + spin_unlock(&n->list_lock); - ac->entry[ac->avail++] = objp; + slabs_destroy(cachep, &list); } /* @@ -1003,7 +813,7 @@ static int __cache_free_alien(struct kmem_cache *cachep, void *objp, STATS_INC_ACOVERFLOW(cachep); __drain_alien_cache(cachep, ac, page_node, &list); } - ac_put_obj(cachep, ac, objp); + ac->entry[ac->avail++] = objp; spin_unlock(&alien->lock); slabs_destroy(cachep, &list); } else { @@ -1540,10 +1350,9 @@ slab_out_of_memory(struct kmem_cache *cachep, gfp_t gfpflags, int nodeid) if ((gfpflags & __GFP_NOWARN) || !__ratelimit(&slab_oom_rs)) return; - printk(KERN_WARNING - "SLAB: Unable to allocate memory on node %d (gfp=0x%x)\n", - nodeid, gfpflags); - printk(KERN_WARNING " cache: %s, object size: %d, order: %d\n", + pr_warn("SLAB: Unable to allocate memory on node %d, gfp=%#x(%pGg)\n", + nodeid, gfpflags, &gfpflags); + pr_warn(" cache: %s, object size: %d, order: %d\n", cachep->name, cachep->size, cachep->gfporder); for_each_kmem_cache_node(cachep, node, n) { @@ -1567,8 +1376,7 @@ slab_out_of_memory(struct kmem_cache *cachep, gfp_t gfpflags, int nodeid) num_slabs += active_slabs; num_objs = num_slabs * cachep->num; - printk(KERN_WARNING - " node %d: slabs: %ld/%ld, objs: %ld/%ld, free: %ld\n", + pr_warn(" node %d: slabs: %ld/%ld, objs: %ld/%ld, free: %ld\n", node, active_slabs, num_slabs, active_objs, num_objs, free_objects); } @@ -1604,10 +1412,6 @@ static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, return NULL; } - /* Record if ALLOC_NO_WATERMARKS was set when allocating the slab */ - if (page_is_pfmemalloc(page)) - pfmemalloc_active = true; - nr_pages = (1 << cachep->gfporder); if (cachep->flags & SLAB_RECLAIM_ACCOUNT) add_zone_page_state(page_zone(page), @@ -1615,8 +1419,10 @@ static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, else add_zone_page_state(page_zone(page), NR_SLAB_UNRECLAIMABLE, nr_pages); + __SetPageSlab(page); - if (page_is_pfmemalloc(page)) + /* Record if ALLOC_NO_WATERMARKS was set when allocating the slab */ + if (sk_memalloc_socks() && page_is_pfmemalloc(page)) SetPageSlabPfmemalloc(page); if (kmemcheck_enabled && !(cachep->flags & SLAB_NOTRACK)) { @@ -1670,6 +1476,14 @@ static void kmem_rcu_free(struct rcu_head *head) } #if DEBUG +static bool is_debug_pagealloc_cache(struct kmem_cache *cachep) +{ + if (debug_pagealloc_enabled() && OFF_SLAB(cachep) && + (cachep->size % PAGE_SIZE) == 0) + return true; + + return false; +} #ifdef CONFIG_DEBUG_PAGEALLOC static void store_stackinfo(struct kmem_cache *cachep, unsigned long *addr, @@ -1703,6 +1517,23 @@ static void store_stackinfo(struct kmem_cache *cachep, unsigned long *addr, } *addr++ = 0x87654321; } + +static void slab_kernel_map(struct kmem_cache *cachep, void *objp, + int map, unsigned long caller) +{ + if (!is_debug_pagealloc_cache(cachep)) + return; + + if (caller) + store_stackinfo(cachep, objp, caller); + + kernel_map_pages(virt_to_page(objp), cachep->size / PAGE_SIZE, map); +} + +#else +static inline void slab_kernel_map(struct kmem_cache *cachep, void *objp, + int map, unsigned long caller) {} + #endif static void poison_obj(struct kmem_cache *cachep, void *addr, unsigned char val) @@ -1781,6 +1612,9 @@ static void check_poison_obj(struct kmem_cache *cachep, void *objp) int size, i; int lines = 0; + if (is_debug_pagealloc_cache(cachep)) + return; + realobj = (char *)objp + obj_offset(cachep); size = cachep->object_size; @@ -1842,20 +1676,18 @@ static void slab_destroy_debugcheck(struct kmem_cache *cachep, struct page *page) { int i; + + if (OBJFREELIST_SLAB(cachep) && cachep->flags & SLAB_POISON) { + poison_obj(cachep, page->freelist - obj_offset(cachep), + POISON_FREE); + } + for (i = 0; i < cachep->num; i++) { void *objp = index_to_obj(cachep, page, i); if (cachep->flags & SLAB_POISON) { -#ifdef CONFIG_DEBUG_PAGEALLOC - if (cachep->size % PAGE_SIZE == 0 && - OFF_SLAB(cachep)) - kernel_map_pages(virt_to_page(objp), - cachep->size / PAGE_SIZE, 1); - else - check_poison_obj(cachep, objp); -#else check_poison_obj(cachep, objp); -#endif + slab_kernel_map(cachep, objp, 1, 0); } if (cachep->flags & SLAB_RED_ZONE) { if (*dbg_redzone1(cachep, objp) != RED_INACTIVE) @@ -1916,7 +1748,6 @@ static void slabs_destroy(struct kmem_cache *cachep, struct list_head *list) * calculate_slab_order - calculate size (page order) of slabs * @cachep: pointer to the cache that is being created * @size: size of objects to be created in this cache. - * @align: required alignment for the objects. * @flags: slab allocation flags * * Also calculates the number of objects per slab. @@ -1926,9 +1757,8 @@ static void slabs_destroy(struct kmem_cache *cachep, struct list_head *list) * towards high-order requests, this should be changed. */ static size_t calculate_slab_order(struct kmem_cache *cachep, - size_t size, size_t align, unsigned long flags) + size_t size, unsigned long flags) { - unsigned long offslab_limit; size_t left_over = 0; int gfporder; @@ -1936,7 +1766,7 @@ static size_t calculate_slab_order(struct kmem_cache *cachep, unsigned int num; size_t remainder; - cache_estimate(gfporder, size, align, flags, &remainder, &num); + num = cache_estimate(gfporder, size, flags, &remainder); if (!num) continue; @@ -1945,19 +1775,24 @@ static size_t calculate_slab_order(struct kmem_cache *cachep, break; if (flags & CFLGS_OFF_SLAB) { - size_t freelist_size_per_obj = sizeof(freelist_idx_t); + struct kmem_cache *freelist_cache; + size_t freelist_size; + + freelist_size = num * sizeof(freelist_idx_t); + freelist_cache = kmalloc_slab(freelist_size, 0u); + if (!freelist_cache) + continue; + /* - * Max number of objs-per-slab for caches which - * use off-slab slabs. Needed to avoid a possible - * looping condition in cache_grow(). + * Needed to avoid possible looping condition + * in cache_grow() */ - if (IS_ENABLED(CONFIG_DEBUG_SLAB_LEAK)) - freelist_size_per_obj += sizeof(char); - offslab_limit = size; - offslab_limit /= freelist_size_per_obj; + if (OFF_SLAB(freelist_cache)) + continue; - if (num > offslab_limit) - break; + /* check if off slab has enough benefit */ + if (freelist_cache->size > cachep->size / 2) + continue; } /* Found something acceptable - save it away */ @@ -2075,6 +1910,79 @@ __kmem_cache_alias(const char *name, size_t size, size_t align, return cachep; } +static bool set_objfreelist_slab_cache(struct kmem_cache *cachep, + size_t size, unsigned long flags) +{ + size_t left; + + cachep->num = 0; + + if (cachep->ctor || flags & SLAB_DESTROY_BY_RCU) + return false; + + left = calculate_slab_order(cachep, size, + flags | CFLGS_OBJFREELIST_SLAB); + if (!cachep->num) + return false; + + if (cachep->num * sizeof(freelist_idx_t) > cachep->object_size) + return false; + + cachep->colour = left / cachep->colour_off; + + return true; +} + +static bool set_off_slab_cache(struct kmem_cache *cachep, + size_t size, unsigned long flags) +{ + size_t left; + + cachep->num = 0; + + /* + * Always use on-slab management when SLAB_NOLEAKTRACE + * to avoid recursive calls into kmemleak. + */ + if (flags & SLAB_NOLEAKTRACE) + return false; + + /* + * Size is large, assume best to place the slab management obj + * off-slab (should allow better packing of objs). + */ + left = calculate_slab_order(cachep, size, flags | CFLGS_OFF_SLAB); + if (!cachep->num) + return false; + + /* + * If the slab has been placed off-slab, and we have enough space then + * move it on-slab. This is at the expense of any extra colouring. + */ + if (left >= cachep->num * sizeof(freelist_idx_t)) + return false; + + cachep->colour = left / cachep->colour_off; + + return true; +} + +static bool set_on_slab_cache(struct kmem_cache *cachep, + size_t size, unsigned long flags) +{ + size_t left; + + cachep->num = 0; + + left = calculate_slab_order(cachep, size, flags); + if (!cachep->num) + return false; + + cachep->colour = left / cachep->colour_off; + + return true; +} + /** * __kmem_cache_create - Create a cache. * @cachep: cache management descriptor @@ -2099,7 +2007,6 @@ __kmem_cache_alias(const char *name, size_t size, size_t align, int __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) { - size_t left_over, freelist_size; size_t ralign = BYTES_PER_WORD; gfp_t gfp; int err; @@ -2119,8 +2026,6 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) if (!(flags & SLAB_DESTROY_BY_RCU)) flags |= SLAB_POISON; #endif - if (flags & SLAB_DESTROY_BY_RCU) - BUG_ON(flags & SLAB_POISON); #endif /* @@ -2152,6 +2057,10 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) * 4) Store it. */ cachep->align = ralign; + cachep->colour_off = cache_line_size(); + /* Offset must be a multiple of the alignment. */ + if (cachep->colour_off < cachep->align) + cachep->colour_off = cachep->align; if (slab_is_available()) gfp = GFP_KERNEL; @@ -2179,37 +2088,8 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) else size += BYTES_PER_WORD; } -#if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC) - /* - * To activate debug pagealloc, off-slab management is necessary - * requirement. In early phase of initialization, small sized slab - * doesn't get initialized so it would not be possible. So, we need - * to check size >= 256. It guarantees that all necessary small - * sized slab is initialized in current slab initialization sequence. - */ - if (!slab_early_init && size >= kmalloc_size(INDEX_NODE) && - size >= 256 && cachep->object_size > cache_line_size() && - ALIGN(size, cachep->align) < PAGE_SIZE) { - cachep->obj_offset += PAGE_SIZE - ALIGN(size, cachep->align); - size = PAGE_SIZE; - } -#endif #endif - /* - * Determine if the slab management is 'on' or 'off' slab. - * (bootstrapping cannot cope with offslab caches so don't do - * it too early on. Always use on-slab management when - * SLAB_NOLEAKTRACE to avoid recursive calls into kmemleak) - */ - if (size >= OFF_SLAB_MIN_SIZE && !slab_early_init && - !(flags & SLAB_NOLEAKTRACE)) - /* - * Size is large, assume best to place the slab management obj - * off-slab (should allow better packing of objs). - */ - flags |= CFLGS_OFF_SLAB; - size = ALIGN(size, cachep->align); /* * We should restrict the number of objects in a slab to implement @@ -2218,42 +2098,46 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) if (FREELIST_BYTE_INDEX && size < SLAB_OBJ_MIN_SIZE) size = ALIGN(SLAB_OBJ_MIN_SIZE, cachep->align); - left_over = calculate_slab_order(cachep, size, cachep->align, flags); - - if (!cachep->num) - return -E2BIG; - - freelist_size = calculate_freelist_size(cachep->num, cachep->align); - +#if DEBUG /* - * If the slab has been placed off-slab, and we have enough space then - * move it on-slab. This is at the expense of any extra colouring. + * To activate debug pagealloc, off-slab management is necessary + * requirement. In early phase of initialization, small sized slab + * doesn't get initialized so it would not be possible. So, we need + * to check size >= 256. It guarantees that all necessary small + * sized slab is initialized in current slab initialization sequence. */ - if (flags & CFLGS_OFF_SLAB && left_over >= freelist_size) { - flags &= ~CFLGS_OFF_SLAB; - left_over -= freelist_size; + if (debug_pagealloc_enabled() && (flags & SLAB_POISON) && + size >= 256 && cachep->object_size > cache_line_size()) { + if (size < PAGE_SIZE || size % PAGE_SIZE == 0) { + size_t tmp_size = ALIGN(size, PAGE_SIZE); + + if (set_off_slab_cache(cachep, tmp_size, flags)) { + flags |= CFLGS_OFF_SLAB; + cachep->obj_offset += tmp_size - size; + size = tmp_size; + goto done; + } + } } +#endif - if (flags & CFLGS_OFF_SLAB) { - /* really off slab. No need for manual alignment */ - freelist_size = calculate_freelist_size(cachep->num, 0); + if (set_objfreelist_slab_cache(cachep, size, flags)) { + flags |= CFLGS_OBJFREELIST_SLAB; + goto done; + } -#ifdef CONFIG_PAGE_POISONING - /* If we're going to use the generic kernel_map_pages() - * poisoning, then it's going to smash the contents of - * the redzone and userword anyhow, so switch them off. - */ - if (size % PAGE_SIZE == 0 && flags & SLAB_POISON) - flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER); -#endif + if (set_off_slab_cache(cachep, size, flags)) { + flags |= CFLGS_OFF_SLAB; + goto done; } - cachep->colour_off = cache_line_size(); - /* Offset must be a multiple of the alignment. */ - if (cachep->colour_off < cachep->align) - cachep->colour_off = cachep->align; - cachep->colour = left_over / cachep->colour_off; - cachep->freelist_size = freelist_size; + if (set_on_slab_cache(cachep, size, flags)) + goto done; + + return -E2BIG; + +done: + cachep->freelist_size = cachep->num * sizeof(freelist_idx_t); cachep->flags = flags; cachep->allocflags = __GFP_COMP; if (CONFIG_ZONE_DMA_FLAG && (flags & SLAB_CACHE_DMA)) @@ -2261,16 +2145,21 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) cachep->size = size; cachep->reciprocal_buffer_size = reciprocal_value(size); - if (flags & CFLGS_OFF_SLAB) { - cachep->freelist_cache = kmalloc_slab(freelist_size, 0u); - /* - * This is a possibility for one of the kmalloc_{dma,}_caches. - * But since we go off slab only for object size greater than - * OFF_SLAB_MIN_SIZE, and kmalloc_{dma,}_caches get created - * in ascending order,this should not happen at all. - * But leave a BUG_ON for some lucky dude. - */ - BUG_ON(ZERO_OR_NULL_PTR(cachep->freelist_cache)); +#if DEBUG + /* + * If we're going to use the generic kernel_map_pages() + * poisoning, then it's going to smash the contents of + * the redzone and userword anyhow, so switch them off. + */ + if (IS_ENABLED(CONFIG_PAGE_POISONING) && + (cachep->flags & SLAB_POISON) && + is_debug_pagealloc_cache(cachep)) + cachep->flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER); +#endif + + if (OFF_SLAB(cachep)) { + cachep->freelist_cache = + kmalloc_slab(cachep->freelist_size, 0u); } err = setup_cpu_cache(cachep, gfp); @@ -2377,9 +2266,6 @@ static int drain_freelist(struct kmem_cache *cache, } page = list_entry(p, struct page, lru); -#if DEBUG - BUG_ON(page->active); -#endif list_del(&page->lru); /* * Safe to drop the lock. The slab is no longer linked @@ -2454,18 +2340,23 @@ static void *alloc_slabmgmt(struct kmem_cache *cachep, void *freelist; void *addr = page_address(page); - if (OFF_SLAB(cachep)) { + page->s_mem = addr + colour_off; + page->active = 0; + + if (OBJFREELIST_SLAB(cachep)) + freelist = NULL; + else if (OFF_SLAB(cachep)) { /* Slab management obj is off-slab. */ freelist = kmem_cache_alloc_node(cachep->freelist_cache, local_flags, nodeid); if (!freelist) return NULL; } else { - freelist = addr + colour_off; - colour_off += cachep->freelist_size; + /* We will use last bytes at the slab for freelist */ + freelist = addr + (PAGE_SIZE << cachep->gfporder) - + cachep->freelist_size; } - page->active = 0; - page->s_mem = addr + colour_off; + return freelist; } @@ -2480,17 +2371,14 @@ static inline void set_free_obj(struct page *page, ((freelist_idx_t *)(page->freelist))[idx] = val; } -static void cache_init_objs(struct kmem_cache *cachep, - struct page *page) +static void cache_init_objs_debug(struct kmem_cache *cachep, struct page *page) { +#if DEBUG int i; for (i = 0; i < cachep->num; i++) { void *objp = index_to_obj(cachep, page, i); -#if DEBUG - /* need to poison the objs? */ - if (cachep->flags & SLAB_POISON) - poison_obj(cachep, objp, POISON_FREE); + if (cachep->flags & SLAB_STORE_USER) *dbg_userword(cachep, objp) = NULL; @@ -2514,15 +2402,32 @@ static void cache_init_objs(struct kmem_cache *cachep, slab_error(cachep, "constructor overwrote the" " start of an object"); } - if ((cachep->size % PAGE_SIZE) == 0 && - OFF_SLAB(cachep) && cachep->flags & SLAB_POISON) - kernel_map_pages(virt_to_page(objp), - cachep->size / PAGE_SIZE, 0); -#else - if (cachep->ctor) - cachep->ctor(objp); + /* need to poison the objs? */ + if (cachep->flags & SLAB_POISON) { + poison_obj(cachep, objp, POISON_FREE); + slab_kernel_map(cachep, objp, 0, 0); + } + } #endif - set_obj_status(page, i, OBJECT_FREE); +} + +static void cache_init_objs(struct kmem_cache *cachep, + struct page *page) +{ + int i; + + cache_init_objs_debug(cachep, page); + + if (OBJFREELIST_SLAB(cachep)) { + page->freelist = index_to_obj(cachep, page, cachep->num - 1) + + obj_offset(cachep); + } + + for (i = 0; i < cachep->num; i++) { + /* constructor could break poison info */ + if (DEBUG == 0 && cachep->ctor) + cachep->ctor(index_to_obj(cachep, page, i)); + set_free_obj(page, i, i); } } @@ -2537,30 +2442,28 @@ static void kmem_flagcheck(struct kmem_cache *cachep, gfp_t flags) } } -static void *slab_get_obj(struct kmem_cache *cachep, struct page *page, - int nodeid) +static void *slab_get_obj(struct kmem_cache *cachep, struct page *page) { void *objp; objp = index_to_obj(cachep, page, get_free_obj(page, page->active)); page->active++; + #if DEBUG - WARN_ON(page_to_nid(virt_to_page(objp)) != nodeid); + if (cachep->flags & SLAB_STORE_USER) + set_store_user_dirty(cachep); #endif return objp; } -static void slab_put_obj(struct kmem_cache *cachep, struct page *page, - void *objp, int nodeid) +static void slab_put_obj(struct kmem_cache *cachep, + struct page *page, void *objp) { unsigned int objnr = obj_to_index(cachep, page, objp); #if DEBUG unsigned int i; - /* Verify that the slab belongs to the intended node */ - WARN_ON(page_to_nid(virt_to_page(objp)) != nodeid); - /* Verify double free bug */ for (i = page->active; i < cachep->num; i++) { if (get_free_obj(page, i) == objnr) { @@ -2571,6 +2474,9 @@ static void slab_put_obj(struct kmem_cache *cachep, struct page *page, } #endif page->active--; + if (!page->freelist) + page->freelist = objp + obj_offset(cachep); + set_free_obj(page, page->active, objnr); } @@ -2645,7 +2551,7 @@ static int cache_grow(struct kmem_cache *cachep, /* Get slab management. */ freelist = alloc_slabmgmt(cachep, page, offset, local_flags & ~GFP_CONSTRAINT_MASK, nodeid); - if (!freelist) + if (OFF_SLAB(cachep) && !freelist) goto opps1; slab_map_pages(cachep, page, freelist); @@ -2726,27 +2632,19 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp, *dbg_redzone1(cachep, objp) = RED_INACTIVE; *dbg_redzone2(cachep, objp) = RED_INACTIVE; } - if (cachep->flags & SLAB_STORE_USER) + if (cachep->flags & SLAB_STORE_USER) { + set_store_user_dirty(cachep); *dbg_userword(cachep, objp) = (void *)caller; + } objnr = obj_to_index(cachep, page, objp); BUG_ON(objnr >= cachep->num); BUG_ON(objp != index_to_obj(cachep, page, objnr)); - set_obj_status(page, objnr, OBJECT_FREE); if (cachep->flags & SLAB_POISON) { -#ifdef CONFIG_DEBUG_PAGEALLOC - if ((cachep->size % PAGE_SIZE)==0 && OFF_SLAB(cachep)) { - store_stackinfo(cachep, objp, caller); - kernel_map_pages(virt_to_page(objp), - cachep->size / PAGE_SIZE, 0); - } else { - poison_obj(cachep, objp, POISON_FREE); - } -#else poison_obj(cachep, objp, POISON_FREE); -#endif + slab_kernel_map(cachep, objp, 0, caller); } return objp; } @@ -2756,7 +2654,85 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp, #define cache_free_debugcheck(x,objp,z) (objp) #endif -static struct page *get_first_slab(struct kmem_cache_node *n) +static inline void fixup_objfreelist_debug(struct kmem_cache *cachep, + void **list) +{ +#if DEBUG + void *next = *list; + void *objp; + + while (next) { + objp = next - obj_offset(cachep); + next = *(void **)next; + poison_obj(cachep, objp, POISON_FREE); + } +#endif +} + +static inline void fixup_slab_list(struct kmem_cache *cachep, + struct kmem_cache_node *n, struct page *page, + void **list) +{ + /* move slabp to correct slabp list: */ + list_del(&page->lru); + if (page->active == cachep->num) { + list_add(&page->lru, &n->slabs_full); + if (OBJFREELIST_SLAB(cachep)) { +#if DEBUG + /* Poisoning will be done without holding the lock */ + if (cachep->flags & SLAB_POISON) { + void **objp = page->freelist; + + *objp = *list; + *list = objp; + } +#endif + page->freelist = NULL; + } + } else + list_add(&page->lru, &n->slabs_partial); +} + +/* Try to find non-pfmemalloc slab if needed */ +static noinline struct page *get_valid_first_slab(struct kmem_cache_node *n, + struct page *page, bool pfmemalloc) +{ + if (!page) + return NULL; + + if (pfmemalloc) + return page; + + if (!PageSlabPfmemalloc(page)) + return page; + + /* No need to keep pfmemalloc slab if we have enough free objects */ + if (n->free_objects > n->free_limit) { + ClearPageSlabPfmemalloc(page); + return page; + } + + /* Move pfmemalloc slab to the end of list to speed up next search */ + list_del(&page->lru); + if (!page->active) + list_add_tail(&page->lru, &n->slabs_free); + else + list_add_tail(&page->lru, &n->slabs_partial); + + list_for_each_entry(page, &n->slabs_partial, lru) { + if (!PageSlabPfmemalloc(page)) + return page; + } + + list_for_each_entry(page, &n->slabs_free, lru) { + if (!PageSlabPfmemalloc(page)) + return page; + } + + return NULL; +} + +static struct page *get_first_slab(struct kmem_cache_node *n, bool pfmemalloc) { struct page *page; @@ -2768,21 +2744,51 @@ static struct page *get_first_slab(struct kmem_cache_node *n) struct page, lru); } + if (sk_memalloc_socks()) + return get_valid_first_slab(n, page, pfmemalloc); + return page; } -static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags, - bool force_refill) +static noinline void *cache_alloc_pfmemalloc(struct kmem_cache *cachep, + struct kmem_cache_node *n, gfp_t flags) +{ + struct page *page; + void *obj; + void *list = NULL; + + if (!gfp_pfmemalloc_allowed(flags)) + return NULL; + + spin_lock(&n->list_lock); + page = get_first_slab(n, true); + if (!page) { + spin_unlock(&n->list_lock); + return NULL; + } + + obj = slab_get_obj(cachep, page); + n->free_objects--; + + fixup_slab_list(cachep, n, page, &list); + + spin_unlock(&n->list_lock); + fixup_objfreelist_debug(cachep, &list); + + return obj; +} + +static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags) { int batchcount; struct kmem_cache_node *n; struct array_cache *ac; int node; + void *list = NULL; check_irq_off(); node = numa_mem_id(); - if (unlikely(force_refill)) - goto force_grow; + retry: ac = cpu_cache_get(cachep); batchcount = ac->batchcount; @@ -2808,7 +2814,7 @@ retry: while (batchcount > 0) { struct page *page; /* Get slab alloc is to come from. */ - page = get_first_slab(n); + page = get_first_slab(n, false); if (!page) goto must_grow; @@ -2826,26 +2832,29 @@ retry: STATS_INC_ACTIVE(cachep); STATS_SET_HIGH(cachep); - ac_put_obj(cachep, ac, slab_get_obj(cachep, page, - node)); + ac->entry[ac->avail++] = slab_get_obj(cachep, page); } - /* move slabp to correct slabp list: */ - list_del(&page->lru); - if (page->active == cachep->num) - list_add(&page->lru, &n->slabs_full); - else - list_add(&page->lru, &n->slabs_partial); + fixup_slab_list(cachep, n, page, &list); } must_grow: n->free_objects -= ac->avail; alloc_done: spin_unlock(&n->list_lock); + fixup_objfreelist_debug(cachep, &list); if (unlikely(!ac->avail)) { int x; -force_grow: + + /* Check if we can use obj in pfmemalloc slab */ + if (sk_memalloc_socks()) { + void *obj = cache_alloc_pfmemalloc(cachep, n, flags); + + if (obj) + return obj; + } + x = cache_grow(cachep, gfp_exact_node(flags), node, NULL); /* cache_grow can reenable interrupts, then ac could change. */ @@ -2853,7 +2862,7 @@ force_grow: node = numa_mem_id(); /* no objects in sight? abort */ - if (!x && (ac->avail == 0 || force_refill)) + if (!x && ac->avail == 0) return NULL; if (!ac->avail) /* objects refilled by interrupt? */ @@ -2861,7 +2870,7 @@ force_grow: } ac->touched = 1; - return ac_get_obj(cachep, ac, flags, force_refill); + return ac->entry[--ac->avail]; } static inline void cache_alloc_debugcheck_before(struct kmem_cache *cachep, @@ -2877,20 +2886,11 @@ static inline void cache_alloc_debugcheck_before(struct kmem_cache *cachep, static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, gfp_t flags, void *objp, unsigned long caller) { - struct page *page; - if (!objp) return objp; if (cachep->flags & SLAB_POISON) { -#ifdef CONFIG_DEBUG_PAGEALLOC - if ((cachep->size % PAGE_SIZE) == 0 && OFF_SLAB(cachep)) - kernel_map_pages(virt_to_page(objp), - cachep->size / PAGE_SIZE, 1); - else - check_poison_obj(cachep, objp); -#else check_poison_obj(cachep, objp); -#endif + slab_kernel_map(cachep, objp, 1, 0); poison_obj(cachep, objp, POISON_INUSE); } if (cachep->flags & SLAB_STORE_USER) @@ -2910,8 +2910,6 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, *dbg_redzone2(cachep, objp) = RED_ACTIVE; } - page = virt_to_head_page(objp); - set_obj_status(page, obj_to_index(cachep, page, objp), OBJECT_ACTIVE); objp += obj_offset(cachep); if (cachep->ctor && cachep->flags & SLAB_POISON) cachep->ctor(objp); @@ -2926,40 +2924,24 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, #define cache_alloc_debugcheck_after(a,b,objp,d) (objp) #endif -static bool slab_should_failslab(struct kmem_cache *cachep, gfp_t flags) -{ - if (unlikely(cachep == kmem_cache)) - return false; - - return should_failslab(cachep->object_size, flags, cachep->flags); -} - static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags) { void *objp; struct array_cache *ac; - bool force_refill = false; check_irq_off(); ac = cpu_cache_get(cachep); if (likely(ac->avail)) { ac->touched = 1; - objp = ac_get_obj(cachep, ac, flags, false); + objp = ac->entry[--ac->avail]; - /* - * Allow for the possibility all avail objects are not allowed - * by the current flags - */ - if (objp) { - STATS_INC_ALLOCHIT(cachep); - goto out; - } - force_refill = true; + STATS_INC_ALLOCHIT(cachep); + goto out; } STATS_INC_ALLOCMISS(cachep); - objp = cache_alloc_refill(cachep, flags, force_refill); + objp = cache_alloc_refill(cachep, flags); /* * the 'ac' may be updated by cache_alloc_refill(), * and kmemleak_erase() requires its correct value. @@ -3097,6 +3079,7 @@ static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, struct page *page; struct kmem_cache_node *n; void *obj; + void *list = NULL; int x; VM_BUG_ON(nodeid < 0 || nodeid >= MAX_NUMNODES); @@ -3106,7 +3089,7 @@ static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, retry: check_irq_off(); spin_lock(&n->list_lock); - page = get_first_slab(n); + page = get_first_slab(n, false); if (!page) goto must_grow; @@ -3118,17 +3101,13 @@ retry: BUG_ON(page->active == cachep->num); - obj = slab_get_obj(cachep, page, nodeid); + obj = slab_get_obj(cachep, page); n->free_objects--; - /* move slabp to correct slabp list: */ - list_del(&page->lru); - if (page->active == cachep->num) - list_add(&page->lru, &n->slabs_full); - else - list_add(&page->lru, &n->slabs_partial); + fixup_slab_list(cachep, n, page, &list); spin_unlock(&n->list_lock); + fixup_objfreelist_debug(cachep, &list); goto done; must_grow: @@ -3152,14 +3131,10 @@ slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, int slab_node = numa_mem_id(); flags &= gfp_allowed_mask; - - lockdep_trace_alloc(flags); - - if (slab_should_failslab(cachep, flags)) + cachep = slab_pre_alloc_hook(cachep, flags); + if (unlikely(!cachep)) return NULL; - cachep = memcg_kmem_get_cache(cachep, flags); - cache_alloc_debugcheck_before(cachep, flags); local_irq_save(save_flags); @@ -3188,16 +3163,11 @@ slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, out: local_irq_restore(save_flags); ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, caller); - kmemleak_alloc_recursive(ptr, cachep->object_size, 1, cachep->flags, - flags); - if (likely(ptr)) { - kmemcheck_slab_alloc(cachep, flags, ptr, cachep->object_size); - if (unlikely(flags & __GFP_ZERO)) - memset(ptr, 0, cachep->object_size); - } + if (unlikely(flags & __GFP_ZERO) && ptr) + memset(ptr, 0, cachep->object_size); - memcg_kmem_put_cache(cachep); + slab_post_alloc_hook(cachep, flags, 1, &ptr); return ptr; } @@ -3240,30 +3210,21 @@ slab_alloc(struct kmem_cache *cachep, gfp_t flags, unsigned long caller) void *objp; flags &= gfp_allowed_mask; - - lockdep_trace_alloc(flags); - - if (slab_should_failslab(cachep, flags)) + cachep = slab_pre_alloc_hook(cachep, flags); + if (unlikely(!cachep)) return NULL; - cachep = memcg_kmem_get_cache(cachep, flags); - cache_alloc_debugcheck_before(cachep, flags); local_irq_save(save_flags); objp = __do_cache_alloc(cachep, flags); local_irq_restore(save_flags); objp = cache_alloc_debugcheck_after(cachep, flags, objp, caller); - kmemleak_alloc_recursive(objp, cachep->object_size, 1, cachep->flags, - flags); prefetchw(objp); - if (likely(objp)) { - kmemcheck_slab_alloc(cachep, flags, objp, cachep->object_size); - if (unlikely(flags & __GFP_ZERO)) - memset(objp, 0, cachep->object_size); - } + if (unlikely(flags & __GFP_ZERO) && objp) + memset(objp, 0, cachep->object_size); - memcg_kmem_put_cache(cachep); + slab_post_alloc_hook(cachep, flags, 1, &objp); return objp; } @@ -3281,13 +3242,12 @@ static void free_block(struct kmem_cache *cachep, void **objpp, void *objp; struct page *page; - clear_obj_pfmemalloc(&objpp[i]); objp = objpp[i]; page = virt_to_head_page(objp); list_del(&page->lru); check_spinlock_acquired_node(cachep, node); - slab_put_obj(cachep, page, objp, node); + slab_put_obj(cachep, page, objp); STATS_DEC_ACTIVE(cachep); n->free_objects++; @@ -3317,9 +3277,7 @@ static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac) LIST_HEAD(list); batchcount = ac->batchcount; -#if DEBUG - BUG_ON(!batchcount || batchcount > ac->avail); -#endif + check_irq_off(); n = get_node(cachep, node); spin_lock(&n->list_lock); @@ -3389,7 +3347,16 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp, cache_flusharray(cachep, ac); } - ac_put_obj(cachep, ac, objp); + if (sk_memalloc_socks()) { + struct page *page = virt_to_head_page(objp); + + if (unlikely(PageSlabPfmemalloc(page))) { + cache_free_pfmemalloc(cachep, page, objp); + return; + } + } + + ac->entry[ac->avail++] = objp; } /** @@ -3411,16 +3378,53 @@ void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags) } EXPORT_SYMBOL(kmem_cache_alloc); -void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p) +static __always_inline void +cache_alloc_debugcheck_after_bulk(struct kmem_cache *s, gfp_t flags, + size_t size, void **p, unsigned long caller) { - __kmem_cache_free_bulk(s, size, p); + size_t i; + + for (i = 0; i < size; i++) + p[i] = cache_alloc_debugcheck_after(s, flags, p[i], caller); } -EXPORT_SYMBOL(kmem_cache_free_bulk); int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, - void **p) + void **p) { - return __kmem_cache_alloc_bulk(s, flags, size, p); + size_t i; + + s = slab_pre_alloc_hook(s, flags); + if (!s) + return 0; + + cache_alloc_debugcheck_before(s, flags); + + local_irq_disable(); + for (i = 0; i < size; i++) { + void *objp = __do_cache_alloc(s, flags); + + if (unlikely(!objp)) + goto error; + p[i] = objp; + } + local_irq_enable(); + + cache_alloc_debugcheck_after_bulk(s, flags, size, p, _RET_IP_); + + /* Clear memory outside IRQ disabled section */ + if (unlikely(flags & __GFP_ZERO)) + for (i = 0; i < size; i++) + memset(p[i], 0, s->object_size); + + slab_post_alloc_hook(s, flags, size, p); + /* FIXME: Trace call missing. Christoph would like a bulk variant */ + return size; +error: + local_irq_enable(); + cache_alloc_debugcheck_after_bulk(s, flags, i, p, _RET_IP_); + slab_post_alloc_hook(s, flags, i, p); + __kmem_cache_free_bulk(s, i, p); + return 0; } EXPORT_SYMBOL(kmem_cache_alloc_bulk); @@ -3567,6 +3571,32 @@ void kmem_cache_free(struct kmem_cache *cachep, void *objp) } EXPORT_SYMBOL(kmem_cache_free); +void kmem_cache_free_bulk(struct kmem_cache *orig_s, size_t size, void **p) +{ + struct kmem_cache *s; + size_t i; + + local_irq_disable(); + for (i = 0; i < size; i++) { + void *objp = p[i]; + + if (!orig_s) /* called via kfree_bulk */ + s = virt_to_cache(objp); + else + s = cache_from_obj(orig_s, objp); + + debug_check_no_locks_freed(objp, s->object_size); + if (!(s->flags & SLAB_DEBUG_OBJECTS)) + debug_check_no_obj_freed(objp, s->object_size); + + __cache_free(s, objp, _RET_IP_); + } + local_irq_enable(); + + /* FIXME: add tracing */ +} +EXPORT_SYMBOL(kmem_cache_free_bulk); + /** * kfree - free previously allocated memory * @objp: pointer returned by kmalloc. @@ -4102,15 +4132,34 @@ static void handle_slab(unsigned long *n, struct kmem_cache *c, struct page *page) { void *p; - int i; + int i, j; + unsigned long v; if (n[0] == n[1]) return; for (i = 0, p = page->s_mem; i < c->num; i++, p += c->size) { - if (get_obj_status(page, i) != OBJECT_ACTIVE) + bool active = true; + + for (j = page->active; j < c->num; j++) { + if (get_free_obj(page, j) == i) { + active = false; + break; + } + } + + if (!active) + continue; + + /* + * probe_kernel_read() is used for DEBUG_PAGEALLOC. page table + * mapping is established when actual object allocation and + * we could mistakenly access the unmapped object in the cpu + * cache. + */ + if (probe_kernel_read(&v, dbg_userword(c, p), sizeof(v))) continue; - if (!add_caller(n, (unsigned long)*dbg_userword(c, p))) + if (!add_caller(n, v)) return; } } @@ -4146,21 +4195,31 @@ static int leaks_show(struct seq_file *m, void *p) if (!(cachep->flags & SLAB_RED_ZONE)) return 0; - /* OK, we can do it */ + /* + * Set store_user_clean and start to grab stored user information + * for all objects on this cache. If some alloc/free requests comes + * during the processing, information would be wrong so restart + * whole processing. + */ + do { + set_store_user_clean(cachep); + drain_cpu_caches(cachep); - x[1] = 0; + x[1] = 0; - for_each_kmem_cache_node(cachep, node, n) { + for_each_kmem_cache_node(cachep, node, n) { - check_irq_on(); - spin_lock_irq(&n->list_lock); + check_irq_on(); + spin_lock_irq(&n->list_lock); + + list_for_each_entry(page, &n->slabs_full, lru) + handle_slab(x, cachep, page); + list_for_each_entry(page, &n->slabs_partial, lru) + handle_slab(x, cachep, page); + spin_unlock_irq(&n->list_lock); + } + } while (!is_store_user_clean(cachep)); - list_for_each_entry(page, &n->slabs_full, lru) - handle_slab(x, cachep, page); - list_for_each_entry(page, &n->slabs_partial, lru) - handle_slab(x, cachep, page); - spin_unlock_irq(&n->list_lock); - } name = cachep->name; if (x[0] == x[1]) { /* Increase the buffer size */ diff --git a/mm/slab.h b/mm/slab.h index 2eedacea439d..b7934361f026 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -38,6 +38,10 @@ struct kmem_cache { #endif #include <linux/memcontrol.h> +#include <linux/fault-inject.h> +#include <linux/kmemcheck.h> +#include <linux/kasan.h> +#include <linux/kmemleak.h> /* * State of the slab allocator. @@ -121,7 +125,7 @@ static inline unsigned long kmem_cache_flags(unsigned long object_size, #define SLAB_DEBUG_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER) #elif defined(CONFIG_SLUB_DEBUG) #define SLAB_DEBUG_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \ - SLAB_TRACE | SLAB_DEBUG_FREE) + SLAB_TRACE | SLAB_CONSISTENCY_CHECKS) #else #define SLAB_DEBUG_FLAGS (0) #endif @@ -168,7 +172,7 @@ ssize_t slabinfo_write(struct file *file, const char __user *buffer, /* * Generic implementation of bulk operations * These are useful for situations in which the allocator cannot - * perform optimizations. In that case segments of the objecct listed + * perform optimizations. In that case segments of the object listed * may be allocated or freed using these operations. */ void __kmem_cache_free_bulk(struct kmem_cache *, size_t, void **); @@ -307,7 +311,8 @@ static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x) * to not do even the assignment. In that case, slab_equal_or_root * will also be a constant. */ - if (!memcg_kmem_enabled() && !unlikely(s->flags & SLAB_DEBUG_FREE)) + if (!memcg_kmem_enabled() && + !unlikely(s->flags & SLAB_CONSISTENCY_CHECKS)) return s; page = virt_to_head_page(x); @@ -321,6 +326,64 @@ static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x) return s; } +static inline size_t slab_ksize(const struct kmem_cache *s) +{ +#ifndef CONFIG_SLUB + return s->object_size; + +#else /* CONFIG_SLUB */ +# ifdef CONFIG_SLUB_DEBUG + /* + * Debugging requires use of the padding between object + * and whatever may come after it. + */ + if (s->flags & (SLAB_RED_ZONE | SLAB_POISON)) + return s->object_size; +# endif + /* + * If we have the need to store the freelist pointer + * back there or track user information then we can + * only use the space before that information. + */ + if (s->flags & (SLAB_DESTROY_BY_RCU | SLAB_STORE_USER)) + return s->inuse; + /* + * Else we can use all the padding etc for the allocation + */ + return s->size; +#endif +} + +static inline struct kmem_cache *slab_pre_alloc_hook(struct kmem_cache *s, + gfp_t flags) +{ + flags &= gfp_allowed_mask; + lockdep_trace_alloc(flags); + might_sleep_if(gfpflags_allow_blocking(flags)); + + if (should_failslab(s, flags)) + return NULL; + + return memcg_kmem_get_cache(s, flags); +} + +static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags, + size_t size, void **p) +{ + size_t i; + + flags &= gfp_allowed_mask; + for (i = 0; i < size; i++) { + void *object = p[i]; + + kmemcheck_slab_alloc(s, flags, object, slab_ksize(s)); + kmemleak_alloc_recursive(object, s->object_size, 1, + s->flags, flags); + kasan_slab_alloc(s, object); + } + memcg_kmem_put_cache(s); +} + #ifndef CONFIG_SLOB /* * The slab lists for all objects. diff --git a/mm/slab_common.c b/mm/slab_common.c index 065b7bdabdc3..6afb2263a5c5 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -109,8 +109,12 @@ void __kmem_cache_free_bulk(struct kmem_cache *s, size_t nr, void **p) { size_t i; - for (i = 0; i < nr; i++) - kmem_cache_free(s, p[i]); + for (i = 0; i < nr; i++) { + if (s) + kmem_cache_free(s, p[i]); + else + kfree(p[i]); + } } int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t nr, diff --git a/mm/slub.c b/mm/slub.c index d8fbd4a6ed59..6c91324f9370 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -124,6 +124,14 @@ static inline int kmem_cache_debug(struct kmem_cache *s) #endif } +static inline void *fixup_red_left(struct kmem_cache *s, void *p) +{ + if (kmem_cache_debug(s) && s->flags & SLAB_RED_ZONE) + p += s->red_left_pad; + + return p; +} + static inline bool kmem_cache_has_cpu_partial(struct kmem_cache *s) { #ifdef CONFIG_SLUB_CPU_PARTIAL @@ -160,10 +168,18 @@ static inline bool kmem_cache_has_cpu_partial(struct kmem_cache *s) */ #define MAX_PARTIAL 10 -#define DEBUG_DEFAULT_FLAGS (SLAB_DEBUG_FREE | SLAB_RED_ZONE | \ +#define DEBUG_DEFAULT_FLAGS (SLAB_CONSISTENCY_CHECKS | SLAB_RED_ZONE | \ SLAB_POISON | SLAB_STORE_USER) /* + * These debug flags cannot use CMPXCHG because there might be consistency + * issues when checking or reading debug information + */ +#define SLAB_NO_CMPXCHG (SLAB_CONSISTENCY_CHECKS | SLAB_STORE_USER | \ + SLAB_TRACE) + + +/* * Debugging flags that require metadata to be stored in the slab. These get * disabled when slub_debug=O is used and a cache's min order increases with * metadata. @@ -224,24 +240,6 @@ static inline void stat(const struct kmem_cache *s, enum stat_item si) * Core slab cache functions *******************************************************************/ -/* Verify that a pointer has an address that is valid within a slab page */ -static inline int check_valid_pointer(struct kmem_cache *s, - struct page *page, const void *object) -{ - void *base; - - if (!object) - return 1; - - base = page_address(page); - if (object < base || object >= base + page->objects * s->size || - (object - base) % s->size) { - return 0; - } - - return 1; -} - static inline void *get_freepointer(struct kmem_cache *s, void *object) { return *(void **)(object + s->offset); @@ -271,12 +269,14 @@ static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp) /* Loop over all objects in a slab */ #define for_each_object(__p, __s, __addr, __objects) \ - for (__p = (__addr); __p < (__addr) + (__objects) * (__s)->size;\ - __p += (__s)->size) + for (__p = fixup_red_left(__s, __addr); \ + __p < (__addr) + (__objects) * (__s)->size; \ + __p += (__s)->size) #define for_each_object_idx(__p, __idx, __s, __addr, __objects) \ - for (__p = (__addr), __idx = 1; __idx <= __objects;\ - __p += (__s)->size, __idx++) + for (__p = fixup_red_left(__s, __addr), __idx = 1; \ + __idx <= __objects; \ + __p += (__s)->size, __idx++) /* Determine object index from a given position */ static inline int slab_index(void *p, struct kmem_cache *s, void *addr) @@ -284,30 +284,6 @@ static inline int slab_index(void *p, struct kmem_cache *s, void *addr) return (p - addr) / s->size; } -static inline size_t slab_ksize(const struct kmem_cache *s) -{ -#ifdef CONFIG_SLUB_DEBUG - /* - * Debugging requires use of the padding between object - * and whatever may come after it. - */ - if (s->flags & (SLAB_RED_ZONE | SLAB_POISON)) - return s->object_size; - -#endif - /* - * If we have the need to store the freelist pointer - * back there or track user information then we can - * only use the space before that information. - */ - if (s->flags & (SLAB_DESTROY_BY_RCU | SLAB_STORE_USER)) - return s->inuse; - /* - * Else we can use all the padding etc for the allocation - */ - return s->size; -} - static inline int order_objects(int order, unsigned long size, int reserved) { return ((PAGE_SIZE << order) - reserved) / size; @@ -458,6 +434,22 @@ static void get_map(struct kmem_cache *s, struct page *page, unsigned long *map) set_bit(slab_index(p, s, addr), map); } +static inline int size_from_object(struct kmem_cache *s) +{ + if (s->flags & SLAB_RED_ZONE) + return s->size - s->red_left_pad; + + return s->size; +} + +static inline void *restore_red_left(struct kmem_cache *s, void *p) +{ + if (s->flags & SLAB_RED_ZONE) + p -= s->red_left_pad; + + return p; +} + /* * Debug settings: */ @@ -491,6 +483,26 @@ static inline void metadata_access_disable(void) /* * Object debugging */ + +/* Verify that a pointer has an address that is valid within a slab page */ +static inline int check_valid_pointer(struct kmem_cache *s, + struct page *page, void *object) +{ + void *base; + + if (!object) + return 1; + + base = page_address(page); + object = restore_red_left(s, object); + if (object < base || object >= base + page->objects * s->size || + (object - base) % s->size) { + return 0; + } + + return 1; +} + static void print_section(char *text, u8 *addr, unsigned int length) { metadata_access_enable(); @@ -630,7 +642,9 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p) pr_err("INFO: Object 0x%p @offset=%tu fp=0x%p\n\n", p, p - addr, get_freepointer(s, p)); - if (p > addr + 16) + if (s->flags & SLAB_RED_ZONE) + print_section("Redzone ", p - s->red_left_pad, s->red_left_pad); + else if (p > addr + 16) print_section("Bytes b4 ", p - 16, 16); print_section("Object ", p, min_t(unsigned long, s->object_size, @@ -647,9 +661,9 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p) if (s->flags & SLAB_STORE_USER) off += 2 * sizeof(struct track); - if (off != s->size) + if (off != size_from_object(s)) /* Beginning of the filler is the free pointer */ - print_section("Padding ", p + off, s->size - off); + print_section("Padding ", p + off, size_from_object(s) - off); dump_stack(); } @@ -679,6 +693,9 @@ static void init_object(struct kmem_cache *s, void *object, u8 val) { u8 *p = object; + if (s->flags & SLAB_RED_ZONE) + memset(p - s->red_left_pad, val, s->red_left_pad); + if (s->flags & __OBJECT_POISON) { memset(p, POISON_FREE, s->object_size - 1); p[s->object_size - 1] = POISON_END; @@ -771,11 +788,11 @@ static int check_pad_bytes(struct kmem_cache *s, struct page *page, u8 *p) /* We also have user information there */ off += 2 * sizeof(struct track); - if (s->size == off) + if (size_from_object(s) == off) return 1; return check_bytes_and_report(s, page, p, "Object padding", - p + off, POISON_INUSE, s->size - off); + p + off, POISON_INUSE, size_from_object(s) - off); } /* Check the pad bytes at the end of a slab page */ @@ -820,6 +837,10 @@ static int check_object(struct kmem_cache *s, struct page *page, if (s->flags & SLAB_RED_ZONE) { if (!check_bytes_and_report(s, page, object, "Redzone", + object - s->red_left_pad, val, s->red_left_pad)) + return 0; + + if (!check_bytes_and_report(s, page, object, "Redzone", endobject, val, s->inuse - s->object_size)) return 0; } else { @@ -1031,20 +1052,32 @@ static void setup_object_debug(struct kmem_cache *s, struct page *page, init_tracking(s, object); } -static noinline int alloc_debug_processing(struct kmem_cache *s, +static inline int alloc_consistency_checks(struct kmem_cache *s, struct page *page, void *object, unsigned long addr) { if (!check_slab(s, page)) - goto bad; + return 0; if (!check_valid_pointer(s, page, object)) { object_err(s, page, object, "Freelist Pointer check fails"); - goto bad; + return 0; } if (!check_object(s, page, object, SLUB_RED_INACTIVE)) - goto bad; + return 0; + + return 1; +} + +static noinline int alloc_debug_processing(struct kmem_cache *s, + struct page *page, + void *object, unsigned long addr) +{ + if (s->flags & SLAB_CONSISTENCY_CHECKS) { + if (!alloc_consistency_checks(s, page, object, addr)) + goto bad; + } /* Success perform special debug activities for allocs */ if (s->flags & SLAB_STORE_USER) @@ -1067,37 +1100,21 @@ bad: return 0; } -/* Supports checking bulk free of a constructed freelist */ -static noinline struct kmem_cache_node *free_debug_processing( - struct kmem_cache *s, struct page *page, - void *head, void *tail, int bulk_cnt, - unsigned long addr, unsigned long *flags) +static inline int free_consistency_checks(struct kmem_cache *s, + struct page *page, void *object, unsigned long addr) { - struct kmem_cache_node *n = get_node(s, page_to_nid(page)); - void *object = head; - int cnt = 0; - - spin_lock_irqsave(&n->list_lock, *flags); - slab_lock(page); - - if (!check_slab(s, page)) - goto fail; - -next_object: - cnt++; - if (!check_valid_pointer(s, page, object)) { slab_err(s, page, "Invalid object pointer 0x%p", object); - goto fail; + return 0; } if (on_freelist(s, page, object)) { object_err(s, page, object, "Object already free"); - goto fail; + return 0; } if (!check_object(s, page, object, SLUB_RED_ACTIVE)) - goto out; + return 0; if (unlikely(s != page->slab_cache)) { if (!PageSlab(page)) { @@ -1110,7 +1127,37 @@ next_object: } else object_err(s, page, object, "page slab pointer corrupt."); - goto fail; + return 0; + } + return 1; +} + +/* Supports checking bulk free of a constructed freelist */ +static noinline int free_debug_processing( + struct kmem_cache *s, struct page *page, + void *head, void *tail, int bulk_cnt, + unsigned long addr) +{ + struct kmem_cache_node *n = get_node(s, page_to_nid(page)); + void *object = head; + int cnt = 0; + unsigned long uninitialized_var(flags); + int ret = 0; + + spin_lock_irqsave(&n->list_lock, flags); + slab_lock(page); + + if (s->flags & SLAB_CONSISTENCY_CHECKS) { + if (!check_slab(s, page)) + goto out; + } + +next_object: + cnt++; + + if (s->flags & SLAB_CONSISTENCY_CHECKS) { + if (!free_consistency_checks(s, page, object, addr)) + goto out; } if (s->flags & SLAB_STORE_USER) @@ -1124,23 +1171,18 @@ next_object: object = get_freepointer(s, object); goto next_object; } + ret = 1; + out: if (cnt != bulk_cnt) slab_err(s, page, "Bulk freelist count(%d) invalid(%d)\n", bulk_cnt, cnt); slab_unlock(page); - /* - * Keep node_lock to preserve integrity - * until the object is actually freed - */ - return n; - -fail: - slab_unlock(page); - spin_unlock_irqrestore(&n->list_lock, *flags); - slab_fix(s, "Object at 0x%p not freed", object); - return NULL; + spin_unlock_irqrestore(&n->list_lock, flags); + if (!ret) + slab_fix(s, "Object at 0x%p not freed", object); + return ret; } static int __init setup_slub_debug(char *str) @@ -1172,7 +1214,7 @@ static int __init setup_slub_debug(char *str) for (; *str && *str != ','; str++) { switch (tolower(*str)) { case 'f': - slub_debug |= SLAB_DEBUG_FREE; + slub_debug |= SLAB_CONSISTENCY_CHECKS; break; case 'z': slub_debug |= SLAB_RED_ZONE; @@ -1231,10 +1273,10 @@ static inline void setup_object_debug(struct kmem_cache *s, static inline int alloc_debug_processing(struct kmem_cache *s, struct page *page, void *object, unsigned long addr) { return 0; } -static inline struct kmem_cache_node *free_debug_processing( +static inline int free_debug_processing( struct kmem_cache *s, struct page *page, void *head, void *tail, int bulk_cnt, - unsigned long addr, unsigned long *flags) { return NULL; } + unsigned long addr) { return 0; } static inline int slab_pad_check(struct kmem_cache *s, struct page *page) { return 1; } @@ -1281,36 +1323,6 @@ static inline void kfree_hook(const void *x) kasan_kfree_large(x); } -static inline struct kmem_cache *slab_pre_alloc_hook(struct kmem_cache *s, - gfp_t flags) -{ - flags &= gfp_allowed_mask; - lockdep_trace_alloc(flags); - might_sleep_if(gfpflags_allow_blocking(flags)); - - if (should_failslab(s->object_size, flags, s->flags)) - return NULL; - - return memcg_kmem_get_cache(s, flags); -} - -static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags, - size_t size, void **p) -{ - size_t i; - - flags &= gfp_allowed_mask; - for (i = 0; i < size; i++) { - void *object = p[i]; - - kmemcheck_slab_alloc(s, flags, object, slab_ksize(s)); - kmemleak_alloc_recursive(object, s->object_size, 1, - s->flags, flags); - kasan_slab_alloc(s, object); - } - memcg_kmem_put_cache(s); -} - static inline void slab_free_hook(struct kmem_cache *s, void *x) { kmemleak_free_recursive(x, s->flags); @@ -1470,7 +1482,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) set_freepointer(s, p, NULL); } - page->freelist = start; + page->freelist = fixup_red_left(s, start); page->inuse = page->objects; page->frozen = 1; @@ -1506,7 +1518,7 @@ static void __free_slab(struct kmem_cache *s, struct page *page) int order = compound_order(page); int pages = 1 << order; - if (kmem_cache_debug(s)) { + if (s->flags & SLAB_CONSISTENCY_CHECKS) { void *p; slab_pad_check(s, page); @@ -2224,8 +2236,8 @@ slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid) if ((gfpflags & __GFP_NOWARN) || !__ratelimit(&slub_oom_rs)) return; - pr_warn("SLUB: Unable to allocate memory on node %d (gfp=0x%x)\n", - nid, gfpflags); + pr_warn("SLUB: Unable to allocate memory on node %d, gfp=%#x(%pGg)\n", + nid, gfpflags, &gfpflags); pr_warn(" cache: %s, object size: %d, buffer size: %d, default order: %d, min order: %d\n", s->name, s->object_size, s->size, oo_order(s->oo), oo_order(s->min)); @@ -2642,8 +2654,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page, stat(s, FREE_SLOWPATH); if (kmem_cache_debug(s) && - !(n = free_debug_processing(s, page, head, tail, cnt, - addr, &flags))) + !free_debug_processing(s, page, head, tail, cnt, addr)) return; do { @@ -2815,6 +2826,7 @@ struct detached_freelist { void *tail; void *freelist; int cnt; + struct kmem_cache *s; }; /* @@ -2829,26 +2841,45 @@ struct detached_freelist { * synchronization primitive. Look ahead in the array is limited due * to performance reasons. */ -static int build_detached_freelist(struct kmem_cache *s, size_t size, - void **p, struct detached_freelist *df) +static inline +int build_detached_freelist(struct kmem_cache *s, size_t size, + void **p, struct detached_freelist *df) { size_t first_skipped_index = 0; int lookahead = 3; void *object; + struct page *page; /* Always re-init detached_freelist */ df->page = NULL; do { object = p[--size]; + /* Do we need !ZERO_OR_NULL_PTR(object) here? (for kfree) */ } while (!object && size); if (!object) return 0; + page = virt_to_head_page(object); + if (!s) { + /* Handle kalloc'ed objects */ + if (unlikely(!PageSlab(page))) { + BUG_ON(!PageCompound(page)); + kfree_hook(object); + __free_kmem_pages(page, compound_order(page)); + p[size] = NULL; /* mark object processed */ + return size; + } + /* Derive kmem_cache from object */ + df->s = page->slab_cache; + } else { + df->s = cache_from_obj(s, object); /* Support for memcg */ + } + /* Start new detached freelist */ - set_freepointer(s, object, NULL); - df->page = virt_to_head_page(object); + df->page = page; + set_freepointer(df->s, object, NULL); df->tail = object; df->freelist = object; p[size] = NULL; /* mark object processed */ @@ -2862,7 +2893,7 @@ static int build_detached_freelist(struct kmem_cache *s, size_t size, /* df->page is always set at this point */ if (df->page == virt_to_head_page(object)) { /* Opportunity build freelist */ - set_freepointer(s, object, df->freelist); + set_freepointer(df->s, object, df->freelist); df->freelist = object; df->cnt++; p[size] = NULL; /* mark object processed */ @@ -2881,25 +2912,20 @@ static int build_detached_freelist(struct kmem_cache *s, size_t size, return first_skipped_index; } - /* Note that interrupts must be enabled when calling this function. */ -void kmem_cache_free_bulk(struct kmem_cache *orig_s, size_t size, void **p) +void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p) { if (WARN_ON(!size)) return; do { struct detached_freelist df; - struct kmem_cache *s; - - /* Support for memcg */ - s = cache_from_obj(orig_s, p[size - 1]); size = build_detached_freelist(s, size, p, &df); if (unlikely(!df.page)) continue; - slab_free(s, df.page, df.freelist, df.tail, df.cnt, _RET_IP_); + slab_free(df.s, df.page, df.freelist, df.tail, df.cnt,_RET_IP_); } while (likely(size)); } EXPORT_SYMBOL(kmem_cache_free_bulk); @@ -3285,7 +3311,7 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order) */ size += 2 * sizeof(struct track); - if (flags & SLAB_RED_ZONE) + if (flags & SLAB_RED_ZONE) { /* * Add some empty padding so that we can catch * overwrites from earlier objects rather than let @@ -3294,6 +3320,11 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order) * of the object. */ size += sizeof(void *); + + s->red_left_pad = sizeof(void *); + s->red_left_pad = ALIGN(s->red_left_pad, s->align); + size += s->red_left_pad; + } #endif /* @@ -3357,7 +3388,7 @@ static int kmem_cache_open(struct kmem_cache *s, unsigned long flags) #if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \ defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE) - if (system_has_cmpxchg_double() && (s->flags & SLAB_DEBUG_FLAGS) == 0) + if (system_has_cmpxchg_double() && (s->flags & SLAB_NO_CMPXCHG) == 0) /* Enable fast mode */ s->flags |= __CMPXCHG_DOUBLE; #endif @@ -4812,16 +4843,16 @@ SLAB_ATTR_RO(total_objects); static ssize_t sanity_checks_show(struct kmem_cache *s, char *buf) { - return sprintf(buf, "%d\n", !!(s->flags & SLAB_DEBUG_FREE)); + return sprintf(buf, "%d\n", !!(s->flags & SLAB_CONSISTENCY_CHECKS)); } static ssize_t sanity_checks_store(struct kmem_cache *s, const char *buf, size_t length) { - s->flags &= ~SLAB_DEBUG_FREE; + s->flags &= ~SLAB_CONSISTENCY_CHECKS; if (buf[0] == '1') { s->flags &= ~__CMPXCHG_DOUBLE; - s->flags |= SLAB_DEBUG_FREE; + s->flags |= SLAB_CONSISTENCY_CHECKS; } return length; } @@ -4865,7 +4896,6 @@ static ssize_t red_zone_store(struct kmem_cache *s, s->flags &= ~SLAB_RED_ZONE; if (buf[0] == '1') { - s->flags &= ~__CMPXCHG_DOUBLE; s->flags |= SLAB_RED_ZONE; } calculate_sizes(s, -1); @@ -4886,7 +4916,6 @@ static ssize_t poison_store(struct kmem_cache *s, s->flags &= ~SLAB_POISON; if (buf[0] == '1') { - s->flags &= ~__CMPXCHG_DOUBLE; s->flags |= SLAB_POISON; } calculate_sizes(s, -1); @@ -5356,7 +5385,7 @@ static char *create_unique_id(struct kmem_cache *s) *p++ = 'd'; if (s->flags & SLAB_RECLAIM_ACCOUNT) *p++ = 'a'; - if (s->flags & SLAB_DEBUG_FREE) + if (s->flags & SLAB_CONSISTENCY_CHECKS) *p++ = 'F'; if (!(s->flags & SLAB_NOTRACK)) *p++ = 't'; diff --git a/mm/truncate.c b/mm/truncate.c index e3ee0e27cd17..7598b552ae03 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -519,7 +519,6 @@ EXPORT_SYMBOL(invalidate_mapping_pages); static int invalidate_complete_page2(struct address_space *mapping, struct page *page) { - struct mem_cgroup *memcg; unsigned long flags; if (page->mapping != mapping) @@ -528,15 +527,13 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page) if (page_has_private(page) && !try_to_release_page(page, GFP_KERNEL)) return 0; - memcg = mem_cgroup_begin_page_stat(page); spin_lock_irqsave(&mapping->tree_lock, flags); if (PageDirty(page)) goto failed; BUG_ON(page_has_private(page)); - __delete_from_page_cache(page, NULL, memcg); + __delete_from_page_cache(page, NULL); spin_unlock_irqrestore(&mapping->tree_lock, flags); - mem_cgroup_end_page_stat(memcg); if (mapping->a_ops->freepage) mapping->a_ops->freepage(page); @@ -545,7 +542,6 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page) return 1; failed: spin_unlock_irqrestore(&mapping->tree_lock, flags); - mem_cgroup_end_page_stat(memcg); return 0; } diff --git a/mm/vmscan.c b/mm/vmscan.c index 71b1c29948db..dd984470248f 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -195,25 +195,25 @@ static unsigned long zone_reclaimable_pages(struct zone *zone) { unsigned long nr; - nr = zone_page_state(zone, NR_ACTIVE_FILE) + - zone_page_state(zone, NR_INACTIVE_FILE) + - zone_page_state(zone, NR_ISOLATED_FILE); + nr = zone_page_state_snapshot(zone, NR_ACTIVE_FILE) + + zone_page_state_snapshot(zone, NR_INACTIVE_FILE) + + zone_page_state_snapshot(zone, NR_ISOLATED_FILE); if (get_nr_swap_pages() > 0) - nr += zone_page_state(zone, NR_ACTIVE_ANON) + - zone_page_state(zone, NR_INACTIVE_ANON) + - zone_page_state(zone, NR_ISOLATED_ANON); + nr += zone_page_state_snapshot(zone, NR_ACTIVE_ANON) + + zone_page_state_snapshot(zone, NR_INACTIVE_ANON) + + zone_page_state_snapshot(zone, NR_ISOLATED_ANON); return nr; } bool zone_reclaimable(struct zone *zone) { - return zone_page_state(zone, NR_PAGES_SCANNED) < + return zone_page_state_snapshot(zone, NR_PAGES_SCANNED) < zone_reclaimable_pages(zone) * 6; } -static unsigned long get_lru_size(struct lruvec *lruvec, enum lru_list lru) +unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru) { if (!mem_cgroup_disabled()) return mem_cgroup_get_lru_size(lruvec, lru); @@ -228,14 +228,6 @@ int register_shrinker(struct shrinker *shrinker) { size_t size = sizeof(*shrinker->nr_deferred); - /* - * If we only have one possible node in the system anyway, save - * ourselves the trouble and disable NUMA aware behavior. This way we - * will save memory and some small loop time later. - */ - if (nr_node_ids == 1) - shrinker->flags &= ~SHRINKER_NUMA_AWARE; - if (shrinker->flags & SHRINKER_NUMA_AWARE) size *= nr_node_ids; @@ -611,12 +603,10 @@ static int __remove_mapping(struct address_space *mapping, struct page *page, bool reclaimed) { unsigned long flags; - struct mem_cgroup *memcg; BUG_ON(!PageLocked(page)); BUG_ON(mapping != page_mapping(page)); - memcg = mem_cgroup_begin_page_stat(page); spin_lock_irqsave(&mapping->tree_lock, flags); /* * The non racy check for a busy page. @@ -656,7 +646,6 @@ static int __remove_mapping(struct address_space *mapping, struct page *page, mem_cgroup_swapout(page, swap); __delete_from_swap_cache(page); spin_unlock_irqrestore(&mapping->tree_lock, flags); - mem_cgroup_end_page_stat(memcg); swapcache_free(swap); } else { void (*freepage)(struct page *); @@ -682,9 +671,8 @@ static int __remove_mapping(struct address_space *mapping, struct page *page, if (reclaimed && page_is_file_cache(page) && !mapping_exiting(mapping) && !dax_mapping(mapping)) shadow = workingset_eviction(mapping, page); - __delete_from_page_cache(page, shadow, memcg); + __delete_from_page_cache(page, shadow); spin_unlock_irqrestore(&mapping->tree_lock, flags); - mem_cgroup_end_page_stat(memcg); if (freepage != NULL) freepage(page); @@ -694,7 +682,6 @@ static int __remove_mapping(struct address_space *mapping, struct page *page, cannot_free: spin_unlock_irqrestore(&mapping->tree_lock, flags); - mem_cgroup_end_page_stat(memcg); return 0; } @@ -1931,8 +1918,8 @@ static bool inactive_file_is_low(struct lruvec *lruvec) unsigned long inactive; unsigned long active; - inactive = get_lru_size(lruvec, LRU_INACTIVE_FILE); - active = get_lru_size(lruvec, LRU_ACTIVE_FILE); + inactive = lruvec_lru_size(lruvec, LRU_INACTIVE_FILE); + active = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE); return active > inactive; } @@ -2071,7 +2058,7 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg, * system is under heavy pressure. */ if (!inactive_file_is_low(lruvec) && - get_lru_size(lruvec, LRU_INACTIVE_FILE) >> sc->priority) { + lruvec_lru_size(lruvec, LRU_INACTIVE_FILE) >> sc->priority) { scan_balance = SCAN_FILE; goto out; } @@ -2097,10 +2084,10 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg, * anon in [0], file in [1] */ - anon = get_lru_size(lruvec, LRU_ACTIVE_ANON) + - get_lru_size(lruvec, LRU_INACTIVE_ANON); - file = get_lru_size(lruvec, LRU_ACTIVE_FILE) + - get_lru_size(lruvec, LRU_INACTIVE_FILE); + anon = lruvec_lru_size(lruvec, LRU_ACTIVE_ANON) + + lruvec_lru_size(lruvec, LRU_INACTIVE_ANON); + file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE) + + lruvec_lru_size(lruvec, LRU_INACTIVE_FILE); spin_lock_irq(&zone->lru_lock); if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) { @@ -2138,7 +2125,7 @@ out: unsigned long size; unsigned long scan; - size = get_lru_size(lruvec, lru); + size = lruvec_lru_size(lruvec, lru); scan = size >> sc->priority; if (!scan && pass && force_scan) diff --git a/mm/vmstat.c b/mm/vmstat.c index 084c6725b373..69ce64f7b8d7 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -924,19 +924,6 @@ static void walk_zones_in_node(struct seq_file *m, pg_data_t *pgdat, #endif #ifdef CONFIG_PROC_FS -static char * const migratetype_names[MIGRATE_TYPES] = { - "Unmovable", - "Movable", - "Reclaimable", - "HighAtomic", -#ifdef CONFIG_CMA - "CMA", -#endif -#ifdef CONFIG_MEMORY_ISOLATION - "Isolate", -#endif -}; - static void frag_show_print(struct seq_file *m, pg_data_t *pgdat, struct zone *zone) { @@ -1133,7 +1120,7 @@ static void pagetypeinfo_showmixedcount(struct seq_file *m, pg_data_t *pgdat) #ifdef CONFIG_PAGE_OWNER int mtype; - if (!page_owner_inited) + if (!static_branch_unlikely(&page_owner_inited)) return; drain_all_pages(NULL); diff --git a/mm/workingset.c b/mm/workingset.c index 61ead9e5549d..6130ba0b2641 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -152,8 +152,25 @@ * refault distance will immediately activate the refaulting page. */ -static void *pack_shadow(unsigned long eviction, struct zone *zone) +#define EVICTION_SHIFT (RADIX_TREE_EXCEPTIONAL_ENTRY + \ + ZONES_SHIFT + NODES_SHIFT + \ + MEM_CGROUP_ID_SHIFT) +#define EVICTION_MASK (~0UL >> EVICTION_SHIFT) + +/* + * Eviction timestamps need to be able to cover the full range of + * actionable refaults. However, bits are tight in the radix tree + * entry, and after storing the identifier for the lruvec there might + * not be enough left to represent every single actionable refault. In + * that case, we have to sacrifice granularity for distance, and group + * evictions into coarser buckets by shaving off lower timestamp bits. + */ +static unsigned int bucket_order __read_mostly; + +static void *pack_shadow(int memcgid, struct zone *zone, unsigned long eviction) { + eviction >>= bucket_order; + eviction = (eviction << MEM_CGROUP_ID_SHIFT) | memcgid; eviction = (eviction << NODES_SHIFT) | zone_to_nid(zone); eviction = (eviction << ZONES_SHIFT) | zone_idx(zone); eviction = (eviction << RADIX_TREE_EXCEPTIONAL_SHIFT); @@ -161,45 +178,23 @@ static void *pack_shadow(unsigned long eviction, struct zone *zone) return (void *)(eviction | RADIX_TREE_EXCEPTIONAL_ENTRY); } -static void unpack_shadow(void *shadow, - struct zone **zone, - unsigned long *distance) +static void unpack_shadow(void *shadow, int *memcgidp, struct zone **zonep, + unsigned long *evictionp) { unsigned long entry = (unsigned long)shadow; - unsigned long eviction; - unsigned long refault; - unsigned long mask; - int zid, nid; + int memcgid, nid, zid; entry >>= RADIX_TREE_EXCEPTIONAL_SHIFT; zid = entry & ((1UL << ZONES_SHIFT) - 1); entry >>= ZONES_SHIFT; nid = entry & ((1UL << NODES_SHIFT) - 1); entry >>= NODES_SHIFT; - eviction = entry; + memcgid = entry & ((1UL << MEM_CGROUP_ID_SHIFT) - 1); + entry >>= MEM_CGROUP_ID_SHIFT; - *zone = NODE_DATA(nid)->node_zones + zid; - - refault = atomic_long_read(&(*zone)->inactive_age); - mask = ~0UL >> (NODES_SHIFT + ZONES_SHIFT + - RADIX_TREE_EXCEPTIONAL_SHIFT); - /* - * The unsigned subtraction here gives an accurate distance - * across inactive_age overflows in most cases. - * - * There is a special case: usually, shadow entries have a - * short lifetime and are either refaulted or reclaimed along - * with the inode before they get too old. But it is not - * impossible for the inactive_age to lap a shadow entry in - * the field, which can then can result in a false small - * refault distance, leading to a false activation should this - * old entry actually refault again. However, earlier kernels - * used to deactivate unconditionally with *every* reclaim - * invocation for the longest time, so the occasional - * inappropriate activation leading to pressure on the active - * list is not a problem. - */ - *distance = (refault - eviction) & mask; + *memcgidp = memcgid; + *zonep = NODE_DATA(nid)->node_zones + zid; + *evictionp = entry << bucket_order; } /** @@ -212,11 +207,20 @@ static void unpack_shadow(void *shadow, */ void *workingset_eviction(struct address_space *mapping, struct page *page) { + struct mem_cgroup *memcg = page_memcg(page); struct zone *zone = page_zone(page); + int memcgid = mem_cgroup_id(memcg); unsigned long eviction; + struct lruvec *lruvec; - eviction = atomic_long_inc_return(&zone->inactive_age); - return pack_shadow(eviction, zone); + /* Page is fully exclusive and pins page->mem_cgroup */ + VM_BUG_ON_PAGE(PageLRU(page), page); + VM_BUG_ON_PAGE(page_count(page), page); + VM_BUG_ON_PAGE(!PageLocked(page), page); + + lruvec = mem_cgroup_zone_lruvec(zone, memcg); + eviction = atomic_long_inc_return(&lruvec->inactive_age); + return pack_shadow(memcgid, zone, eviction); } /** @@ -231,12 +235,64 @@ void *workingset_eviction(struct address_space *mapping, struct page *page) bool workingset_refault(void *shadow) { unsigned long refault_distance; + unsigned long active_file; + struct mem_cgroup *memcg; + unsigned long eviction; + struct lruvec *lruvec; + unsigned long refault; struct zone *zone; + int memcgid; + + unpack_shadow(shadow, &memcgid, &zone, &eviction); + + rcu_read_lock(); + /* + * Look up the memcg associated with the stored ID. It might + * have been deleted since the page's eviction. + * + * Note that in rare events the ID could have been recycled + * for a new cgroup that refaults a shared page. This is + * impossible to tell from the available data. However, this + * should be a rare and limited disturbance, and activations + * are always speculative anyway. Ultimately, it's the aging + * algorithm's job to shake out the minimum access frequency + * for the active cache. + * + * XXX: On !CONFIG_MEMCG, this will always return NULL; it + * would be better if the root_mem_cgroup existed in all + * configurations instead. + */ + memcg = mem_cgroup_from_id(memcgid); + if (!mem_cgroup_disabled() && !memcg) { + rcu_read_unlock(); + return false; + } + lruvec = mem_cgroup_zone_lruvec(zone, memcg); + refault = atomic_long_read(&lruvec->inactive_age); + active_file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE); + rcu_read_unlock(); + + /* + * The unsigned subtraction here gives an accurate distance + * across inactive_age overflows in most cases. + * + * There is a special case: usually, shadow entries have a + * short lifetime and are either refaulted or reclaimed along + * with the inode before they get too old. But it is not + * impossible for the inactive_age to lap a shadow entry in + * the field, which can then can result in a false small + * refault distance, leading to a false activation should this + * old entry actually refault again. However, earlier kernels + * used to deactivate unconditionally with *every* reclaim + * invocation for the longest time, so the occasional + * inappropriate activation leading to pressure on the active + * list is not a problem. + */ + refault_distance = (refault - eviction) & EVICTION_MASK; - unpack_shadow(shadow, &zone, &refault_distance); inc_zone_state(zone, WORKINGSET_REFAULT); - if (refault_distance <= zone_page_state(zone, NR_ACTIVE_FILE)) { + if (refault_distance <= active_file) { inc_zone_state(zone, WORKINGSET_ACTIVATE); return true; } @@ -249,7 +305,22 @@ bool workingset_refault(void *shadow) */ void workingset_activation(struct page *page) { - atomic_long_inc(&page_zone(page)->inactive_age); + struct lruvec *lruvec; + + lock_page_memcg(page); + /* + * Filter non-memcg pages here, e.g. unmap can call + * mark_page_accessed() on VDSO pages. + * + * XXX: See workingset_refault() - this should return + * root_mem_cgroup even for !CONFIG_MEMCG. + */ + if (!mem_cgroup_disabled() && !page_memcg(page)) + goto out; + lruvec = mem_cgroup_zone_lruvec(page_zone(page), page_memcg(page)); + atomic_long_inc(&lruvec->inactive_age); +out: + unlock_page_memcg(page); } /* @@ -398,8 +469,25 @@ static struct lock_class_key shadow_nodes_key; static int __init workingset_init(void) { + unsigned int timestamp_bits; + unsigned int max_order; int ret; + BUILD_BUG_ON(BITS_PER_LONG < EVICTION_SHIFT); + /* + * Calculate the eviction bucket size to cover the longest + * actionable refault distance, which is currently half of + * memory (totalram_pages/2). However, memory hotplug may add + * some more pages at runtime, so keep working with up to + * double the initial memory by using totalram_pages as-is. + */ + timestamp_bits = BITS_PER_LONG - EVICTION_SHIFT; + max_order = fls_long(totalram_pages - 1); + if (max_order > timestamp_bits) + bucket_order = max_order - timestamp_bits; + printk("workingset: timestamp_bits=%d max_order=%d bucket_order=%u\n", + timestamp_bits, max_order, bucket_order); + ret = list_lru_init_key(&workingset_shadow_nodes, &shadow_nodes_key); if (ret) goto err; |