summaryrefslogtreecommitdiff
path: root/mm/page_alloc.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/page_alloc.c')
-rw-r--r--mm/page_alloc.c281
1 files changed, 183 insertions, 98 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index a2214c64ed3c..ca423cc20b59 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -607,6 +607,9 @@ static bool need_debug_guardpage(void)
if (!debug_pagealloc_enabled())
return false;
+ if (!debug_guardpage_minorder())
+ return false;
+
return true;
}
@@ -615,6 +618,9 @@ static void init_debug_guardpage(void)
if (!debug_pagealloc_enabled())
return;
+ if (!debug_guardpage_minorder())
+ return;
+
_debug_guardpage_enabled = true;
}
@@ -635,19 +641,22 @@ static int __init debug_guardpage_minorder_setup(char *buf)
pr_info("Setting debug_guardpage_minorder to %lu\n", res);
return 0;
}
-__setup("debug_guardpage_minorder=", debug_guardpage_minorder_setup);
+early_param("debug_guardpage_minorder", debug_guardpage_minorder_setup);
-static inline void set_page_guard(struct zone *zone, struct page *page,
+static inline bool set_page_guard(struct zone *zone, struct page *page,
unsigned int order, int migratetype)
{
struct page_ext *page_ext;
if (!debug_guardpage_enabled())
- return;
+ return false;
+
+ if (order >= debug_guardpage_minorder())
+ return false;
page_ext = lookup_page_ext(page);
if (unlikely(!page_ext))
- return;
+ return false;
__set_bit(PAGE_EXT_DEBUG_GUARD, &page_ext->flags);
@@ -655,6 +664,8 @@ static inline void set_page_guard(struct zone *zone, struct page *page,
set_page_private(page, order);
/* Guard pages are not available for any usage */
__mod_zone_freepage_state(zone, -(1 << order), migratetype);
+
+ return true;
}
static inline void clear_page_guard(struct zone *zone, struct page *page,
@@ -676,9 +687,9 @@ static inline void clear_page_guard(struct zone *zone, struct page *page,
__mod_zone_freepage_state(zone, (1 << order), migratetype);
}
#else
-struct page_ext_operations debug_guardpage_ops = { NULL, };
-static inline void set_page_guard(struct zone *zone, struct page *page,
- unsigned int order, int migratetype) {}
+struct page_ext_operations debug_guardpage_ops;
+static inline bool set_page_guard(struct zone *zone, struct page *page,
+ unsigned int order, int migratetype) { return false; }
static inline void clear_page_guard(struct zone *zone, struct page *page,
unsigned int order, int migratetype) {}
#endif
@@ -1393,15 +1404,18 @@ static void __init deferred_free_range(struct page *page,
return;
/* Free a large naturally-aligned chunk if possible */
- if (nr_pages == MAX_ORDER_NR_PAGES &&
- (pfn & (MAX_ORDER_NR_PAGES-1)) == 0) {
+ if (nr_pages == pageblock_nr_pages &&
+ (pfn & (pageblock_nr_pages - 1)) == 0) {
set_pageblock_migratetype(page, MIGRATE_MOVABLE);
- __free_pages_boot_core(page, MAX_ORDER-1);
+ __free_pages_boot_core(page, pageblock_order);
return;
}
- for (i = 0; i < nr_pages; i++, page++)
+ for (i = 0; i < nr_pages; i++, page++, pfn++) {
+ if ((pfn & (pageblock_nr_pages - 1)) == 0)
+ set_pageblock_migratetype(page, MIGRATE_MOVABLE);
__free_pages_boot_core(page, 0);
+ }
}
/* Completion tracking for deferred_init_memmap() threads */
@@ -1469,9 +1483,9 @@ static int __init deferred_init_memmap(void *data)
/*
* Ensure pfn_valid is checked every
- * MAX_ORDER_NR_PAGES for memory holes
+ * pageblock_nr_pages for memory holes
*/
- if ((pfn & (MAX_ORDER_NR_PAGES - 1)) == 0) {
+ if ((pfn & (pageblock_nr_pages - 1)) == 0) {
if (!pfn_valid(pfn)) {
page = NULL;
goto free_range;
@@ -1484,7 +1498,7 @@ static int __init deferred_init_memmap(void *data)
}
/* Minimise pfn page lookups and scheduler checks */
- if (page && (pfn & (MAX_ORDER_NR_PAGES - 1)) != 0) {
+ if (page && (pfn & (pageblock_nr_pages - 1)) != 0) {
page++;
} else {
nr_pages += nr_to_free;
@@ -1520,6 +1534,9 @@ free_range:
free_base_page = NULL;
free_base_pfn = nr_to_free = 0;
}
+ /* Free the last block of pages to allocator */
+ nr_pages += nr_to_free;
+ deferred_free_range(free_base_page, free_base_pfn, nr_to_free);
first_init_pfn = max(end_pfn, first_init_pfn);
}
@@ -1616,18 +1633,15 @@ static inline void expand(struct zone *zone, struct page *page,
size >>= 1;
VM_BUG_ON_PAGE(bad_range(zone, &page[size]), &page[size]);
- if (IS_ENABLED(CONFIG_DEBUG_PAGEALLOC) &&
- debug_guardpage_enabled() &&
- high < debug_guardpage_minorder()) {
- /*
- * Mark as guard pages (or page), that will allow to
- * merge back to allocator when buddy will be freed.
- * Corresponding page table entries will not be touched,
- * pages will stay not present in virtual address space
- */
- set_page_guard(zone, &page[size], high, migratetype);
+ /*
+ * Mark as guard pages (or page), that will allow to
+ * merge back to allocator when buddy will be freed.
+ * Corresponding page table entries will not be touched,
+ * pages will stay not present in virtual address space
+ */
+ if (set_page_guard(zone, &page[size], high, migratetype))
continue;
- }
+
list_add(&page[size].lru, &area->free_list[migratetype]);
area->nr_free++;
set_page_order(&page[size], high);
@@ -2489,9 +2503,14 @@ int __isolate_free_page(struct page *page, unsigned int order)
mt = get_pageblock_migratetype(page);
if (!is_migrate_isolate(mt)) {
- /* Obey watermarks as if the page was being allocated */
- watermark = low_wmark_pages(zone) + (1 << order);
- if (!zone_watermark_ok(zone, 0, watermark, 0, 0))
+ /*
+ * Obey watermarks as if the page was being allocated. We can
+ * emulate a high-order watermark check with a raised order-0
+ * watermark, because we already know our high-order page
+ * exists.
+ */
+ watermark = min_wmark_pages(zone) + (1UL << order);
+ if (!zone_watermark_ok(zone, 0, watermark, 0, ALLOC_CMA))
return 0;
__mod_zone_freepage_state(zone, -(1UL << order), mt);
@@ -2960,9 +2979,11 @@ static DEFINE_RATELIMIT_STATE(nopage_rs,
DEFAULT_RATELIMIT_INTERVAL,
DEFAULT_RATELIMIT_BURST);
-void warn_alloc_failed(gfp_t gfp_mask, unsigned int order, const char *fmt, ...)
+void warn_alloc(gfp_t gfp_mask, const char *fmt, ...)
{
unsigned int filter = SHOW_MEM_FILTER_NODES;
+ struct va_format vaf;
+ va_list args;
if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs) ||
debug_guardpage_minorder() > 0)
@@ -2980,22 +3001,16 @@ void warn_alloc_failed(gfp_t gfp_mask, unsigned int order, const char *fmt, ...)
if (in_interrupt() || !(gfp_mask & __GFP_DIRECT_RECLAIM))
filter &= ~SHOW_MEM_FILTER_NODES;
- if (fmt) {
- struct va_format vaf;
- va_list args;
+ pr_warn("%s: ", current->comm);
- va_start(args, fmt);
+ va_start(args, fmt);
+ vaf.fmt = fmt;
+ vaf.va = &args;
+ pr_cont("%pV", &vaf);
+ va_end(args);
- vaf.fmt = fmt;
- vaf.va = &args;
+ pr_cont(", mode:%#x(%pGg)\n", gfp_mask, &gfp_mask);
- pr_warn("%pV", &vaf);
-
- va_end(args);
- }
-
- pr_warn("%s: page allocation failure: order:%u, mode:%#x(%pGg)\n",
- current->comm, order, gfp_mask, &gfp_mask);
dump_stack();
if (!should_suppress_show_mem())
show_mem(filter);
@@ -3137,6 +3152,65 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
return NULL;
}
+static inline bool
+should_compact_retry(struct alloc_context *ac, int order, int alloc_flags,
+ enum compact_result compact_result,
+ enum compact_priority *compact_priority,
+ int *compaction_retries)
+{
+ int max_retries = MAX_COMPACT_RETRIES;
+ int min_priority;
+
+ if (!order)
+ return false;
+
+ if (compaction_made_progress(compact_result))
+ (*compaction_retries)++;
+
+ /*
+ * compaction considers all the zone as desperately out of memory
+ * so it doesn't really make much sense to retry except when the
+ * failure could be caused by insufficient priority
+ */
+ if (compaction_failed(compact_result))
+ goto check_priority;
+
+ /*
+ * make sure the compaction wasn't deferred or didn't bail out early
+ * due to locks contention before we declare that we should give up.
+ * But do not retry if the given zonelist is not suitable for
+ * compaction.
+ */
+ if (compaction_withdrawn(compact_result))
+ return compaction_zonelist_suitable(ac, order, alloc_flags);
+
+ /*
+ * !costly requests are much more important than __GFP_REPEAT
+ * costly ones because they are de facto nofail and invoke OOM
+ * killer to move on while costly can fail and users are ready
+ * to cope with that. 1/4 retries is rather arbitrary but we
+ * would need much more detailed feedback from compaction to
+ * make a better decision.
+ */
+ if (order > PAGE_ALLOC_COSTLY_ORDER)
+ max_retries /= 4;
+ if (*compaction_retries <= max_retries)
+ return true;
+
+ /*
+ * Make sure there are attempts at the highest priority if we exhausted
+ * all retries or failed at the lower priorities.
+ */
+check_priority:
+ min_priority = (order > PAGE_ALLOC_COSTLY_ORDER) ?
+ MIN_COMPACT_COSTLY_PRIORITY : MIN_COMPACT_PRIORITY;
+ if (*compact_priority > min_priority) {
+ (*compact_priority)--;
+ *compaction_retries = 0;
+ return true;
+ }
+ return false;
+}
#else
static inline struct page *
__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
@@ -3147,13 +3221,11 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
return NULL;
}
-#endif /* CONFIG_COMPACTION */
-
static inline bool
should_compact_retry(struct alloc_context *ac, unsigned int order, int alloc_flags,
enum compact_result compact_result,
enum compact_priority *compact_priority,
- int compaction_retries)
+ int *compaction_retries)
{
struct zone *zone;
struct zoneref *z;
@@ -3175,6 +3247,7 @@ should_compact_retry(struct alloc_context *ac, unsigned int order, int alloc_fla
}
return false;
}
+#endif /* CONFIG_COMPACTION */
/* Perform direct synchronous page reclaim */
static int
@@ -3325,16 +3398,26 @@ bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)
static inline bool
should_reclaim_retry(gfp_t gfp_mask, unsigned order,
struct alloc_context *ac, int alloc_flags,
- bool did_some_progress, int no_progress_loops)
+ bool did_some_progress, int *no_progress_loops)
{
struct zone *zone;
struct zoneref *z;
/*
+ * Costly allocations might have made a progress but this doesn't mean
+ * their order will become available due to high fragmentation so
+ * always increment the no progress counter for them
+ */
+ if (did_some_progress && order <= PAGE_ALLOC_COSTLY_ORDER)
+ *no_progress_loops = 0;
+ else
+ (*no_progress_loops)++;
+
+ /*
* Make sure we converge to OOM if we cannot make any progress
* several times in the row.
*/
- if (no_progress_loops > MAX_RECLAIM_RETRIES)
+ if (*no_progress_loops > MAX_RECLAIM_RETRIES)
return false;
/*
@@ -3349,7 +3432,7 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order,
unsigned long reclaimable;
available = reclaimable = zone_reclaimable_pages(zone);
- available -= DIV_ROUND_UP(no_progress_loops * available,
+ available -= DIV_ROUND_UP((*no_progress_loops) * available,
MAX_RECLAIM_RETRIES);
available += zone_page_state_snapshot(zone, NR_FREE_PAGES);
@@ -3410,6 +3493,8 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
enum compact_result compact_result;
int compaction_retries = 0;
int no_progress_loops = 0;
+ unsigned long alloc_start = jiffies;
+ unsigned int stall_timeout = 10 * HZ;
/*
* In the slowpath, we sanity check order to avoid ever trying to
@@ -3554,9 +3639,6 @@ retry:
if (page)
goto got_pg;
- if (order && compaction_made_progress(compact_result))
- compaction_retries++;
-
/* Do not loop if specifically requested */
if (gfp_mask & __GFP_NORETRY)
goto nopage;
@@ -3568,18 +3650,16 @@ retry:
if (order > PAGE_ALLOC_COSTLY_ORDER && !(gfp_mask & __GFP_REPEAT))
goto nopage;
- /*
- * Costly allocations might have made a progress but this doesn't mean
- * their order will become available due to high fragmentation so
- * always increment the no progress counter for them
- */
- if (did_some_progress && order <= PAGE_ALLOC_COSTLY_ORDER)
- no_progress_loops = 0;
- else
- no_progress_loops++;
+ /* Make sure we know about allocations which stall for too long */
+ if (time_after(jiffies, alloc_start + stall_timeout)) {
+ warn_alloc(gfp_mask,
+ "page alloction stalls for %ums, order:%u\n",
+ jiffies_to_msecs(jiffies-alloc_start), order);
+ stall_timeout += 10 * HZ;
+ }
if (should_reclaim_retry(gfp_mask, order, ac, alloc_flags,
- did_some_progress > 0, no_progress_loops))
+ did_some_progress > 0, &no_progress_loops))
goto retry;
/*
@@ -3591,7 +3671,7 @@ retry:
if (did_some_progress > 0 &&
should_compact_retry(ac, order, alloc_flags,
compact_result, &compact_priority,
- compaction_retries))
+ &compaction_retries))
goto retry;
/* Reclaim has failed us, start killing things */
@@ -3606,7 +3686,8 @@ retry:
}
nopage:
- warn_alloc_failed(gfp_mask, order, NULL);
+ warn_alloc(gfp_mask,
+ "page allocation failure: order:%u", order);
got_pg:
return page;
}
@@ -4555,7 +4636,7 @@ static void build_zonelists_in_node_order(pg_data_t *pgdat, int node)
int j;
struct zonelist *zonelist;
- zonelist = &pgdat->node_zonelists[0];
+ zonelist = &pgdat->node_zonelists[ZONELIST_FALLBACK];
for (j = 0; zonelist->_zonerefs[j].zone != NULL; j++)
;
j = build_zonelists_node(NODE_DATA(node), zonelist, j);
@@ -4571,7 +4652,7 @@ static void build_thisnode_zonelists(pg_data_t *pgdat)
int j;
struct zonelist *zonelist;
- zonelist = &pgdat->node_zonelists[1];
+ zonelist = &pgdat->node_zonelists[ZONELIST_NOFALLBACK];
j = build_zonelists_node(pgdat, zonelist, 0);
zonelist->_zonerefs[j].zone = NULL;
zonelist->_zonerefs[j].zone_idx = 0;
@@ -4592,7 +4673,7 @@ static void build_zonelists_in_zone_order(pg_data_t *pgdat, int nr_nodes)
struct zone *z;
struct zonelist *zonelist;
- zonelist = &pgdat->node_zonelists[0];
+ zonelist = &pgdat->node_zonelists[ZONELIST_FALLBACK];
pos = 0;
for (zone_type = MAX_NR_ZONES - 1; zone_type >= 0; zone_type--) {
for (j = 0; j < nr_nodes; j++) {
@@ -4727,7 +4808,7 @@ static void build_zonelists(pg_data_t *pgdat)
local_node = pgdat->node_id;
- zonelist = &pgdat->node_zonelists[0];
+ zonelist = &pgdat->node_zonelists[ZONELIST_FALLBACK];
j = build_zonelists_node(pgdat, zonelist, 0);
/*
@@ -5000,15 +5081,6 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
/*
- * If not mirrored_kernelcore and ZONE_MOVABLE exists, range
- * from zone_movable_pfn[nid] to end of each node should be
- * ZONE_MOVABLE not ZONE_NORMAL. skip it.
- */
- if (!mirrored_kernelcore && zone_movable_pfn[nid])
- if (zone == ZONE_NORMAL && pfn >= zone_movable_pfn[nid])
- continue;
-
- /*
* Check given memblock attribute by firmware which can affect
* kernel memory layout. If zone==ZONE_MOVABLE but memory is
* mirrored, it's an overlapped memmap init. skip it.
@@ -5451,6 +5523,12 @@ static void __meminit adjust_zone_range_for_zone_movable(int nid,
*zone_end_pfn = min(node_end_pfn,
arch_zone_highest_possible_pfn[movable_zone]);
+ /* Adjust for ZONE_MOVABLE starting within this range */
+ } else if (!mirrored_kernelcore &&
+ *zone_start_pfn < zone_movable_pfn[nid] &&
+ *zone_end_pfn > zone_movable_pfn[nid]) {
+ *zone_end_pfn = zone_movable_pfn[nid];
+
/* Check if this whole range is within ZONE_MOVABLE */
} else if (*zone_start_pfn >= zone_movable_pfn[nid])
*zone_start_pfn = *zone_end_pfn;
@@ -5554,28 +5632,23 @@ static unsigned long __meminit zone_absent_pages_in_node(int nid,
* Treat pages to be ZONE_MOVABLE in ZONE_NORMAL as absent pages
* and vice versa.
*/
- if (zone_movable_pfn[nid]) {
- if (mirrored_kernelcore) {
- unsigned long start_pfn, end_pfn;
- struct memblock_region *r;
-
- for_each_memblock(memory, r) {
- start_pfn = clamp(memblock_region_memory_base_pfn(r),
- zone_start_pfn, zone_end_pfn);
- end_pfn = clamp(memblock_region_memory_end_pfn(r),
- zone_start_pfn, zone_end_pfn);
-
- if (zone_type == ZONE_MOVABLE &&
- memblock_is_mirror(r))
- nr_absent += end_pfn - start_pfn;
-
- if (zone_type == ZONE_NORMAL &&
- !memblock_is_mirror(r))
- nr_absent += end_pfn - start_pfn;
- }
- } else {
- if (zone_type == ZONE_NORMAL)
- nr_absent += node_end_pfn - zone_movable_pfn[nid];
+ if (mirrored_kernelcore && zone_movable_pfn[nid]) {
+ unsigned long start_pfn, end_pfn;
+ struct memblock_region *r;
+
+ for_each_memblock(memory, r) {
+ start_pfn = clamp(memblock_region_memory_base_pfn(r),
+ zone_start_pfn, zone_end_pfn);
+ end_pfn = clamp(memblock_region_memory_end_pfn(r),
+ zone_start_pfn, zone_end_pfn);
+
+ if (zone_type == ZONE_MOVABLE &&
+ memblock_is_mirror(r))
+ nr_absent += end_pfn - start_pfn;
+
+ if (zone_type == ZONE_NORMAL &&
+ !memblock_is_mirror(r))
+ nr_absent += end_pfn - start_pfn;
}
}
@@ -6929,6 +7002,17 @@ static int __init set_hashdist(char *str)
__setup("hashdist=", set_hashdist);
#endif
+#ifndef __HAVE_ARCH_RESERVED_KERNEL_PAGES
+/*
+ * Returns the number of pages that arch has reserved but
+ * is not known to alloc_large_system_hash().
+ */
+static unsigned long __init arch_reserved_kernel_pages(void)
+{
+ return 0;
+}
+#endif
+
/*
* allocate a large system hash table from bootmem
* - it is assumed that the hash table must contain an exact power-of-2
@@ -6953,6 +7037,7 @@ void *__init alloc_large_system_hash(const char *tablename,
if (!numentries) {
/* round applicable memory size up to nearest megabyte */
numentries = nr_kernel_pages;
+ numentries -= arch_reserved_kernel_pages();
/* It isn't necessary when PAGE_SIZE >= 1MB */
if (PAGE_SHIFT < 20)