summaryrefslogtreecommitdiff
path: root/mm/page_alloc.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/page_alloc.c')
-rw-r--r--mm/page_alloc.c993
1 files changed, 714 insertions, 279 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 40e29429e7b0..beda41710802 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -61,6 +61,7 @@
#include <linux/hugetlb.h>
#include <linux/sched/rt.h>
#include <linux/page_owner.h>
+#include <linux/kthread.h>
#include <asm/sections.h>
#include <asm/tlbflush.h>
@@ -235,6 +236,75 @@ EXPORT_SYMBOL(nr_online_nodes);
int page_group_by_mobility_disabled __read_mostly;
+#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
+static inline void reset_deferred_meminit(pg_data_t *pgdat)
+{
+ pgdat->first_deferred_pfn = ULONG_MAX;
+}
+
+/* Returns true if the struct page for the pfn is uninitialised */
+static inline bool __meminit early_page_uninitialised(unsigned long pfn)
+{
+ if (pfn >= NODE_DATA(early_pfn_to_nid(pfn))->first_deferred_pfn)
+ return true;
+
+ return false;
+}
+
+static inline bool early_page_nid_uninitialised(unsigned long pfn, int nid)
+{
+ if (pfn >= NODE_DATA(nid)->first_deferred_pfn)
+ return true;
+
+ return false;
+}
+
+/*
+ * Returns false when the remaining initialisation should be deferred until
+ * later in the boot cycle when it can be parallelised.
+ */
+static inline bool update_defer_init(pg_data_t *pgdat,
+ unsigned long pfn, unsigned long zone_end,
+ unsigned long *nr_initialised)
+{
+ /* Always populate low zones for address-contrained allocations */
+ if (zone_end < pgdat_end_pfn(pgdat))
+ return true;
+
+ /* Initialise at least 2G of the highest zone */
+ (*nr_initialised)++;
+ if (*nr_initialised > (2UL << (30 - PAGE_SHIFT)) &&
+ (pfn & (PAGES_PER_SECTION - 1)) == 0) {
+ pgdat->first_deferred_pfn = pfn;
+ return false;
+ }
+
+ return true;
+}
+#else
+static inline void reset_deferred_meminit(pg_data_t *pgdat)
+{
+}
+
+static inline bool early_page_uninitialised(unsigned long pfn)
+{
+ return false;
+}
+
+static inline bool early_page_nid_uninitialised(unsigned long pfn, int nid)
+{
+ return false;
+}
+
+static inline bool update_defer_init(pg_data_t *pgdat,
+ unsigned long pfn, unsigned long zone_end,
+ unsigned long *nr_initialised)
+{
+ return true;
+}
+#endif
+
+
void set_pageblock_migratetype(struct page *page, int migratetype)
{
if (unlikely(page_group_by_mobility_disabled &&
@@ -380,20 +450,6 @@ void prep_compound_page(struct page *page, unsigned long order)
}
}
-static inline void prep_zero_page(struct page *page, unsigned int order,
- gfp_t gfp_flags)
-{
- int i;
-
- /*
- * clear_highpage() will use KM_USER0, so it's a bug to use __GFP_ZERO
- * and __GFP_HIGHMEM from hard or soft interrupt context.
- */
- VM_BUG_ON((gfp_flags & __GFP_HIGHMEM) && in_interrupt());
- for (i = 0; i < (1 << order); i++)
- clear_highpage(page + i);
-}
-
#ifdef CONFIG_DEBUG_PAGEALLOC
unsigned int _debug_guardpage_minorder;
bool _debug_pagealloc_enabled __read_mostly;
@@ -778,6 +834,75 @@ static int free_tail_pages_check(struct page *head_page, struct page *page)
return 0;
}
+static void __meminit __init_single_page(struct page *page, unsigned long pfn,
+ unsigned long zone, int nid)
+{
+ set_page_links(page, zone, nid, pfn);
+ init_page_count(page);
+ page_mapcount_reset(page);
+ page_cpupid_reset_last(page);
+
+ INIT_LIST_HEAD(&page->lru);
+#ifdef WANT_PAGE_VIRTUAL
+ /* The shift won't overflow because ZONE_NORMAL is below 4G. */
+ if (!is_highmem_idx(zone))
+ set_page_address(page, __va(pfn << PAGE_SHIFT));
+#endif
+}
+
+static void __meminit __init_single_pfn(unsigned long pfn, unsigned long zone,
+ int nid)
+{
+ return __init_single_page(pfn_to_page(pfn), pfn, zone, nid);
+}
+
+#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
+static void init_reserved_page(unsigned long pfn)
+{
+ pg_data_t *pgdat;
+ int nid, zid;
+
+ if (!early_page_uninitialised(pfn))
+ return;
+
+ nid = early_pfn_to_nid(pfn);
+ pgdat = NODE_DATA(nid);
+
+ for (zid = 0; zid < MAX_NR_ZONES; zid++) {
+ struct zone *zone = &pgdat->node_zones[zid];
+
+ if (pfn >= zone->zone_start_pfn && pfn < zone_end_pfn(zone))
+ break;
+ }
+ __init_single_pfn(pfn, zid, nid);
+}
+#else
+static inline void init_reserved_page(unsigned long pfn)
+{
+}
+#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
+
+/*
+ * Initialised pages do not have PageReserved set. This function is
+ * called for each range allocated by the bootmem allocator and
+ * marks the pages PageReserved. The remaining valid pages are later
+ * sent to the buddy page allocator.
+ */
+void __meminit reserve_bootmem_region(unsigned long start, unsigned long end)
+{
+ unsigned long start_pfn = PFN_DOWN(start);
+ unsigned long end_pfn = PFN_UP(end);
+
+ for (; start_pfn < end_pfn; start_pfn++) {
+ if (pfn_valid(start_pfn)) {
+ struct page *page = pfn_to_page(start_pfn);
+
+ init_reserved_page(start_pfn);
+ SetPageReserved(page);
+ }
+ }
+}
+
static bool free_pages_prepare(struct page *page, unsigned int order)
{
bool compound = PageCompound(page);
@@ -832,7 +957,8 @@ static void __free_pages_ok(struct page *page, unsigned int order)
local_irq_restore(flags);
}
-void __init __free_pages_bootmem(struct page *page, unsigned int order)
+static void __init __free_pages_boot_core(struct page *page,
+ unsigned long pfn, unsigned int order)
{
unsigned int nr_pages = 1 << order;
struct page *p = page;
@@ -852,6 +978,235 @@ void __init __free_pages_bootmem(struct page *page, unsigned int order)
__free_pages(page, order);
}
+#if defined(CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID) || \
+ defined(CONFIG_HAVE_MEMBLOCK_NODE_MAP)
+
+static struct mminit_pfnnid_cache early_pfnnid_cache __meminitdata;
+
+int __meminit early_pfn_to_nid(unsigned long pfn)
+{
+ static DEFINE_SPINLOCK(early_pfn_lock);
+ int nid;
+
+ spin_lock(&early_pfn_lock);
+ nid = __early_pfn_to_nid(pfn, &early_pfnnid_cache);
+ if (nid < 0)
+ nid = 0;
+ spin_unlock(&early_pfn_lock);
+
+ return nid;
+}
+#endif
+
+#ifdef CONFIG_NODES_SPAN_OTHER_NODES
+static inline bool __meminit meminit_pfn_in_nid(unsigned long pfn, int node,
+ struct mminit_pfnnid_cache *state)
+{
+ int nid;
+
+ nid = __early_pfn_to_nid(pfn, state);
+ if (nid >= 0 && nid != node)
+ return false;
+ return true;
+}
+
+/* Only safe to use early in boot when initialisation is single-threaded */
+static inline bool __meminit early_pfn_in_nid(unsigned long pfn, int node)
+{
+ return meminit_pfn_in_nid(pfn, node, &early_pfnnid_cache);
+}
+
+#else
+
+static inline bool __meminit early_pfn_in_nid(unsigned long pfn, int node)
+{
+ return true;
+}
+static inline bool __meminit meminit_pfn_in_nid(unsigned long pfn, int node,
+ struct mminit_pfnnid_cache *state)
+{
+ return true;
+}
+#endif
+
+
+void __init __free_pages_bootmem(struct page *page, unsigned long pfn,
+ unsigned int order)
+{
+ if (early_page_uninitialised(pfn))
+ return;
+ return __free_pages_boot_core(page, pfn, order);
+}
+
+#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
+static void __init deferred_free_range(struct page *page,
+ unsigned long pfn, int nr_pages)
+{
+ int i;
+
+ if (!page)
+ return;
+
+ /* Free a large naturally-aligned chunk if possible */
+ if (nr_pages == MAX_ORDER_NR_PAGES &&
+ (pfn & (MAX_ORDER_NR_PAGES-1)) == 0) {
+ set_pageblock_migratetype(page, MIGRATE_MOVABLE);
+ __free_pages_boot_core(page, pfn, MAX_ORDER-1);
+ return;
+ }
+
+ for (i = 0; i < nr_pages; i++, page++, pfn++)
+ __free_pages_boot_core(page, pfn, 0);
+}
+
+/* Completion tracking for deferred_init_memmap() threads */
+static atomic_t pgdat_init_n_undone __initdata;
+static __initdata DECLARE_COMPLETION(pgdat_init_all_done_comp);
+
+static inline void __init pgdat_init_report_one_done(void)
+{
+ if (atomic_dec_and_test(&pgdat_init_n_undone))
+ complete(&pgdat_init_all_done_comp);
+}
+
+/* Initialise remaining memory on a node */
+static int __init deferred_init_memmap(void *data)
+{
+ pg_data_t *pgdat = data;
+ int nid = pgdat->node_id;
+ struct mminit_pfnnid_cache nid_init_state = { };
+ unsigned long start = jiffies;
+ unsigned long nr_pages = 0;
+ unsigned long walk_start, walk_end;
+ int i, zid;
+ struct zone *zone;
+ unsigned long first_init_pfn = pgdat->first_deferred_pfn;
+ const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
+
+ if (first_init_pfn == ULONG_MAX) {
+ pgdat_init_report_one_done();
+ return 0;
+ }
+
+ /* Bind memory initialisation thread to a local node if possible */
+ if (!cpumask_empty(cpumask))
+ set_cpus_allowed_ptr(current, cpumask);
+
+ /* Sanity check boundaries */
+ BUG_ON(pgdat->first_deferred_pfn < pgdat->node_start_pfn);
+ BUG_ON(pgdat->first_deferred_pfn > pgdat_end_pfn(pgdat));
+ pgdat->first_deferred_pfn = ULONG_MAX;
+
+ /* Only the highest zone is deferred so find it */
+ for (zid = 0; zid < MAX_NR_ZONES; zid++) {
+ zone = pgdat->node_zones + zid;
+ if (first_init_pfn < zone_end_pfn(zone))
+ break;
+ }
+
+ for_each_mem_pfn_range(i, nid, &walk_start, &walk_end, NULL) {
+ unsigned long pfn, end_pfn;
+ struct page *page = NULL;
+ struct page *free_base_page = NULL;
+ unsigned long free_base_pfn = 0;
+ int nr_to_free = 0;
+
+ end_pfn = min(walk_end, zone_end_pfn(zone));
+ pfn = first_init_pfn;
+ if (pfn < walk_start)
+ pfn = walk_start;
+ if (pfn < zone->zone_start_pfn)
+ pfn = zone->zone_start_pfn;
+
+ for (; pfn < end_pfn; pfn++) {
+ if (!pfn_valid_within(pfn))
+ goto free_range;
+
+ /*
+ * Ensure pfn_valid is checked every
+ * MAX_ORDER_NR_PAGES for memory holes
+ */
+ if ((pfn & (MAX_ORDER_NR_PAGES - 1)) == 0) {
+ if (!pfn_valid(pfn)) {
+ page = NULL;
+ goto free_range;
+ }
+ }
+
+ if (!meminit_pfn_in_nid(pfn, nid, &nid_init_state)) {
+ page = NULL;
+ goto free_range;
+ }
+
+ /* Minimise pfn page lookups and scheduler checks */
+ if (page && (pfn & (MAX_ORDER_NR_PAGES - 1)) != 0) {
+ page++;
+ } else {
+ nr_pages += nr_to_free;
+ deferred_free_range(free_base_page,
+ free_base_pfn, nr_to_free);
+ free_base_page = NULL;
+ free_base_pfn = nr_to_free = 0;
+
+ page = pfn_to_page(pfn);
+ cond_resched();
+ }
+
+ if (page->flags) {
+ VM_BUG_ON(page_zone(page) != zone);
+ goto free_range;
+ }
+
+ __init_single_page(page, pfn, zid, nid);
+ if (!free_base_page) {
+ free_base_page = page;
+ free_base_pfn = pfn;
+ nr_to_free = 0;
+ }
+ nr_to_free++;
+
+ /* Where possible, batch up pages for a single free */
+ continue;
+free_range:
+ /* Free the current block of pages to allocator */
+ nr_pages += nr_to_free;
+ deferred_free_range(free_base_page, free_base_pfn,
+ nr_to_free);
+ free_base_page = NULL;
+ free_base_pfn = nr_to_free = 0;
+ }
+
+ first_init_pfn = max(end_pfn, first_init_pfn);
+ }
+
+ /* Sanity check that the next zone really is unpopulated */
+ WARN_ON(++zid < MAX_NR_ZONES && populated_zone(++zone));
+
+ pr_info("node %d initialised, %lu pages in %ums\n", nid, nr_pages,
+ jiffies_to_msecs(jiffies - start));
+
+ pgdat_init_report_one_done();
+ return 0;
+}
+
+void __init page_alloc_init_late(void)
+{
+ int nid;
+
+ /* There will be num_node_state(N_MEMORY) threads */
+ atomic_set(&pgdat_init_n_undone, num_node_state(N_MEMORY));
+ for_each_node_state(nid, N_MEMORY) {
+ kthread_run(deferred_init_memmap, NODE_DATA(nid), "pgdatinit%d", nid);
+ }
+
+ /* Block until all are initialised */
+ wait_for_completion(&pgdat_init_all_done_comp);
+
+ /* Reinit limits that are based on free pages after the kernel is up */
+ files_maxfiles_init();
+}
+#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
+
#ifdef CONFIG_CMA
/* Free whole pageblock and set its migration type to MIGRATE_CMA. */
void __init init_cma_reserved_pageblock(struct page *page)
@@ -941,6 +1296,10 @@ static inline int check_new_page(struct page *page)
bad_reason = "non-NULL mapping";
if (unlikely(atomic_read(&page->_count) != 0))
bad_reason = "nonzero _count";
+ if (unlikely(page->flags & __PG_HWPOISON)) {
+ bad_reason = "HWPoisoned (hardware-corrupted)";
+ bad_flags = __PG_HWPOISON;
+ }
if (unlikely(page->flags & PAGE_FLAGS_CHECK_AT_PREP)) {
bad_reason = "PAGE_FLAGS_CHECK_AT_PREP flag set";
bad_flags = PAGE_FLAGS_CHECK_AT_PREP;
@@ -975,7 +1334,8 @@ static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags,
kasan_alloc_pages(page, order);
if (gfp_flags & __GFP_ZERO)
- prep_zero_page(page, order, gfp_flags);
+ for (i = 0; i < (1 << order); i++)
+ clear_highpage(page + i);
if (order && (gfp_flags & __GFP_COMP))
prep_compound_page(page, order);
@@ -1032,11 +1392,9 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
static int fallbacks[MIGRATE_TYPES][4] = {
[MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE },
[MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE },
+ [MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE },
#ifdef CONFIG_CMA
- [MIGRATE_MOVABLE] = { MIGRATE_CMA, MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE },
[MIGRATE_CMA] = { MIGRATE_RESERVE }, /* Never used */
-#else
- [MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE },
#endif
[MIGRATE_RESERVE] = { MIGRATE_RESERVE }, /* Never used */
#ifdef CONFIG_MEMORY_ISOLATION
@@ -1044,6 +1402,17 @@ static int fallbacks[MIGRATE_TYPES][4] = {
#endif
};
+#ifdef CONFIG_CMA
+static struct page *__rmqueue_cma_fallback(struct zone *zone,
+ unsigned int order)
+{
+ return __rmqueue_smallest(zone, order, MIGRATE_CMA);
+}
+#else
+static inline struct page *__rmqueue_cma_fallback(struct zone *zone,
+ unsigned int order) { return NULL; }
+#endif
+
/*
* Move the free pages in a range to the free lists of the requested type.
* Note that start_page and end_pages are not aligned on a pageblock
@@ -1136,14 +1505,40 @@ static void change_pageblock_range(struct page *pageblock_page,
* as fragmentation caused by those allocations polluting movable pageblocks
* is worse than movable allocations stealing from unmovable and reclaimable
* pageblocks.
- *
- * If we claim more than half of the pageblock, change pageblock's migratetype
- * as well.
*/
-static void try_to_steal_freepages(struct zone *zone, struct page *page,
- int start_type, int fallback_type)
+static bool can_steal_fallback(unsigned int order, int start_mt)
+{
+ /*
+ * Leaving this order check is intended, although there is
+ * relaxed order check in next check. The reason is that
+ * we can actually steal whole pageblock if this condition met,
+ * but, below check doesn't guarantee it and that is just heuristic
+ * so could be changed anytime.
+ */
+ if (order >= pageblock_order)
+ return true;
+
+ if (order >= pageblock_order / 2 ||
+ start_mt == MIGRATE_RECLAIMABLE ||
+ start_mt == MIGRATE_UNMOVABLE ||
+ page_group_by_mobility_disabled)
+ return true;
+
+ return false;
+}
+
+/*
+ * This function implements actual steal behaviour. If order is large enough,
+ * we can steal whole pageblock. If not, we first move freepages in this
+ * pageblock and check whether half of pages are moved or not. If half of
+ * pages are moved, we can change migratetype of pageblock and permanently
+ * use it's pages as requested migratetype in the future.
+ */
+static void steal_suitable_fallback(struct zone *zone, struct page *page,
+ int start_type)
{
int current_order = page_order(page);
+ int pages;
/* Take ownership for orders >= pageblock_order */
if (current_order >= pageblock_order) {
@@ -1151,19 +1546,49 @@ static void try_to_steal_freepages(struct zone *zone, struct page *page,
return;
}
- if (current_order >= pageblock_order / 2 ||
- start_type == MIGRATE_RECLAIMABLE ||
- start_type == MIGRATE_UNMOVABLE ||
- page_group_by_mobility_disabled) {
- int pages;
+ pages = move_freepages_block(zone, page, start_type);
+
+ /* Claim the whole block if over half of it is free */
+ if (pages >= (1 << (pageblock_order-1)) ||
+ page_group_by_mobility_disabled)
+ set_pageblock_migratetype(page, start_type);
+}
+
+/*
+ * Check whether there is a suitable fallback freepage with requested order.
+ * If only_stealable is true, this function returns fallback_mt only if
+ * we can steal other freepages all together. This would help to reduce
+ * fragmentation due to mixed migratetype pages in one pageblock.
+ */
+int find_suitable_fallback(struct free_area *area, unsigned int order,
+ int migratetype, bool only_stealable, bool *can_steal)
+{
+ int i;
+ int fallback_mt;
+
+ if (area->nr_free == 0)
+ return -1;
+
+ *can_steal = false;
+ for (i = 0;; i++) {
+ fallback_mt = fallbacks[migratetype][i];
+ if (fallback_mt == MIGRATE_RESERVE)
+ break;
+
+ if (list_empty(&area->free_list[fallback_mt]))
+ continue;
+
+ if (can_steal_fallback(order, migratetype))
+ *can_steal = true;
- pages = move_freepages_block(zone, page, start_type);
+ if (!only_stealable)
+ return fallback_mt;
- /* Claim the whole block if over half of it is free */
- if (pages >= (1 << (pageblock_order-1)) ||
- page_group_by_mobility_disabled)
- set_pageblock_migratetype(page, start_type);
+ if (*can_steal)
+ return fallback_mt;
}
+
+ return -1;
}
/* Remove an element from the buddy allocator from the fallback list */
@@ -1173,64 +1598,45 @@ __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype)
struct free_area *area;
unsigned int current_order;
struct page *page;
+ int fallback_mt;
+ bool can_steal;
/* Find the largest possible block of pages in the other list */
for (current_order = MAX_ORDER-1;
current_order >= order && current_order <= MAX_ORDER-1;
--current_order) {
- int i;
- for (i = 0;; i++) {
- int migratetype = fallbacks[start_migratetype][i];
- int buddy_type = start_migratetype;
-
- /* MIGRATE_RESERVE handled later if necessary */
- if (migratetype == MIGRATE_RESERVE)
- break;
-
- area = &(zone->free_area[current_order]);
- if (list_empty(&area->free_list[migratetype]))
- continue;
-
- page = list_entry(area->free_list[migratetype].next,
- struct page, lru);
- area->nr_free--;
-
- if (!is_migrate_cma(migratetype)) {
- try_to_steal_freepages(zone, page,
- start_migratetype,
- migratetype);
- } else {
- /*
- * When borrowing from MIGRATE_CMA, we need to
- * release the excess buddy pages to CMA
- * itself, and we do not try to steal extra
- * free pages.
- */
- buddy_type = migratetype;
- }
+ area = &(zone->free_area[current_order]);
+ fallback_mt = find_suitable_fallback(area, current_order,
+ start_migratetype, false, &can_steal);
+ if (fallback_mt == -1)
+ continue;
- /* Remove the page from the freelists */
- list_del(&page->lru);
- rmv_page_order(page);
+ page = list_entry(area->free_list[fallback_mt].next,
+ struct page, lru);
+ if (can_steal)
+ steal_suitable_fallback(zone, page, start_migratetype);
- expand(zone, page, order, current_order, area,
- buddy_type);
+ /* Remove the page from the freelists */
+ area->nr_free--;
+ list_del(&page->lru);
+ rmv_page_order(page);
- /*
- * The freepage_migratetype may differ from pageblock's
- * migratetype depending on the decisions in
- * try_to_steal_freepages(). This is OK as long as it
- * does not differ for MIGRATE_CMA pageblocks. For CMA
- * we need to make sure unallocated pages flushed from
- * pcp lists are returned to the correct freelist.
- */
- set_freepage_migratetype(page, buddy_type);
+ expand(zone, page, order, current_order, area,
+ start_migratetype);
+ /*
+ * The freepage_migratetype may differ from pageblock's
+ * migratetype depending on the decisions in
+ * try_to_steal_freepages(). This is OK as long as it
+ * does not differ for MIGRATE_CMA pageblocks. For CMA
+ * we need to make sure unallocated pages flushed from
+ * pcp lists are returned to the correct freelist.
+ */
+ set_freepage_migratetype(page, start_migratetype);
- trace_mm_page_alloc_extfrag(page, order, current_order,
- start_migratetype, migratetype);
+ trace_mm_page_alloc_extfrag(page, order, current_order,
+ start_migratetype, fallback_mt);
- return page;
- }
+ return page;
}
return NULL;
@@ -1249,7 +1655,11 @@ retry_reserve:
page = __rmqueue_smallest(zone, order, migratetype);
if (unlikely(!page) && migratetype != MIGRATE_RESERVE) {
- page = __rmqueue_fallback(zone, order, migratetype);
+ if (migratetype == MIGRATE_MOVABLE)
+ page = __rmqueue_cma_fallback(zone, order);
+
+ if (!page)
+ page = __rmqueue_fallback(zone, order, migratetype);
/*
* Use MIGRATE_RESERVE rather than fail an allocation. goto
@@ -1321,7 +1731,7 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
int to_drain, batch;
local_irq_save(flags);
- batch = ACCESS_ONCE(pcp->batch);
+ batch = READ_ONCE(pcp->batch);
to_drain = min(pcp->count, batch);
if (to_drain > 0) {
free_pcppages_bulk(zone, to_drain, pcp);
@@ -1520,7 +1930,7 @@ void free_hot_cold_page(struct page *page, bool cold)
list_add_tail(&page->lru, &pcp->lists[migratetype]);
pcp->count++;
if (pcp->count >= pcp->high) {
- unsigned long batch = ACCESS_ONCE(pcp->batch);
+ unsigned long batch = READ_ONCE(pcp->batch);
free_pcppages_bulk(zone, batch, pcp);
pcp->count -= batch;
}
@@ -1553,6 +1963,7 @@ void free_hot_cold_page_list(struct list_head *list, bool cold)
void split_page(struct page *page, unsigned int order)
{
int i;
+ gfp_t gfp_mask;
VM_BUG_ON_PAGE(PageCompound(page), page);
VM_BUG_ON_PAGE(!page_count(page), page);
@@ -1566,10 +1977,11 @@ void split_page(struct page *page, unsigned int order)
split_page(virt_to_page(page[0].shadow), order);
#endif
- set_page_owner(page, 0, 0);
+ gfp_mask = get_page_owner_gfp(page);
+ set_page_owner(page, 0, gfp_mask);
for (i = 1; i < (1 << order); i++) {
set_page_refcounted(page + i);
- set_page_owner(page + i, 0, 0);
+ set_page_owner(page + i, 0, gfp_mask);
}
}
EXPORT_SYMBOL_GPL(split_page);
@@ -1599,6 +2011,8 @@ int __isolate_free_page(struct page *page, unsigned int order)
zone->free_area[order].nr_free--;
rmv_page_order(page);
+ set_page_owner(page, order, __GFP_MOVABLE);
+
/* Set the pageblock if the isolated page is at least a pageblock */
if (order >= pageblock_order - 1) {
struct page *endpage = page + (1 << order) - 1;
@@ -1610,7 +2024,7 @@ int __isolate_free_page(struct page *page, unsigned int order)
}
}
- set_page_owner(page, order, 0);
+
return 1UL << order;
}
@@ -2272,48 +2686,6 @@ void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...)
show_mem(filter);
}
-static inline int
-should_alloc_retry(gfp_t gfp_mask, unsigned int order,
- unsigned long did_some_progress,
- unsigned long pages_reclaimed)
-{
- /* Do not loop if specifically requested */
- if (gfp_mask & __GFP_NORETRY)
- return 0;
-
- /* Always retry if specifically requested */
- if (gfp_mask & __GFP_NOFAIL)
- return 1;
-
- /*
- * Suspend converts GFP_KERNEL to __GFP_WAIT which can prevent reclaim
- * making forward progress without invoking OOM. Suspend also disables
- * storage devices so kswapd will not help. Bail if we are suspending.
- */
- if (!did_some_progress && pm_suspended_storage())
- return 0;
-
- /*
- * In this implementation, order <= PAGE_ALLOC_COSTLY_ORDER
- * means __GFP_NOFAIL, but that may not be true in other
- * implementations.
- */
- if (order <= PAGE_ALLOC_COSTLY_ORDER)
- return 1;
-
- /*
- * For order > PAGE_ALLOC_COSTLY_ORDER, if __GFP_REPEAT is
- * specified, then we retry until we no longer reclaim any pages
- * (above), or we've reclaimed an order of pages at least as
- * large as the allocation's order. In both cases, if the
- * allocation still fails, we stop retrying.
- */
- if (gfp_mask & __GFP_REPEAT && pages_reclaimed < (1 << order))
- return 1;
-
- return 0;
-}
-
static inline struct page *
__alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
const struct alloc_context *ac, unsigned long *did_some_progress)
@@ -2323,10 +2695,10 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
*did_some_progress = 0;
/*
- * Acquire the per-zone oom lock for each zone. If that
- * fails, somebody else is making progress for us.
+ * Acquire the oom lock. If that fails, somebody else is
+ * making progress for us.
*/
- if (!oom_zonelist_trylock(ac->zonelist, gfp_mask)) {
+ if (!mutex_trylock(&oom_lock)) {
*did_some_progress = 1;
schedule_timeout_uninterruptible(1);
return NULL;
@@ -2352,23 +2724,19 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
/* The OOM killer does not needlessly kill tasks for lowmem */
if (ac->high_zoneidx < ZONE_NORMAL)
goto out;
- /* The OOM killer does not compensate for light reclaim */
+ /* The OOM killer does not compensate for IO-less reclaim */
if (!(gfp_mask & __GFP_FS)) {
/*
* XXX: Page reclaim didn't yield anything,
* and the OOM killer can't be invoked, but
- * keep looping as per should_alloc_retry().
+ * keep looping as per tradition.
*/
*did_some_progress = 1;
goto out;
}
- /*
- * GFP_THISNODE contains __GFP_NORETRY and we never hit this.
- * Sanity check for bare calls of __GFP_THISNODE, not real OOM.
- * The caller should handle page allocation failure by itself if
- * it specifies __GFP_THISNODE.
- * Note: Hugepage uses it but will hit PAGE_ALLOC_COSTLY_ORDER.
- */
+ if (pm_suspended_storage())
+ goto out;
+ /* The OOM killer may not free memory on a specific node */
if (gfp_mask & __GFP_THISNODE)
goto out;
}
@@ -2377,7 +2745,7 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
|| WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL))
*did_some_progress = 1;
out:
- oom_zonelist_unlock(ac->zonelist, gfp_mask);
+ mutex_unlock(&oom_lock);
return page;
}
@@ -2623,15 +2991,11 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
}
/*
- * GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and
- * __GFP_NOWARN set) should not cause reclaim since the subsystem
- * (f.e. slab) using GFP_THISNODE may choose to trigger reclaim
- * using a larger set of nodes after it has established that the
- * allowed per node queues are empty and that nodes are
- * over allocated.
+ * If this allocation cannot block and it is for a specific node, then
+ * fail early. There's no need to wakeup kswapd or retry for a
+ * speculative node-specific allocation.
*/
- if (IS_ENABLED(CONFIG_NUMA) &&
- (gfp_mask & GFP_THISNODE) == GFP_THISNODE)
+ if (IS_ENABLED(CONFIG_NUMA) && (gfp_mask & __GFP_THISNODE) && !wait)
goto nopage;
retry:
@@ -2754,40 +3118,40 @@ retry:
if (page)
goto got_pg;
- /* Check if we should retry the allocation */
+ /* Do not loop if specifically requested */
+ if (gfp_mask & __GFP_NORETRY)
+ goto noretry;
+
+ /* Keep reclaiming pages as long as there is reasonable progress */
pages_reclaimed += did_some_progress;
- if (should_alloc_retry(gfp_mask, order, did_some_progress,
- pages_reclaimed)) {
- /*
- * If we fail to make progress by freeing individual
- * pages, but the allocation wants us to keep going,
- * start OOM killing tasks.
- */
- if (!did_some_progress) {
- page = __alloc_pages_may_oom(gfp_mask, order, ac,
- &did_some_progress);
- if (page)
- goto got_pg;
- if (!did_some_progress)
- goto nopage;
- }
+ if ((did_some_progress && order <= PAGE_ALLOC_COSTLY_ORDER) ||
+ ((gfp_mask & __GFP_REPEAT) && pages_reclaimed < (1 << order))) {
/* Wait for some write requests to complete then retry */
wait_iff_congested(ac->preferred_zone, BLK_RW_ASYNC, HZ/50);
goto retry;
- } else {
- /*
- * High-order allocations do not necessarily loop after
- * direct reclaim and reclaim/compaction depends on compaction
- * being called after reclaim so call directly if necessary
- */
- page = __alloc_pages_direct_compact(gfp_mask, order,
- alloc_flags, ac, migration_mode,
- &contended_compaction,
- &deferred_compaction);
- if (page)
- goto got_pg;
}
+ /* Reclaim has failed us, start killing things */
+ page = __alloc_pages_may_oom(gfp_mask, order, ac, &did_some_progress);
+ if (page)
+ goto got_pg;
+
+ /* Retry as long as the OOM killer is making progress */
+ if (did_some_progress)
+ goto retry;
+
+noretry:
+ /*
+ * High-order allocations do not necessarily loop after
+ * direct reclaim and reclaim/compaction depends on compaction
+ * being called after reclaim so call directly if necessary
+ */
+ page = __alloc_pages_direct_compact(gfp_mask, order, alloc_flags,
+ ac, migration_mode,
+ &contended_compaction,
+ &deferred_compaction);
+ if (page)
+ goto got_pg;
nopage:
warn_alloc_failed(gfp_mask, order, NULL);
got_pg:
@@ -2824,7 +3188,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
/*
* Check the zones suitable for the gfp_mask contain at least one
* valid zone. It's possible to have an empty zonelist as a result
- * of GFP_THISNODE and a memoryless node
+ * of __GFP_THISNODE and a memoryless node
*/
if (unlikely(!zonelist->_zonerefs->zone))
return NULL;
@@ -2927,6 +3291,104 @@ void free_pages(unsigned long addr, unsigned int order)
EXPORT_SYMBOL(free_pages);
/*
+ * Page Fragment:
+ * An arbitrary-length arbitrary-offset area of memory which resides
+ * within a 0 or higher order page. Multiple fragments within that page
+ * are individually refcounted, in the page's reference counter.
+ *
+ * The page_frag functions below provide a simple allocation framework for
+ * page fragments. This is used by the network stack and network device
+ * drivers to provide a backing region of memory for use as either an
+ * sk_buff->head, or to be used in the "frags" portion of skb_shared_info.
+ */
+static struct page *__page_frag_refill(struct page_frag_cache *nc,
+ gfp_t gfp_mask)
+{
+ struct page *page = NULL;
+ gfp_t gfp = gfp_mask;
+
+#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
+ gfp_mask |= __GFP_COMP | __GFP_NOWARN | __GFP_NORETRY |
+ __GFP_NOMEMALLOC;
+ page = alloc_pages_node(NUMA_NO_NODE, gfp_mask,
+ PAGE_FRAG_CACHE_MAX_ORDER);
+ nc->size = page ? PAGE_FRAG_CACHE_MAX_SIZE : PAGE_SIZE;
+#endif
+ if (unlikely(!page))
+ page = alloc_pages_node(NUMA_NO_NODE, gfp, 0);
+
+ nc->va = page ? page_address(page) : NULL;
+
+ return page;
+}
+
+void *__alloc_page_frag(struct page_frag_cache *nc,
+ unsigned int fragsz, gfp_t gfp_mask)
+{
+ unsigned int size = PAGE_SIZE;
+ struct page *page;
+ int offset;
+
+ if (unlikely(!nc->va)) {
+refill:
+ page = __page_frag_refill(nc, gfp_mask);
+ if (!page)
+ return NULL;
+
+#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
+ /* if size can vary use size else just use PAGE_SIZE */
+ size = nc->size;
+#endif
+ /* Even if we own the page, we do not use atomic_set().
+ * This would break get_page_unless_zero() users.
+ */
+ atomic_add(size - 1, &page->_count);
+
+ /* reset page count bias and offset to start of new frag */
+ nc->pfmemalloc = page->pfmemalloc;
+ nc->pagecnt_bias = size;
+ nc->offset = size;
+ }
+
+ offset = nc->offset - fragsz;
+ if (unlikely(offset < 0)) {
+ page = virt_to_page(nc->va);
+
+ if (!atomic_sub_and_test(nc->pagecnt_bias, &page->_count))
+ goto refill;
+
+#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
+ /* if size can vary use size else just use PAGE_SIZE */
+ size = nc->size;
+#endif
+ /* OK, page count is 0, we can safely set it */
+ atomic_set(&page->_count, size);
+
+ /* reset page count bias and offset to start of new frag */
+ nc->pagecnt_bias = size;
+ offset = size - fragsz;
+ }
+
+ nc->pagecnt_bias--;
+ nc->offset = offset;
+
+ return nc->va + offset;
+}
+EXPORT_SYMBOL(__alloc_page_frag);
+
+/*
+ * Frees a page fragment allocated out of either a compound or order 0 page.
+ */
+void __free_page_frag(void *addr)
+{
+ struct page *page = virt_to_head_page(addr);
+
+ if (unlikely(put_page_testzero(page)))
+ __free_pages_ok(page, compound_order(page));
+}
+EXPORT_SYMBOL(__free_page_frag);
+
+/*
* alloc_kmem_pages charges newly allocated pages to the kmem resource counter
* of the current memory cgroup.
*
@@ -3201,38 +3663,31 @@ static void show_migration_types(unsigned char type)
* Show free area list (used inside shift_scroll-lock stuff)
* We also calculate the percentage fragmentation. We do this by counting the
* memory on each free list with the exception of the first item on the list.
- * Suppresses nodes that are not allowed by current's cpuset if
- * SHOW_MEM_FILTER_NODES is passed.
+ *
+ * Bits in @filter:
+ * SHOW_MEM_FILTER_NODES: suppress nodes that are not allowed by current's
+ * cpuset.
*/
void show_free_areas(unsigned int filter)
{
+ unsigned long free_pcp = 0;
int cpu;
struct zone *zone;
for_each_populated_zone(zone) {
if (skip_free_areas_node(filter, zone_to_nid(zone)))
continue;
- show_node(zone);
- printk("%s per-cpu:\n", zone->name);
-
- for_each_online_cpu(cpu) {
- struct per_cpu_pageset *pageset;
-
- pageset = per_cpu_ptr(zone->pageset, cpu);
- printk("CPU %4d: hi:%5d, btch:%4d usd:%4d\n",
- cpu, pageset->pcp.high,
- pageset->pcp.batch, pageset->pcp.count);
- }
+ for_each_online_cpu(cpu)
+ free_pcp += per_cpu_ptr(zone->pageset, cpu)->pcp.count;
}
printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu\n"
" active_file:%lu inactive_file:%lu isolated_file:%lu\n"
- " unevictable:%lu"
- " dirty:%lu writeback:%lu unstable:%lu\n"
- " free:%lu slab_reclaimable:%lu slab_unreclaimable:%lu\n"
+ " unevictable:%lu dirty:%lu writeback:%lu unstable:%lu\n"
+ " slab_reclaimable:%lu slab_unreclaimable:%lu\n"
" mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n"
- " free_cma:%lu\n",
+ " free:%lu free_pcp:%lu free_cma:%lu\n",
global_page_state(NR_ACTIVE_ANON),
global_page_state(NR_INACTIVE_ANON),
global_page_state(NR_ISOLATED_ANON),
@@ -3243,13 +3698,14 @@ void show_free_areas(unsigned int filter)
global_page_state(NR_FILE_DIRTY),
global_page_state(NR_WRITEBACK),
global_page_state(NR_UNSTABLE_NFS),
- global_page_state(NR_FREE_PAGES),
global_page_state(NR_SLAB_RECLAIMABLE),
global_page_state(NR_SLAB_UNRECLAIMABLE),
global_page_state(NR_FILE_MAPPED),
global_page_state(NR_SHMEM),
global_page_state(NR_PAGETABLE),
global_page_state(NR_BOUNCE),
+ global_page_state(NR_FREE_PAGES),
+ free_pcp,
global_page_state(NR_FREE_CMA_PAGES));
for_each_populated_zone(zone) {
@@ -3257,6 +3713,11 @@ void show_free_areas(unsigned int filter)
if (skip_free_areas_node(filter, zone_to_nid(zone)))
continue;
+
+ free_pcp = 0;
+ for_each_online_cpu(cpu)
+ free_pcp += per_cpu_ptr(zone->pageset, cpu)->pcp.count;
+
show_node(zone);
printk("%s"
" free:%lukB"
@@ -3283,6 +3744,8 @@ void show_free_areas(unsigned int filter)
" pagetables:%lukB"
" unstable:%lukB"
" bounce:%lukB"
+ " free_pcp:%lukB"
+ " local_pcp:%ukB"
" free_cma:%lukB"
" writeback_tmp:%lukB"
" pages_scanned:%lu"
@@ -3314,6 +3777,8 @@ void show_free_areas(unsigned int filter)
K(zone_page_state(zone, NR_PAGETABLE)),
K(zone_page_state(zone, NR_UNSTABLE_NFS)),
K(zone_page_state(zone, NR_BOUNCE)),
+ K(free_pcp),
+ K(this_cpu_read(zone->pageset->pcp.count)),
K(zone_page_state(zone, NR_FREE_CMA_PAGES)),
K(zone_page_state(zone, NR_WRITEBACK_TEMP)),
K(zone_page_state(zone, NR_PAGES_SCANNED)),
@@ -4062,6 +4527,9 @@ static void setup_zone_migrate_reserve(struct zone *zone)
zone->nr_migrate_reserve_block = reserve;
for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {
+ if (!early_page_nid_uninitialised(pfn, zone_to_nid(zone)))
+ return;
+
if (!pfn_valid(pfn))
continue;
page = pfn_to_page(pfn);
@@ -4124,15 +4592,16 @@ static void setup_zone_migrate_reserve(struct zone *zone)
void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
unsigned long start_pfn, enum memmap_context context)
{
- struct page *page;
+ pg_data_t *pgdat = NODE_DATA(nid);
unsigned long end_pfn = start_pfn + size;
unsigned long pfn;
struct zone *z;
+ unsigned long nr_initialised = 0;
if (highest_memmap_pfn < end_pfn - 1)
highest_memmap_pfn = end_pfn - 1;
- z = &NODE_DATA(nid)->node_zones[zone];
+ z = &pgdat->node_zones[zone];
for (pfn = start_pfn; pfn < end_pfn; pfn++) {
/*
* There can be holes in boot-time mem_map[]s
@@ -4144,14 +4613,11 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
continue;
if (!early_pfn_in_nid(pfn, nid))
continue;
+ if (!update_defer_init(pgdat, pfn, end_pfn,
+ &nr_initialised))
+ break;
}
- page = pfn_to_page(pfn);
- set_page_links(page, zone, nid, pfn);
- mminit_verify_page_links(page, zone, nid, pfn);
- init_page_count(page);
- page_mapcount_reset(page);
- page_cpupid_reset_last(page);
- SetPageReserved(page);
+
/*
* Mark the block movable so that blocks are reserved for
* movable at startup. This will force kernel allocations
@@ -4166,17 +4632,14 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
* check here not to call set_pageblock_migratetype() against
* pfn out of zone.
*/
- if ((z->zone_start_pfn <= pfn)
- && (pfn < zone_end_pfn(z))
- && !(pfn & (pageblock_nr_pages - 1)))
- set_pageblock_migratetype(page, MIGRATE_MOVABLE);
+ if (!(pfn & (pageblock_nr_pages - 1))) {
+ struct page *page = pfn_to_page(pfn);
- INIT_LIST_HEAD(&page->lru);
-#ifdef WANT_PAGE_VIRTUAL
- /* The shift won't overflow because ZONE_NORMAL is below 4G. */
- if (!is_highmem_idx(zone))
- set_page_address(page, __va(pfn << PAGE_SHIFT));
-#endif
+ __init_single_page(page, pfn, zone, nid);
+ set_pageblock_migratetype(page, MIGRATE_MOVABLE);
+ } else {
+ __init_single_pfn(pfn, zone, nid);
+ }
}
}
@@ -4434,57 +4897,30 @@ int __meminit init_currently_empty_zone(struct zone *zone,
#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
#ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID
+
/*
* Required by SPARSEMEM. Given a PFN, return what node the PFN is on.
*/
-int __meminit __early_pfn_to_nid(unsigned long pfn)
+int __meminit __early_pfn_to_nid(unsigned long pfn,
+ struct mminit_pfnnid_cache *state)
{
unsigned long start_pfn, end_pfn;
int nid;
- /*
- * NOTE: The following SMP-unsafe globals are only used early in boot
- * when the kernel is running single-threaded.
- */
- static unsigned long __meminitdata last_start_pfn, last_end_pfn;
- static int __meminitdata last_nid;
- if (last_start_pfn <= pfn && pfn < last_end_pfn)
- return last_nid;
+ if (state->last_start <= pfn && pfn < state->last_end)
+ return state->last_nid;
nid = memblock_search_pfn_nid(pfn, &start_pfn, &end_pfn);
if (nid != -1) {
- last_start_pfn = start_pfn;
- last_end_pfn = end_pfn;
- last_nid = nid;
+ state->last_start = start_pfn;
+ state->last_end = end_pfn;
+ state->last_nid = nid;
}
return nid;
}
#endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */
-int __meminit early_pfn_to_nid(unsigned long pfn)
-{
- int nid;
-
- nid = __early_pfn_to_nid(pfn);
- if (nid >= 0)
- return nid;
- /* just returns 0 */
- return 0;
-}
-
-#ifdef CONFIG_NODES_SPAN_OTHER_NODES
-bool __meminit early_pfn_in_nid(unsigned long pfn, int node)
-{
- int nid;
-
- nid = __early_pfn_to_nid(pfn);
- if (nid >= 0 && nid != node)
- return false;
- return true;
-}
-#endif
-
/**
* free_bootmem_with_active_regions - Call memblock_free_early_nid for each active range
* @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed.
@@ -4726,22 +5162,28 @@ static void __meminit calculate_node_totalpages(struct pglist_data *pgdat,
unsigned long *zones_size,
unsigned long *zholes_size)
{
- unsigned long realtotalpages, totalpages = 0;
+ unsigned long realtotalpages = 0, totalpages = 0;
enum zone_type i;
- for (i = 0; i < MAX_NR_ZONES; i++)
- totalpages += zone_spanned_pages_in_node(pgdat->node_id, i,
- node_start_pfn,
- node_end_pfn,
- zones_size);
- pgdat->node_spanned_pages = totalpages;
-
- realtotalpages = totalpages;
- for (i = 0; i < MAX_NR_ZONES; i++)
- realtotalpages -=
- zone_absent_pages_in_node(pgdat->node_id, i,
+ for (i = 0; i < MAX_NR_ZONES; i++) {
+ struct zone *zone = pgdat->node_zones + i;
+ unsigned long size, real_size;
+
+ size = zone_spanned_pages_in_node(pgdat->node_id, i,
+ node_start_pfn,
+ node_end_pfn,
+ zones_size);
+ real_size = size - zone_absent_pages_in_node(pgdat->node_id, i,
node_start_pfn, node_end_pfn,
zholes_size);
+ zone->spanned_pages = size;
+ zone->present_pages = real_size;
+
+ totalpages += size;
+ realtotalpages += real_size;
+ }
+
+ pgdat->node_spanned_pages = totalpages;
pgdat->node_present_pages = realtotalpages;
printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id,
realtotalpages);
@@ -4851,8 +5293,7 @@ static unsigned long __paginginit calc_memmap_size(unsigned long spanned_pages,
* NOTE: pgdat should get zeroed by caller.
*/
static void __paginginit free_area_init_core(struct pglist_data *pgdat,
- unsigned long node_start_pfn, unsigned long node_end_pfn,
- unsigned long *zones_size, unsigned long *zholes_size)
+ unsigned long node_start_pfn, unsigned long node_end_pfn)
{
enum zone_type j;
int nid = pgdat->node_id;
@@ -4873,12 +5314,8 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
struct zone *zone = pgdat->node_zones + j;
unsigned long size, realsize, freesize, memmap_pages;
- size = zone_spanned_pages_in_node(nid, j, node_start_pfn,
- node_end_pfn, zones_size);
- realsize = freesize = size - zone_absent_pages_in_node(nid, j,
- node_start_pfn,
- node_end_pfn,
- zholes_size);
+ size = zone->spanned_pages;
+ realsize = freesize = zone->present_pages;
/*
* Adjust freesize so that it accounts for how much memory
@@ -4913,8 +5350,6 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
nr_kernel_pages -= memmap_pages;
nr_all_pages += freesize;
- zone->spanned_pages = size;
- zone->present_pages = realsize;
/*
* Set an approximate value for lowmem here, it will be adjusted
* when the bootmem allocator frees pages into the buddy system.
@@ -5003,6 +5438,7 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
/* pg_data_t should be reset to zero when it's allocated */
WARN_ON(pgdat->nr_zones || pgdat->classzone_idx);
+ reset_deferred_meminit(pgdat);
pgdat->node_id = nid;
pgdat->node_start_pfn = node_start_pfn;
#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
@@ -5020,8 +5456,7 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
(unsigned long)pgdat->node_mem_map);
#endif
- free_area_init_core(pgdat, start_pfn, end_pfn,
- zones_size, zholes_size);
+ free_area_init_core(pgdat, start_pfn, end_pfn);
}
#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
@@ -5717,7 +6152,7 @@ static void __setup_per_zone_wmarks(void)
* value here.
*
* The WMARK_HIGH-WMARK_LOW and (WMARK_LOW-WMARK_MIN)
- * deltas controls asynch page reclaim, and so should
+ * deltas control asynch page reclaim, and so should
* not be capped for highmem.
*/
unsigned long min_pages;
@@ -5970,9 +6405,9 @@ out:
return ret;
}
+#ifdef CONFIG_NUMA
int hashdist = HASHDIST_DEFAULT;
-#ifdef CONFIG_NUMA
static int __init set_hashdist(char *str)
{
if (!str)
@@ -6164,7 +6599,7 @@ void set_pfnblock_flags_mask(struct page *page, unsigned long flags,
mask <<= (BITS_PER_LONG - bitidx - 1);
flags <<= (BITS_PER_LONG - bitidx - 1);
- word = ACCESS_ONCE(bitmap[word_bitidx]);
+ word = READ_ONCE(bitmap[word_bitidx]);
for (;;) {
old_word = cmpxchg(&bitmap[word_bitidx], word, (word & ~mask) | flags);
if (word == old_word)