diff options
-rw-r--r-- | include/linux/mmzone.h | 6 | ||||
-rw-r--r-- | mm/page_alloc.c | 101 |
2 files changed, 59 insertions, 48 deletions
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 11671d14409e..ac819bf9522c 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -529,6 +529,7 @@ typedef enum { ZONE_WRITEBACK, /* reclaim scanning has recently found * many pages under writeback */ + ZONE_FAIR_DEPLETED, /* fair zone policy batch depleted */ } zone_flags_t; static inline void zone_set_flag(struct zone *zone, zone_flags_t flag) @@ -566,6 +567,11 @@ static inline int zone_is_reclaim_locked(const struct zone *zone) return test_bit(ZONE_RECLAIM_LOCKED, &zone->flags); } +static inline int zone_is_fair_depleted(const struct zone *zone) +{ + return test_bit(ZONE_FAIR_DEPLETED, &zone->flags); +} + static inline int zone_is_oom_locked(const struct zone *zone) { return test_bit(ZONE_OOM_LOCKED, &zone->flags); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 3eb5adf1b2dd..ea419137f845 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1614,6 +1614,9 @@ again: } __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order)); + if (zone_page_state(zone, NR_ALLOC_BATCH) == 0 && + !zone_is_fair_depleted(zone)) + zone_set_flag(zone, ZONE_FAIR_DEPLETED); __count_zone_vm_events(PGALLOC, zone, 1 << order); zone_statistics(preferred_zone, zone, gfp_flags); @@ -1938,6 +1941,18 @@ static inline void init_zone_allows_reclaim(int nid) } #endif /* CONFIG_NUMA */ +static void reset_alloc_batches(struct zone *preferred_zone) +{ + struct zone *zone = preferred_zone->zone_pgdat->node_zones; + + do { + mod_zone_page_state(zone, NR_ALLOC_BATCH, + high_wmark_pages(zone) - low_wmark_pages(zone) - + atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH])); + zone_clear_flag(zone, ZONE_FAIR_DEPLETED); + } while (zone++ != preferred_zone); +} + /* * get_page_from_freelist goes through the zonelist trying to allocate * a page. @@ -1955,8 +1970,12 @@ get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order, int did_zlc_setup = 0; /* just call zlc_setup() one time */ bool consider_zone_dirty = (alloc_flags & ALLOC_WMARK_LOW) && (gfp_mask & __GFP_WRITE); + int nr_fair_skipped = 0; + bool zonelist_rescan; zonelist_scan: + zonelist_rescan = false; + /* * Scan zonelist, looking for a zone with enough free. * See also __cpuset_node_allowed_softwall() comment in kernel/cpuset.c. @@ -1981,8 +2000,10 @@ zonelist_scan: if (alloc_flags & ALLOC_FAIR) { if (!zone_local(preferred_zone, zone)) break; - if (atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]) <= 0) + if (zone_is_fair_depleted(zone)) { + nr_fair_skipped++; continue; + } } /* * When allocating a page cache page for writing, we @@ -2088,13 +2109,7 @@ this_zone_full: zlc_mark_zone_full(zonelist, z); } - if (unlikely(IS_ENABLED(CONFIG_NUMA) && page == NULL && zlc_active)) { - /* Disable zlc cache for second zonelist scan */ - zlc_active = 0; - goto zonelist_scan; - } - - if (page) + if (page) { /* * page->pfmemalloc is set when ALLOC_NO_WATERMARKS was * necessary to allocate the page. The expectation is @@ -2103,8 +2118,37 @@ this_zone_full: * for !PFMEMALLOC purposes. */ page->pfmemalloc = !!(alloc_flags & ALLOC_NO_WATERMARKS); + return page; + } - return page; + /* + * The first pass makes sure allocations are spread fairly within the + * local node. However, the local node might have free pages left + * after the fairness batches are exhausted, and remote zones haven't + * even been considered yet. Try once more without fairness, and + * include remote zones now, before entering the slowpath and waking + * kswapd: prefer spilling to a remote zone over swapping locally. + */ + if (alloc_flags & ALLOC_FAIR) { + alloc_flags &= ~ALLOC_FAIR; + if (nr_fair_skipped) { + zonelist_rescan = true; + reset_alloc_batches(preferred_zone); + } + if (nr_online_nodes > 1) + zonelist_rescan = true; + } + + if (unlikely(IS_ENABLED(CONFIG_NUMA) && zlc_active)) { + /* Disable zlc cache for second zonelist scan */ + zlc_active = 0; + zonelist_rescan = true; + } + + if (zonelist_rescan) + goto zonelist_scan; + + return NULL; } /* @@ -2433,28 +2477,6 @@ __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order, return page; } -static void reset_alloc_batches(struct zonelist *zonelist, - enum zone_type high_zoneidx, - struct zone *preferred_zone) -{ - struct zoneref *z; - struct zone *zone; - - for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { - /* - * Only reset the batches of zones that were actually - * considered in the fairness pass, we don't want to - * trash fairness information for zones that are not - * actually part of this zonelist's round-robin cycle. - */ - if (!zone_local(preferred_zone, zone)) - continue; - mod_zone_page_state(zone, NR_ALLOC_BATCH, - high_wmark_pages(zone) - low_wmark_pages(zone) - - atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH])); - } -} - static void wake_all_kswapds(unsigned int order, struct zonelist *zonelist, enum zone_type high_zoneidx, @@ -2792,29 +2814,12 @@ retry_cpuset: if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE) alloc_flags |= ALLOC_CMA; #endif -retry: /* First allocation attempt */ page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, zonelist, high_zoneidx, alloc_flags, preferred_zone, classzone_idx, migratetype); if (unlikely(!page)) { /* - * The first pass makes sure allocations are spread - * fairly within the local node. However, the local - * node might have free pages left after the fairness - * batches are exhausted, and remote zones haven't - * even been considered yet. Try once more without - * fairness, and include remote zones now, before - * entering the slowpath and waking kswapd: prefer - * spilling to a remote zone over swapping locally. - */ - if (alloc_flags & ALLOC_FAIR) { - reset_alloc_batches(zonelist, high_zoneidx, - preferred_zone); - alloc_flags &= ~ALLOC_FAIR; - goto retry; - } - /* * Runtime PM, block IO and its error handling path * can deadlock because I/O on the device might not * complete. |