diff options
Diffstat (limited to 'mm/vmscan.c')
-rw-r--r-- | mm/vmscan.c | 212 |
1 files changed, 149 insertions, 63 deletions
diff --git a/mm/vmscan.c b/mm/vmscan.c index 4f49535d4cd3..7ef69124fa3e 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -95,8 +95,6 @@ struct scan_control { /* Can pages be swapped as part of reclaim? */ int may_swap; - int swappiness; - int order; /* @@ -107,6 +105,7 @@ struct scan_control { /* Which cgroup do we reclaim from */ struct mem_cgroup *mem_cgroup; + struct memcg_scanrecord *memcg_record; /* * Nodemask of nodes allowed by the caller. If NULL, all nodes @@ -173,7 +172,8 @@ static unsigned long zone_nr_lru_pages(struct zone *zone, struct scan_control *sc, enum lru_list lru) { if (!scanning_global_lru(sc)) - return mem_cgroup_zone_nr_lru_pages(sc->mem_cgroup, zone, lru); + return mem_cgroup_zone_nr_lru_pages(sc->mem_cgroup, + zone_to_nid(zone), zone_idx(zone), BIT(lru)); return zone_page_state(zone, NR_LRU_BASE + lru); } @@ -250,49 +250,90 @@ unsigned long shrink_slab(struct shrink_control *shrink, unsigned long long delta; unsigned long total_scan; unsigned long max_pass; + int shrink_ret = 0; + long nr; + long new_nr; + long batch_size = shrinker->batch ? shrinker->batch + : SHRINK_BATCH; + + /* + * copy the current shrinker scan count into a local variable + * and zero it so that other concurrent shrinker invocations + * don't also do this scanning work. + */ + do { + nr = shrinker->nr; + } while (cmpxchg(&shrinker->nr, nr, 0) != nr); + total_scan = nr; max_pass = do_shrinker_shrink(shrinker, shrink, 0); delta = (4 * nr_pages_scanned) / shrinker->seeks; delta *= max_pass; do_div(delta, lru_pages + 1); - shrinker->nr += delta; - if (shrinker->nr < 0) { + total_scan += delta; + if (total_scan < 0) { printk(KERN_ERR "shrink_slab: %pF negative objects to " "delete nr=%ld\n", - shrinker->shrink, shrinker->nr); - shrinker->nr = max_pass; + shrinker->shrink, total_scan); + total_scan = max_pass; } /* + * We need to avoid excessive windup on filesystem shrinkers + * due to large numbers of GFP_NOFS allocations causing the + * shrinkers to return -1 all the time. This results in a large + * nr being built up so when a shrink that can do some work + * comes along it empties the entire cache due to nr >>> + * max_pass. This is bad for sustaining a working set in + * memory. + * + * Hence only allow the shrinker to scan the entire cache when + * a large delta change is calculated directly. + */ + if (delta < max_pass / 4) + total_scan = min(total_scan, max_pass / 2); + + /* * Avoid risking looping forever due to too large nr value: * never try to free more than twice the estimate number of * freeable entries. */ - if (shrinker->nr > max_pass * 2) - shrinker->nr = max_pass * 2; + if (total_scan > max_pass * 2) + total_scan = max_pass * 2; - total_scan = shrinker->nr; - shrinker->nr = 0; + trace_mm_shrink_slab_start(shrinker, shrink, nr, + nr_pages_scanned, lru_pages, + max_pass, delta, total_scan); - while (total_scan >= SHRINK_BATCH) { - long this_scan = SHRINK_BATCH; - int shrink_ret; + while (total_scan >= batch_size) { int nr_before; nr_before = do_shrinker_shrink(shrinker, shrink, 0); shrink_ret = do_shrinker_shrink(shrinker, shrink, - this_scan); + batch_size); if (shrink_ret == -1) break; if (shrink_ret < nr_before) ret += nr_before - shrink_ret; - count_vm_events(SLABS_SCANNED, this_scan); - total_scan -= this_scan; + count_vm_events(SLABS_SCANNED, batch_size); + total_scan -= batch_size; cond_resched(); } - shrinker->nr += total_scan; + /* + * move the unused scan count back into the shrinker in a + * manner that handles concurrent updates. If we exhausted the + * scan, there is no need to do an update. + */ + do { + nr = shrinker->nr; + new_nr = total_scan + nr; + if (total_scan <= 0) + break; + } while (cmpxchg(&shrinker->nr, nr, new_nr) != nr); + + trace_mm_shrink_slab_end(shrinker, shrink_ret, nr, new_nr); } up_read(&shrinker_rwsem); out: @@ -1308,6 +1349,8 @@ putback_lru_pages(struct zone *zone, struct scan_control *sc, int file = is_file_lru(lru); int numpages = hpage_nr_pages(page); reclaim_stat->recent_rotated[file] += numpages; + if (!scanning_global_lru(sc)) + sc->memcg_record->nr_rotated[file] += numpages; } if (!pagevec_add(&pvec, page)) { spin_unlock_irq(&zone->lru_lock); @@ -1351,6 +1394,10 @@ static noinline_for_stack void update_isolated_counts(struct zone *zone, reclaim_stat->recent_scanned[0] += *nr_anon; reclaim_stat->recent_scanned[1] += *nr_file; + if (!scanning_global_lru(sc)) { + sc->memcg_record->nr_scanned[0] += *nr_anon; + sc->memcg_record->nr_scanned[1] += *nr_file; + } } /* @@ -1464,6 +1511,9 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, nr_reclaimed += shrink_page_list(&page_list, zone, sc); } + if (!scanning_global_lru(sc)) + sc->memcg_record->nr_freed[file] += nr_reclaimed; + local_irq_disable(); if (current_is_kswapd()) __count_vm_events(KSWAPD_STEAL, nr_reclaimed); @@ -1563,6 +1613,8 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, } reclaim_stat->recent_scanned[file] += nr_taken; + if (!scanning_global_lru(sc)) + sc->memcg_record->nr_scanned[file] += nr_taken; __count_zone_vm_events(PGREFILL, zone, pgscanned); if (file) @@ -1614,6 +1666,8 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, * get_scan_ratio. */ reclaim_stat->recent_rotated[file] += nr_rotated; + if (!scanning_global_lru(sc)) + sc->memcg_record->nr_rotated[file] += nr_rotated; move_active_pages_to_lru(zone, &l_active, LRU_ACTIVE + file * LRU_FILE); @@ -1729,6 +1783,13 @@ static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, return shrink_inactive_list(nr_to_scan, zone, sc, priority, file); } +static int vmscan_swappiness(struct scan_control *sc) +{ + if (scanning_global_lru(sc)) + return vm_swappiness; + return mem_cgroup_swappiness(sc->mem_cgroup); +} + /* * Determine how aggressively the anon and file LRU lists should be * scanned. The relative value of each set of LRU lists is determined @@ -1748,6 +1809,7 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc, enum lru_list l; int noswap = 0; int force_scan = 0; + unsigned long nr_force_scan[2]; anon = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_ANON) + @@ -1770,6 +1832,8 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc, fraction[0] = 0; fraction[1] = 1; denominator = 1; + nr_force_scan[0] = 0; + nr_force_scan[1] = SWAP_CLUSTER_MAX; goto out; } @@ -1781,6 +1845,8 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc, fraction[0] = 1; fraction[1] = 0; denominator = 1; + nr_force_scan[0] = SWAP_CLUSTER_MAX; + nr_force_scan[1] = 0; goto out; } } @@ -1789,8 +1855,8 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc, * With swappiness at 100, anonymous and file have the same priority. * This scanning priority is essentially the inverse of IO cost. */ - anon_prio = sc->swappiness; - file_prio = 200 - sc->swappiness; + anon_prio = vmscan_swappiness(sc); + file_prio = 200 - vmscan_swappiness(sc); /* * OK, so we have swap space and a fair amount of page cache @@ -1829,6 +1895,11 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc, fraction[0] = ap; fraction[1] = fp; denominator = ap + fp + 1; + if (force_scan) { + unsigned long scan = SWAP_CLUSTER_MAX; + nr_force_scan[0] = div64_u64(scan * ap, denominator); + nr_force_scan[1] = div64_u64(scan * fp, denominator); + } out: for_each_evictable_lru(l) { int file = is_file_lru(l); @@ -1849,12 +1920,8 @@ out: * memcg, priority drop can cause big latency. So, it's better * to scan small amount. See may_noscan above. */ - if (!scan && force_scan) { - if (file) - scan = SWAP_CLUSTER_MAX; - else if (!noswap) - scan = SWAP_CLUSTER_MAX; - } + if (!scan && force_scan) + scan = nr_force_scan[file]; nr[l] = scan; } } @@ -2179,7 +2246,6 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, .nr_to_reclaim = SWAP_CLUSTER_MAX, .may_unmap = 1, .may_swap = 1, - .swappiness = vm_swappiness, .order = order, .mem_cgroup = NULL, .nodemask = nodemask, @@ -2202,10 +2268,10 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, #ifdef CONFIG_CGROUP_MEM_RES_CTLR unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, - gfp_t gfp_mask, bool noswap, - unsigned int swappiness, - struct zone *zone, - unsigned long *nr_scanned) + gfp_t gfp_mask, bool noswap, + struct zone *zone, + struct memcg_scanrecord *rec, + unsigned long *scanned) { struct scan_control sc = { .nr_scanned = 0, @@ -2213,10 +2279,11 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, .may_writepage = !laptop_mode, .may_unmap = 1, .may_swap = !noswap, - .swappiness = swappiness, .order = 0, .mem_cgroup = mem, + .memcg_record = rec, }; + unsigned long start, end; sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK); @@ -2225,6 +2292,7 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, sc.may_writepage, sc.gfp_mask); + start = sched_clock(); /* * NOTE: Although we can get the priority field, using it * here is not a good idea, since it limits the pages we can scan. @@ -2233,29 +2301,34 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, * the priority and make it zero. */ shrink_zone(0, zone, &sc); + end = sched_clock(); + + if (rec) + rec->elapsed += end - start; + *scanned = sc.nr_scanned; trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed); - *nr_scanned = sc.nr_scanned; return sc.nr_reclaimed; } unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, gfp_t gfp_mask, bool noswap, - unsigned int swappiness) + struct memcg_scanrecord *rec) { struct zonelist *zonelist; unsigned long nr_reclaimed; + unsigned long start, end; int nid; struct scan_control sc = { .may_writepage = !laptop_mode, .may_unmap = 1, .may_swap = !noswap, .nr_to_reclaim = SWAP_CLUSTER_MAX, - .swappiness = swappiness, .order = 0, .mem_cgroup = mem_cont, + .memcg_record = rec, .nodemask = NULL, /* we don't care the placement */ .gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK), @@ -2264,6 +2337,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, .gfp_mask = sc.gfp_mask, }; + start = sched_clock(); /* * Unlike direct reclaim via alloc_pages(), memcg's reclaim doesn't * take care of from where we get pages. So the node where we start the @@ -2278,6 +2352,9 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, sc.gfp_mask); nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink); + end = sched_clock(); + if (rec) + rec->elapsed += end - start; trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed); @@ -2310,7 +2387,8 @@ static bool pgdat_balanced(pg_data_t *pgdat, unsigned long balanced_pages, for (i = 0; i <= classzone_idx; i++) present_pages += pgdat->node_zones[i].present_pages; - return balanced_pages > (present_pages >> 2); + /* A special case here: if zone has no page, we think it's balanced */ + return balanced_pages >= (present_pages >> 2); } /* is kswapd sleeping prematurely? */ @@ -2326,7 +2404,7 @@ static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining, return true; /* Check the watermark levels */ - for (i = 0; i < pgdat->nr_zones; i++) { + for (i = 0; i <= classzone_idx; i++) { struct zone *zone = pgdat->node_zones + i; if (!populated_zone(zone)) @@ -2344,7 +2422,7 @@ static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining, } if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone), - classzone_idx, 0)) + i, 0)) all_zones_ok = false; else balanced += zone->present_pages; @@ -2403,7 +2481,6 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order, * we want to put equal scanning pressure on each zone. */ .nr_to_reclaim = ULONG_MAX, - .swappiness = vm_swappiness, .order = order, .mem_cgroup = NULL, }; @@ -2451,7 +2528,6 @@ loop_again: if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone), 0, 0)) { end_zone = i; - *classzone_idx = i; break; } } @@ -2510,18 +2586,18 @@ loop_again: KSWAPD_ZONE_BALANCE_GAP_RATIO); if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone) + balance_gap, - end_zone, 0)) + end_zone, 0)) { shrink_zone(priority, zone, &sc); - reclaim_state->reclaimed_slab = 0; - nr_slab = shrink_slab(&shrink, sc.nr_scanned, lru_pages); - sc.nr_reclaimed += reclaim_state->reclaimed_slab; - total_scanned += sc.nr_scanned; - if (zone->all_unreclaimable) - continue; - if (nr_slab == 0 && - !zone_reclaimable(zone)) - zone->all_unreclaimable = 1; + reclaim_state->reclaimed_slab = 0; + nr_slab = shrink_slab(&shrink, sc.nr_scanned, lru_pages); + sc.nr_reclaimed += reclaim_state->reclaimed_slab; + total_scanned += sc.nr_scanned; + + if (nr_slab == 0 && !zone_reclaimable(zone)) + zone->all_unreclaimable = 1; + } + /* * If we've done a decent amount of scanning and * the reclaim ratio is low, start doing writepage @@ -2531,6 +2607,12 @@ loop_again: total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2) sc.may_writepage = 1; + if (zone->all_unreclaimable) { + if (end_zone && end_zone == i) + end_zone--; + continue; + } + if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone), end_zone, 0)) { all_zones_ok = 0; @@ -2709,8 +2791,8 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx) */ static int kswapd(void *p) { - unsigned long order; - int classzone_idx; + unsigned long order, new_order; + int classzone_idx, new_classzone_idx; pg_data_t *pgdat = (pg_data_t*)p; struct task_struct *tsk = current; @@ -2740,17 +2822,23 @@ static int kswapd(void *p) tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD; set_freezable(); - order = 0; - classzone_idx = MAX_NR_ZONES - 1; + order = new_order = 0; + classzone_idx = new_classzone_idx = pgdat->nr_zones - 1; for ( ; ; ) { - unsigned long new_order; - int new_classzone_idx; int ret; - new_order = pgdat->kswapd_max_order; - new_classzone_idx = pgdat->classzone_idx; - pgdat->kswapd_max_order = 0; - pgdat->classzone_idx = MAX_NR_ZONES - 1; + /* + * If the last balance_pgdat was unsuccessful it's unlikely a + * new request of a similar or harder type will succeed soon + * so consider going to sleep on the basis we reclaimed at + */ + if (classzone_idx >= new_classzone_idx && order == new_order) { + new_order = pgdat->kswapd_max_order; + new_classzone_idx = pgdat->classzone_idx; + pgdat->kswapd_max_order = 0; + pgdat->classzone_idx = pgdat->nr_zones - 1; + } + if (order < new_order || classzone_idx > new_classzone_idx) { /* * Don't sleep if someone wants a larger 'order' @@ -2763,7 +2851,7 @@ static int kswapd(void *p) order = pgdat->kswapd_max_order; classzone_idx = pgdat->classzone_idx; pgdat->kswapd_max_order = 0; - pgdat->classzone_idx = MAX_NR_ZONES - 1; + pgdat->classzone_idx = pgdat->nr_zones - 1; } ret = try_to_freeze(); @@ -2862,7 +2950,6 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim) .may_writepage = 1, .nr_to_reclaim = nr_to_reclaim, .hibernation_mode = 1, - .swappiness = vm_swappiness, .order = 0, }; struct shrink_control shrink = { @@ -3049,7 +3136,6 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) .nr_to_reclaim = max_t(unsigned long, nr_pages, SWAP_CLUSTER_MAX), .gfp_mask = gfp_mask, - .swappiness = vm_swappiness, .order = order, }; struct shrink_control shrink = { |