From e5ca8071fe65f409ba074d1c45ec2db977b5b222 Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Tue, 16 Jul 2019 16:26:09 -0700 Subject: mm/vmscan.c: add a new member reclaim_state in struct shrink_control Patch series "mm/vmscan: calculate reclaimed slab in all reclaim paths". This patchset is to fix the issues in doing shrink slab. There're six different reclaim paths by now, - kswapd reclaim path - node reclaim path - hibernate preallocate memory reclaim path - direct reclaim path - memcg reclaim path - memcg softlimit reclaim path The slab caches reclaimed in these paths are only calculated in the above three paths. The issues are detailed explained in patch #2. We should calculate the reclaimed slab caches in every reclaim path. In order to do it, the struct reclaim_state is placed into the struct shrink_control. In node reclaim path, there'is another issue about shrinking slab, which is adressed in "mm/vmscan: shrink slab in node reclaim" (https://lore.kernel.org/linux-mm/1559874946-22960-1-git-send-email-laoar.shao@gmail.com/). This patch (of 2): The struct reclaim_state is used to record how many slab caches are reclaimed in one reclaim path. The struct shrink_control is used to control one reclaim path. So we'd better put reclaim_state into shrink_control. [laoar.shao@gmail.com: remove reclaim_state assignment from __perform_reclaim()] Link: http://lkml.kernel.org/r/1561381582-13697-1-git-send-email-laoar.shao@gmail.com Link: http://lkml.kernel.org/r/1561112086-6169-2-git-send-email-laoar.shao@gmail.com Signed-off-by: Yafang Shao Reviewed-by: Andrew Morton Reviewed-by: Kirill Tkhai Cc: Michal Hocko Cc: Johannes Weiner Cc: Vladimir Davydov Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) (limited to 'mm/vmscan.c') diff --git a/mm/vmscan.c b/mm/vmscan.c index f8e3dcd527b8..a01897fdfdac 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -131,6 +131,9 @@ struct scan_control { unsigned int file_taken; unsigned int taken; } nr; + + /* for recording the reclaimed slab by now */ + struct reclaim_state reclaim_state; }; #ifdef ARCH_HAS_PREFETCH @@ -3483,6 +3486,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx) .may_unmap = 1, }; + current->reclaim_state = &sc.reclaim_state; psi_memstall_enter(&pflags); __fs_reclaim_acquire(); @@ -3664,6 +3668,8 @@ out: snapshot_refaults(NULL, pgdat); __fs_reclaim_release(); psi_memstall_leave(&pflags); + current->reclaim_state = NULL; + /* * Return the order kswapd stopped reclaiming at as * prepare_kswapd_sleep() takes it into account. If another caller @@ -3787,15 +3793,10 @@ static int kswapd(void *p) unsigned int classzone_idx = MAX_NR_ZONES - 1; pg_data_t *pgdat = (pg_data_t*)p; struct task_struct *tsk = current; - - struct reclaim_state reclaim_state = { - .reclaimed_slab = 0, - }; const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id); if (!cpumask_empty(cpumask)) set_cpus_allowed_ptr(tsk, cpumask); - current->reclaim_state = &reclaim_state; /* * Tell the memory management that we're a "memory allocator", @@ -3857,7 +3858,6 @@ kswapd_try_sleep: } tsk->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD); - current->reclaim_state = NULL; return 0; } @@ -3922,7 +3922,6 @@ void wakeup_kswapd(struct zone *zone, gfp_t gfp_flags, int order, */ unsigned long shrink_all_memory(unsigned long nr_to_reclaim) { - struct reclaim_state reclaim_state; struct scan_control sc = { .nr_to_reclaim = nr_to_reclaim, .gfp_mask = GFP_HIGHUSER_MOVABLE, @@ -3940,8 +3939,7 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim) fs_reclaim_acquire(sc.gfp_mask); noreclaim_flag = memalloc_noreclaim_save(); - reclaim_state.reclaimed_slab = 0; - p->reclaim_state = &reclaim_state; + p->reclaim_state = &sc.reclaim_state; nr_reclaimed = do_try_to_free_pages(zonelist, &sc); @@ -4110,7 +4108,6 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in /* Minimum pages needed in order to stay on node */ const unsigned long nr_pages = 1 << order; struct task_struct *p = current; - struct reclaim_state reclaim_state; unsigned int noreclaim_flag; struct scan_control sc = { .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX), @@ -4135,8 +4132,7 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in */ noreclaim_flag = memalloc_noreclaim_save(); p->flags |= PF_SWAPWRITE; - reclaim_state.reclaimed_slab = 0; - p->reclaim_state = &reclaim_state; + p->reclaim_state = &sc.reclaim_state; if (node_pagecache_reclaimable(pgdat) > pgdat->min_unmapped_pages) { /* -- cgit v1.2.3 From 0308f7cf19c9741837f5b4c8cde14342bba72604 Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Tue, 16 Jul 2019 16:26:12 -0700 Subject: mm/vmscan.c: calculate reclaimed slab caches in all reclaim paths There are six different reclaim paths by now: - kswapd reclaim path - node reclaim path - hibernate preallocate memory reclaim path - direct reclaim path - memcg reclaim path - memcg softlimit reclaim path The slab caches reclaimed in these paths are only calculated in the above three paths. There're some drawbacks if we don't calculate the reclaimed slab caches. - The sc->nr_reclaimed isn't correct if there're some slab caches relcaimed in this path. - The slab caches may be reclaimed thoroughly if there're lots of reclaimable slab caches and few page caches. Let's take an easy example for this case. If one memcg is full of slab caches and the limit of it is 512M, in other words there're approximately 512M slab caches in this memcg. Then the limit of the memcg is reached and the memcg reclaim begins, and then in this memcg reclaim path it will continuesly reclaim the slab caches until the sc->priority drops to 0. After this reclaim stops, you will find there're few slab caches left, which is less than 20M in my test case. While after this patch applied the number is greater than 300M and the sc->priority only drops to 3. Link: http://lkml.kernel.org/r/1561112086-6169-3-git-send-email-laoar.shao@gmail.com Signed-off-by: Yafang Shao Reviewed-by: Kirill Tkhai Reviewed-by: Andrew Morton Cc: Kirill Tkhai Cc: Michal Hocko Cc: Johannes Weiner Cc: Vladimir Davydov Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'mm/vmscan.c') diff --git a/mm/vmscan.c b/mm/vmscan.c index a01897fdfdac..88d740db3216 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -3194,11 +3194,13 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, if (throttle_direct_reclaim(sc.gfp_mask, zonelist, nodemask)) return 1; + current->reclaim_state = &sc.reclaim_state; trace_mm_vmscan_direct_reclaim_begin(order, sc.gfp_mask); nr_reclaimed = do_try_to_free_pages(zonelist, &sc); trace_mm_vmscan_direct_reclaim_end(nr_reclaimed); + current->reclaim_state = NULL; return nr_reclaimed; } @@ -3221,6 +3223,7 @@ unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg, }; unsigned long lru_pages; + current->reclaim_state = &sc.reclaim_state; sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK); @@ -3238,7 +3241,9 @@ unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg, trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed); + current->reclaim_state = NULL; *nr_scanned = sc.nr_scanned; + return sc.nr_reclaimed; } @@ -3265,6 +3270,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, .may_shrinkslab = 1, }; + current->reclaim_state = &sc.reclaim_state; /* * Unlike direct reclaim via alloc_pages(), memcg's reclaim doesn't * take care of from where we get pages. So the node where we start the @@ -3285,6 +3291,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, psi_memstall_leave(&pflags); trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed); + current->reclaim_state = NULL; return nr_reclaimed; } -- cgit v1.2.3 From 1732d2b0117c26a6bf6027c919e49603156ea93d Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Tue, 16 Jul 2019 16:26:15 -0700 Subject: mm/vmscan.c: add checks for incorrect handling of current->reclaim_state Six sites are presently altering current->reclaim_state. There is a risk that one function stomps on a caller's value. Use a helper function to catch such errors. Cc: Yafang Shao Cc: Kirill Tkhai Cc: Michal Hocko Cc: Johannes Weiner Cc: Vladimir Davydov Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 37 ++++++++++++++++++++++++------------- 1 file changed, 24 insertions(+), 13 deletions(-) (limited to 'mm/vmscan.c') diff --git a/mm/vmscan.c b/mm/vmscan.c index 88d740db3216..44df66a98f2a 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -241,6 +241,18 @@ static void unregister_memcg_shrinker(struct shrinker *shrinker) } #endif /* CONFIG_MEMCG_KMEM */ +static void set_task_reclaim_state(struct task_struct *task, + struct reclaim_state *rs) +{ + /* Check for an overwrite */ + WARN_ON_ONCE(rs && task->reclaim_state); + + /* Check for the nulling of an already-nulled member */ + WARN_ON_ONCE(!rs && !task->reclaim_state); + + task->reclaim_state = rs; +} + #ifdef CONFIG_MEMCG static bool global_reclaim(struct scan_control *sc) { @@ -3194,13 +3206,13 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, if (throttle_direct_reclaim(sc.gfp_mask, zonelist, nodemask)) return 1; - current->reclaim_state = &sc.reclaim_state; + set_task_reclaim_state(current, &sc.reclaim_state); trace_mm_vmscan_direct_reclaim_begin(order, sc.gfp_mask); nr_reclaimed = do_try_to_free_pages(zonelist, &sc); trace_mm_vmscan_direct_reclaim_end(nr_reclaimed); - current->reclaim_state = NULL; + set_task_reclaim_state(current, NULL); return nr_reclaimed; } @@ -3223,7 +3235,7 @@ unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg, }; unsigned long lru_pages; - current->reclaim_state = &sc.reclaim_state; + set_task_reclaim_state(current, &sc.reclaim_state); sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK); @@ -3241,7 +3253,7 @@ unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg, trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed); - current->reclaim_state = NULL; + set_task_reclaim_state(current, NULL); *nr_scanned = sc.nr_scanned; return sc.nr_reclaimed; @@ -3270,7 +3282,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, .may_shrinkslab = 1, }; - current->reclaim_state = &sc.reclaim_state; + set_task_reclaim_state(current, &sc.reclaim_state); /* * Unlike direct reclaim via alloc_pages(), memcg's reclaim doesn't * take care of from where we get pages. So the node where we start the @@ -3291,7 +3303,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, psi_memstall_leave(&pflags); trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed); - current->reclaim_state = NULL; + set_task_reclaim_state(current, NULL); return nr_reclaimed; } @@ -3493,7 +3505,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx) .may_unmap = 1, }; - current->reclaim_state = &sc.reclaim_state; + set_task_reclaim_state(current, &sc.reclaim_state); psi_memstall_enter(&pflags); __fs_reclaim_acquire(); @@ -3675,7 +3687,7 @@ out: snapshot_refaults(NULL, pgdat); __fs_reclaim_release(); psi_memstall_leave(&pflags); - current->reclaim_state = NULL; + set_task_reclaim_state(current, NULL); /* * Return the order kswapd stopped reclaiming at as @@ -3940,17 +3952,16 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim) .hibernation_mode = 1, }; struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask); - struct task_struct *p = current; unsigned long nr_reclaimed; unsigned int noreclaim_flag; fs_reclaim_acquire(sc.gfp_mask); noreclaim_flag = memalloc_noreclaim_save(); - p->reclaim_state = &sc.reclaim_state; + set_task_reclaim_state(current, &sc.reclaim_state); nr_reclaimed = do_try_to_free_pages(zonelist, &sc); - p->reclaim_state = NULL; + set_task_reclaim_state(current, NULL); memalloc_noreclaim_restore(noreclaim_flag); fs_reclaim_release(sc.gfp_mask); @@ -4139,7 +4150,7 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in */ noreclaim_flag = memalloc_noreclaim_save(); p->flags |= PF_SWAPWRITE; - p->reclaim_state = &sc.reclaim_state; + set_task_reclaim_state(p, &sc.reclaim_state); if (node_pagecache_reclaimable(pgdat) > pgdat->min_unmapped_pages) { /* @@ -4151,7 +4162,7 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in } while (sc.nr_reclaimed < nr_pages && --sc.priority >= 0); } - p->reclaim_state = NULL; + set_task_reclaim_state(p, NULL); current->flags &= ~PF_SWAPWRITE; memalloc_noreclaim_restore(noreclaim_flag); fs_reclaim_release(sc.gfp_mask); -- cgit v1.2.3