From 77cd814835df22e177a9caac1b046f7ffdfeedd0 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Mon, 4 Sep 2023 17:08:49 +0200 Subject: mm/vmstat: use this_cpu_try_cmpxchg in mod_{zone,node}_state Use this_cpu_try_cmpxchg instead of this_cpu_cmpxchg (*ptr, old, new) == old in mod_zone_state and mod_node_state. x86 CMPXCHG instruction returns success in ZF flag, so this change saves a compare after cmpxchg (and related move instruction in front of cmpxchg). Also, try_cmpxchg implicitly assigns old *ptr value to "old" when cmpxchg fails. There is no need to re-read the value in the loop. No functional change intended. Link: https://lkml.kernel.org/r/20230904150917.8318-1-ubizjak@gmail.com Signed-off-by: Uros Bizjak Signed-off-by: Andrew Morton --- mm/vmstat.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) (limited to 'mm/vmstat.c') diff --git a/mm/vmstat.c b/mm/vmstat.c index 00e81e99c6ee..894e4c88d241 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -559,8 +559,10 @@ static inline void mod_zone_state(struct zone *zone, { struct per_cpu_zonestat __percpu *pcp = zone->per_cpu_zonestats; s8 __percpu *p = pcp->vm_stat_diff + item; - long o, n, t, z; + long n, t, z; + s8 o; + o = this_cpu_read(*p); do { z = 0; /* overflow to zone counters */ @@ -576,8 +578,7 @@ static inline void mod_zone_state(struct zone *zone, */ t = this_cpu_read(pcp->stat_threshold); - o = this_cpu_read(*p); - n = delta + o; + n = delta + (long)o; if (abs(n) > t) { int os = overstep_mode * (t >> 1) ; @@ -586,7 +587,7 @@ static inline void mod_zone_state(struct zone *zone, z = n + os; n = -os; } - } while (this_cpu_cmpxchg(*p, o, n) != o); + } while (!this_cpu_try_cmpxchg(*p, &o, n)); if (z) zone_page_state_add(z, zone, item); @@ -616,7 +617,8 @@ static inline void mod_node_state(struct pglist_data *pgdat, { struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats; s8 __percpu *p = pcp->vm_node_stat_diff + item; - long o, n, t, z; + long n, t, z; + s8 o; if (vmstat_item_in_bytes(item)) { /* @@ -629,6 +631,7 @@ static inline void mod_node_state(struct pglist_data *pgdat, delta >>= PAGE_SHIFT; } + o = this_cpu_read(*p); do { z = 0; /* overflow to node counters */ @@ -644,8 +647,7 @@ static inline void mod_node_state(struct pglist_data *pgdat, */ t = this_cpu_read(pcp->stat_threshold); - o = this_cpu_read(*p); - n = delta + o; + n = delta + (long)o; if (abs(n) > t) { int os = overstep_mode * (t >> 1) ; @@ -654,7 +656,7 @@ static inline void mod_node_state(struct pglist_data *pgdat, z = n + os; n = -os; } - } while (this_cpu_cmpxchg(*p, o, n) != o); + } while (!this_cpu_try_cmpxchg(*p, &o, n)); if (z) node_page_state_add(z, pgdat, item); -- cgit v1.2.3 From fa8c4f9a665bd0cf5a475327aea4fd179e896216 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Fri, 11 Aug 2023 17:08:19 +0800 Subject: mm: fix draining remote pageset If there is no memory allocation/freeing in the PCP (Per-CPU Pageset) of a remote zone (zone in remote NUMA node) after some time (3 seconds for now), the pages of the PCP of the remote zone will be drained to avoid memory wastage. This behavior was introduced in the commit 4ae7c03943fc ("[PATCH] Periodically drain non local pagesets") and the commit 4037d452202e ("Move remote node draining out of slab allocators") But, after the commit 7cc36bbddde5 ("vmstat: on-demand vmstat workers V8"), the vmstat updater worker which is used to drain the PCP of remote zones may not be re-queued when we are waiting for the timeout (pcp->expire != 0) if there are no vmstat changes on this CPU, for example, when the CPU goes idle or runs user space only workloads. This may cause the pages of a remote zone be kept in PCP of this CPU for long time. So that, the page reclaiming of the remote zone may be triggered prematurely. This isn't a severe problem in practice, because the PCP of the remote zone will be drained if some memory are allocated/freed again on this CPU. And, the PCP will eventually be drained during the direct reclaiming if necessary. Anyway, the problem still deserves a fix via guaranteeing that the vmstat updater worker will always be re-queued when we are waiting for the timeout. In effect, this restores the original behavior before the commit 7cc36bbddde5. We can reproduce the bug via allocating/freeing pages from a remote zone then go idle as follows. And the patch can fix it. - Run some workloads, use `numactl` to bind CPU to node 0 and memory to node 1. So the PCP of the CPU on node 0 for zone on node 1 will be filled. - After workloads finish, idle for 60s - Check /proc/zoneinfo With the original kernel, the number of pages in the PCP of the CPU on node 0 for zone on node 1 is non-zero after idle. With the patched kernel, it becomes 0 after idle. That is, we avoid to keep pages in the remote PCP during idle. Link: https://lkml.kernel.org/r/20231007062356.187621-1-ying.huang@intel.com Link: https://lkml.kernel.org/r/20230811090819.60845-1-ying.huang@intel.com Fixes: 7cc36bbddde5 ("vmstat: on-demand vmstat workers V8") Signed-off-by: "Huang, Ying" Reviewed-by: Christoph Lameter Cc: Mel Gorman Cc: Vlastimil Babka Cc: Michal Hocko Signed-off-by: Andrew Morton --- mm/vmstat.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'mm/vmstat.c') diff --git a/mm/vmstat.c b/mm/vmstat.c index 894e4c88d241..88ea95d4221c 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -857,8 +857,10 @@ static int refresh_cpu_vm_stats(bool do_pagesets) continue; } - if (__this_cpu_dec_return(pcp->expire)) + if (__this_cpu_dec_return(pcp->expire)) { + changes++; continue; + } if (__this_cpu_read(pcp->count)) { drain_zone_pages(zone, this_cpu_ptr(pcp)); -- cgit v1.2.3 From 51a755c56dc05a8b31ed28d24f28354946dc7529 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Mon, 16 Oct 2023 13:30:00 +0800 Subject: mm: tune PCP high automatically The target to tune PCP high automatically is as follows, - Minimize allocation/freeing from/to shared zone - Minimize idle pages in PCP - Minimize pages in PCP if the system free pages is too few To reach these target, a tuning algorithm as follows is designed, - When we refill PCP via allocating from the zone, increase PCP high. Because if we had larger PCP, we could avoid to allocate from the zone. - In periodic vmstat updating kworker (via refresh_cpu_vm_stats()), decrease PCP high to try to free possible idle PCP pages. - When page reclaiming is active for the zone, stop increasing PCP high in allocating path, decrease PCP high and free some pages in freeing path. So, the PCP high can be tuned to the page allocating/freeing depth of workloads eventually. One issue of the algorithm is that if the number of pages allocated is much more than that of pages freed on a CPU, the PCP high may become the maximal value even if the allocating/freeing depth is small. But this isn't a severe issue, because there are no idle pages in this case. One alternative choice is to increase PCP high when we drain PCP via trying to free pages to the zone, but don't increase PCP high during PCP refilling. This can avoid the issue above. But if the number of pages allocated is much less than that of pages freed on a CPU, there will be many idle pages in PCP and it is hard to free these idle pages. 1/8 (>> 3) of PCP high will be decreased periodically. The value 1/8 is kind of arbitrary. Just to make sure that the idle PCP pages will be freed eventually. On a 2-socket Intel server with 224 logical CPU, we run 8 kbuild instances in parallel (each with `make -j 28`) in 8 cgroup. This simulates the kbuild server that is used by 0-Day kbuild service. With the patch, the build time decreases 3.5%. The cycles% of the spinlock contention (mostly for zone lock) decreases from 11.0% to 0.5%. The number of PCP draining for high order pages freeing (free_high) decreases 65.6%. The number of pages allocated from zone (instead of from PCP) decreases 83.9%. Link: https://lkml.kernel.org/r/20231016053002.756205-8-ying.huang@intel.com Signed-off-by: "Huang, Ying" Suggested-by: Mel Gorman Suggested-by: Michal Hocko Cc: Vlastimil Babka Cc: David Hildenbrand Cc: Johannes Weiner Cc: Dave Hansen Cc: Pavel Tatashin Cc: Matthew Wilcox Cc: Christoph Lameter Cc: Arjan van de Ven Cc: Sudeep Holla Signed-off-by: Andrew Morton --- mm/vmstat.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'mm/vmstat.c') diff --git a/mm/vmstat.c b/mm/vmstat.c index 88ea95d4221c..359460deb377 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -816,9 +816,7 @@ static int refresh_cpu_vm_stats(bool do_pagesets) for_each_populated_zone(zone) { struct per_cpu_zonestat __percpu *pzstats = zone->per_cpu_zonestats; -#ifdef CONFIG_NUMA struct per_cpu_pages __percpu *pcp = zone->per_cpu_pageset; -#endif for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) { int v; @@ -834,10 +832,12 @@ static int refresh_cpu_vm_stats(bool do_pagesets) #endif } } -#ifdef CONFIG_NUMA if (do_pagesets) { cond_resched(); + + changes += decay_pcp_high(zone, this_cpu_ptr(pcp)); +#ifdef CONFIG_NUMA /* * Deal with draining the remote pageset of this * processor @@ -866,8 +866,8 @@ static int refresh_cpu_vm_stats(bool do_pagesets) drain_zone_pages(zone, this_cpu_ptr(pcp)); changes++; } - } #endif + } } for_each_online_pgdat(pgdat) { -- cgit v1.2.3