From 4e0cf05f60590c4119c2eeacf136d1b832978370 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Tue, 6 Dec 2022 18:13:39 +0100 Subject: mm: memcontrol: skip moving non-present pages that are mapped elsewhere Patch series "mm: push down lock_page_memcg()", v2. This patch (of 3): During charge moving, the pte lock and the page lock cover nearly all cases of stabilizing page_mapped(). The only exception is when we're looking at a non-present pte and find a page in the page cache or in the swapcache: if the page is mapped elsewhere, it can become unmapped outside of our control. For this reason, rmap needs lock_page_memcg(). We don't like cgroup-specific locks in generic MM code - especially in performance-critical MM code - and for a legacy feature that's unlikely to have many users left - if any. So remove the exception. Arguably that's better semantics anyway: the page is shared, and another process seems to be the more active user. Once we stop moving such pages, rmap doesn't need lock_page_memcg() anymore. The next patch will remove it. Link: https://lkml.kernel.org/r/20221206171340.139790-1-hannes@cmpxchg.org Link: https://lkml.kernel.org/r/20221206171340.139790-2-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Suggested-by: Hugh Dickins Acked-by: Hugh Dickins Acked-by: Shakeel Butt Acked-by: Michal Hocko Cc: Roman Gushchin Cc: Muchun Song Signed-off-by: Andrew Morton --- mm/memcontrol.c | 52 ++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 38 insertions(+), 14 deletions(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index ab457f0394ab..a698a2b6523b 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5692,7 +5692,7 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma, * @from: mem_cgroup which the page is moved from. * @to: mem_cgroup which the page is moved to. @from != @to. * - * The caller must make sure the page is not on LRU (isolate_page() is useful.) + * The page must be locked and not on the LRU. * * This function doesn't do "charge" to new cgroup and doesn't do "uncharge" * from old cgroup. @@ -5709,20 +5709,13 @@ static int mem_cgroup_move_account(struct page *page, int nid, ret; VM_BUG_ON(from == to); + VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); VM_BUG_ON(compound && !folio_test_large(folio)); - /* - * Prevent mem_cgroup_migrate() from looking at - * page's memory cgroup of its source page while we change it. - */ - ret = -EBUSY; - if (!folio_trylock(folio)) - goto out; - ret = -EINVAL; if (folio_memcg(folio) != from) - goto out_unlock; + goto out; pgdat = folio_pgdat(folio); from_vec = mem_cgroup_lruvec(from, pgdat); @@ -5809,8 +5802,6 @@ static int mem_cgroup_move_account(struct page *page, mem_cgroup_charge_statistics(from, -nr_pages); memcg_check_events(from, nid); local_irq_enable(); -out_unlock: - folio_unlock(folio); out: return ret; } @@ -5859,6 +5850,29 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma, else if (is_swap_pte(ptent)) page = mc_handle_swap_pte(vma, ptent, &ent); + if (target && page) { + if (!trylock_page(page)) { + put_page(page); + return ret; + } + /* + * page_mapped() must be stable during the move. This + * pte is locked, so if it's present, the page cannot + * become unmapped. If it isn't, we have only partial + * control over the mapped state: the page lock will + * prevent new faults against pagecache and swapcache, + * so an unmapped page cannot become mapped. However, + * if the page is already mapped elsewhere, it can + * unmap, and there is nothing we can do about it. + * Alas, skip moving the page in this case. + */ + if (!pte_present(ptent) && page_mapped(page)) { + unlock_page(page); + put_page(page); + return ret; + } + } + if (!page && !ent.val) return ret; if (page) { @@ -5875,8 +5889,11 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma, if (target) target->page = page; } - if (!ret || !target) + if (!ret || !target) { + if (target) + unlock_page(page); put_page(page); + } } /* * There is a swap entry and a page doesn't exist or isn't charged. @@ -5916,6 +5933,10 @@ static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma, ret = MC_TARGET_PAGE; if (target) { get_page(page); + if (!trylock_page(page)) { + put_page(page); + return MC_TARGET_NONE; + } target->page = page; } } @@ -6154,6 +6175,7 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, } putback_lru_page(page); } + unlock_page(page); put_page(page); } else if (target_type == MC_TARGET_DEVICE) { page = target.page; @@ -6162,6 +6184,7 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, mc.precharge -= HPAGE_PMD_NR; mc.moved_charge += HPAGE_PMD_NR; } + unlock_page(page); put_page(page); } spin_unlock(ptl); @@ -6204,7 +6227,8 @@ retry: } if (!device) putback_lru_page(page); -put: /* get_mctgt_type() gets the page */ +put: /* get_mctgt_type() gets & locks the page */ + unlock_page(page); put_page(page); break; case MC_TARGET_SWAP: -- cgit v1.2.3 From da34a8484d162585e22ed8c1e4114aa2f60e3567 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 7 Dec 2022 14:00:39 +0100 Subject: mm: memcontrol: deprecate charge moving Charge moving mode in cgroup1 allows memory to follow tasks as they migrate between cgroups. This is, and always has been, a questionable thing to do - for several reasons. First, it's expensive. Pages need to be identified, locked and isolated from various MM operations, and reassigned, one by one. Second, it's unreliable. Once pages are charged to a cgroup, there isn't always a clear owner task anymore. Cache isn't moved at all, for example. Mapped memory is moved - but if trylocking or isolating a page fails, it's arbitrarily left behind. Frequent moving between domains may leave a task's memory scattered all over the place. Third, it isn't really needed. Launcher tasks can kick off workload tasks directly in their target cgroup. Using dedicated per-workload groups allows fine-grained policy adjustments - no need to move tasks and their physical pages between control domains. The feature was never forward-ported to cgroup2, and it hasn't been missed. Despite it being a niche usecase, the maintenance overhead of supporting it is enormous. Because pages are moved while they are live and subject to various MM operations, the synchronization rules are complicated. There are lock_page_memcg() in MM and FS code, which non-cgroup people don't understand. In some cases we've been able to shift code and cgroup API calls around such that we can rely on native locking as much as possible. But that's fragile, and sometimes we need to hold MM locks for longer than we otherwise would (pte lock e.g.). Mark the feature deprecated. Hopefully we can remove it soon. And backport into -stable kernels so that people who develop against earlier kernels are warned about this deprecation as early as possible. [akpm@linux-foundation.org: fix memory.rst underlining] Link: https://lkml.kernel.org/r/Y5COd+qXwk/S+n8N@cmpxchg.org Signed-off-by: Johannes Weiner Acked-by: Shakeel Butt Acked-by: Hugh Dickins Acked-by: Michal Hocko Cc: Muchun Song Cc: Roman Gushchin Cc: Signed-off-by: Andrew Morton --- mm/memcontrol.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index a698a2b6523b..49f67176a1a2 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3919,6 +3919,10 @@ static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css, { struct mem_cgroup *memcg = mem_cgroup_from_css(css); + pr_warn_once("Cgroup memory moving (move_charge_at_immigrate) is deprecated. " + "Please report your usecase to linux-mm@kvack.org if you " + "depend on this functionality.\n"); + if (val & ~MOVE_MASK) return -EINVAL; -- cgit v1.2.3 From e4dde56cd208674ce899b47589f263499e5b8cdc Mon Sep 17 00:00:00 2001 From: Yu Zhao Date: Wed, 21 Dec 2022 21:19:04 -0700 Subject: mm: multi-gen LRU: per-node lru_gen_folio lists For each node, memcgs are divided into two generations: the old and the young. For each generation, memcgs are randomly sharded into multiple bins to improve scalability. For each bin, an RCU hlist_nulls is virtually divided into three segments: the head, the tail and the default. An onlining memcg is added to the tail of a random bin in the old generation. The eviction starts at the head of a random bin in the old generation. The per-node memcg generation counter, whose reminder (mod 2) indexes the old generation, is incremented when all its bins become empty. There are four operations: 1. MEMCG_LRU_HEAD, which moves an memcg to the head of a random bin in its current generation (old or young) and updates its "seg" to "head"; 2. MEMCG_LRU_TAIL, which moves an memcg to the tail of a random bin in its current generation (old or young) and updates its "seg" to "tail"; 3. MEMCG_LRU_OLD, which moves an memcg to the head of a random bin in the old generation, updates its "gen" to "old" and resets its "seg" to "default"; 4. MEMCG_LRU_YOUNG, which moves an memcg to the tail of a random bin in the young generation, updates its "gen" to "young" and resets its "seg" to "default". The events that trigger the above operations are: 1. Exceeding the soft limit, which triggers MEMCG_LRU_HEAD; 2. The first attempt to reclaim an memcg below low, which triggers MEMCG_LRU_TAIL; 3. The first attempt to reclaim an memcg below reclaimable size threshold, which triggers MEMCG_LRU_TAIL; 4. The second attempt to reclaim an memcg below reclaimable size threshold, which triggers MEMCG_LRU_YOUNG; 5. Attempting to reclaim an memcg below min, which triggers MEMCG_LRU_YOUNG; 6. Finishing the aging on the eviction path, which triggers MEMCG_LRU_YOUNG; 7. Offlining an memcg, which triggers MEMCG_LRU_OLD. Note that memcg LRU only applies to global reclaim, and the round-robin incrementing of their max_seq counters ensures the eventual fairness to all eligible memcgs. For memcg reclaim, it still relies on mem_cgroup_iter(). Link: https://lkml.kernel.org/r/20221222041905.2431096-7-yuzhao@google.com Signed-off-by: Yu Zhao Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Michael Larabel Cc: Michal Hocko Cc: Mike Rapoport Cc: Roman Gushchin Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- mm/memcontrol.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 49f67176a1a2..2758b67eb169 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -478,6 +478,16 @@ static void mem_cgroup_update_tree(struct mem_cgroup *memcg, int nid) struct mem_cgroup_per_node *mz; struct mem_cgroup_tree_per_node *mctz; + if (lru_gen_enabled()) { + struct lruvec *lruvec = &memcg->nodeinfo[nid]->lruvec; + + /* see the comment on MEMCG_NR_GENS */ + if (soft_limit_excess(memcg) && lru_gen_memcg_seg(lruvec) != MEMCG_LRU_HEAD) + lru_gen_rotate_memcg(lruvec, MEMCG_LRU_HEAD); + + return; + } + mctz = soft_limit_tree.rb_tree_per_node[nid]; if (!mctz) return; @@ -3530,6 +3540,9 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order, struct mem_cgroup_tree_per_node *mctz; unsigned long excess; + if (lru_gen_enabled()) + return 0; + if (order > 0) return 0; @@ -5391,6 +5404,7 @@ static int mem_cgroup_css_online(struct cgroup_subsys_state *css) if (unlikely(mem_cgroup_is_root(memcg))) queue_delayed_work(system_unbound_wq, &stats_flush_dwork, 2UL*HZ); + lru_gen_online_memcg(memcg); return 0; offline_kmem: memcg_offline_kmem(memcg); @@ -5422,6 +5436,7 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) memcg_offline_kmem(memcg); reparent_shrinker_deferred(memcg); wb_memcg_offline(memcg); + lru_gen_offline_memcg(memcg); drain_all_stock(memcg); @@ -5433,6 +5448,7 @@ static void mem_cgroup_css_released(struct cgroup_subsys_state *css) struct mem_cgroup *memcg = mem_cgroup_from_css(css); invalidate_reclaim_iterators(memcg); + lru_gen_release_memcg(memcg); } static void mem_cgroup_css_free(struct cgroup_subsys_state *css) -- cgit v1.2.3 From becacb04fdd439d7d1f2a93739161706a2e3e947 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Fri, 30 Dec 2022 15:08:42 +0800 Subject: mm: memcg: add folio_memcg_check() Patch series "mm: convert page_idle/damon to use folios", v4. This patch (of 8): Convert page_memcg_check() into folio_memcg_check() and add a page_memcg_check() wrapper. The behaviour of page_memcg_check() is unchanged; tail pages always had a NULL ->memcg_data. Link: https://lkml.kernel.org/r/20221230070849.63358-1-wangkefeng.wang@huawei.com Link: https://lkml.kernel.org/r/20221230070849.63358-2-wangkefeng.wang@huawei.com Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Kefeng Wang Cc: David Hildenbrand Cc: SeongJae Park Cc: Vishal Moola (Oracle) Signed-off-by: Andrew Morton --- mm/memcontrol.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 2758b67eb169..17965e558ab7 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2952,13 +2952,13 @@ struct mem_cgroup *mem_cgroup_from_obj_folio(struct folio *folio, void *p) } /* - * page_memcg_check() is used here, because in theory we can encounter + * folio_memcg_check() is used here, because in theory we can encounter * a folio where the slab flag has been cleared already, but * slab->memcg_data has not been freed yet - * page_memcg_check(page) will guarantee that a proper memory + * folio_memcg_check() will guarantee that a proper memory * cgroup pointer or NULL will be returned. */ - return page_memcg_check(folio_page(folio, 0)); + return folio_memcg_check(folio); } /* -- cgit v1.2.3 From 75376c6fb93b99e94192cfff48222d11819ee917 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 16 Jan 2023 19:25:07 +0000 Subject: mm: convert mem_cgroup_css_from_page() to mem_cgroup_css_from_folio() Only one caller doesn't have a folio, so move the page_folio() call to that one caller from mem_cgroup_css_from_folio(). Link: https://lkml.kernel.org/r/20230116192507.2146150-3-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/memcontrol.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index faeea84964aa..893427aded01 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -350,21 +350,19 @@ EXPORT_SYMBOL(memcg_kmem_enabled_key); #endif /** - * mem_cgroup_css_from_page - css of the memcg associated with a page - * @page: page of interest + * mem_cgroup_css_from_folio - css of the memcg associated with a folio + * @folio: folio of interest * * If memcg is bound to the default hierarchy, css of the memcg associated - * with @page is returned. The returned css remains associated with @page + * with @folio is returned. The returned css remains associated with @folio * until it is released. * * If memcg is bound to a traditional hierarchy, the css of root_mem_cgroup * is returned. */ -struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page) +struct cgroup_subsys_state *mem_cgroup_css_from_folio(struct folio *folio) { - struct mem_cgroup *memcg; - - memcg = page_memcg(page); + struct mem_cgroup *memcg = folio_memcg(folio); if (!memcg || !cgroup_subsys_on_dfl(memory_cgrp_subsys)) memcg = root_mem_cgroup; -- cgit v1.2.3 From 36c7b4db7c942ae9e1b111f0c6b468c8b2e33842 Mon Sep 17 00:00:00 2001 From: "T.J. Alumbaugh" Date: Wed, 18 Jan 2023 00:18:24 +0000 Subject: mm: multi-gen LRU: section for memcg LRU Move memcg LRU code into a dedicated section. Improve the design doc to outline its architecture. Link: https://lkml.kernel.org/r/20230118001827.1040870-5-talumbau@google.com Signed-off-by: T.J. Alumbaugh Cc: Yu Zhao Signed-off-by: Andrew Morton --- mm/memcontrol.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 893427aded01..17335459d8dc 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -476,12 +476,8 @@ static void mem_cgroup_update_tree(struct mem_cgroup *memcg, int nid) struct mem_cgroup_tree_per_node *mctz; if (lru_gen_enabled()) { - struct lruvec *lruvec = &memcg->nodeinfo[nid]->lruvec; - - /* see the comment on MEMCG_NR_GENS */ - if (soft_limit_excess(memcg) && lru_gen_memcg_seg(lruvec) != MEMCG_LRU_HEAD) - lru_gen_rotate_memcg(lruvec, MEMCG_LRU_HEAD); - + if (soft_limit_excess(memcg)) + lru_gen_soft_reclaim(&memcg->nodeinfo[nid]->lruvec); return; } -- cgit v1.2.3 From f7a449f779608efe1941a0e0c4bd7b5f57000be7 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Mon, 13 Feb 2023 11:29:22 -0800 Subject: mm: memcontrol: rename memcg_kmem_enabled() Currently there are two kmem-related helper functions with a confusing semantics: memcg_kmem_enabled() and mem_cgroup_kmem_disabled(). The problem is that an obvious expectation memcg_kmem_enabled() == !mem_cgroup_kmem_disabled(), can be false. mem_cgroup_kmem_disabled() is similar to mem_cgroup_disabled(): it returns true only if CONFIG_MEMCG_KMEM is not set or the kmem accounting is disabled using a boot time kernel option "cgroup.memory=nokmem". It never changes the value dynamically. memcg_kmem_enabled() is different: it always returns false until the first non-root memory cgroup will get online (assuming the kernel memory accounting is enabled). It's goal is to improve the performance on systems without the cgroupfs mounted/memory controller enabled or on the systems with only the root memory cgroup. To make things more obvious and avoid potential bugs, let's rename memcg_kmem_enabled() to memcg_kmem_online(). Link: https://lkml.kernel.org/r/20230213192922.1146370-1-roman.gushchin@linux.dev Signed-off-by: Roman Gushchin Acked-by: Muchun Song Acked-by: Michal Hocko Cc: Johannes Weiner Cc: Shakeel Butt Cc: Dennis Zhou Signed-off-by: Andrew Morton --- mm/memcontrol.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 17335459d8dc..3e3cdb9bed95 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -345,8 +345,8 @@ static void memcg_reparent_objcgs(struct mem_cgroup *memcg, * conditional to this static branch, we'll have to allow modules that does * kmem_cache_alloc and the such to see this symbol as well */ -DEFINE_STATIC_KEY_FALSE(memcg_kmem_enabled_key); -EXPORT_SYMBOL(memcg_kmem_enabled_key); +DEFINE_STATIC_KEY_FALSE(memcg_kmem_online_key); +EXPORT_SYMBOL(memcg_kmem_online_key); #endif /** @@ -3034,7 +3034,7 @@ struct obj_cgroup *get_obj_cgroup_from_page(struct page *page) { struct obj_cgroup *objcg; - if (!memcg_kmem_enabled()) + if (!memcg_kmem_online()) return NULL; if (PageMemcgKmem(page)) { @@ -3746,7 +3746,7 @@ static int memcg_online_kmem(struct mem_cgroup *memcg) objcg->memcg = memcg; rcu_assign_pointer(memcg->objcg, objcg); - static_branch_enable(&memcg_kmem_enabled_key); + static_branch_enable(&memcg_kmem_online_key); memcg->kmemcg_id = memcg->id.id; -- cgit v1.2.3 From f7f9c00dfafffd7a5a1a5685e2d874c64913e2ed Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Wed, 15 Feb 2023 18:39:35 +0800 Subject: mm: change to return bool for isolate_lru_page() The isolate_lru_page() can only return 0 or -EBUSY, and most users did not care about the negative error of isolate_lru_page(), except one user in add_page_for_migration(). So we can convert the isolate_lru_page() to return a boolean value, which can help to make the code more clear when checking the return value of isolate_lru_page(). Also convert all users' logic of checking the isolation state. No functional changes intended. Link: https://lkml.kernel.org/r/3074c1ab628d9dbf139b33f248a8bc253a3f95f0.1676424378.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Acked-by: David Hildenbrand Reviewed-by: Matthew Wilcox (Oracle) Acked-by: Linus Torvalds Reviewed-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/memcontrol.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 3e3cdb9bed95..25f2465d5a37 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -6176,7 +6176,7 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, target_type = get_mctgt_type_thp(vma, addr, *pmd, &target); if (target_type == MC_TARGET_PAGE) { page = target.page; - if (!isolate_lru_page(page)) { + if (isolate_lru_page(page)) { if (!mem_cgroup_move_account(page, true, mc.from, mc.to)) { mc.precharge -= HPAGE_PMD_NR; @@ -6226,7 +6226,7 @@ retry: */ if (PageTransCompound(page)) goto put; - if (!device && isolate_lru_page(page)) + if (!device && !isolate_lru_page(page)) goto put; if (!mem_cgroup_move_account(page, false, mc.from, mc.to)) { -- cgit v1.2.3