From d8f5f7e445f02eb10dee1a0a992146314cf460f8 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Tue, 29 Aug 2023 14:37:34 -0700 Subject: hugetlb: set hugetlb page flag before optimizing vmemmap Currently, vmemmap optimization of hugetlb pages is performed before the hugetlb flag (previously hugetlb destructor) is set identifying it as a hugetlb folio. This means there is a window of time where an ordinary folio does not have all associated vmemmap present. The core mm only expects vmemmap to be potentially optimized for hugetlb and device dax. This can cause problems in code such as memory error handling that may want to write to tail struct pages. There is only one call to perform hugetlb vmemmap optimization today. To fix this issue, simply set the hugetlb flag before that call. There was a similar issue in the free hugetlb path that was previously addressed. The two routines that optimize or restore hugetlb vmemmap should only be passed hugetlb folios/pages. To catch any callers not following this rule, add VM_WARN_ON calls to the routines. In the hugetlb free code paths, some calls could be made to restore vmemmap after clearing the hugetlb flag. This was 'safe' as in these cases vmemmap was already present and the call was a NOOP. However, for consistency these calls where eliminated so that we can add the VM_WARN_ON checks. Link: https://lkml.kernel.org/r/20230829213734.69673-1-mike.kravetz@oracle.com Fixes: f41f2ed43ca5 ("mm: hugetlb: free the vmemmap pages associated with each HugeTLB page") Signed-off-by: Mike Kravetz Reviewed-by: Muchun Song Cc: James Houghton Cc: Miaohe Lin Cc: Michal Hocko Cc: Naoya Horiguchi Cc: Usama Arif Signed-off-by: Andrew Morton --- mm/hugetlb.c | 31 ++++++++++++++++++++++--------- 1 file changed, 22 insertions(+), 9 deletions(-) (limited to 'mm/hugetlb.c') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 52d26072dfda..afce9037f9ee 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1720,7 +1720,12 @@ static void __update_and_free_hugetlb_folio(struct hstate *h, if (folio_test_hugetlb_raw_hwp_unreliable(folio)) return; - if (hugetlb_vmemmap_restore(h, &folio->page)) { + /* + * If folio is not vmemmap optimized (!clear_dtor), then the folio + * is no longer identified as a hugetlb page. hugetlb_vmemmap_restore + * can only be passed hugetlb pages and will BUG otherwise. + */ + if (clear_dtor && hugetlb_vmemmap_restore(h, &folio->page)) { spin_lock_irq(&hugetlb_lock); /* * If we cannot allocate vmemmap pages, just refuse to free the @@ -1930,9 +1935,9 @@ static void __prep_account_new_huge_page(struct hstate *h, int nid) static void __prep_new_hugetlb_folio(struct hstate *h, struct folio *folio) { + folio_set_hugetlb(folio); hugetlb_vmemmap_optimize(h, &folio->page); INIT_LIST_HEAD(&folio->lru); - folio_set_hugetlb(folio); hugetlb_set_folio_subpool(folio, NULL); set_hugetlb_cgroup(folio, NULL); set_hugetlb_cgroup_rsvd(folio, NULL); @@ -3580,13 +3585,21 @@ static int demote_free_hugetlb_folio(struct hstate *h, struct folio *folio) remove_hugetlb_folio_for_demote(h, folio, false); spin_unlock_irq(&hugetlb_lock); - rc = hugetlb_vmemmap_restore(h, &folio->page); - if (rc) { - /* Allocation of vmemmmap failed, we can not demote folio */ - spin_lock_irq(&hugetlb_lock); - folio_ref_unfreeze(folio, 1); - add_hugetlb_folio(h, folio, false); - return rc; + /* + * If vmemmap already existed for folio, the remove routine above would + * have cleared the hugetlb folio flag. Hence the folio is technically + * no longer a hugetlb folio. hugetlb_vmemmap_restore can only be + * passed hugetlb folios and will BUG otherwise. + */ + if (folio_test_hugetlb(folio)) { + rc = hugetlb_vmemmap_restore(h, &folio->page); + if (rc) { + /* Allocation of vmemmmap failed, we can not demote folio */ + spin_lock_irq(&hugetlb_lock); + folio_ref_unfreeze(folio, 1); + add_hugetlb_folio(h, folio, false); + return rc; + } } /* -- cgit v1.2.3 From b72b3c9c34c825c81d205241c5f822fc7835923f Mon Sep 17 00:00:00 2001 From: Xueshi Hu Date: Tue, 29 Aug 2023 11:33:43 +0800 Subject: mm/hugetlb: fix nodes huge page allocation when there are surplus pages In set_nr_huge_pages(), local variable "count" is used to record persistent_huge_pages(), but when it cames to nodes huge page allocation, the semantics changes to nr_huge_pages. When there exists surplus huge pages and using the interface under /sys/devices/system/node/node*/hugepages to change huge page pool size, this difference can result in the allocation of an unexpected number of huge pages. Steps to reproduce the bug: Starting with: Node 0 Node 1 Total HugePages_Total 0.00 0.00 0.00 HugePages_Free 0.00 0.00 0.00 HugePages_Surp 0.00 0.00 0.00 create 100 huge pages in Node 0 and consume it, then set Node 0 's nr_hugepages to 0. yields: Node 0 Node 1 Total HugePages_Total 200.00 0.00 200.00 HugePages_Free 0.00 0.00 0.00 HugePages_Surp 200.00 0.00 200.00 write 100 to Node 1's nr_hugepages echo 100 > /sys/devices/system/node/node1/\ hugepages/hugepages-2048kB/nr_hugepages gets: Node 0 Node 1 Total HugePages_Total 200.00 400.00 600.00 HugePages_Free 0.00 400.00 400.00 HugePages_Surp 200.00 0.00 200.00 Kernel is expected to create only 100 huge pages and it gives 200. Link: https://lkml.kernel.org/r/20230829033343.467779-1-xueshi.hu@smartx.com Fixes: 9a30523066cd ("hugetlb: add per node hstate attributes") Signed-off-by: Xueshi Hu Reviewed-by: Mike Kravetz Cc: Andi Kleen Cc: Lee Schermerhorn Cc: Mel Gorman Cc: Muchun Song Signed-off-by: Andrew Morton --- mm/hugetlb.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'mm/hugetlb.c') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index afce9037f9ee..7c90c43574a6 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3457,7 +3457,9 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid, if (nid != NUMA_NO_NODE) { unsigned long old_count = count; - count += h->nr_huge_pages - h->nr_huge_pages_node[nid]; + count += persistent_huge_pages(h) - + (h->nr_huge_pages_node[nid] - + h->surplus_huge_pages_node[nid]); /* * User may have specified a large count value which caused the * above calculation to overflow. In this case, they wanted -- cgit v1.2.3 From 426056efe835cf4864ccf4c328fe3af9146fc539 Mon Sep 17 00:00:00 2001 From: Zi Yan Date: Wed, 13 Sep 2023 16:12:45 -0400 Subject: mm/hugetlb: use nth_page() in place of direct struct page manipulation When dealing with hugetlb pages, manipulating struct page pointers directly can get to wrong struct page, since struct page is not guaranteed to be contiguous on SPARSEMEM without VMEMMAP. Use nth_page() to handle it properly. A wrong or non-existing page might be tried to be grabbed, either leading to a non freeable page or kernel memory access errors. No bug is reported. It comes from code inspection. Link: https://lkml.kernel.org/r/20230913201248.452081-3-zi.yan@sent.com Fixes: 57a196a58421 ("hugetlb: simplify hugetlb handling in follow_page_mask") Signed-off-by: Zi Yan Reviewed-by: Muchun Song Cc: David Hildenbrand Cc: Matthew Wilcox (Oracle) Cc: Mike Kravetz Cc: Mike Rapoport (IBM) Cc: Thomas Bogendoerfer Cc: Signed-off-by: Andrew Morton --- mm/hugetlb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm/hugetlb.c') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 7c90c43574a6..a945efe2858a 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -6493,7 +6493,7 @@ struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma, } } - page += ((address & ~huge_page_mask(h)) >> PAGE_SHIFT); + page = nth_page(page, ((address & ~huge_page_mask(h)) >> PAGE_SHIFT)); /* * Note that page may be a sub-page, and with vmemmap -- cgit v1.2.3 From fde1c4ecf91640e5a95ec36b71ec2e8ec379ce40 Mon Sep 17 00:00:00 2001 From: Usama Arif Date: Wed, 13 Sep 2023 11:54:01 +0100 Subject: mm: hugetlb: skip initialization of gigantic tail struct pages if freed by HVO The new boot flow when it comes to initialization of gigantic pages is as follows: - At boot time, for a gigantic page during __alloc_bootmem_hugepage, the region after the first struct page is marked as noinit. - This results in only the first struct page to be initialized in reserve_bootmem_region. As the tail struct pages are not initialized at this point, there can be a significant saving in boot time if HVO succeeds later on. - Later on in the boot, the head page is prepped and the first HUGETLB_VMEMMAP_RESERVE_SIZE / sizeof(struct page) - 1 tail struct pages are initialized. - HVO is attempted. If it is not successful, then the rest of the tail struct pages are initialized. If it is successful, no more tail struct pages need to be initialized saving significant boot time. The WARN_ON for increased ref count in gather_bootmem_prealloc was changed to a VM_BUG_ON. This is OK as there should be no speculative references this early in boot process. The VM_BUG_ON's are there just in case such code is introduced. [akpm@linux-foundation.org: make it nicer for 80 cols] Link: https://lkml.kernel.org/r/20230913105401.519709-5-usama.arif@bytedance.com Signed-off-by: Usama Arif Reviewed-by: Muchun Song Reviewed-by: Mike Kravetz Cc: Fam Zheng Cc: Mike Rapoport (IBM) Cc: Punit Agrawal Signed-off-by: Andrew Morton --- mm/hugetlb.c | 68 ++++++++++++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 59 insertions(+), 9 deletions(-) (limited to 'mm/hugetlb.c') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index a945efe2858a..4e276466d6aa 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3169,6 +3169,16 @@ int __alloc_bootmem_huge_page(struct hstate *h, int nid) } found: + + /* + * Only initialize the head struct page in memmap_init_reserved_pages, + * rest of the struct pages will be initialized by the HugeTLB + * subsystem itself. + * The head struct page is used to get folio information by the HugeTLB + * subsystem like zone id and node id. + */ + memblock_reserved_mark_noinit(virt_to_phys((void *)m + PAGE_SIZE), + huge_page_size(h) - PAGE_SIZE); /* Put them into a private list first because mem_map is not up yet */ INIT_LIST_HEAD(&m->list); list_add(&m->list, &huge_boot_pages); @@ -3176,6 +3186,43 @@ found: return 1; } +/* Initialize [start_page:end_page_number] tail struct pages of a hugepage */ +static void __init hugetlb_folio_init_tail_vmemmap(struct folio *folio, + unsigned long start_page_number, + unsigned long end_page_number) +{ + enum zone_type zone = zone_idx(folio_zone(folio)); + int nid = folio_nid(folio); + unsigned long head_pfn = folio_pfn(folio); + unsigned long pfn, end_pfn = head_pfn + end_page_number; + int ret; + + for (pfn = head_pfn + start_page_number; pfn < end_pfn; pfn++) { + struct page *page = pfn_to_page(pfn); + + __init_single_page(page, pfn, zone, nid); + prep_compound_tail((struct page *)folio, pfn - head_pfn); + ret = page_ref_freeze(page, 1); + VM_BUG_ON(!ret); + } +} + +static void __init hugetlb_folio_init_vmemmap(struct folio *folio, + struct hstate *h, + unsigned long nr_pages) +{ + int ret; + + /* Prepare folio head */ + __folio_clear_reserved(folio); + __folio_set_head(folio); + ret = page_ref_freeze(&folio->page, 1); + VM_BUG_ON(!ret); + /* Initialize the necessary tail struct pages */ + hugetlb_folio_init_tail_vmemmap(folio, 1, nr_pages); + prep_compound_head((struct page *)folio, huge_page_order(h)); +} + /* * Put bootmem huge pages into the standard lists after mem_map is up. * Note: This only applies to gigantic (order > MAX_ORDER) pages. @@ -3186,19 +3233,21 @@ static void __init gather_bootmem_prealloc(void) list_for_each_entry(m, &huge_boot_pages, list) { struct page *page = virt_to_page(m); - struct folio *folio = page_folio(page); + struct folio *folio = (void *)page; struct hstate *h = m->hstate; VM_BUG_ON(!hstate_is_gigantic(h)); WARN_ON(folio_ref_count(folio) != 1); - if (prep_compound_gigantic_folio(folio, huge_page_order(h))) { - WARN_ON(folio_test_reserved(folio)); - prep_new_hugetlb_folio(h, folio, folio_nid(folio)); - free_huge_folio(folio); /* add to the hugepage allocator */ - } else { - /* VERY unlikely inflated ref count on a tail page */ - free_gigantic_folio(folio, huge_page_order(h)); - } + + hugetlb_folio_init_vmemmap(folio, h, + HUGETLB_VMEMMAP_RESERVE_PAGES); + prep_new_hugetlb_folio(h, folio, folio_nid(folio)); + /* If HVO fails, initialize all tail struct pages */ + if (!HPageVmemmapOptimized(&folio->page)) + hugetlb_folio_init_tail_vmemmap(folio, + HUGETLB_VMEMMAP_RESERVE_PAGES, + pages_per_huge_page(h)); + free_huge_folio(folio); /* add to the hugepage allocator */ /* * We need to restore the 'stolen' pages to totalram_pages @@ -3209,6 +3258,7 @@ static void __init gather_bootmem_prealloc(void) cond_resched(); } } + static void __init hugetlb_hstate_alloc_pages_onenode(struct hstate *h, int nid) { unsigned long i; -- cgit v1.2.3 From 3ec145f9d01e799bc7e41277e571141546b850f3 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 24 Aug 2023 15:13:23 +0100 Subject: hugetlb: use a folio in free_hpage_workfn() Patch series "Small hugetlb cleanups", v2. Some trivial folio conversions This patch (of 3): update_and_free_hugetlb_folio puts the memory on hpage_freelist as a folio so we can take it off the list as a folio. Link: https://lkml.kernel.org/r/20230824141325.2704553-1-willy@infradead.org Link: https://lkml.kernel.org/r/20230824141325.2704553-2-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Mike Kravetz Reviewed-by: Muchun Song Cc: Sidhartha Kumar Signed-off-by: Andrew Morton --- mm/hugetlb.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'mm/hugetlb.c') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 4e276466d6aa..7a6fe4f13777 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1787,22 +1787,22 @@ static void free_hpage_workfn(struct work_struct *work) node = llist_del_all(&hpage_freelist); while (node) { - struct page *page; + struct folio *folio; struct hstate *h; - page = container_of((struct address_space **)node, - struct page, mapping); + folio = container_of((struct address_space **)node, + struct folio, mapping); node = node->next; - page->mapping = NULL; + folio->mapping = NULL; /* * The VM_BUG_ON_FOLIO(!folio_test_hugetlb(folio), folio) in * folio_hstate() is going to trigger because a previous call to * remove_hugetlb_folio() will clear the hugetlb bit, so do * not use folio_hstate() directly. */ - h = size_to_hstate(page_size(page)); + h = size_to_hstate(folio_size(folio)); - __update_and_free_hugetlb_folio(h, page_folio(page)); + __update_and_free_hugetlb_folio(h, folio); cond_resched(); } -- cgit v1.2.3 From 04bbfd844b99c7c89a344236d3758e7a2d41572f Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 24 Aug 2023 15:13:24 +0100 Subject: hugetlb: remove a few calls to page_folio() Anything found on a linked list threaded through ->lru is guaranteed to be a folio as the compound_head found in a tail page overlaps the ->lru member of struct page. So we can pull folios directly off these lists no matter whether pages or folios were added to the list. Link: https://lkml.kernel.org/r/20230824141325.2704553-3-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Mike Kravetz Reviewed-by: Muchun Song Cc: Sidhartha Kumar Signed-off-by: Andrew Morton --- mm/hugetlb.c | 26 +++++++++++--------------- 1 file changed, 11 insertions(+), 15 deletions(-) (limited to 'mm/hugetlb.c') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 7a6fe4f13777..e91ce966a2c9 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1836,11 +1836,9 @@ static void update_and_free_hugetlb_folio(struct hstate *h, struct folio *folio, static void update_and_free_pages_bulk(struct hstate *h, struct list_head *list) { - struct page *page, *t_page; - struct folio *folio; + struct folio *folio, *t_folio; - list_for_each_entry_safe(page, t_page, list, lru) { - folio = page_folio(page); + list_for_each_entry_safe(folio, t_folio, list, lru) { update_and_free_hugetlb_folio(h, folio, false); cond_resched(); } @@ -2229,8 +2227,7 @@ static struct page *remove_pool_huge_page(struct hstate *h, bool acct_surplus) { int nr_nodes, node; - struct page *page = NULL; - struct folio *folio; + struct folio *folio = NULL; lockdep_assert_held(&hugetlb_lock); for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) { @@ -2240,15 +2237,14 @@ static struct page *remove_pool_huge_page(struct hstate *h, */ if ((!acct_surplus || h->surplus_huge_pages_node[node]) && !list_empty(&h->hugepage_freelists[node])) { - page = list_entry(h->hugepage_freelists[node].next, - struct page, lru); - folio = page_folio(page); + folio = list_entry(h->hugepage_freelists[node].next, + struct folio, lru); remove_hugetlb_folio(h, folio, acct_surplus); break; } } - return page; + return &folio->page; } /* @@ -3414,15 +3410,15 @@ static void try_to_free_low(struct hstate *h, unsigned long count, * Collect pages to be freed on a list, and free after dropping lock */ for_each_node_mask(i, *nodes_allowed) { - struct page *page, *next; + struct folio *folio, *next; struct list_head *freel = &h->hugepage_freelists[i]; - list_for_each_entry_safe(page, next, freel, lru) { + list_for_each_entry_safe(folio, next, freel, lru) { if (count >= h->nr_huge_pages) goto out; - if (PageHighMem(page)) + if (folio_test_highmem(folio)) continue; - remove_hugetlb_folio(h, page_folio(page), false); - list_add(&page->lru, &page_list); + remove_hugetlb_folio(h, folio, false); + list_add(&folio->lru, &page_list); } } -- cgit v1.2.3 From d5b43e9683ec5c78524f9ae93c1824edcbc1f08d Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 24 Aug 2023 15:13:25 +0100 Subject: hugetlb: convert remove_pool_huge_page() to remove_pool_hugetlb_folio() Convert the callers to expect a folio and remove the unnecesary conversion back to a struct page. Link: https://lkml.kernel.org/r/20230824141325.2704553-4-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Mike Kravetz Reviewed-by: Muchun Song Cc: Sidhartha Kumar Signed-off-by: Andrew Morton --- mm/hugetlb.c | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) (limited to 'mm/hugetlb.c') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index e91ce966a2c9..de220e3ff8be 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1446,7 +1446,7 @@ static int hstate_next_node_to_alloc(struct hstate *h, } /* - * helper for remove_pool_huge_page() - return the previously saved + * helper for remove_pool_hugetlb_folio() - return the previously saved * node ["this node"] from which to free a huge page. Advance the * next node id whether or not we find a free huge page to free so * that the next attempt to free addresses the next node. @@ -2222,9 +2222,8 @@ static int alloc_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed, * an additional call to free the page to low level allocators. * Called with hugetlb_lock locked. */ -static struct page *remove_pool_huge_page(struct hstate *h, - nodemask_t *nodes_allowed, - bool acct_surplus) +static struct folio *remove_pool_hugetlb_folio(struct hstate *h, + nodemask_t *nodes_allowed, bool acct_surplus) { int nr_nodes, node; struct folio *folio = NULL; @@ -2244,7 +2243,7 @@ static struct page *remove_pool_huge_page(struct hstate *h, } } - return &folio->page; + return folio; } /* @@ -2598,7 +2597,6 @@ static void return_unused_surplus_pages(struct hstate *h, unsigned long unused_resv_pages) { unsigned long nr_pages; - struct page *page; LIST_HEAD(page_list); lockdep_assert_held(&hugetlb_lock); @@ -2619,15 +2617,17 @@ static void return_unused_surplus_pages(struct hstate *h, * evenly across all nodes with memory. Iterate across these nodes * until we can no longer free unreserved surplus pages. This occurs * when the nodes with surplus pages have no free pages. - * remove_pool_huge_page() will balance the freed pages across the + * remove_pool_hugetlb_folio() will balance the freed pages across the * on-line nodes with memory and will handle the hstate accounting. */ while (nr_pages--) { - page = remove_pool_huge_page(h, &node_states[N_MEMORY], 1); - if (!page) + struct folio *folio; + + folio = remove_pool_hugetlb_folio(h, &node_states[N_MEMORY], 1); + if (!folio) goto out; - list_add(&page->lru, &page_list); + list_add(&folio->lru, &page_list); } out: @@ -3472,7 +3472,6 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid, nodemask_t *nodes_allowed) { unsigned long min_count, ret; - struct page *page; LIST_HEAD(page_list); NODEMASK_ALLOC(nodemask_t, node_alloc_noretry, GFP_KERNEL); @@ -3594,11 +3593,13 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid, * Collect pages to be removed on list without dropping lock */ while (min_count < persistent_huge_pages(h)) { - page = remove_pool_huge_page(h, nodes_allowed, 0); - if (!page) + struct folio *folio; + + folio = remove_pool_hugetlb_folio(h, nodes_allowed, 0); + if (!folio) break; - list_add(&page->lru, &page_list); + list_add(&folio->lru, &page_list); } /* free the pages after dropping lock */ spin_unlock_irq(&hugetlb_lock); -- cgit v1.2.3 From a08c7193e4f18dc8508f2d07d0de2c5b94cb39a3 Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Tue, 26 Sep 2023 12:20:17 -0700 Subject: mm/filemap: remove hugetlb special casing in filemap.c Remove special cased hugetlb handling code within the page cache by changing the granularity of ->index to the base page size rather than the huge page size. The motivation of this patch is to reduce complexity within the filemap code while also increasing performance by removing branches that are evaluated on every page cache lookup. To support the change in index, new wrappers for hugetlb page cache interactions are added. These wrappers perform the conversion to a linear index which is now expected by the page cache for huge pages. ========================= PERFORMANCE ====================================== Perf was used to check the performance differences after the patch. Overall the performance is similar to mainline with a very small larger overhead that occurs in __filemap_add_folio() and hugetlb_add_to_page_cache(). This is because of the larger overhead that occurs in xa_load() and xa_store() as the xarray is now using more entries to store hugetlb folios in the page cache. Timing aarch64 2MB Page Size 6.5-rc3 + this patch: [root@sidhakum-ol9-1 hugepages]# time fallocate -l 700GB test.txt real 1m49.568s user 0m0.000s sys 1m49.461s 6.5-rc3: [root]# time fallocate -l 700GB test.txt real 1m47.495s user 0m0.000s sys 1m47.370s 1GB Page Size 6.5-rc3 + this patch: [root@sidhakum-ol9-1 hugepages1G]# time fallocate -l 700GB test.txt real 1m47.024s user 0m0.000s sys 1m46.921s 6.5-rc3: [root@sidhakum-ol9-1 hugepages1G]# time fallocate -l 700GB test.txt real 1m44.551s user 0m0.000s sys 1m44.438s x86 2MB Page Size 6.5-rc3 + this patch: [root@sidhakum-ol9-2 hugepages]# time fallocate -l 100GB test.txt real 0m22.383s user 0m0.000s sys 0m22.255s 6.5-rc3: [opc@sidhakum-ol9-2 hugepages]$ time sudo fallocate -l 100GB /dev/hugepages/test.txt real 0m22.735s user 0m0.038s sys 0m22.567s 1GB Page Size 6.5-rc3 + this patch: [root@sidhakum-ol9-2 hugepages1GB]# time fallocate -l 100GB test.txt real 0m25.786s user 0m0.001s sys 0m25.589s 6.5-rc3: [root@sidhakum-ol9-2 hugepages1G]# time fallocate -l 100GB test.txt real 0m33.454s user 0m0.001s sys 0m33.193s aarch64: workload - fallocate a 700GB file backed by huge pages 6.5-rc3 + this patch: 2MB Page Size: --100.00%--__arm64_sys_fallocate ksys_fallocate vfs_fallocate hugetlbfs_fallocate | |--95.04%--__pi_clear_page | |--3.57%--clear_huge_page | | | |--2.63%--rcu_all_qs | | | --0.91%--__cond_resched | --0.67%--__cond_resched 0.17% 0.00% 0 fallocate [kernel.vmlinux] [k] hugetlb_add_to_page_cache 0.14% 0.10% 11 fallocate [kernel.vmlinux] [k] __filemap_add_folio 6.5-rc3 2MB Page Size: --100.00%--__arm64_sys_fallocate ksys_fallocate vfs_fallocate hugetlbfs_fallocate | |--94.91%--__pi_clear_page | |--4.11%--clear_huge_page | | | |--3.00%--rcu_all_qs | | | --1.10%--__cond_resched | --0.59%--__cond_resched 0.08% 0.01% 1 fallocate [kernel.kallsyms] [k] hugetlb_add_to_page_cache 0.05% 0.03% 3 fallocate [kernel.kallsyms] [k] __filemap_add_folio x86 workload - fallocate a 100GB file backed by huge pages 6.5-rc3 + this patch: 2MB Page Size: hugetlbfs_fallocate | --99.57%--clear_huge_page | --98.47%--clear_page_erms | --0.53%--asm_sysvec_apic_timer_interrupt 0.04% 0.04% 1 fallocate [kernel.kallsyms] [k] xa_load 0.04% 0.00% 0 fallocate [kernel.kallsyms] [k] hugetlb_add_to_page_cache 0.04% 0.00% 0 fallocate [kernel.kallsyms] [k] __filemap_add_folio 0.04% 0.00% 0 fallocate [kernel.kallsyms] [k] xas_store 6.5-rc3 2MB Page Size: --99.93%--__x64_sys_fallocate vfs_fallocate hugetlbfs_fallocate | --99.38%--clear_huge_page | |--98.40%--clear_page_erms | --0.59%--__cond_resched 0.03% 0.03% 1 fallocate [kernel.kallsyms] [k] __filemap_add_folio ========================= TESTING ====================================== This patch passes libhugetlbfs tests and LTP hugetlb tests ********** TEST SUMMARY * 2M * 32-bit 64-bit * Total testcases: 110 113 * Skipped: 0 0 * PASS: 107 113 * FAIL: 0 0 * Killed by signal: 3 0 * Bad configuration: 0 0 * Expected FAIL: 0 0 * Unexpected PASS: 0 0 * Test not present: 0 0 * Strange test result: 0 0 ********** Done executing testcases. LTP Version: 20220527-178-g2761a81c4 page migration was also tested using Mike Kravetz's test program.[8] [dan.carpenter@linaro.org: fix an NULL vs IS_ERR() bug] Link: https://lkml.kernel.org/r/1772c296-1417-486f-8eef-171af2192681@moroto.mountain Link: https://lkml.kernel.org/r/20230926192017.98183-1-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Signed-off-by: Dan Carpenter Reported-and-tested-by: syzbot+c225dea486da4d5592bd@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=c225dea486da4d5592bd Cc: Matthew Wilcox (Oracle) Cc: Mike Kravetz Cc: Muchun Song Signed-off-by: Andrew Morton --- mm/hugetlb.c | 32 ++++++-------------------------- 1 file changed, 6 insertions(+), 26 deletions(-) (limited to 'mm/hugetlb.c') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index de220e3ff8be..0aaf28ce32e5 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -952,7 +952,7 @@ static long region_count(struct resv_map *resv, long f, long t) /* * Convert the address within this vma to the page offset within - * the mapping, in pagecache page units; huge pages here. + * the mapping, huge page units here. */ static pgoff_t vma_hugecache_offset(struct hstate *h, struct vm_area_struct *vma, unsigned long address) @@ -961,13 +961,6 @@ static pgoff_t vma_hugecache_offset(struct hstate *h, (vma->vm_pgoff >> huge_page_order(h)); } -pgoff_t linear_hugepage_index(struct vm_area_struct *vma, - unsigned long address) -{ - return vma_hugecache_offset(hstate_vma(vma), vma, address); -} -EXPORT_SYMBOL_GPL(linear_hugepage_index); - /** * vma_kernel_pagesize - Page size granularity for this VMA. * @vma: The user mapping. @@ -2074,20 +2067,6 @@ struct address_space *hugetlb_page_mapping_lock_write(struct page *hpage) return NULL; } -pgoff_t hugetlb_basepage_index(struct page *page) -{ - struct page *page_head = compound_head(page); - pgoff_t index = page_index(page_head); - unsigned long compound_idx; - - if (compound_order(page_head) > MAX_ORDER) - compound_idx = page_to_pfn(page) - page_to_pfn(page_head); - else - compound_idx = page - page_head; - - return (index << compound_order(page_head)) + compound_idx; -} - static struct folio *alloc_buddy_hugetlb_folio(struct hstate *h, gfp_t gfp_mask, int nid, nodemask_t *nmask, nodemask_t *node_alloc_noretry) @@ -5772,7 +5751,7 @@ static bool hugetlbfs_pagecache_present(struct hstate *h, struct vm_area_struct *vma, unsigned long address) { struct address_space *mapping = vma->vm_file->f_mapping; - pgoff_t idx = vma_hugecache_offset(h, vma, address); + pgoff_t idx = linear_page_index(vma, address); struct folio *folio; folio = filemap_get_folio(mapping, idx); @@ -5789,6 +5768,7 @@ int hugetlb_add_to_page_cache(struct folio *folio, struct address_space *mapping struct hstate *h = hstate_inode(inode); int err; + idx <<= huge_page_order(h); __folio_set_locked(folio); err = __filemap_add_folio(mapping, folio, idx, GFP_KERNEL, NULL); @@ -5896,7 +5876,7 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, * before we get page_table_lock. */ new_folio = false; - folio = filemap_lock_folio(mapping, idx); + folio = filemap_lock_hugetlb_folio(h, mapping, idx); if (IS_ERR(folio)) { size = i_size_read(mapping->host) >> huge_page_shift(h); if (idx >= size) @@ -6205,7 +6185,7 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, /* Just decrements count, does not deallocate */ vma_end_reservation(h, vma, haddr); - pagecache_folio = filemap_lock_folio(mapping, idx); + pagecache_folio = filemap_lock_hugetlb_folio(h, mapping, idx); if (IS_ERR(pagecache_folio)) pagecache_folio = NULL; } @@ -6338,7 +6318,7 @@ int hugetlb_mfill_atomic_pte(pte_t *dst_pte, if (is_continue) { ret = -EFAULT; - folio = filemap_lock_folio(mapping, idx); + folio = filemap_lock_hugetlb_folio(h, mapping, idx); if (IS_ERR(folio)) goto out; folio_in_pagecache = true; -- cgit v1.2.3 From a48bf7b4757cd8de3497c2878536f46a8d2da65c Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Tue, 26 Sep 2023 10:44:33 -0700 Subject: mm/hugetlb: replace page_ref_freeze() with folio_ref_freeze() in hugetlb_folio_init_vmemmap() No functional difference, folio_ref_freeze() is currently a wrapper for page_ref_freeze(). Link: https://lkml.kernel.org/r/20230926174433.81241-1-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Reviewed-by: Muchun Song Cc: Matthew Wilcox (Oracle) Cc: Mike Kravetz Cc: Usama Arif Signed-off-by: Andrew Morton --- mm/hugetlb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm/hugetlb.c') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 0aaf28ce32e5..36b40bc9ac25 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3191,7 +3191,7 @@ static void __init hugetlb_folio_init_vmemmap(struct folio *folio, /* Prepare folio head */ __folio_clear_reserved(folio); __folio_set_head(folio); - ret = page_ref_freeze(&folio->page, 1); + ret = folio_ref_freeze(folio, 1); VM_BUG_ON(!ret); /* Initialize the necessary tail struct pages */ hugetlb_folio_init_tail_vmemmap(folio, 1, nr_pages); -- cgit v1.2.3 From 30a89adf872d2e46323840964c95dc0ae3bb5843 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Mon, 16 Oct 2023 19:55:49 -0700 Subject: hugetlb: check for hugetlb folio before vmemmap_restore In commit d8f5f7e445f0 ("hugetlb: set hugetlb page flag before optimizing vmemmap") checks were added to print a warning if hugetlb_vmemmap_restore was called on a non-hugetlb page. This was mostly due to ordering issues in the hugetlb page set up and tear down sequencees. One place missed was the routine dissolve_free_huge_page. Naoya Horiguchi noted: "I saw that VM_WARN_ON_ONCE() in hugetlb_vmemmap_restore is triggered when memory_failure() is called on a free hugetlb page with vmemmap optimization disabled (the warning is not triggered if vmemmap optimization is enabled). I think that we need check folio_test_hugetlb() before dissolve_free_huge_page() calls hugetlb_vmemmap_restore_folio()." Perform the check as suggested by Naoya. Link: https://lkml.kernel.org/r/20231017032140.GA3680@monkey Fixes: d8f5f7e445f0 ("hugetlb: set hugetlb page flag before optimizing vmemmap") Signed-off-by: Mike Kravetz Suggested-by: Naoya Horiguchi Tested-by: Naoya Horiguchi Cc: Anshuman Khandual Cc: Barry Song Cc: David Hildenbrand Cc: David Rientjes Cc: Joao Martins Cc: Matthew Wilcox (Oracle) Cc: Miaohe Lin Cc: Michal Hocko Cc: Muchun Song Cc: Oscar Salvador Cc: Xiongchun Duan Signed-off-by: Andrew Morton --- mm/hugetlb.c | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) (limited to 'mm/hugetlb.c') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 3fb3a5232173..dfb4534834f5 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2322,17 +2322,23 @@ retry: * need to adjust max_huge_pages if the page is not freed. * Attempt to allocate vmemmmap here so that we can take * appropriate action on failure. + * + * The folio_test_hugetlb check here is because + * remove_hugetlb_folio will clear hugetlb folio flag for + * non-vmemmap optimized hugetlb folios. */ - rc = hugetlb_vmemmap_restore(h, &folio->page); - if (!rc) { - update_and_free_hugetlb_folio(h, folio, false); - } else { - spin_lock_irq(&hugetlb_lock); - add_hugetlb_folio(h, folio, false); - h->max_huge_pages++; - spin_unlock_irq(&hugetlb_lock); - } + if (folio_test_hugetlb(folio)) { + rc = hugetlb_vmemmap_restore(h, &folio->page); + if (rc) { + spin_lock_irq(&hugetlb_lock); + add_hugetlb_folio(h, folio, false); + h->max_huge_pages++; + goto out; + } + } else + rc = 0; + update_and_free_hugetlb_folio(h, folio, false); return rc; } out: -- cgit v1.2.3 From d61ea1cb009532dcbd77a9d44b812704cec60146 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Mon, 21 Aug 2023 19:15:13 +0500 Subject: userfaultfd: UFFD_FEATURE_WP_ASYNC MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Patch series "Implement IOCTL to get and optionally clear info about PTEs", v33. *Motivation* The real motivation for adding PAGEMAP_SCAN IOCTL is to emulate Windows GetWriteWatch() and ResetWriteWatch() syscalls [1]. The GetWriteWatch() retrieves the addresses of the pages that are written to in a region of virtual memory. This syscall is used in Windows applications and games etc. This syscall is being emulated in pretty slow manner in userspace. Our purpose is to enhance the kernel such that we translate it efficiently in a better way. Currently some out of tree hack patches are being used to efficiently emulate it in some kernels. We intend to replace those with these patches. So the whole gaming on Linux can effectively get benefit from this. It means there would be tons of users of this code. CRIU use case [2] was mentioned by Andrei and Danylo: > Use cases for migrating sparse VMAs are binaries sanitized with ASAN, > MSAN or TSAN [3]. All of these sanitizers produce sparse mappings of > shadow memory [4]. Being able to migrate such binaries allows to highly > reduce the amount of work needed to identify and fix post-migration > crashes, which happen constantly. Andrei defines the following uses of this code: * it is more granular and allows us to track changed pages more effectively. The current interface can clear dirty bits for the entire process only. In addition, reading info about pages is a separate operation. It means we must freeze the process to read information about all its pages, reset dirty bits, only then we can start dumping pages. The information about pages becomes more and more outdated, while we are processing pages. The new interface solves both these downsides. First, it allows us to read pte bits and clear the soft-dirty bit atomically. It means that CRIU will not need to freeze processes to pre-dump their memory. Second, it clears soft-dirty bits for a specified region of memory. It means CRIU will have actual info about pages to the moment of dumping them. * The new interface has to be much faster because basic page filtering is happening in the kernel. With the old interface, we have to read pagemap for each page. *Implementation Evolution (Short Summary)* From the definition of GetWriteWatch(), we feel like kernel's soft-dirty feature can be used under the hood with some additions like: * reset soft-dirty flag for only a specific region of memory instead of clearing the flag for the entire process * get and clear soft-dirty flag for a specific region atomically So we decided to use ioctl on pagemap file to read or/and reset soft-dirty flag. But using soft-dirty flag, sometimes we get extra pages which weren't even written. They had become soft-dirty because of VMA merging and VM_SOFTDIRTY flag. This breaks the definition of GetWriteWatch(). We were able to by-pass this short coming by ignoring VM_SOFTDIRTY until David reported that mprotect etc messes up the soft-dirty flag while ignoring VM_SOFTDIRTY [5]. This wasn't happening until [6] got introduced. We discussed if we can revert these patches. But we could not reach to any conclusion. So at this point, I made couple of tries to solve this whole VM_SOFTDIRTY issue by correcting the soft-dirty implementation: * [7] Correct the bug fixed wrongly back in 2014. It had potential to cause regression. We left it behind. * [8] Keep a list of soft-dirty part of a VMA across splits and merges. I got the reply don't increase the size of the VMA by 8 bytes. At this point, we left soft-dirty considering it is too much delicate and userfaultfd [9] seemed like the only way forward. From there onward, we have been basing soft-dirty emulation on userfaultfd wp feature where kernel resolves the faults itself when WP_ASYNC feature is used. It was straight forward to add WP_ASYNC feature in userfautlfd. Now we get only those pages dirty or written-to which are really written in reality. (PS There is another WP_UNPOPULATED userfautfd feature is required which is needed to avoid pre-faulting memory before write-protecting [9].) All the different masks were added on the request of CRIU devs to create interface more generic and better. [1] https://learn.microsoft.com/en-us/windows/win32/api/memoryapi/nf-memoryapi-getwritewatch [2] https://lore.kernel.org/all/20221014134802.1361436-1-mdanylo@google.com [3] https://github.com/google/sanitizers [4] https://github.com/google/sanitizers/wiki/AddressSanitizerAlgorithm#64-bit [5] https://lore.kernel.org/all/bfcae708-db21-04b4-0bbe-712badd03071@redhat.com [6] https://lore.kernel.org/all/20220725142048.30450-1-peterx@redhat.com/ [7] https://lore.kernel.org/all/20221122115007.2787017-1-usama.anjum@collabora.com [8] https://lore.kernel.org/all/20221220162606.1595355-1-usama.anjum@collabora.com [9] https://lore.kernel.org/all/20230306213925.617814-1-peterx@redhat.com [10] https://lore.kernel.org/all/20230125144529.1630917-1-mdanylo@google.com This patch (of 6): Add a new userfaultfd-wp feature UFFD_FEATURE_WP_ASYNC, that allows userfaultfd wr-protect faults to be resolved by the kernel directly. It can be used like a high accuracy version of soft-dirty, without vma modifications during tracking, and also with ranged support by default rather than for a whole mm when reset the protections due to existence of ioctl(UFFDIO_WRITEPROTECT). Several goals of such a dirty tracking interface: 1. All types of memory should be supported and tracable. This is nature for soft-dirty but should mention when the context is userfaultfd, because it used to only support anon/shmem/hugetlb. The problem is for a dirty tracking purpose these three types may not be enough, and it's legal to track anything e.g. any page cache writes from mmap. 2. Protections can be applied to partial of a memory range, without vma split/merge fuss. The hope is that the tracking itself should not affect any vma layout change. It also helps when reset happens because the reset will not need mmap write lock which can block the tracee. 3. Accuracy needs to be maintained. This means we need pte markers to work on any type of VMA. One could question that, the whole concept of async dirty tracking is not really close to fundamentally what userfaultfd used to be: it's not "a fault to be serviced by userspace" anymore. However, using userfaultfd-wp here as a framework is convenient for us in at least: 1. VM_UFFD_WP vma flag, which has a very good name to suite something like this, so we don't need VM_YET_ANOTHER_SOFT_DIRTY. Just use a new feature bit to identify from a sync version of uffd-wp registration. 2. PTE markers logic can be leveraged across the whole kernel to maintain the uffd-wp bit as long as an arch supports, this also applies to this case where uffd-wp bit will be a hint to dirty information and it will not go lost easily (e.g. when some page cache ptes got zapped). 3. Reuse ioctl(UFFDIO_WRITEPROTECT) interface for either starting or resetting a range of memory, while there's no counterpart in the old soft-dirty world, hence if this is wanted in a new design we'll need a new interface otherwise. We can somehow understand that commonality because uffd-wp was fundamentally a similar idea of write-protecting pages just like soft-dirty. This implementation allows WP_ASYNC to imply WP_UNPOPULATED, because so far WP_ASYNC seems to not usable if without WP_UNPOPULATE. This also gives us chance to modify impl of WP_ASYNC just in case it could be not depending on WP_UNPOPULATED anymore in the future kernels. It's also fine to imply that because both features will rely on PTE_MARKER_UFFD_WP config option, so they'll show up together (or both missing) in an UFFDIO_API probe. vma_can_userfault() now allows any VMA if the userfaultfd registration is only about async uffd-wp. So we can track dirty for all kinds of memory including generic file systems (like XFS, EXT4 or BTRFS). One trick worth mention in do_wp_page() is that we need to manually update vmf->orig_pte here because it can be used later with a pte_same() check - this path always has FAULT_FLAG_ORIG_PTE_VALID set in the flags. The major defect of this approach of dirty tracking is we need to populate the pgtables when tracking starts. Soft-dirty doesn't do it like that. It's unwanted in the case where the range of memory to track is huge and unpopulated (e.g., tracking updates on a 10G file with mmap() on top, without having any page cache installed yet). One way to improve this is to allow pte markers exist for larger than PTE level for PMD+. That will not change the interface if to implemented, so we can leave that for later. Link: https://lkml.kernel.org/r/20230821141518.870589-1-usama.anjum@collabora.com Link: https://lkml.kernel.org/r/20230821141518.870589-2-usama.anjum@collabora.com Signed-off-by: Peter Xu Co-developed-by: Muhammad Usama Anjum Signed-off-by: Muhammad Usama Anjum Cc: Alex Sierra Cc: Al Viro Cc: Andrei Vagin Cc: Axel Rasmussen Cc: Christian Brauner Cc: Cyrill Gorcunov Cc: Dan Williams Cc: David Hildenbrand Cc: Greg Kroah-Hartman Cc: Gustavo A. R. Silva Cc: "Liam R. Howlett" Cc: Matthew Wilcox Cc: Michal Miroslaw Cc: Mike Rapoport (IBM) Cc: Nadav Amit Cc: Pasha Tatashin Cc: Paul Gofman Cc: Shuah Khan Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Yang Shi Cc: Yun Zhou Cc: Michał Mirosław Signed-off-by: Andrew Morton --- mm/hugetlb.c | 32 +++++++++++++++++++------------- 1 file changed, 19 insertions(+), 13 deletions(-) (limited to 'mm/hugetlb.c') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index dfb4534834f5..bc654b36df9f 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -6247,21 +6247,27 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, /* Handle userfault-wp first, before trying to lock more pages */ if (userfaultfd_wp(vma) && huge_pte_uffd_wp(huge_ptep_get(ptep)) && (flags & FAULT_FLAG_WRITE) && !huge_pte_write(entry)) { - struct vm_fault vmf = { - .vma = vma, - .address = haddr, - .real_address = address, - .flags = flags, - }; + if (!userfaultfd_wp_async(vma)) { + struct vm_fault vmf = { + .vma = vma, + .address = haddr, + .real_address = address, + .flags = flags, + }; - spin_unlock(ptl); - if (pagecache_folio) { - folio_unlock(pagecache_folio); - folio_put(pagecache_folio); + spin_unlock(ptl); + if (pagecache_folio) { + folio_unlock(pagecache_folio); + folio_put(pagecache_folio); + } + hugetlb_vma_unlock_read(vma); + mutex_unlock(&hugetlb_fault_mutex_table[hash]); + return handle_userfault(&vmf, VM_UFFD_WP); } - hugetlb_vma_unlock_read(vma); - mutex_unlock(&hugetlb_fault_mutex_table[hash]); - return handle_userfault(&vmf, VM_UFFD_WP); + + entry = huge_pte_clear_uffd_wp(entry); + set_huge_pte_at(mm, haddr, ptep, entry); + /* Fallthrough to CoW */ } /* -- cgit v1.2.3 From 52526ca7fdb905a768a93f8faa418e9b988fc34b Mon Sep 17 00:00:00 2001 From: Muhammad Usama Anjum Date: Mon, 21 Aug 2023 19:15:14 +0500 Subject: fs/proc/task_mmu: implement IOCTL to get and optionally clear info about PTEs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The PAGEMAP_SCAN IOCTL on the pagemap file can be used to get or optionally clear the info about page table entries. The following operations are supported in this IOCTL: - Scan the address range and get the memory ranges matching the provided criteria. This is performed when the output buffer is specified. - Write-protect the pages. The PM_SCAN_WP_MATCHING is used to write-protect the pages of interest. The PM_SCAN_CHECK_WPASYNC aborts the operation if non-Async Write Protected pages are found. The ``PM_SCAN_WP_MATCHING`` can be used with or without PM_SCAN_CHECK_WPASYNC. - Both of those operations can be combined into one atomic operation where we can get and write protect the pages as well. Following flags about pages are currently supported: - PAGE_IS_WPALLOWED - Page has async-write-protection enabled - PAGE_IS_WRITTEN - Page has been written to from the time it was write protected - PAGE_IS_FILE - Page is file backed - PAGE_IS_PRESENT - Page is present in the memory - PAGE_IS_SWAPPED - Page is in swapped - PAGE_IS_PFNZERO - Page has zero PFN - PAGE_IS_HUGE - Page is THP or Hugetlb backed This IOCTL can be extended to get information about more PTE bits. The entire address range passed by user [start, end) is scanned until either the user provided buffer is full or max_pages have been found. [akpm@linux-foundation.org: update it for "mm: hugetlb: add huge page size param to set_huge_pte_at()"] [akpm@linux-foundation.org: fix CONFIG_HUGETLB_PAGE=n warning] [arnd@arndb.de: hide unused pagemap_scan_backout_range() function] Link: https://lkml.kernel.org/r/20230927060257.2975412-1-arnd@kernel.org [sfr@canb.auug.org.au: fix "fs/proc/task_mmu: hide unused pagemap_scan_backout_range() function"] Link: https://lkml.kernel.org/r/20230928092223.0625c6bf@canb.auug.org.au Link: https://lkml.kernel.org/r/20230821141518.870589-3-usama.anjum@collabora.com Signed-off-by: Muhammad Usama Anjum Signed-off-by: Michał Mirosław Signed-off-by: Arnd Bergmann Signed-off-by: Stephen Rothwell Reviewed-by: Andrei Vagin Reviewed-by: Michał Mirosław Cc: Alex Sierra Cc: Al Viro Cc: Axel Rasmussen Cc: Christian Brauner Cc: Cyrill Gorcunov Cc: Dan Williams Cc: David Hildenbrand Cc: Greg Kroah-Hartman Cc: Gustavo A. R. Silva Cc: "Liam R. Howlett" Cc: Matthew Wilcox Cc: Michal Miroslaw Cc: Mike Rapoport (IBM) Cc: Nadav Amit Cc: Pasha Tatashin Cc: Paul Gofman Cc: Peter Xu Cc: Shuah Khan Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Yang Shi Cc: Yun Zhou Signed-off-by: Andrew Morton --- mm/hugetlb.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'mm/hugetlb.c') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index bc654b36df9f..2878e0e6bac5 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -5044,7 +5044,7 @@ bool is_hugetlb_entry_migration(pte_t pte) return false; } -static bool is_hugetlb_entry_hwpoisoned(pte_t pte) +bool is_hugetlb_entry_hwpoisoned(pte_t pte) { swp_entry_t swp; @@ -6266,7 +6266,8 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, } entry = huge_pte_clear_uffd_wp(entry); - set_huge_pte_at(mm, haddr, ptep, entry); + set_huge_pte_at(mm, haddr, ptep, entry, + huge_page_size(hstate_vma(vma))); /* Fallthrough to CoW */ } -- cgit v1.2.3 From 5ca432896a4ce6d69fffc3298b24c0dd9bdb871f Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 2 Oct 2023 16:29:47 +0200 Subject: mm/rmap: move SetPageAnonExclusive() out of page_move_anon_rmap() Patch series "mm/rmap: convert page_move_anon_rmap() to folio_move_anon_rmap()". Convert page_move_anon_rmap() to folio_move_anon_rmap(), letting the callers handle PageAnonExclusive. I'm including cleanup patch #3 because it fits into the picture and can be done cleaner by the conversion. This patch (of 3): Let's move it into the caller: there is a difference between whether an anon folio can only be mapped by one process (e.g., into one VMA), and whether it is truly exclusive (e.g., no references -- including GUP -- from other processes). Further, for large folios the page might not actually be pointing at the head page of the folio, so it better be handled in the caller. This is a preparation for converting page_move_anon_rmap() to consume a folio. Link: https://lkml.kernel.org/r/20231002142949.235104-1-david@redhat.com Link: https://lkml.kernel.org/r/20231002142949.235104-2-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Suren Baghdasaryan Reviewed-by: Vishal Moola (Oracle) Cc: Mike Kravetz Cc: Muchun Song Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- mm/hugetlb.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'mm/hugetlb.c') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 2878e0e6bac5..35d924f74972 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -5652,8 +5652,10 @@ retry_avoidcopy: * owner and can reuse this page. */ if (folio_mapcount(old_folio) == 1 && folio_test_anon(old_folio)) { - if (!PageAnonExclusive(&old_folio->page)) + if (!PageAnonExclusive(&old_folio->page)) { page_move_anon_rmap(&old_folio->page, vma); + SetPageAnonExclusive(&old_folio->page); + } if (likely(!unshare)) set_huge_ptep_writable(vma, haddr, ptep); -- cgit v1.2.3 From 069686255c16a75b6a796e42df47f5af27b496a4 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 2 Oct 2023 16:29:48 +0200 Subject: mm/rmap: convert page_move_anon_rmap() to folio_move_anon_rmap() Let's convert it to consume a folio. [akpm@linux-foundation.org: fix kerneldoc] Link: https://lkml.kernel.org/r/20231002142949.235104-3-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Suren Baghdasaryan Reviewed-by: Vishal Moola (Oracle) Cc: Mike Kravetz Cc: Muchun Song Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- mm/hugetlb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm/hugetlb.c') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 35d924f74972..7ad9d2159da4 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -5653,7 +5653,7 @@ retry_avoidcopy: */ if (folio_mapcount(old_folio) == 1 && folio_test_anon(old_folio)) { if (!PageAnonExclusive(&old_folio->page)) { - page_move_anon_rmap(&old_folio->page, vma); + folio_move_anon_rmap(old_folio, vma); SetPageAnonExclusive(&old_folio->page); } if (likely(!unshare)) -- cgit v1.2.3 From 59838b2566f6d03099743675a2e0f425813078c6 Mon Sep 17 00:00:00 2001 From: Frank van der Linden Date: Wed, 4 Oct 2023 15:32:48 +0000 Subject: mm, hugetlb: remove HUGETLB_CGROUP_MIN_ORDER Originally, hugetlb_cgroup was the only hugetlb user of tail page structure fields. So, the code defined and checked against HUGETLB_CGROUP_MIN_ORDER to make sure pages weren't too small to use. However, by now, tail page #2 is used to store hugetlb hwpoison and subpool information as well. In other words, without that tail page hugetlb doesn't work. Acknowledge this fact by getting rid of HUGETLB_CGROUP_MIN_ORDER and checks against it. Instead, just check for the minimum viable page order at hstate creation time. Link: https://lkml.kernel.org/r/20231004153248.3842997-1-fvdl@google.com Signed-off-by: Frank van der Linden Reviewed-by: Mike Kravetz Cc: Muchun Song Signed-off-by: Andrew Morton --- mm/hugetlb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm/hugetlb.c') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 7ad9d2159da4..e2b1c417b90a 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -4361,7 +4361,7 @@ void __init hugetlb_add_hstate(unsigned int order) return; } BUG_ON(hugetlb_max_hstate >= HUGE_MAX_HSTATE); - BUG_ON(order == 0); + BUG_ON(order < order_base_2(__NR_USED_SUBPAGE)); h = &hstates[hugetlb_max_hstate++]; mutex_init(&h->resize_lock); h->order = order; -- cgit v1.2.3 From 8cba9576df601c384abd334a503c3f6e1e29eefb Mon Sep 17 00:00:00 2001 From: Nhat Pham Date: Fri, 6 Oct 2023 11:46:28 -0700 Subject: hugetlb: memcg: account hugetlb-backed memory in memory controller Currently, hugetlb memory usage is not acounted for in the memory controller, which could lead to memory overprotection for cgroups with hugetlb-backed memory. This has been observed in our production system. For instance, here is one of our usecases: suppose there are two 32G containers. The machine is booted with hugetlb_cma=6G, and each container may or may not use up to 3 gigantic page, depending on the workload within it. The rest is anon, cache, slab, etc. We can set the hugetlb cgroup limit of each cgroup to 3G to enforce hugetlb fairness. But it is very difficult to configure memory.max to keep overall consumption, including anon, cache, slab etc. fair. What we have had to resort to is to constantly poll hugetlb usage and readjust memory.max. Similar procedure is done to other memory limits (memory.low for e.g). However, this is rather cumbersome and buggy. Furthermore, when there is a delay in memory limits correction, (for e.g when hugetlb usage changes within consecutive runs of the userspace agent), the system could be in an over/underprotected state. This patch rectifies this issue by charging the memcg when the hugetlb folio is utilized, and uncharging when the folio is freed (analogous to the hugetlb controller). Note that we do not charge when the folio is allocated to the hugetlb pool, because at this point it is not owned by any memcg. Some caveats to consider: * This feature is only available on cgroup v2. * There is no hugetlb pool management involved in the memory controller. As stated above, hugetlb folios are only charged towards the memory controller when it is used. Host overcommit management has to consider it when configuring hard limits. * Failure to charge towards the memcg results in SIGBUS. This could happen even if the hugetlb pool still has pages (but the cgroup limit is hit and reclaim attempt fails). * When this feature is enabled, hugetlb pages contribute to memory reclaim protection. low, min limits tuning must take into account hugetlb memory. * Hugetlb pages utilized while this option is not selected will not be tracked by the memory controller (even if cgroup v2 is remounted later on). Link: https://lkml.kernel.org/r/20231006184629.155543-4-nphamcs@gmail.com Signed-off-by: Nhat Pham Acked-by: Johannes Weiner Cc: Frank van der Linden Cc: Michal Hocko Cc: Mike Kravetz Cc: Muchun Song Cc: Rik van Riel Cc: Roman Gushchin Cc: Shakeel Butt Cc: Shuah Khan Cc: Tejun heo Cc: Yosry Ahmed Cc: Zefan Li Signed-off-by: Andrew Morton --- mm/hugetlb.c | 35 ++++++++++++++++++++++++++++------- 1 file changed, 28 insertions(+), 7 deletions(-) (limited to 'mm/hugetlb.c') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index e2b1c417b90a..da6f85b7db88 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1927,6 +1927,7 @@ void free_huge_folio(struct folio *folio) pages_per_huge_page(h), folio); hugetlb_cgroup_uncharge_folio_rsvd(hstate_index(h), pages_per_huge_page(h), folio); + mem_cgroup_uncharge(folio); if (restore_reserve) h->resv_huge_pages++; @@ -3026,11 +3027,20 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, struct hugepage_subpool *spool = subpool_vma(vma); struct hstate *h = hstate_vma(vma); struct folio *folio; - long map_chg, map_commit; + long map_chg, map_commit, nr_pages = pages_per_huge_page(h); long gbl_chg; - int ret, idx; + int memcg_charge_ret, ret, idx; struct hugetlb_cgroup *h_cg = NULL; + struct mem_cgroup *memcg; bool deferred_reserve; + gfp_t gfp = htlb_alloc_mask(h) | __GFP_RETRY_MAYFAIL; + + memcg = get_mem_cgroup_from_current(); + memcg_charge_ret = mem_cgroup_hugetlb_try_charge(memcg, gfp, nr_pages); + if (memcg_charge_ret == -ENOMEM) { + mem_cgroup_put(memcg); + return ERR_PTR(-ENOMEM); + } idx = hstate_index(h); /* @@ -3039,8 +3049,12 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, * code of zero indicates a reservation exists (no change). */ map_chg = gbl_chg = vma_needs_reservation(h, vma, addr); - if (map_chg < 0) + if (map_chg < 0) { + if (!memcg_charge_ret) + mem_cgroup_cancel_charge(memcg, nr_pages); + mem_cgroup_put(memcg); return ERR_PTR(-ENOMEM); + } /* * Processes that did not create the mapping will have no @@ -3051,10 +3065,8 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, */ if (map_chg || avoid_reserve) { gbl_chg = hugepage_subpool_get_pages(spool, 1); - if (gbl_chg < 0) { - vma_end_reservation(h, vma, addr); - return ERR_PTR(-ENOSPC); - } + if (gbl_chg < 0) + goto out_end_reservation; /* * Even though there was no reservation in the region/reserve @@ -3136,6 +3148,11 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, hugetlb_cgroup_uncharge_folio_rsvd(hstate_index(h), pages_per_huge_page(h), folio); } + + if (!memcg_charge_ret) + mem_cgroup_commit_charge(folio, memcg); + mem_cgroup_put(memcg); + return folio; out_uncharge_cgroup: @@ -3147,7 +3164,11 @@ out_uncharge_cgroup_reservation: out_subpool_put: if (map_chg || avoid_reserve) hugepage_subpool_put_pages(spool, 1); +out_end_reservation: vma_end_reservation(h, vma, addr); + if (!memcg_charge_ret) + mem_cgroup_cancel_charge(memcg, nr_pages); + mem_cgroup_put(memcg); return ERR_PTR(-ENOSPC); } -- cgit v1.2.3 From d2cf88c27f51361dfe2982e510a444411d2fc78a Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Wed, 18 Oct 2023 19:31:03 -0700 Subject: hugetlb: optimize update_and_free_pages_bulk to avoid lock cycles MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Patch series "Batch hugetlb vmemmap modification operations", v8. When hugetlb vmemmap optimization was introduced, the overhead of enabling the option was measured as described in commit 426e5c429d16 [1]. The summary states that allocating a hugetlb page should be ~2x slower with optimization and freeing a hugetlb page should be ~2-3x slower. Such overhead was deemed an acceptable trade off for the memory savings obtained by freeing vmemmap pages. It was recently reported that the overhead associated with enabling vmemmap optimization could be as high as 190x for hugetlb page allocations. Yes, 190x! Some actual numbers from other environments are: Bare Metal 8 socket Intel(R) Xeon(R) CPU E7-8895 ------------------------------------------------ Unmodified next-20230824, vm.hugetlb_optimize_vmemmap = 0 time echo 500000 > .../hugepages-2048kB/nr_hugepages real 0m4.119s time echo 0 > .../hugepages-2048kB/nr_hugepages real 0m4.477s Unmodified next-20230824, vm.hugetlb_optimize_vmemmap = 1 time echo 500000 > .../hugepages-2048kB/nr_hugepages real 0m28.973s time echo 0 > .../hugepages-2048kB/nr_hugepages real 0m36.748s VM with 252 vcpus on host with 2 socket AMD EPYC 7J13 Milan ----------------------------------------------------------- Unmodified next-20230824, vm.hugetlb_optimize_vmemmap = 0 time echo 524288 > .../hugepages-2048kB/nr_hugepages real 0m2.463s time echo 0 > .../hugepages-2048kB/nr_hugepages real 0m2.931s Unmodified next-20230824, vm.hugetlb_optimize_vmemmap = 1 time echo 524288 > .../hugepages-2048kB/nr_hugepages real 2m27.609s time echo 0 > .../hugepages-2048kB/nr_hugepages real 2m29.924s In the VM environment, the slowdown of enabling hugetlb vmemmap optimization resulted in allocation times being 61x slower. A quick profile showed that the vast majority of this overhead was due to TLB flushing. Each time we modify the kernel pagetable we need to flush the TLB. For each hugetlb that is optimized, there could be potentially two TLB flushes performed. One for the vmemmap pages associated with the hugetlb page, and potentially another one if the vmemmap pages are mapped at the PMD level and must be split. The TLB flushes required for the kernel pagetable, result in a broadcast IPI with each CPU having to flush a range of pages, or do a global flush if a threshold is exceeded. So, the flush time increases with the number of CPUs. In addition, in virtual environments the broadcast IPI can’t be accelerated by hypervisor hardware and leads to traps that need to wakeup/IPI all vCPUs which is very expensive. Because of this the slowdown in virtual environments is even worse than bare metal as the number of vCPUS/CPUs is increased. The following series attempts to reduce amount of time spent in TLB flushing. The idea is to batch the vmemmap modification operations for multiple hugetlb pages. Instead of doing one or two TLB flushes for each page, we do two TLB flushes for each batch of pages. One flush after splitting pages mapped at the PMD level, and another after remapping vmemmap associated with all hugetlb pages. Results of such batching are as follows: Bare Metal 8 socket Intel(R) Xeon(R) CPU E7-8895 ------------------------------------------------ next-20230824 + Batching patches, vm.hugetlb_optimize_vmemmap = 0 time echo 500000 > .../hugepages-2048kB/nr_hugepages real 0m4.719s time echo 0 > .../hugepages-2048kB/nr_hugepages real 0m4.245s next-20230824 + Batching patches, vm.hugetlb_optimize_vmemmap = 1 time echo 500000 > .../hugepages-2048kB/nr_hugepages real 0m7.267s time echo 0 > .../hugepages-2048kB/nr_hugepages real 0m13.199s VM with 252 vcpus on host with 2 socket AMD EPYC 7J13 Milan ----------------------------------------------------------- next-20230824 + Batching patches, vm.hugetlb_optimize_vmemmap = 0 time echo 524288 > .../hugepages-2048kB/nr_hugepages real 0m2.715s time echo 0 > .../hugepages-2048kB/nr_hugepages real 0m3.186s next-20230824 + Batching patches, vm.hugetlb_optimize_vmemmap = 1 time echo 524288 > .../hugepages-2048kB/nr_hugepages real 0m4.799s time echo 0 > .../hugepages-2048kB/nr_hugepages real 0m5.273s With batching, results are back in the 2-3x slowdown range. This patch (of 8): update_and_free_pages_bulk is designed to free a list of hugetlb pages back to their associated lower level allocators. This may require allocating vmemmmap pages associated with each hugetlb page. The hugetlb page destructor must be changed before pages are freed to lower level allocators. However, the destructor must be changed under the hugetlb lock. This means there is potentially one lock cycle per page. Minimize the number of lock cycles in update_and_free_pages_bulk by: 1) allocating necessary vmemmap for all hugetlb pages on the list 2) take hugetlb lock and clear destructor for all pages on the list 3) free all pages on list back to low level allocators Link: https://lkml.kernel.org/r/20231019023113.345257-1-mike.kravetz@oracle.com Link: https://lkml.kernel.org/r/20231019023113.345257-2-mike.kravetz@oracle.com Signed-off-by: Mike Kravetz Reviewed-by: Muchun Song Acked-by: James Houghton Cc: Anshuman Khandual Cc: Barry Song <21cnbao@gmail.com> Cc: David Hildenbrand Cc: David Rientjes Cc: Joao Martins Cc: Konrad Dybcio Cc: Matthew Wilcox (Oracle) Cc: Miaohe Lin Cc: Michal Hocko Cc: Naoya Horiguchi Cc: Oscar Salvador Cc: Sergey Senozhatsky Cc: Usama Arif Cc: Xiongchun Duan Signed-off-by: Andrew Morton --- mm/hugetlb.c | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) (limited to 'mm/hugetlb.c') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index da6f85b7db88..b839080a2a6b 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1862,7 +1862,46 @@ static void update_and_free_hugetlb_folio(struct hstate *h, struct folio *folio, static void update_and_free_pages_bulk(struct hstate *h, struct list_head *list) { struct folio *folio, *t_folio; + bool clear_dtor = false; + /* + * First allocate required vmemmmap (if necessary) for all folios on + * list. If vmemmap can not be allocated, we can not free folio to + * lower level allocator, so add back as hugetlb surplus page. + * add_hugetlb_folio() removes the page from THIS list. + * Use clear_dtor to note if vmemmap was successfully allocated for + * ANY page on the list. + */ + list_for_each_entry_safe(folio, t_folio, list, lru) { + if (folio_test_hugetlb_vmemmap_optimized(folio)) { + if (hugetlb_vmemmap_restore(h, &folio->page)) { + spin_lock_irq(&hugetlb_lock); + add_hugetlb_folio(h, folio, true); + spin_unlock_irq(&hugetlb_lock); + } else + clear_dtor = true; + } + } + + /* + * If vmemmmap allocation was performed on any folio above, take lock + * to clear destructor of all folios on list. This avoids the need to + * lock/unlock for each individual folio. + * The assumption is vmemmap allocation was performed on all or none + * of the folios on the list. This is true expect in VERY rare cases. + */ + if (clear_dtor) { + spin_lock_irq(&hugetlb_lock); + list_for_each_entry(folio, list, lru) + __clear_hugetlb_destructor(h, folio); + spin_unlock_irq(&hugetlb_lock); + } + + /* + * Free folios back to low level allocators. vmemmap and destructors + * were taken care of above, so update_and_free_hugetlb_folio will + * not need to take hugetlb lock. + */ list_for_each_entry_safe(folio, t_folio, list, lru) { update_and_free_hugetlb_folio(h, folio, false); cond_resched(); -- cgit v1.2.3 From d67e32f26713c35669e343edfdce6fa97a7d6bbc Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Wed, 18 Oct 2023 19:31:04 -0700 Subject: hugetlb: restructure pool allocations Allocation of a hugetlb page for the hugetlb pool is done by the routine alloc_pool_huge_page. This routine will allocate contiguous pages from a low level allocator, prep the pages for usage as a hugetlb page and then add the resulting hugetlb page to the pool. In the 'prep' stage, optional vmemmap optimization is done. For performance reasons we want to perform vmemmap optimization on multiple hugetlb pages at once. To do this, restructure the hugetlb pool allocation code such that vmemmap optimization can be isolated and later batched. The code to allocate hugetlb pages from bootmem was also modified to allow batching. No functional changes, only code restructure. Link: https://lkml.kernel.org/r/20231019023113.345257-3-mike.kravetz@oracle.com Signed-off-by: Mike Kravetz Reviewed-by: Muchun Song Tested-by: Sergey Senozhatsky Cc: Anshuman Khandual Cc: Barry Song <21cnbao@gmail.com> Cc: David Hildenbrand Cc: David Rientjes Cc: James Houghton Cc: Joao Martins Cc: Konrad Dybcio Cc: Matthew Wilcox (Oracle) Cc: Miaohe Lin Cc: Michal Hocko Cc: Naoya Horiguchi Cc: Oscar Salvador Cc: Usama Arif Cc: Xiongchun Duan Signed-off-by: Andrew Morton --- mm/hugetlb.c | 180 ++++++++++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 141 insertions(+), 39 deletions(-) (limited to 'mm/hugetlb.c') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index b839080a2a6b..559f7c71c596 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1996,16 +1996,21 @@ static void __prep_account_new_huge_page(struct hstate *h, int nid) h->nr_huge_pages_node[nid]++; } -static void __prep_new_hugetlb_folio(struct hstate *h, struct folio *folio) +static void init_new_hugetlb_folio(struct hstate *h, struct folio *folio) { folio_set_hugetlb(folio); - hugetlb_vmemmap_optimize(h, &folio->page); INIT_LIST_HEAD(&folio->lru); hugetlb_set_folio_subpool(folio, NULL); set_hugetlb_cgroup(folio, NULL); set_hugetlb_cgroup_rsvd(folio, NULL); } +static void __prep_new_hugetlb_folio(struct hstate *h, struct folio *folio) +{ + init_new_hugetlb_folio(h, folio); + hugetlb_vmemmap_optimize(h, &folio->page); +} + static void prep_new_hugetlb_folio(struct hstate *h, struct folio *folio, int nid) { __prep_new_hugetlb_folio(h, folio); @@ -2202,16 +2207,9 @@ retry: return page_folio(page); } -/* - * Common helper to allocate a fresh hugetlb page. All specific allocators - * should use this function to get new hugetlb pages - * - * Note that returned page is 'frozen': ref count of head page and all tail - * pages is zero. - */ -static struct folio *alloc_fresh_hugetlb_folio(struct hstate *h, - gfp_t gfp_mask, int nid, nodemask_t *nmask, - nodemask_t *node_alloc_noretry) +static struct folio *__alloc_fresh_hugetlb_folio(struct hstate *h, + gfp_t gfp_mask, int nid, nodemask_t *nmask, + nodemask_t *node_alloc_noretry) { struct folio *folio; bool retry = false; @@ -2224,6 +2222,7 @@ retry: nid, nmask, node_alloc_noretry); if (!folio) return NULL; + if (hstate_is_gigantic(h)) { if (!prep_compound_gigantic_folio(folio, huge_page_order(h))) { /* @@ -2238,32 +2237,81 @@ retry: return NULL; } } - prep_new_hugetlb_folio(h, folio, folio_nid(folio)); return folio; } +static struct folio *only_alloc_fresh_hugetlb_folio(struct hstate *h, + gfp_t gfp_mask, int nid, nodemask_t *nmask, + nodemask_t *node_alloc_noretry) +{ + struct folio *folio; + + folio = __alloc_fresh_hugetlb_folio(h, gfp_mask, nid, nmask, + node_alloc_noretry); + if (folio) + init_new_hugetlb_folio(h, folio); + return folio; +} + /* - * Allocates a fresh page to the hugetlb allocator pool in the node interleaved - * manner. + * Common helper to allocate a fresh hugetlb page. All specific allocators + * should use this function to get new hugetlb pages + * + * Note that returned page is 'frozen': ref count of head page and all tail + * pages is zero. */ -static int alloc_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed, - nodemask_t *node_alloc_noretry) +static struct folio *alloc_fresh_hugetlb_folio(struct hstate *h, + gfp_t gfp_mask, int nid, nodemask_t *nmask, + nodemask_t *node_alloc_noretry) { struct folio *folio; - int nr_nodes, node; + + folio = __alloc_fresh_hugetlb_folio(h, gfp_mask, nid, nmask, + node_alloc_noretry); + if (!folio) + return NULL; + + prep_new_hugetlb_folio(h, folio, folio_nid(folio)); + return folio; +} + +static void prep_and_add_allocated_folios(struct hstate *h, + struct list_head *folio_list) +{ + unsigned long flags; + struct folio *folio, *tmp_f; + + /* Add all new pool pages to free lists in one lock cycle */ + spin_lock_irqsave(&hugetlb_lock, flags); + list_for_each_entry_safe(folio, tmp_f, folio_list, lru) { + __prep_account_new_huge_page(h, folio_nid(folio)); + enqueue_hugetlb_folio(h, folio); + } + spin_unlock_irqrestore(&hugetlb_lock, flags); +} + +/* + * Allocates a fresh hugetlb page in a node interleaved manner. The page + * will later be added to the appropriate hugetlb pool. + */ +static struct folio *alloc_pool_huge_folio(struct hstate *h, + nodemask_t *nodes_allowed, + nodemask_t *node_alloc_noretry) +{ gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE; + int nr_nodes, node; for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) { - folio = alloc_fresh_hugetlb_folio(h, gfp_mask, node, + struct folio *folio; + + folio = only_alloc_fresh_hugetlb_folio(h, gfp_mask, node, nodes_allowed, node_alloc_noretry); - if (folio) { - free_huge_folio(folio); /* free it into the hugepage allocator */ - return 1; - } + if (folio) + return folio; } - return 0; + return NULL; } /* @@ -3302,25 +3350,35 @@ static void __init hugetlb_folio_init_vmemmap(struct folio *folio, */ static void __init gather_bootmem_prealloc(void) { + LIST_HEAD(folio_list); struct huge_bootmem_page *m; + struct hstate *h = NULL, *prev_h = NULL; list_for_each_entry(m, &huge_boot_pages, list) { struct page *page = virt_to_page(m); struct folio *folio = (void *)page; - struct hstate *h = m->hstate; + + h = m->hstate; + /* + * It is possible to have multiple huge page sizes (hstates) + * in this list. If so, process each size separately. + */ + if (h != prev_h && prev_h != NULL) + prep_and_add_allocated_folios(prev_h, &folio_list); + prev_h = h; VM_BUG_ON(!hstate_is_gigantic(h)); WARN_ON(folio_ref_count(folio) != 1); hugetlb_folio_init_vmemmap(folio, h, HUGETLB_VMEMMAP_RESERVE_PAGES); - prep_new_hugetlb_folio(h, folio, folio_nid(folio)); + __prep_new_hugetlb_folio(h, folio); /* If HVO fails, initialize all tail struct pages */ if (!HPageVmemmapOptimized(&folio->page)) hugetlb_folio_init_tail_vmemmap(folio, HUGETLB_VMEMMAP_RESERVE_PAGES, pages_per_huge_page(h)); - free_huge_folio(folio); /* add to the hugepage allocator */ + list_add(&folio->lru, &folio_list); /* * We need to restore the 'stolen' pages to totalram_pages @@ -3330,6 +3388,8 @@ static void __init gather_bootmem_prealloc(void) adjust_managed_page_count(page, pages_per_huge_page(h)); cond_resched(); } + + prep_and_add_allocated_folios(h, &folio_list); } static void __init hugetlb_hstate_alloc_pages_onenode(struct hstate *h, int nid) @@ -3363,9 +3423,22 @@ static void __init hugetlb_hstate_alloc_pages_onenode(struct hstate *h, int nid) h->max_huge_pages_node[nid] = i; } +/* + * NOTE: this routine is called in different contexts for gigantic and + * non-gigantic pages. + * - For gigantic pages, this is called early in the boot process and + * pages are allocated from memblock allocated or something similar. + * Gigantic pages are actually added to pools later with the routine + * gather_bootmem_prealloc. + * - For non-gigantic pages, this is called later in the boot process after + * all of mm is up and functional. Pages are allocated from buddy and + * then added to hugetlb pools. + */ static void __init hugetlb_hstate_alloc_pages(struct hstate *h) { unsigned long i; + struct folio *folio; + LIST_HEAD(folio_list); nodemask_t *node_alloc_noretry; bool node_specific_alloc = false; @@ -3407,14 +3480,25 @@ static void __init hugetlb_hstate_alloc_pages(struct hstate *h) for (i = 0; i < h->max_huge_pages; ++i) { if (hstate_is_gigantic(h)) { + /* + * gigantic pages not added to list as they are not + * added to pools now. + */ if (!alloc_bootmem_huge_page(h, NUMA_NO_NODE)) break; - } else if (!alloc_pool_huge_page(h, - &node_states[N_MEMORY], - node_alloc_noretry)) - break; + } else { + folio = alloc_pool_huge_folio(h, &node_states[N_MEMORY], + node_alloc_noretry); + if (!folio) + break; + list_add(&folio->lru, &folio_list); + } cond_resched(); } + + /* list will be empty if hstate_is_gigantic */ + prep_and_add_allocated_folios(h, &folio_list); + if (i < h->max_huge_pages) { char buf[32]; @@ -3548,7 +3632,9 @@ found: static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid, nodemask_t *nodes_allowed) { - unsigned long min_count, ret; + unsigned long min_count; + unsigned long allocated; + struct folio *folio; LIST_HEAD(page_list); NODEMASK_ALLOC(nodemask_t, node_alloc_noretry, GFP_KERNEL); @@ -3625,7 +3711,8 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid, break; } - while (count > persistent_huge_pages(h)) { + allocated = 0; + while (count > (persistent_huge_pages(h) + allocated)) { /* * If this allocation races such that we no longer need the * page, free_huge_folio will handle it by freeing the page @@ -3636,15 +3723,32 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid, /* yield cpu to avoid soft lockup */ cond_resched(); - ret = alloc_pool_huge_page(h, nodes_allowed, + folio = alloc_pool_huge_folio(h, nodes_allowed, node_alloc_noretry); - spin_lock_irq(&hugetlb_lock); - if (!ret) + if (!folio) { + prep_and_add_allocated_folios(h, &page_list); + spin_lock_irq(&hugetlb_lock); goto out; + } + + list_add(&folio->lru, &page_list); + allocated++; /* Bail for signals. Probably ctrl-c from user */ - if (signal_pending(current)) + if (signal_pending(current)) { + prep_and_add_allocated_folios(h, &page_list); + spin_lock_irq(&hugetlb_lock); goto out; + } + + spin_lock_irq(&hugetlb_lock); + } + + /* Add allocated pages to the pool */ + if (!list_empty(&page_list)) { + spin_unlock_irq(&hugetlb_lock); + prep_and_add_allocated_folios(h, &page_list); + spin_lock_irq(&hugetlb_lock); } /* @@ -3670,8 +3774,6 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid, * Collect pages to be removed on list without dropping lock */ while (min_count < persistent_huge_pages(h)) { - struct folio *folio; - folio = remove_pool_hugetlb_folio(h, nodes_allowed, 0); if (!folio) break; -- cgit v1.2.3 From 79359d6d24df2f304abbf6f137b01d2b76175ce3 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Wed, 18 Oct 2023 19:31:05 -0700 Subject: hugetlb: perform vmemmap optimization on a list of pages When adding hugetlb pages to the pool, we first create a list of the allocated pages before adding to the pool. Pass this list of pages to a new routine hugetlb_vmemmap_optimize_folios() for vmemmap optimization. Due to significant differences in vmemmmap initialization for bootmem allocated hugetlb pages, a new routine prep_and_add_bootmem_folios is created. We also modify the routine vmemmap_should_optimize() to check for pages that are already optimized. There are code paths that might request vmemmap optimization twice and we want to make sure this is not attempted. Link: https://lkml.kernel.org/r/20231019023113.345257-4-mike.kravetz@oracle.com Signed-off-by: Mike Kravetz Reviewed-by: Muchun Song Cc: Anshuman Khandual Cc: Barry Song <21cnbao@gmail.com> Cc: David Hildenbrand Cc: David Rientjes Cc: James Houghton Cc: Joao Martins Cc: Konrad Dybcio Cc: Matthew Wilcox (Oracle) Cc: Miaohe Lin Cc: Michal Hocko Cc: Naoya Horiguchi Cc: Oscar Salvador Cc: Sergey Senozhatsky Cc: Usama Arif Cc: Xiongchun Duan Signed-off-by: Andrew Morton --- mm/hugetlb.c | 43 +++++++++++++++++++++++++++++++++++-------- 1 file changed, 35 insertions(+), 8 deletions(-) (limited to 'mm/hugetlb.c') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 559f7c71c596..8b171f866d0a 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2282,6 +2282,9 @@ static void prep_and_add_allocated_folios(struct hstate *h, unsigned long flags; struct folio *folio, *tmp_f; + /* Send list for bulk vmemmap optimization processing */ + hugetlb_vmemmap_optimize_folios(h, folio_list); + /* Add all new pool pages to free lists in one lock cycle */ spin_lock_irqsave(&hugetlb_lock, flags); list_for_each_entry_safe(folio, tmp_f, folio_list, lru) { @@ -3344,6 +3347,35 @@ static void __init hugetlb_folio_init_vmemmap(struct folio *folio, prep_compound_head((struct page *)folio, huge_page_order(h)); } +static void __init prep_and_add_bootmem_folios(struct hstate *h, + struct list_head *folio_list) +{ + unsigned long flags; + struct folio *folio, *tmp_f; + + /* Send list for bulk vmemmap optimization processing */ + hugetlb_vmemmap_optimize_folios(h, folio_list); + + /* Add all new pool pages to free lists in one lock cycle */ + spin_lock_irqsave(&hugetlb_lock, flags); + list_for_each_entry_safe(folio, tmp_f, folio_list, lru) { + if (!folio_test_hugetlb_vmemmap_optimized(folio)) { + /* + * If HVO fails, initialize all tail struct pages + * We do not worry about potential long lock hold + * time as this is early in boot and there should + * be no contention. + */ + hugetlb_folio_init_tail_vmemmap(folio, + HUGETLB_VMEMMAP_RESERVE_PAGES, + pages_per_huge_page(h)); + } + __prep_account_new_huge_page(h, folio_nid(folio)); + enqueue_hugetlb_folio(h, folio); + } + spin_unlock_irqrestore(&hugetlb_lock, flags); +} + /* * Put bootmem huge pages into the standard lists after mem_map is up. * Note: This only applies to gigantic (order > MAX_ORDER) pages. @@ -3364,7 +3396,7 @@ static void __init gather_bootmem_prealloc(void) * in this list. If so, process each size separately. */ if (h != prev_h && prev_h != NULL) - prep_and_add_allocated_folios(prev_h, &folio_list); + prep_and_add_bootmem_folios(prev_h, &folio_list); prev_h = h; VM_BUG_ON(!hstate_is_gigantic(h)); @@ -3372,12 +3404,7 @@ static void __init gather_bootmem_prealloc(void) hugetlb_folio_init_vmemmap(folio, h, HUGETLB_VMEMMAP_RESERVE_PAGES); - __prep_new_hugetlb_folio(h, folio); - /* If HVO fails, initialize all tail struct pages */ - if (!HPageVmemmapOptimized(&folio->page)) - hugetlb_folio_init_tail_vmemmap(folio, - HUGETLB_VMEMMAP_RESERVE_PAGES, - pages_per_huge_page(h)); + init_new_hugetlb_folio(h, folio); list_add(&folio->lru, &folio_list); /* @@ -3389,7 +3416,7 @@ static void __init gather_bootmem_prealloc(void) cond_resched(); } - prep_and_add_allocated_folios(h, &folio_list); + prep_and_add_bootmem_folios(h, &folio_list); } static void __init hugetlb_hstate_alloc_pages_onenode(struct hstate *h, int nid) -- cgit v1.2.3 From cfb8c75099dbf152db1b075eead5029c5a30ce51 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Wed, 18 Oct 2023 19:31:06 -0700 Subject: hugetlb: perform vmemmap restoration on a list of pages The routine update_and_free_pages_bulk already performs vmemmap restoration on the list of hugetlb pages in a separate step. In preparation for more functionality to be added in this step, create a new routine hugetlb_vmemmap_restore_folios() that will restore vmemmap for a list of folios. This new routine must provide sufficient feedback about errors and actual restoration performed so that update_and_free_pages_bulk can perform optimally. Special care must be taken when encountering an error from hugetlb_vmemmap_restore_folios. We want to continue making as much forward progress as possible. A new routine bulk_vmemmap_restore_error handles this specific situation. Link: https://lkml.kernel.org/r/20231019023113.345257-5-mike.kravetz@oracle.com Signed-off-by: Mike Kravetz Reviewed-by: Muchun Song Cc: Anshuman Khandual Cc: Barry Song <21cnbao@gmail.com> Cc: David Hildenbrand Cc: David Rientjes Cc: James Houghton Cc: Joao Martins Cc: Konrad Dybcio Cc: Matthew Wilcox (Oracle) Cc: Miaohe Lin Cc: Michal Hocko Cc: Naoya Horiguchi Cc: Oscar Salvador Cc: Sergey Senozhatsky Cc: Usama Arif Cc: Xiongchun Duan Signed-off-by: Andrew Morton --- mm/hugetlb.c | 99 +++++++++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 71 insertions(+), 28 deletions(-) (limited to 'mm/hugetlb.c') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 8b171f866d0a..cf834bb7f820 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1859,50 +1859,93 @@ static void update_and_free_hugetlb_folio(struct hstate *h, struct folio *folio, schedule_work(&free_hpage_work); } -static void update_and_free_pages_bulk(struct hstate *h, struct list_head *list) +static void bulk_vmemmap_restore_error(struct hstate *h, + struct list_head *folio_list, + struct list_head *non_hvo_folios) { struct folio *folio, *t_folio; - bool clear_dtor = false; - /* - * First allocate required vmemmmap (if necessary) for all folios on - * list. If vmemmap can not be allocated, we can not free folio to - * lower level allocator, so add back as hugetlb surplus page. - * add_hugetlb_folio() removes the page from THIS list. - * Use clear_dtor to note if vmemmap was successfully allocated for - * ANY page on the list. - */ - list_for_each_entry_safe(folio, t_folio, list, lru) { - if (folio_test_hugetlb_vmemmap_optimized(folio)) { + if (!list_empty(non_hvo_folios)) { + /* + * Free any restored hugetlb pages so that restore of the + * entire list can be retried. + * The idea is that in the common case of ENOMEM errors freeing + * hugetlb pages with vmemmap we will free up memory so that we + * can allocate vmemmap for more hugetlb pages. + */ + list_for_each_entry_safe(folio, t_folio, non_hvo_folios, lru) { + list_del(&folio->lru); + spin_lock_irq(&hugetlb_lock); + __clear_hugetlb_destructor(h, folio); + spin_unlock_irq(&hugetlb_lock); + update_and_free_hugetlb_folio(h, folio, false); + cond_resched(); + } + } else { + /* + * In the case where there are no folios which can be + * immediately freed, we loop through the list trying to restore + * vmemmap individually in the hope that someone elsewhere may + * have done something to cause success (such as freeing some + * memory). If unable to restore a hugetlb page, the hugetlb + * page is made a surplus page and removed from the list. + * If are able to restore vmemmap and free one hugetlb page, we + * quit processing the list to retry the bulk operation. + */ + list_for_each_entry_safe(folio, t_folio, folio_list, lru) if (hugetlb_vmemmap_restore(h, &folio->page)) { + list_del(&folio->lru); spin_lock_irq(&hugetlb_lock); add_hugetlb_folio(h, folio, true); spin_unlock_irq(&hugetlb_lock); - } else - clear_dtor = true; - } + } else { + list_del(&folio->lru); + spin_lock_irq(&hugetlb_lock); + __clear_hugetlb_destructor(h, folio); + spin_unlock_irq(&hugetlb_lock); + update_and_free_hugetlb_folio(h, folio, false); + cond_resched(); + break; + } } +} + +static void update_and_free_pages_bulk(struct hstate *h, + struct list_head *folio_list) +{ + long ret; + struct folio *folio, *t_folio; + LIST_HEAD(non_hvo_folios); /* - * If vmemmmap allocation was performed on any folio above, take lock - * to clear destructor of all folios on list. This avoids the need to - * lock/unlock for each individual folio. - * The assumption is vmemmap allocation was performed on all or none - * of the folios on the list. This is true expect in VERY rare cases. + * First allocate required vmemmmap (if necessary) for all folios. + * Carefully handle errors and free up any available hugetlb pages + * in an effort to make forward progress. */ - if (clear_dtor) { +retry: + ret = hugetlb_vmemmap_restore_folios(h, folio_list, &non_hvo_folios); + if (ret < 0) { + bulk_vmemmap_restore_error(h, folio_list, &non_hvo_folios); + goto retry; + } + + /* + * At this point, list should be empty, ret should be >= 0 and there + * should only be pages on the non_hvo_folios list. + * Do note that the non_hvo_folios list could be empty. + * Without HVO enabled, ret will be 0 and there is no need to call + * __clear_hugetlb_destructor as this was done previously. + */ + VM_WARN_ON(!list_empty(folio_list)); + VM_WARN_ON(ret < 0); + if (!list_empty(&non_hvo_folios) && ret) { spin_lock_irq(&hugetlb_lock); - list_for_each_entry(folio, list, lru) + list_for_each_entry(folio, &non_hvo_folios, lru) __clear_hugetlb_destructor(h, folio); spin_unlock_irq(&hugetlb_lock); } - /* - * Free folios back to low level allocators. vmemmap and destructors - * were taken care of above, so update_and_free_hugetlb_folio will - * not need to take hugetlb lock. - */ - list_for_each_entry_safe(folio, t_folio, list, lru) { + list_for_each_entry_safe(folio, t_folio, &non_hvo_folios, lru) { update_and_free_hugetlb_folio(h, folio, false); cond_resched(); } -- cgit v1.2.3 From c5ad3233ead566d74918bd76ba2dbdbe83ba1d9d Mon Sep 17 00:00:00 2001 From: Usama Arif Date: Wed, 11 Oct 2023 15:45:57 +0100 Subject: hugetlb_vmemmap: use folio argument for hugetlb_vmemmap_* functions Most function calls in hugetlb.c are made with folio arguments. This brings hugetlb_vmemmap calls inline with them by using folio instead of head struct page. Head struct page is still needed within these functions. The set/clear/test functions for hugepages are also changed to folio versions. Link: https://lkml.kernel.org/r/20231011144557.1720481-2-usama.arif@bytedance.com Signed-off-by: Usama Arif Reviewed-by: Muchun Song Cc: Fam Zheng Cc: Mike Kravetz Cc: Punit Agrawal Signed-off-by: Andrew Morton --- mm/hugetlb.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'mm/hugetlb.c') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index cf834bb7f820..dd8065e36038 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1747,10 +1747,10 @@ static void __update_and_free_hugetlb_folio(struct hstate *h, /* * If folio is not vmemmap optimized (!clear_dtor), then the folio - * is no longer identified as a hugetlb page. hugetlb_vmemmap_restore + * is no longer identified as a hugetlb page. hugetlb_vmemmap_restore_folio * can only be passed hugetlb pages and will BUG otherwise. */ - if (clear_dtor && hugetlb_vmemmap_restore(h, &folio->page)) { + if (clear_dtor && hugetlb_vmemmap_restore_folio(h, folio)) { spin_lock_irq(&hugetlb_lock); /* * If we cannot allocate vmemmap pages, just refuse to free the @@ -1893,7 +1893,7 @@ static void bulk_vmemmap_restore_error(struct hstate *h, * quit processing the list to retry the bulk operation. */ list_for_each_entry_safe(folio, t_folio, folio_list, lru) - if (hugetlb_vmemmap_restore(h, &folio->page)) { + if (hugetlb_vmemmap_restore_folio(h, folio)) { list_del(&folio->lru); spin_lock_irq(&hugetlb_lock); add_hugetlb_folio(h, folio, true); @@ -2051,7 +2051,7 @@ static void init_new_hugetlb_folio(struct hstate *h, struct folio *folio) static void __prep_new_hugetlb_folio(struct hstate *h, struct folio *folio) { init_new_hugetlb_folio(h, folio); - hugetlb_vmemmap_optimize(h, &folio->page); + hugetlb_vmemmap_optimize_folio(h, folio); } static void prep_new_hugetlb_folio(struct hstate *h, struct folio *folio, int nid) @@ -2462,7 +2462,7 @@ retry: * non-vmemmap optimized hugetlb folios. */ if (folio_test_hugetlb(folio)) { - rc = hugetlb_vmemmap_restore(h, &folio->page); + rc = hugetlb_vmemmap_restore_folio(h, folio); if (rc) { spin_lock_irq(&hugetlb_lock); add_hugetlb_folio(h, folio, false); @@ -3886,11 +3886,11 @@ static int demote_free_hugetlb_folio(struct hstate *h, struct folio *folio) /* * If vmemmap already existed for folio, the remove routine above would * have cleared the hugetlb folio flag. Hence the folio is technically - * no longer a hugetlb folio. hugetlb_vmemmap_restore can only be + * no longer a hugetlb folio. hugetlb_vmemmap_restore_folio can only be * passed hugetlb folios and will BUG otherwise. */ if (folio_test_hugetlb(folio)) { - rc = hugetlb_vmemmap_restore(h, &folio->page); + rc = hugetlb_vmemmap_restore_folio(h, folio); if (rc) { /* Allocation of vmemmmap failed, we can not demote folio */ spin_lock_irq(&hugetlb_lock); -- cgit v1.2.3 From 72e315f7a750281b4410ac30d8930f735459e72d Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 3 Oct 2023 02:27:47 -0700 Subject: mempolicy: mmap_lock is not needed while migrating folios mbind(2) holds down_write of current task's mmap_lock throughout (exclusive because it needs to set the new mempolicy on the vmas); migrate_pages(2) holds down_read of pid's mmap_lock throughout. They both hold mmap_lock across the internal migrate_pages(), under which all new page allocations (huge or small) are made. I'm nervous about it; and migrate_pages() certainly does not need mmap_lock itself. It's done this way for mbind(2), because its page allocator is vma_alloc_folio() or alloc_hugetlb_folio_vma(), both of which depend on vma and address. Now that we have alloc_pages_mpol(), depending on (refcounted) memory policy and interleave index, mbind(2) can be modified to use that or alloc_hugetlb_folio_nodemask(), and then not need mmap_lock across the internal migrate_pages() at all: add alloc_migration_target_by_mpol() to replace mbind's new_page(). (After that change, alloc_hugetlb_folio_vma() is used by nothing but a userfaultfd function: move it out of hugetlb.h and into the #ifdef.) migrate_pages(2) has chosen its target node before migrating, so can continue to use the standard alloc_migration_target(); but let it take and drop mmap_lock just around migrate_to_node()'s queue_pages_range(): neither the node-to-node calculations nor the page migrations need it. It seems unlikely, but it is conceivable that some userspace depends on the kernel's mmap_lock exclusion here, instead of doing its own locking: more likely in a testsuite than in real life. It is also possible, of course, that some pages on the list will be munmapped by another thread before they are migrated, or a newer memory policy applied to the range by that time: but such races could happen before, as soon as mmap_lock was dropped, so it does not appear to be a concern. Link: https://lkml.kernel.org/r/21e564e8-269f-6a89-7ee2-fd612831c289@google.com Signed-off-by: Hugh Dickins Cc: Andi Kleen Cc: Christoph Lameter Cc: David Hildenbrand Cc: Greg Kroah-Hartman Cc: "Huang, Ying" Cc: Kefeng Wang Cc: Matthew Wilcox (Oracle) Cc: Mel Gorman Cc: Michal Hocko Cc: Mike Kravetz Cc: Nhat Pham Cc: Sidhartha Kumar Cc: Suren Baghdasaryan Cc: Tejun heo Cc: Vishal Moola (Oracle) Cc: Yang Shi Cc: Yosry Ahmed Signed-off-by: Andrew Morton --- mm/hugetlb.c | 38 ++++++++++++++++++++------------------ 1 file changed, 20 insertions(+), 18 deletions(-) (limited to 'mm/hugetlb.c') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index dd8065e36038..1169ef2f2176 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2630,24 +2630,6 @@ struct folio *alloc_hugetlb_folio_nodemask(struct hstate *h, int preferred_nid, return alloc_migrate_hugetlb_folio(h, gfp_mask, preferred_nid, nmask); } -/* mempolicy aware migration callback */ -struct folio *alloc_hugetlb_folio_vma(struct hstate *h, struct vm_area_struct *vma, - unsigned long address) -{ - struct mempolicy *mpol; - nodemask_t *nodemask; - struct folio *folio; - gfp_t gfp_mask; - int node; - - gfp_mask = htlb_alloc_mask(h); - node = huge_node(vma, address, gfp_mask, &mpol, &nodemask); - folio = alloc_hugetlb_folio_nodemask(h, node, nodemask, gfp_mask); - mpol_cond_put(mpol); - - return folio; -} - /* * Increase the hugetlb pool such that it can accommodate a reservation * of size 'delta'. @@ -6559,6 +6541,26 @@ out_mutex: } #ifdef CONFIG_USERFAULTFD +/* + * Can probably be eliminated, but still used by hugetlb_mfill_atomic_pte(). + */ +static struct folio *alloc_hugetlb_folio_vma(struct hstate *h, + struct vm_area_struct *vma, unsigned long address) +{ + struct mempolicy *mpol; + nodemask_t *nodemask; + struct folio *folio; + gfp_t gfp_mask; + int node; + + gfp_mask = htlb_alloc_mask(h); + node = huge_node(vma, address, gfp_mask, &mpol, &nodemask); + folio = alloc_hugetlb_folio_nodemask(h, node, nodemask, gfp_mask); + mpol_cond_put(mpol); + + return folio; +} + /* * Used by userfaultfd UFFDIO_* ioctls. Based on userfaultfd's mfill_atomic_pte * with modifications for hugetlb pages. -- cgit v1.2.3