From 5e9113731a3ce616e8b5aa128ffc1aeaa4942571 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Tue, 8 Sep 2015 15:01:28 -0700 Subject: mm/hugetlb: add cache of descriptors to resv_map for region_add hugetlbfs is used today by applications that want a high degree of control over huge page usage. Often, large hugetlbfs files are used to map a large number huge pages into the application processes. The applications know when page ranges within these large files will no longer be used, and ideally would like to release them back to the subpool or global pools for other uses. The fallocate() system call provides an interface for preallocation and hole punching within files. This patch set adds fallocate functionality to hugetlbfs. fallocate hole punch will want to remove a specific range of pages. When pages are removed, their associated entries in the region/reserve map will also be removed. This will break an assumption in the region_chg/region_add calling sequence. If a new region descriptor must be allocated, it is done as part of the region_chg processing. In this way, region_add can not fail because it does not need to attempt an allocation. To prepare for fallocate hole punch, create a "cache" of descriptors that can be used by region_add if necessary. region_chg will ensure there are sufficient entries in the cache. It will be necessary to track the number of in progress add operations to know a sufficient number of descriptors reside in the cache. A new routine region_abort is added to adjust this in progress count when add operations are aborted. vma_abort_reservation is also added for callers creating reservations with vma_needs_reservation/vma_commit_reservation. [akpm@linux-foundation.org: fix typo in comment, use more cols] Signed-off-by: Mike Kravetz Reviewed-by: Naoya Horiguchi Acked-by: Hillf Danton Cc: Dave Hansen Cc: David Rientjes Cc: Hugh Dickins Cc: Davidlohr Bueso Cc: Aneesh Kumar Cc: Christoph Hellwig Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 174 +++++++++++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 152 insertions(+), 22 deletions(-) (limited to 'mm/hugetlb.c') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 51ae41d0fbc0..4e5815ed7a8e 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -240,11 +240,14 @@ struct file_region { /* * Add the huge page range represented by [f, t) to the reserve - * map. Existing regions will be expanded to accommodate the - * specified range. We know only existing regions need to be - * expanded, because region_add is only called after region_chg - * with the same range. If a new file_region structure must - * be allocated, it is done in region_chg. + * map. In the normal case, existing regions will be expanded + * to accommodate the specified range. Sufficient regions should + * exist for expansion due to the previous call to region_chg + * with the same range. However, it is possible that region_del + * could have been called after region_chg and modifed the map + * in such a way that no region exists to be expanded. In this + * case, pull a region descriptor from the cache associated with + * the map and use that for the new range. * * Return the number of new huge pages added to the map. This * number is greater than or equal to zero. @@ -261,6 +264,28 @@ static long region_add(struct resv_map *resv, long f, long t) if (f <= rg->to) break; + /* + * If no region exists which can be expanded to include the + * specified range, the list must have been modified by an + * interleving call to region_del(). Pull a region descriptor + * from the cache and use it for this range. + */ + if (&rg->link == head || t < rg->from) { + VM_BUG_ON(resv->region_cache_count <= 0); + + resv->region_cache_count--; + nrg = list_first_entry(&resv->region_cache, struct file_region, + link); + list_del(&nrg->link); + + nrg->from = f; + nrg->to = t; + list_add(&nrg->link, rg->link.prev); + + add += t - f; + goto out_locked; + } + /* Round our left edge to the current segment if it encloses us. */ if (f > rg->from) f = rg->from; @@ -294,6 +319,8 @@ static long region_add(struct resv_map *resv, long f, long t) add += t - nrg->to; /* Added to end of region */ nrg->to = t; +out_locked: + resv->adds_in_progress--; spin_unlock(&resv->lock); VM_BUG_ON(add < 0); return add; @@ -312,11 +339,14 @@ static long region_add(struct resv_map *resv, long f, long t) * so that the subsequent region_add call will have all the * regions it needs and will not fail. * - * Returns the number of huge pages that need to be added - * to the existing reservation map for the range [f, t). - * This number is greater or equal to zero. -ENOMEM is - * returned if a new file_region structure is needed and can - * not be allocated. + * Upon entry, region_chg will also examine the cache of region descriptors + * associated with the map. If there are not enough descriptors cached, one + * will be allocated for the in progress add operation. + * + * Returns the number of huge pages that need to be added to the existing + * reservation map for the range [f, t). This number is greater or equal to + * zero. -ENOMEM is returned if a new file_region structure or cache entry + * is needed and can not be allocated. */ static long region_chg(struct resv_map *resv, long f, long t) { @@ -326,6 +356,31 @@ static long region_chg(struct resv_map *resv, long f, long t) retry: spin_lock(&resv->lock); +retry_locked: + resv->adds_in_progress++; + + /* + * Check for sufficient descriptors in the cache to accommodate + * the number of in progress add operations. + */ + if (resv->adds_in_progress > resv->region_cache_count) { + struct file_region *trg; + + VM_BUG_ON(resv->adds_in_progress - resv->region_cache_count > 1); + /* Must drop lock to allocate a new descriptor. */ + resv->adds_in_progress--; + spin_unlock(&resv->lock); + + trg = kmalloc(sizeof(*trg), GFP_KERNEL); + if (!trg) + return -ENOMEM; + + spin_lock(&resv->lock); + list_add(&trg->link, &resv->region_cache); + resv->region_cache_count++; + goto retry_locked; + } + /* Locate the region we are before or in. */ list_for_each_entry(rg, head, link) if (f <= rg->to) @@ -336,6 +391,7 @@ retry: * size such that we can guarantee to record the reservation. */ if (&rg->link == head || t < rg->from) { if (!nrg) { + resv->adds_in_progress--; spin_unlock(&resv->lock); nrg = kmalloc(sizeof(*nrg), GFP_KERNEL); if (!nrg) @@ -384,6 +440,25 @@ out_nrg: return chg; } +/* + * Abort the in progress add operation. The adds_in_progress field + * of the resv_map keeps track of the operations in progress between + * calls to region_chg and region_add. Operations are sometimes + * aborted after the call to region_chg. In such cases, region_abort + * is called to decrement the adds_in_progress counter. + * + * NOTE: The range arguments [f, t) are not needed or used in this + * routine. They are kept to make reading the calling code easier as + * arguments will match the associated region_chg call. + */ +static void region_abort(struct resv_map *resv, long f, long t) +{ + spin_lock(&resv->lock); + VM_BUG_ON(!resv->region_cache_count); + resv->adds_in_progress--; + spin_unlock(&resv->lock); +} + /* * Truncate the reserve map at index 'end'. Modify/truncate any * region which contains end. Delete any regions past end. @@ -544,22 +619,44 @@ static void set_vma_private_data(struct vm_area_struct *vma, struct resv_map *resv_map_alloc(void) { struct resv_map *resv_map = kmalloc(sizeof(*resv_map), GFP_KERNEL); - if (!resv_map) + struct file_region *rg = kmalloc(sizeof(*rg), GFP_KERNEL); + + if (!resv_map || !rg) { + kfree(resv_map); + kfree(rg); return NULL; + } kref_init(&resv_map->refs); spin_lock_init(&resv_map->lock); INIT_LIST_HEAD(&resv_map->regions); + resv_map->adds_in_progress = 0; + + INIT_LIST_HEAD(&resv_map->region_cache); + list_add(&rg->link, &resv_map->region_cache); + resv_map->region_cache_count = 1; + return resv_map; } void resv_map_release(struct kref *ref) { struct resv_map *resv_map = container_of(ref, struct resv_map, refs); + struct list_head *head = &resv_map->region_cache; + struct file_region *rg, *trg; /* Clear out any active regions before we release the map. */ region_truncate(resv_map, 0); + + /* ... and any entries left in the cache */ + list_for_each_entry_safe(rg, trg, head, link) { + list_del(&rg->link); + kfree(rg); + } + + VM_BUG_ON(resv_map->adds_in_progress); + kfree(resv_map); } @@ -1473,16 +1570,18 @@ static void return_unused_surplus_pages(struct hstate *h, } } + /* - * vma_needs_reservation and vma_commit_reservation are used by the huge - * page allocation routines to manage reservations. + * vma_needs_reservation, vma_commit_reservation and vma_abort_reservation + * are used by the huge page allocation routines to manage reservations. * * vma_needs_reservation is called to determine if the huge page at addr * within the vma has an associated reservation. If a reservation is * needed, the value 1 is returned. The caller is then responsible for * managing the global reservation and subpool usage counts. After * the huge page has been allocated, vma_commit_reservation is called - * to add the page to the reservation map. + * to add the page to the reservation map. If the reservation must be + * aborted instead of committed, vma_abort_reservation is called. * * In the normal case, vma_commit_reservation returns the same value * as the preceding vma_needs_reservation call. The only time this @@ -1490,9 +1589,14 @@ static void return_unused_surplus_pages(struct hstate *h, * is the responsibility of the caller to notice the difference and * take appropriate action. */ +enum vma_resv_mode { + VMA_NEEDS_RESV, + VMA_COMMIT_RESV, + VMA_ABORT_RESV, +}; static long __vma_reservation_common(struct hstate *h, struct vm_area_struct *vma, unsigned long addr, - bool commit) + enum vma_resv_mode mode) { struct resv_map *resv; pgoff_t idx; @@ -1503,10 +1607,20 @@ static long __vma_reservation_common(struct hstate *h, return 1; idx = vma_hugecache_offset(h, vma, addr); - if (commit) - ret = region_add(resv, idx, idx + 1); - else + switch (mode) { + case VMA_NEEDS_RESV: ret = region_chg(resv, idx, idx + 1); + break; + case VMA_COMMIT_RESV: + ret = region_add(resv, idx, idx + 1); + break; + case VMA_ABORT_RESV: + region_abort(resv, idx, idx + 1); + ret = 0; + break; + default: + BUG(); + } if (vma->vm_flags & VM_MAYSHARE) return ret; @@ -1517,13 +1631,19 @@ static long __vma_reservation_common(struct hstate *h, static long vma_needs_reservation(struct hstate *h, struct vm_area_struct *vma, unsigned long addr) { - return __vma_reservation_common(h, vma, addr, false); + return __vma_reservation_common(h, vma, addr, VMA_NEEDS_RESV); } static long vma_commit_reservation(struct hstate *h, struct vm_area_struct *vma, unsigned long addr) { - return __vma_reservation_common(h, vma, addr, true); + return __vma_reservation_common(h, vma, addr, VMA_COMMIT_RESV); +} + +static void vma_abort_reservation(struct hstate *h, + struct vm_area_struct *vma, unsigned long addr) +{ + (void)__vma_reservation_common(h, vma, addr, VMA_ABORT_RESV); } static struct page *alloc_huge_page(struct vm_area_struct *vma, @@ -1549,8 +1669,10 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, if (chg < 0) return ERR_PTR(-ENOMEM); if (chg || avoid_reserve) - if (hugepage_subpool_get_pages(spool, 1) < 0) + if (hugepage_subpool_get_pages(spool, 1) < 0) { + vma_abort_reservation(h, vma, addr); return ERR_PTR(-ENOSPC); + } ret = hugetlb_cgroup_charge_cgroup(idx, pages_per_huge_page(h), &h_cg); if (ret) @@ -1596,6 +1718,7 @@ out_uncharge_cgroup: out_subpool_put: if (chg || avoid_reserve) hugepage_subpool_put_pages(spool, 1); + vma_abort_reservation(h, vma, addr); return ERR_PTR(-ENOSPC); } @@ -3236,11 +3359,14 @@ retry: * any allocations necessary to record that reservation occur outside * the spinlock. */ - if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) + if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) { if (vma_needs_reservation(h, vma, address) < 0) { ret = VM_FAULT_OOM; goto backout_unlocked; } + /* Just decrements count, does not deallocate */ + vma_abort_reservation(h, vma, address); + } ptl = huge_pte_lockptr(h, mm, ptep); spin_lock(ptl); @@ -3387,6 +3513,8 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, ret = VM_FAULT_OOM; goto out_mutex; } + /* Just decrements count, does not deallocate */ + vma_abort_reservation(h, vma, address); if (!(vma->vm_flags & VM_MAYSHARE)) pagecache_page = hugetlbfs_pagecache_page(h, @@ -3726,6 +3854,8 @@ int hugetlb_reserve_pages(struct inode *inode, } return 0; out_err: + if (!vma || vma->vm_flags & VM_MAYSHARE) + region_abort(resv_map, from, to); if (vma && is_vma_resv_set(vma, HPAGE_RESV_OWNER)) kref_put(&resv_map->refs, resv_map_release); return ret; -- cgit v1.2.3 From feba16e25a578080af5aad5eb9e469b4e6c23eef Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Tue, 8 Sep 2015 15:01:31 -0700 Subject: mm/hugetlb: add region_del() to delete a specific range of entries fallocate hole punch will want to remove a specific range of pages. The existing region_truncate() routine deletes all region/reserve map entries after a specified offset. region_del() will provide this same functionality if the end of region is specified as LONG_MAX. Hence, region_del() can replace region_truncate(). Unlike region_truncate(), region_del() can return an error in the rare case where it can not allocate memory for a region descriptor. This ONLY happens in the case where an existing region must be split. Current callers passing LONG_MAX as end of range will never experience this error and do not need to deal with error handling. Future callers of region_del() (such as fallocate hole punch) will need to handle this error. Signed-off-by: Mike Kravetz Reviewed-by: Naoya Horiguchi Acked-by: Hillf Danton Cc: Dave Hansen Cc: David Rientjes Cc: Hugh Dickins Cc: Davidlohr Bueso Cc: Aneesh Kumar Cc: Christoph Hellwig Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 122 +++++++++++++++++++++++++++++++++++++++++------------------ 1 file changed, 85 insertions(+), 37 deletions(-) (limited to 'mm/hugetlb.c') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 4e5815ed7a8e..78e7eded4063 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -460,43 +460,90 @@ static void region_abort(struct resv_map *resv, long f, long t) } /* - * Truncate the reserve map at index 'end'. Modify/truncate any - * region which contains end. Delete any regions past end. - * Return the number of huge pages removed from the map. + * Delete the specified range [f, t) from the reserve map. If the + * t parameter is LONG_MAX, this indicates that ALL regions after f + * should be deleted. Locate the regions which intersect [f, t) + * and either trim, delete or split the existing regions. + * + * Returns the number of huge pages deleted from the reserve map. + * In the normal case, the return value is zero or more. In the + * case where a region must be split, a new region descriptor must + * be allocated. If the allocation fails, -ENOMEM will be returned. + * NOTE: If the parameter t == LONG_MAX, then we will never split + * a region and possibly return -ENOMEM. Callers specifying + * t == LONG_MAX do not need to check for -ENOMEM error. */ -static long region_truncate(struct resv_map *resv, long end) +static long region_del(struct resv_map *resv, long f, long t) { struct list_head *head = &resv->regions; struct file_region *rg, *trg; - long chg = 0; + struct file_region *nrg = NULL; + long del = 0; +retry: spin_lock(&resv->lock); - /* Locate the region we are either in or before. */ - list_for_each_entry(rg, head, link) - if (end <= rg->to) + list_for_each_entry_safe(rg, trg, head, link) { + if (rg->to <= f) + continue; + if (rg->from >= t) break; - if (&rg->link == head) - goto out; - /* If we are in the middle of a region then adjust it. */ - if (end > rg->from) { - chg = rg->to - end; - rg->to = end; - rg = list_entry(rg->link.next, typeof(*rg), link); - } + if (f > rg->from && t < rg->to) { /* Must split region */ + /* + * Check for an entry in the cache before dropping + * lock and attempting allocation. + */ + if (!nrg && + resv->region_cache_count > resv->adds_in_progress) { + nrg = list_first_entry(&resv->region_cache, + struct file_region, + link); + list_del(&nrg->link); + resv->region_cache_count--; + } - /* Drop any remaining regions. */ - list_for_each_entry_safe(rg, trg, rg->link.prev, link) { - if (&rg->link == head) + if (!nrg) { + spin_unlock(&resv->lock); + nrg = kmalloc(sizeof(*nrg), GFP_KERNEL); + if (!nrg) + return -ENOMEM; + goto retry; + } + + del += t - f; + + /* New entry for end of split region */ + nrg->from = t; + nrg->to = rg->to; + INIT_LIST_HEAD(&nrg->link); + + /* Original entry is trimmed */ + rg->to = f; + + list_add(&nrg->link, &rg->link); + nrg = NULL; break; - chg += rg->to - rg->from; - list_del(&rg->link); - kfree(rg); + } + + if (f <= rg->from && t >= rg->to) { /* Remove entire region */ + del += rg->to - rg->from; + list_del(&rg->link); + kfree(rg); + continue; + } + + if (f <= rg->from) { /* Trim beginning of region */ + del += t - rg->from; + rg->from = t; + } else { /* Trim end of region */ + del += rg->to - f; + rg->to = f; + } } -out: spin_unlock(&resv->lock); - return chg; + kfree(nrg); + return del; } /* @@ -647,7 +694,7 @@ void resv_map_release(struct kref *ref) struct file_region *rg, *trg; /* Clear out any active regions before we release the map. */ - region_truncate(resv_map, 0); + region_del(resv_map, 0, LONG_MAX); /* ... and any entries left in the cache */ list_for_each_entry_safe(rg, trg, head, link) { @@ -1572,7 +1619,7 @@ static void return_unused_surplus_pages(struct hstate *h, /* - * vma_needs_reservation, vma_commit_reservation and vma_abort_reservation + * vma_needs_reservation, vma_commit_reservation and vma_end_reservation * are used by the huge page allocation routines to manage reservations. * * vma_needs_reservation is called to determine if the huge page at addr @@ -1580,8 +1627,9 @@ static void return_unused_surplus_pages(struct hstate *h, * needed, the value 1 is returned. The caller is then responsible for * managing the global reservation and subpool usage counts. After * the huge page has been allocated, vma_commit_reservation is called - * to add the page to the reservation map. If the reservation must be - * aborted instead of committed, vma_abort_reservation is called. + * to add the page to the reservation map. If the page allocation fails, + * the reservation must be ended instead of committed. vma_end_reservation + * is called in such cases. * * In the normal case, vma_commit_reservation returns the same value * as the preceding vma_needs_reservation call. The only time this @@ -1592,7 +1640,7 @@ static void return_unused_surplus_pages(struct hstate *h, enum vma_resv_mode { VMA_NEEDS_RESV, VMA_COMMIT_RESV, - VMA_ABORT_RESV, + VMA_END_RESV, }; static long __vma_reservation_common(struct hstate *h, struct vm_area_struct *vma, unsigned long addr, @@ -1614,7 +1662,7 @@ static long __vma_reservation_common(struct hstate *h, case VMA_COMMIT_RESV: ret = region_add(resv, idx, idx + 1); break; - case VMA_ABORT_RESV: + case VMA_END_RESV: region_abort(resv, idx, idx + 1); ret = 0; break; @@ -1640,10 +1688,10 @@ static long vma_commit_reservation(struct hstate *h, return __vma_reservation_common(h, vma, addr, VMA_COMMIT_RESV); } -static void vma_abort_reservation(struct hstate *h, +static void vma_end_reservation(struct hstate *h, struct vm_area_struct *vma, unsigned long addr) { - (void)__vma_reservation_common(h, vma, addr, VMA_ABORT_RESV); + (void)__vma_reservation_common(h, vma, addr, VMA_END_RESV); } static struct page *alloc_huge_page(struct vm_area_struct *vma, @@ -1670,7 +1718,7 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, return ERR_PTR(-ENOMEM); if (chg || avoid_reserve) if (hugepage_subpool_get_pages(spool, 1) < 0) { - vma_abort_reservation(h, vma, addr); + vma_end_reservation(h, vma, addr); return ERR_PTR(-ENOSPC); } @@ -1718,7 +1766,7 @@ out_uncharge_cgroup: out_subpool_put: if (chg || avoid_reserve) hugepage_subpool_put_pages(spool, 1); - vma_abort_reservation(h, vma, addr); + vma_end_reservation(h, vma, addr); return ERR_PTR(-ENOSPC); } @@ -3365,7 +3413,7 @@ retry: goto backout_unlocked; } /* Just decrements count, does not deallocate */ - vma_abort_reservation(h, vma, address); + vma_end_reservation(h, vma, address); } ptl = huge_pte_lockptr(h, mm, ptep); @@ -3514,7 +3562,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, goto out_mutex; } /* Just decrements count, does not deallocate */ - vma_abort_reservation(h, vma, address); + vma_end_reservation(h, vma, address); if (!(vma->vm_flags & VM_MAYSHARE)) pagecache_page = hugetlbfs_pagecache_page(h, @@ -3870,7 +3918,7 @@ void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed) long gbl_reserve; if (resv_map) - chg = region_truncate(resv_map, offset); + chg = region_del(resv_map, offset, LONG_MAX); spin_lock(&inode->i_lock); inode->i_blocks -= (blocks_per_huge_page(h) * freed); spin_unlock(&inode->i_lock); -- cgit v1.2.3 From c672c7f29f2fdb73e1f72911bf499675c81fcdbb Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Tue, 8 Sep 2015 15:01:35 -0700 Subject: mm/hugetlb: expose hugetlb fault mutex for use by fallocate hugetlb page faults are currently synchronized by the table of mutexes (htlb_fault_mutex_table). fallocate code will need to synchronize with the page fault code when it allocates or deletes pages. Expose interfaces so that fallocate operations can be synchronized with page faults. Minor name changes to be more consistent with other global hugetlb symbols. Signed-off-by: Mike Kravetz Reviewed-by: Naoya Horiguchi Acked-by: Hillf Danton Cc: Dave Hansen Cc: David Rientjes Cc: Hugh Dickins Cc: Davidlohr Bueso Cc: Aneesh Kumar Cc: Christoph Hellwig Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'mm/hugetlb.c') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 78e7eded4063..070880fe1ff7 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -64,7 +64,7 @@ DEFINE_SPINLOCK(hugetlb_lock); * prevent spurious OOMs when the hugepage pool is fully utilized. */ static int num_fault_mutexes; -static struct mutex *htlb_fault_mutex_table ____cacheline_aligned_in_smp; +struct mutex *hugetlb_fault_mutex_table ____cacheline_aligned_in_smp; /* Forward declaration */ static int hugetlb_acct_memory(struct hstate *h, long delta); @@ -2482,7 +2482,7 @@ static void __exit hugetlb_exit(void) } kobject_put(hugepages_kobj); - kfree(htlb_fault_mutex_table); + kfree(hugetlb_fault_mutex_table); } module_exit(hugetlb_exit); @@ -2515,12 +2515,12 @@ static int __init hugetlb_init(void) #else num_fault_mutexes = 1; #endif - htlb_fault_mutex_table = + hugetlb_fault_mutex_table = kmalloc(sizeof(struct mutex) * num_fault_mutexes, GFP_KERNEL); - BUG_ON(!htlb_fault_mutex_table); + BUG_ON(!hugetlb_fault_mutex_table); for (i = 0; i < num_fault_mutexes; i++) - mutex_init(&htlb_fault_mutex_table[i]); + mutex_init(&hugetlb_fault_mutex_table[i]); return 0; } module_init(hugetlb_init); @@ -3454,7 +3454,7 @@ backout_unlocked: } #ifdef CONFIG_SMP -static u32 fault_mutex_hash(struct hstate *h, struct mm_struct *mm, +u32 hugetlb_fault_mutex_hash(struct hstate *h, struct mm_struct *mm, struct vm_area_struct *vma, struct address_space *mapping, pgoff_t idx, unsigned long address) @@ -3479,7 +3479,7 @@ static u32 fault_mutex_hash(struct hstate *h, struct mm_struct *mm, * For uniprocesor systems we always use a single mutex, so just * return 0 and avoid the hashing overhead. */ -static u32 fault_mutex_hash(struct hstate *h, struct mm_struct *mm, +u32 hugetlb_fault_mutex_hash(struct hstate *h, struct mm_struct *mm, struct vm_area_struct *vma, struct address_space *mapping, pgoff_t idx, unsigned long address) @@ -3527,8 +3527,8 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, * get spurious allocation failures if two CPUs race to instantiate * the same page in the page cache. */ - hash = fault_mutex_hash(h, mm, vma, mapping, idx, address); - mutex_lock(&htlb_fault_mutex_table[hash]); + hash = hugetlb_fault_mutex_hash(h, mm, vma, mapping, idx, address); + mutex_lock(&hugetlb_fault_mutex_table[hash]); entry = huge_ptep_get(ptep); if (huge_pte_none(entry)) { @@ -3613,7 +3613,7 @@ out_ptl: put_page(pagecache_page); } out_mutex: - mutex_unlock(&htlb_fault_mutex_table[hash]); + mutex_unlock(&hugetlb_fault_mutex_table[hash]); /* * Generally it's safe to hold refcount during waiting page lock. But * here we just wait to defer the next page fault to avoid busy loop and -- cgit v1.2.3 From b5cec28d36f5ee6b4e6f68a0a40aa1e4045d6d99 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Tue, 8 Sep 2015 15:01:41 -0700 Subject: hugetlbfs: truncate_hugepages() takes a range of pages Modify truncate_hugepages() to take a range of pages (start, end) instead of simply start. If an end value of LLONG_MAX is passed, the current "truncate" functionality is maintained. Existing callers are modified to pass LLONG_MAX as end of range. By keying off end == LLONG_MAX, the routine behaves differently for truncate and hole punch. Page removal is now synchronized with page allocation via faults by using the fault mutex table. The hole punch case can experience the rare region_del error and must handle accordingly. Add the routine hugetlb_fix_reserve_counts to fix up reserve counts in the case where region_del returns an error. Since the routine handles more than just the truncate case, it is renamed to remove_inode_hugepages(). To be consistent, the routine truncate_huge_page() is renamed remove_huge_page(). Downstream of remove_inode_hugepages(), the routine hugetlb_unreserve_pages() is also modified to take a range of pages. hugetlb_unreserve_pages is modified to detect an error from region_del and pass it back to the caller. Signed-off-by: Mike Kravetz Reviewed-by: Naoya Horiguchi Acked-by: Hillf Danton Cc: Dave Hansen Cc: David Rientjes Cc: Hugh Dickins Cc: Davidlohr Bueso Cc: Aneesh Kumar Cc: Christoph Hellwig Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 40 +++++++++++++++++++++++++++++++++++++--- 1 file changed, 37 insertions(+), 3 deletions(-) (limited to 'mm/hugetlb.c') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 070880fe1ff7..61c52cd5f77b 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -546,6 +546,28 @@ retry: return del; } +/* + * A rare out of memory error was encountered which prevented removal of + * the reserve map region for a page. The huge page itself was free'ed + * and removed from the page cache. This routine will adjust the subpool + * usage count, and the global reserve count if needed. By incrementing + * these counts, the reserve map entry which could not be deleted will + * appear as a "reserved" entry instead of simply dangling with incorrect + * counts. + */ +void hugetlb_fix_reserve_counts(struct inode *inode, bool restore_reserve) +{ + struct hugepage_subpool *spool = subpool_inode(inode); + long rsv_adjust; + + rsv_adjust = hugepage_subpool_get_pages(spool, 1); + if (restore_reserve && rsv_adjust) { + struct hstate *h = hstate_inode(inode); + + hugetlb_acct_memory(h, 1); + } +} + /* * Count and return the number of huge pages in the reserve map * that intersect with the range [f, t). @@ -3909,7 +3931,8 @@ out_err: return ret; } -void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed) +long hugetlb_unreserve_pages(struct inode *inode, long start, long end, + long freed) { struct hstate *h = hstate_inode(inode); struct resv_map *resv_map = inode_resv_map(inode); @@ -3917,8 +3940,17 @@ void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed) struct hugepage_subpool *spool = subpool_inode(inode); long gbl_reserve; - if (resv_map) - chg = region_del(resv_map, offset, LONG_MAX); + if (resv_map) { + chg = region_del(resv_map, start, end); + /* + * region_del() can fail in the rare case where a region + * must be split and another region descriptor can not be + * allocated. If end == LONG_MAX, it will not fail. + */ + if (chg < 0) + return chg; + } + spin_lock(&inode->i_lock); inode->i_blocks -= (blocks_per_huge_page(h) * freed); spin_unlock(&inode->i_lock); @@ -3929,6 +3961,8 @@ void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed) */ gbl_reserve = hugepage_subpool_put_pages(spool, (chg - freed)); hugetlb_acct_memory(h, -gbl_reserve); + + return 0; } #ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE -- cgit v1.2.3 From 1fb1b0e9ef2d661488f8053986c3b7641cae529d Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Tue, 8 Sep 2015 15:01:44 -0700 Subject: mm/hugetlb: vma_has_reserves() needs to handle fallocate hole punch In vma_has_reserves(), the current assumption is that reserves are always present for shared mappings. However, this will not be the case with fallocate hole punch. When punching a hole, the present page will be deleted as well as the region/reserve map entry (and hence any reservation). vma_has_reserves is passed "chg" which indicates whether or not a region/reserve map is present. Use this to determine if reserves are actually present or were removed via hole punch. Signed-off-by: Mike Kravetz Reviewed-by: Naoya Horiguchi Acked-by: Hillf Danton Cc: Dave Hansen Cc: David Rientjes Cc: Hugh Dickins Cc: Davidlohr Bueso Cc: Aneesh Kumar Cc: Christoph Hellwig Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) (limited to 'mm/hugetlb.c') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 61c52cd5f77b..bd12e8c8bc7b 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -801,8 +801,19 @@ static bool vma_has_reserves(struct vm_area_struct *vma, long chg) } /* Shared mappings always use reserves */ - if (vma->vm_flags & VM_MAYSHARE) - return true; + if (vma->vm_flags & VM_MAYSHARE) { + /* + * We know VM_NORESERVE is not set. Therefore, there SHOULD + * be a region map for all pages. The only situation where + * there is no region map is if a hole was punched via + * fallocate. In this case, there really are no reverves to + * use. This situation is indicated if chg != 0. + */ + if (chg) + return false; + else + return true; + } /* * Only the process that called mmap() has reserves for -- cgit v1.2.3 From d85f69b0b533ec6d7ac8c21db958c44c6d957c90 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Tue, 8 Sep 2015 15:01:47 -0700 Subject: mm/hugetlb: alloc_huge_page handle areas hole punched by fallocate Areas hole punched by fallocate will not have entries in the region/reserve map. However, shared mappings with min_size subpool reservations may still have reserved pages. alloc_huge_page needs to handle this special case and do the proper accounting. Signed-off-by: Mike Kravetz Reviewed-by: Naoya Horiguchi Acked-by: Hillf Danton Cc: Dave Hansen Cc: David Rientjes Cc: Hugh Dickins Cc: Davidlohr Bueso Cc: Aneesh Kumar Cc: Christoph Hellwig Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 54 +++++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 39 insertions(+), 15 deletions(-) (limited to 'mm/hugetlb.c') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index bd12e8c8bc7b..114ad6ce7030 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1733,34 +1733,58 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, struct hugepage_subpool *spool = subpool_vma(vma); struct hstate *h = hstate_vma(vma); struct page *page; - long chg, commit; + long map_chg, map_commit; + long gbl_chg; int ret, idx; struct hugetlb_cgroup *h_cg; idx = hstate_index(h); /* - * Processes that did not create the mapping will have no - * reserves and will not have accounted against subpool - * limit. Check that the subpool limit can be made before - * satisfying the allocation MAP_NORESERVE mappings may also - * need pages and subpool limit allocated allocated if no reserve - * mapping overlaps. + * Examine the region/reserve map to determine if the process + * has a reservation for the page to be allocated. A return + * code of zero indicates a reservation exists (no change). */ - chg = vma_needs_reservation(h, vma, addr); - if (chg < 0) + map_chg = gbl_chg = vma_needs_reservation(h, vma, addr); + if (map_chg < 0) return ERR_PTR(-ENOMEM); - if (chg || avoid_reserve) - if (hugepage_subpool_get_pages(spool, 1) < 0) { + + /* + * Processes that did not create the mapping will have no + * reserves as indicated by the region/reserve map. Check + * that the allocation will not exceed the subpool limit. + * Allocations for MAP_NORESERVE mappings also need to be + * checked against any subpool limit. + */ + if (map_chg || avoid_reserve) { + gbl_chg = hugepage_subpool_get_pages(spool, 1); + if (gbl_chg < 0) { vma_end_reservation(h, vma, addr); return ERR_PTR(-ENOSPC); } + /* + * Even though there was no reservation in the region/reserve + * map, there could be reservations associated with the + * subpool that can be used. This would be indicated if the + * return value of hugepage_subpool_get_pages() is zero. + * However, if avoid_reserve is specified we still avoid even + * the subpool reservations. + */ + if (avoid_reserve) + gbl_chg = 1; + } + ret = hugetlb_cgroup_charge_cgroup(idx, pages_per_huge_page(h), &h_cg); if (ret) goto out_subpool_put; spin_lock(&hugetlb_lock); - page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve, chg); + /* + * glb_chg is passed to indicate whether or not a page must be taken + * from the global free pool (global change). gbl_chg == 0 indicates + * a reservation exists for the allocation. + */ + page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve, gbl_chg); if (!page) { spin_unlock(&hugetlb_lock); page = alloc_buddy_huge_page(h, NUMA_NO_NODE); @@ -1776,8 +1800,8 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, set_page_private(page, (unsigned long)spool); - commit = vma_commit_reservation(h, vma, addr); - if (unlikely(chg > commit)) { + map_commit = vma_commit_reservation(h, vma, addr); + if (unlikely(map_chg > map_commit)) { /* * The page was added to the reservation map between * vma_needs_reservation and vma_commit_reservation. @@ -1797,7 +1821,7 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, out_uncharge_cgroup: hugetlb_cgroup_uncharge_cgroup(idx, pages_per_huge_page(h), h_cg); out_subpool_put: - if (chg || avoid_reserve) + if (map_chg || avoid_reserve) hugepage_subpool_put_pages(spool, 1); vma_end_reservation(h, vma, addr); return ERR_PTR(-ENOSPC); -- cgit v1.2.3 From ab76ad540a50191308e5bb6b5e2d9e26c78616d3 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Tue, 8 Sep 2015 15:01:50 -0700 Subject: hugetlbfs: New huge_add_to_page_cache helper routine Currently, there is only a single place where hugetlbfs pages are added to the page cache. The new fallocate code be adding a second one, so break the functionality out into its own helper. Signed-off-by: Dave Hansen Signed-off-by: Mike Kravetz Reviewed-by: Naoya Horiguchi Acked-by: Hillf Danton Cc: David Rientjes Cc: Hugh Dickins Cc: Davidlohr Bueso Cc: Aneesh Kumar Cc: Christoph Hellwig Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) (limited to 'mm/hugetlb.c') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 114ad6ce7030..d45eacc5653e 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3375,6 +3375,23 @@ static bool hugetlbfs_pagecache_present(struct hstate *h, return page != NULL; } +int huge_add_to_page_cache(struct page *page, struct address_space *mapping, + pgoff_t idx) +{ + struct inode *inode = mapping->host; + struct hstate *h = hstate_inode(inode); + int err = add_to_page_cache(page, mapping, idx, GFP_KERNEL); + + if (err) + return err; + ClearPagePrivate(page); + + spin_lock(&inode->i_lock); + inode->i_blocks += blocks_per_huge_page(h); + spin_unlock(&inode->i_lock); + return 0; +} + static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, struct address_space *mapping, pgoff_t idx, unsigned long address, pte_t *ptep, unsigned int flags) @@ -3422,21 +3439,13 @@ retry: set_page_huge_active(page); if (vma->vm_flags & VM_MAYSHARE) { - int err; - struct inode *inode = mapping->host; - - err = add_to_page_cache(page, mapping, idx, GFP_KERNEL); + int err = huge_add_to_page_cache(page, mapping, idx); if (err) { put_page(page); if (err == -EEXIST) goto retry; goto out; } - ClearPagePrivate(page); - - spin_lock(&inode->i_lock); - inode->i_blocks += blocks_per_huge_page(h); - spin_unlock(&inode->i_lock); } else { lock_page(page); if (unlikely(anon_vma_prepare(vma))) { -- cgit v1.2.3 From 70c3547e36f5c9fbc4caecfeca98f0effa6932c5 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Tue, 8 Sep 2015 15:01:54 -0700 Subject: hugetlbfs: add hugetlbfs_fallocate() This is based on the shmem version, but it has diverged quite a bit. We have no swap to worry about, nor the new file sealing. Add synchronication via the fault mutex table to coordinate page faults, fallocate allocation and fallocate hole punch. What this allows us to do is move physical memory in and out of a hugetlbfs file without having it mapped. This also gives us the ability to support MADV_REMOVE since it is currently implemented using fallocate(). MADV_REMOVE lets madvise() remove pages from the middle of a hugetlbfs file, which wasn't possible before. hugetlbfs fallocate only operates on whole huge pages. Based on code by Dave Hansen. Signed-off-by: Mike Kravetz Reviewed-by: Naoya Horiguchi Acked-by: Hillf Danton Cc: Dave Hansen Cc: David Rientjes Cc: Hugh Dickins Cc: Davidlohr Bueso Cc: Aneesh Kumar Cc: Christoph Hellwig Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm/hugetlb.c') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index d45eacc5653e..cd1280c487ff 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1727,7 +1727,7 @@ static void vma_end_reservation(struct hstate *h, (void)__vma_reservation_common(h, vma, addr, VMA_END_RESV); } -static struct page *alloc_huge_page(struct vm_area_struct *vma, +struct page *alloc_huge_page(struct vm_area_struct *vma, unsigned long addr, int avoid_reserve) { struct hugepage_subpool *spool = subpool_vma(vma); -- cgit v1.2.3 From 96db800f5d73cd5c49461253d45766e094f0f8c2 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Tue, 8 Sep 2015 15:03:50 -0700 Subject: mm: rename alloc_pages_exact_node() to __alloc_pages_node() alloc_pages_exact_node() was introduced in commit 6484eb3e2a81 ("page allocator: do not check NUMA node ID when the caller knows the node is valid") as an optimized variant of alloc_pages_node(), that doesn't fallback to current node for nid == NUMA_NO_NODE. Unfortunately the name of the function can easily suggest that the allocation is restricted to the given node and fails otherwise. In truth, the node is only preferred, unless __GFP_THISNODE is passed among the gfp flags. The misleading name has lead to mistakes in the past, see for example commits 5265047ac301 ("mm, thp: really limit transparent hugepage allocation to local node") and b360edb43f8e ("mm, mempolicy: migrate_to_node should only migrate to node"). Another issue with the name is that there's a family of alloc_pages_exact*() functions where 'exact' means exact size (instead of page order), which leads to more confusion. To prevent further mistakes, this patch effectively renames alloc_pages_exact_node() to __alloc_pages_node() to better convey that it's an optimized variant of alloc_pages_node() not intended for general usage. Both functions get described in comments. It has been also considered to really provide a convenience function for allocations restricted to a node, but the major opinion seems to be that __GFP_THISNODE already provides that functionality and we shouldn't duplicate the API needlessly. The number of users would be small anyway. Existing callers of alloc_pages_exact_node() are simply converted to call __alloc_pages_node(), with the exception of sba_alloc_coherent() which open-codes the check for NUMA_NO_NODE, so it is converted to use alloc_pages_node() instead. This means it no longer performs some VM_BUG_ON checks, and since the current check for nid in alloc_pages_node() uses a 'nid < 0' comparison (which includes NUMA_NO_NODE), it may hide wrong values which would be previously exposed. Both differences will be rectified by the next patch. To sum up, this patch makes no functional changes, except temporarily hiding potentially buggy callers. Restricting the checks in alloc_pages_node() is left for the next patch which can in turn expose more existing buggy callers. Signed-off-by: Vlastimil Babka Acked-by: Johannes Weiner Acked-by: Robin Holt Acked-by: Michal Hocko Acked-by: Christoph Lameter Acked-by: Michael Ellerman Cc: Mel Gorman Cc: David Rientjes Cc: Greg Thelen Cc: Aneesh Kumar K.V Cc: Pekka Enberg Cc: Joonsoo Kim Cc: Naoya Horiguchi Cc: Tony Luck Cc: Fenghua Yu Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Gleb Natapov Cc: Paolo Bonzini Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Cliff Whickman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm/hugetlb.c') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index cd1280c487ff..999fb0aef8f1 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1331,7 +1331,7 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid) { struct page *page; - page = alloc_pages_exact_node(nid, + page = __alloc_pages_node(nid, htlb_alloc_mask(h)|__GFP_COMP|__GFP_THISNODE| __GFP_REPEAT|__GFP_NOWARN, huge_page_order(h)); @@ -1483,7 +1483,7 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, int nid) __GFP_REPEAT|__GFP_NOWARN, huge_page_order(h)); else - page = alloc_pages_exact_node(nid, + page = __alloc_pages_node(nid, htlb_alloc_mask(h)|__GFP_COMP|__GFP_THISNODE| __GFP_REPEAT|__GFP_NOWARN, huge_page_order(h)); -- cgit v1.2.3