From 07ca760673088f262da57ff42c15558688565aa2 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Mon, 14 Feb 2022 18:29:54 -0800 Subject: mm/munlock: maintain page->mlock_count while unevictable Previous patches have been preparatory: now implement page->mlock_count. The ordering of the "Unevictable LRU" is of no significance, and there is no point holding unevictable pages on a list: place page->mlock_count to overlay page->lru.prev (since page->lru.next is overlaid by compound_head, which needs to be even so as not to satisfy PageTail - though 2 could be added instead of 1 for each mlock, if that's ever an improvement). But it's only safe to rely on or modify page->mlock_count while lruvec lock is held and page is on unevictable "LRU" - we can save lots of edits by continuing to pretend that there's an imaginary LRU here (there is an unevictable count which still needs to be maintained, but not a list). The mlock_count technique suffers from an unreliability much like with page_mlock(): while someone else has the page off LRU, not much can be done. As before, err on the safe side (behave as if mlock_count 0), and let try_to_unlock_one() move the page to unevictable if reclaim finds out later on - a few misplaced pages don't matter, what we want to avoid is imbalancing reclaim by flooding evictable lists with unevictable pages. I am not a fan of "if (!isolate_lru_page(page)) putback_lru_page(page);": if we have taken lruvec lock to get the page off its present list, then we save everyone trouble (and however many extra atomic ops) by putting it on its destination list immediately. Signed-off-by: Hugh Dickins Acked-by: Vlastimil Babka Signed-off-by: Matthew Wilcox (Oracle) --- include/linux/mm_inline.h | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'include/linux/mm_inline.h') diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index b725839dfe71..884d6f6af05b 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -99,7 +99,8 @@ void lruvec_add_folio(struct lruvec *lruvec, struct folio *folio) update_lru_size(lruvec, lru, folio_zonenum(folio), folio_nr_pages(folio)); - list_add(&folio->lru, &lruvec->lists[lru]); + if (lru != LRU_UNEVICTABLE) + list_add(&folio->lru, &lruvec->lists[lru]); } static __always_inline void add_page_to_lru_list(struct page *page, @@ -115,6 +116,7 @@ void lruvec_add_folio_tail(struct lruvec *lruvec, struct folio *folio) update_lru_size(lruvec, lru, folio_zonenum(folio), folio_nr_pages(folio)); + /* This is not expected to be used on LRU_UNEVICTABLE */ list_add_tail(&folio->lru, &lruvec->lists[lru]); } @@ -127,8 +129,11 @@ static __always_inline void add_page_to_lru_list_tail(struct page *page, static __always_inline void lruvec_del_folio(struct lruvec *lruvec, struct folio *folio) { - list_del(&folio->lru); - update_lru_size(lruvec, folio_lru_list(folio), folio_zonenum(folio), + enum lru_list lru = folio_lru_list(folio); + + if (lru != LRU_UNEVICTABLE) + list_del(&folio->lru); + update_lru_size(lruvec, lru, folio_zonenum(folio), -folio_nr_pages(folio)); } -- cgit v1.2.3 From 5c26f6ac9416b63d093e29c30e79b3297e425472 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Fri, 4 Mar 2022 20:28:51 -0800 Subject: mm: refactor vm_area_struct::anon_vma_name usage code Avoid mixing strings and their anon_vma_name referenced pointers by using struct anon_vma_name whenever possible. This simplifies the code and allows easier sharing of anon_vma_name structures when they represent the same name. [surenb@google.com: fix comment] Link: https://lkml.kernel.org/r/20220223153613.835563-1-surenb@google.com Link: https://lkml.kernel.org/r/20220224231834.1481408-1-surenb@google.com Signed-off-by: Suren Baghdasaryan Suggested-by: Matthew Wilcox Suggested-by: Michal Hocko Acked-by: Michal Hocko Cc: Colin Cross Cc: Sumit Semwal Cc: Dave Hansen Cc: Kees Cook Cc: "Kirill A. Shutemov" Cc: Vlastimil Babka Cc: Johannes Weiner Cc: "Eric W. Biederman" Cc: Christian Brauner Cc: Alexey Gladkov Cc: Sasha Levin Cc: Chris Hyser Cc: Davidlohr Bueso Cc: Peter Collingbourne Cc: Xiaofeng Cao Cc: David Hildenbrand Cc: Cyrill Gorcunov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm_inline.h | 87 ++++++++++++++++++++++++++++++++--------------- 1 file changed, 59 insertions(+), 28 deletions(-) (limited to 'include/linux/mm_inline.h') diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index b725839dfe71..dd3accaa4e6d 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -140,50 +140,81 @@ static __always_inline void del_page_from_lru_list(struct page *page, #ifdef CONFIG_ANON_VMA_NAME /* - * mmap_lock should be read-locked when calling vma_anon_name() and while using - * the returned pointer. + * mmap_lock should be read-locked when calling anon_vma_name(). Caller should + * either keep holding the lock while using the returned pointer or it should + * raise anon_vma_name refcount before releasing the lock. */ -extern const char *vma_anon_name(struct vm_area_struct *vma); +extern struct anon_vma_name *anon_vma_name(struct vm_area_struct *vma); +extern struct anon_vma_name *anon_vma_name_alloc(const char *name); +extern void anon_vma_name_free(struct kref *kref); -/* - * mmap_lock should be read-locked for orig_vma->vm_mm. - * mmap_lock should be write-locked for new_vma->vm_mm or new_vma should be - * isolated. - */ -extern void dup_vma_anon_name(struct vm_area_struct *orig_vma, - struct vm_area_struct *new_vma); +/* mmap_lock should be read-locked */ +static inline void anon_vma_name_get(struct anon_vma_name *anon_name) +{ + if (anon_name) + kref_get(&anon_name->kref); +} -/* - * mmap_lock should be write-locked or vma should have been isolated under - * write-locked mmap_lock protection. - */ -extern void free_vma_anon_name(struct vm_area_struct *vma); +static inline void anon_vma_name_put(struct anon_vma_name *anon_name) +{ + if (anon_name) + kref_put(&anon_name->kref, anon_vma_name_free); +} -/* mmap_lock should be read-locked */ -static inline bool is_same_vma_anon_name(struct vm_area_struct *vma, - const char *name) +static inline void dup_anon_vma_name(struct vm_area_struct *orig_vma, + struct vm_area_struct *new_vma) +{ + struct anon_vma_name *anon_name = anon_vma_name(orig_vma); + + if (anon_name) { + anon_vma_name_get(anon_name); + new_vma->anon_name = anon_name; + } +} + +static inline void free_anon_vma_name(struct vm_area_struct *vma) { - const char *vma_name = vma_anon_name(vma); + /* + * Not using anon_vma_name because it generates a warning if mmap_lock + * is not held, which might be the case here. + */ + if (!vma->vm_file) + anon_vma_name_put(vma->anon_name); +} - /* either both NULL, or pointers to same string */ - if (vma_name == name) +static inline bool anon_vma_name_eq(struct anon_vma_name *anon_name1, + struct anon_vma_name *anon_name2) +{ + if (anon_name1 == anon_name2) return true; - return name && vma_name && !strcmp(name, vma_name); + return anon_name1 && anon_name2 && + !strcmp(anon_name1->name, anon_name2->name); } + #else /* CONFIG_ANON_VMA_NAME */ -static inline const char *vma_anon_name(struct vm_area_struct *vma) +static inline struct anon_vma_name *anon_vma_name(struct vm_area_struct *vma) { return NULL; } -static inline void dup_vma_anon_name(struct vm_area_struct *orig_vma, - struct vm_area_struct *new_vma) {} -static inline void free_vma_anon_name(struct vm_area_struct *vma) {} -static inline bool is_same_vma_anon_name(struct vm_area_struct *vma, - const char *name) + +static inline struct anon_vma_name *anon_vma_name_alloc(const char *name) +{ + return NULL; +} + +static inline void anon_vma_name_get(struct anon_vma_name *anon_name) {} +static inline void anon_vma_name_put(struct anon_vma_name *anon_name) {} +static inline void dup_anon_vma_name(struct vm_area_struct *orig_vma, + struct vm_area_struct *new_vma) {} +static inline void free_anon_vma_name(struct vm_area_struct *vma) {} + +static inline bool anon_vma_name_eq(struct anon_vma_name *anon_name1, + struct anon_vma_name *anon_name2) { return true; } + #endif /* CONFIG_ANON_VMA_NAME */ static inline void init_tlb_flush_pending(struct mm_struct *mm) -- cgit v1.2.3 From 96403e11283def1d1c465c8279514c9a504d8630 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Fri, 4 Mar 2022 20:28:55 -0800 Subject: mm: prevent vm_area_struct::anon_name refcount saturation A deep process chain with many vmas could grow really high. With default sysctl_max_map_count (64k) and default pid_max (32k) the max number of vmas in the system is 2147450880 and the refcounter has headroom of 1073774592 before it reaches REFCOUNT_SATURATED (3221225472). Therefore it's unlikely that an anonymous name refcounter will overflow with these defaults. Currently the max for pid_max is PID_MAX_LIMIT (4194304) and for sysctl_max_map_count it's INT_MAX (2147483647). In this configuration anon_vma_name refcount overflow becomes theoretically possible (that still require heavy sharing of that anon_vma_name between processes). kref refcounting interface used in anon_vma_name structure will detect a counter overflow when it reaches REFCOUNT_SATURATED value but will only generate a warning and freeze the ref counter. This would lead to the refcounted object never being freed. A determined attacker could leak memory like that but it would be rather expensive and inefficient way to do so. To ensure anon_vma_name refcount does not overflow, stop anon_vma_name sharing when the refcount reaches REFCOUNT_MAX (2147483647), which still leaves INT_MAX/2 (1073741823) values before the counter reaches REFCOUNT_SATURATED. This should provide enough headroom for raising the refcounts temporarily. Link: https://lkml.kernel.org/r/20220223153613.835563-2-surenb@google.com Signed-off-by: Suren Baghdasaryan Suggested-by: Michal Hocko Acked-by: Michal Hocko Cc: Alexey Gladkov Cc: Chris Hyser Cc: Christian Brauner Cc: Colin Cross Cc: Cyrill Gorcunov Cc: Dave Hansen Cc: David Hildenbrand Cc: Davidlohr Bueso Cc: "Eric W. Biederman" Cc: Johannes Weiner Cc: Kees Cook Cc: "Kirill A. Shutemov" Cc: Matthew Wilcox Cc: Peter Collingbourne Cc: Sasha Levin Cc: Sumit Semwal Cc: Vlastimil Babka Cc: Xiaofeng Cao Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm_inline.h | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) (limited to 'include/linux/mm_inline.h') diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index dd3accaa4e6d..cf90b1fa2c60 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -161,15 +161,25 @@ static inline void anon_vma_name_put(struct anon_vma_name *anon_name) kref_put(&anon_name->kref, anon_vma_name_free); } +static inline +struct anon_vma_name *anon_vma_name_reuse(struct anon_vma_name *anon_name) +{ + /* Prevent anon_name refcount saturation early on */ + if (kref_read(&anon_name->kref) < REFCOUNT_MAX) { + anon_vma_name_get(anon_name); + return anon_name; + + } + return anon_vma_name_alloc(anon_name->name); +} + static inline void dup_anon_vma_name(struct vm_area_struct *orig_vma, struct vm_area_struct *new_vma) { struct anon_vma_name *anon_name = anon_vma_name(orig_vma); - if (anon_name) { - anon_vma_name_get(anon_name); - new_vma->anon_name = anon_name; - } + if (anon_name) + new_vma->anon_name = anon_vma_name_reuse(anon_name); } static inline void free_anon_vma_name(struct vm_area_struct *vma) -- cgit v1.2.3