From bc0c3357601e3ff1b006600530079bd246ef0d82 Mon Sep 17 00:00:00 2001
From: Mateusz Guzik <mjguzik@gmail.com>
Date: Wed, 23 Aug 2023 19:05:56 +0200
Subject: mm: remove remnants of SPLIT_RSS_COUNTING

The feature got retired in f1a7941243c1 ("mm: convert mm's rss stats into
percpu_counter"), but the patch failed to fully clean it up.

Link: https://lkml.kernel.org/r/20230823170556.2281747-1-mjguzik@gmail.com
Signed-off-by: Mateusz Guzik <mjguzik@gmail.com>
Acked-by: Shakeel Butt <shakeelb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h | 8 --------
 1 file changed, 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index bf5d0b1b16f4..7613150acab9 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2628,14 +2628,6 @@ static inline void setmax_mm_hiwater_rss(unsigned long *maxrss,
 		*maxrss = hiwater_rss;
 }
 
-#if defined(SPLIT_RSS_COUNTING)
-void sync_mm_rss(struct mm_struct *mm);
-#else
-static inline void sync_mm_rss(struct mm_struct *mm)
-{
-}
-#endif
-
 #ifndef CONFIG_ARCH_HAS_PTE_SPECIAL
 static inline int pte_special(pte_t pte)
 {
-- 
cgit v1.2.3


From 91e79d22be75fec88ae58d274a7c9e49d6215099 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Wed, 23 Aug 2023 00:13:14 +0100
Subject: mm: convert DAX lock/unlock page to lock/unlock folio

The one caller of DAX lock/unlock page already calls compound_head(), so
use page_folio() instead, then use a folio throughout the DAX code to
remove uses of page->mapping and page->index.

[jane.chu@oracle.com: add comment to mf_generic_kill_procss(), simplify mf_generic_kill_procs:folio initialization]
  Link: https://lkml.kernel.org/r/20230908222336.186313-1-jane.chu@oracle.com
Link: https://lkml.kernel.org/r/20230822231314.349200-1-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Jane Chu <jane.chu@oracle.com>
Acked-by: Naoya Horiguchi <naoya.horiguchi@nec.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Jane Chu <jane.chu@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/dax.h | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/dax.h b/include/linux/dax.h
index 22cd9902345d..b463502b16e1 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -159,8 +159,8 @@ int dax_writeback_mapping_range(struct address_space *mapping,
 
 struct page *dax_layout_busy_page(struct address_space *mapping);
 struct page *dax_layout_busy_page_range(struct address_space *mapping, loff_t start, loff_t end);
-dax_entry_t dax_lock_page(struct page *page);
-void dax_unlock_page(struct page *page, dax_entry_t cookie);
+dax_entry_t dax_lock_folio(struct folio *folio);
+void dax_unlock_folio(struct folio *folio, dax_entry_t cookie);
 dax_entry_t dax_lock_mapping_entry(struct address_space *mapping,
 		unsigned long index, struct page **page);
 void dax_unlock_mapping_entry(struct address_space *mapping,
@@ -182,14 +182,14 @@ static inline int dax_writeback_mapping_range(struct address_space *mapping,
 	return -EOPNOTSUPP;
 }
 
-static inline dax_entry_t dax_lock_page(struct page *page)
+static inline dax_entry_t dax_lock_folio(struct folio *folio)
 {
-	if (IS_DAX(page->mapping->host))
+	if (IS_DAX(folio->mapping->host))
 		return ~0UL;
 	return 0;
 }
 
-static inline void dax_unlock_page(struct page *page, dax_entry_t cookie)
+static inline void dax_unlock_folio(struct folio *folio, dax_entry_t cookie)
 {
 }
 
-- 
cgit v1.2.3


From b1e5a3dee255a11cbdd5a0e814829276bd33a793 Mon Sep 17 00:00:00 2001
From: "Joel Fernandes (Google)" <joel@joelfernandes.org>
Date: Sun, 3 Sep 2023 15:13:23 +0000
Subject: mm/mremap: allow moves within the same VMA for stack moves

For the stack move happening in shift_arg_pages(), the move is happening
within the same VMA which spans the old and new ranges.

In case the aligned address happens to fall within that VMA, allow such
moves and don't abort the mremap alignment optimization.

In the regular non-stack mremap case, we cannot allow any such moves as
will end up destroying some part of the mapping (either the source of the
move, or part of the existing mapping).  So just avoid it for stack moves.

Link: https://lkml.kernel.org/r/20230903151328.2981432-3-joel@joelfernandes.org
Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Reviewed-by: Lorenzo Stoakes <lstoakes@gmail.com>
Cc: Kalesh Singh <kaleshsingh@google.com>
Cc: "Kirill A. Shutemov" <kirill@shutemov.name>
Cc: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Lokesh Gidra <lokeshgidra@google.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Paul E. McKenney <paulmck@kernel.org>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 7613150acab9..21c3d4e8a282 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2480,7 +2480,7 @@ int get_cmdline(struct task_struct *task, char *buffer, int buflen);
 extern unsigned long move_page_tables(struct vm_area_struct *vma,
 		unsigned long old_addr, struct vm_area_struct *new_vma,
 		unsigned long new_addr, unsigned long len,
-		bool need_rmap_locks);
+		bool need_rmap_locks, bool for_stack);
 
 /*
  * Flags used by change_protection().  For now we make it a bitmap so
-- 
cgit v1.2.3


From d896073fc767ebb40c11a6a9de71c390757ac64b Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Thu, 7 Sep 2023 02:29:27 +0000
Subject: mm/damon/core: add more comments for nr_accesses

The comment on struct damon_region about nr_accesses field looks not
sufficient.  Many people actually used to ask what nr_accesses mean.
There is more detailed explanation of the mechanism on the comment for
struct damon_attrs, but it is also ambiguous, as it doesn't specify the
name of the counter for aggregating the access check results.  Make
those more detailed.

Link: https://lkml.kernel.org/r/20230907022929.91361-10-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Steven Rostedt (Google) <rostedt@goodmis.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/damon.h | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/damon.h b/include/linux/damon.h
index ae2664d1d5f1..266f92b34dd2 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -43,6 +43,10 @@ struct damon_addr_range {
  * @list:		List head for siblings.
  * @age:		Age of this region.
  *
+ * @nr_accesses is reset to zero for every &damon_attrs->aggr_interval and be
+ * increased for every &damon_attrs->sample_interval if an access to the region
+ * during the last sampling interval is found.
+ *
  * @age is initially zero, increased for each aggregation interval, and reset
  * to zero again if the access frequency is significantly changed.  If two
  * regions are merged into a new region, both @nr_accesses and @age of the new
@@ -472,13 +476,14 @@ struct damon_callback {
  *				regions.
  *
  * For each @sample_interval, DAMON checks whether each region is accessed or
- * not.  It aggregates and keeps the access information (number of accesses to
- * each region) for @aggr_interval time.  DAMON also checks whether the target
- * memory regions need update (e.g., by ``mmap()`` calls from the application,
- * in case of virtual memory monitoring) and applies the changes for each
- * @ops_update_interval.  All time intervals are in micro-seconds.
- * Please refer to &struct damon_operations and &struct damon_callback for more
- * detail.
+ * not during the last @sample_interval.  If such access is found, DAMON
+ * aggregates the information by increasing &damon_region->nr_accesses for
+ * @aggr_interval time.  For each @aggr_interval, the count is reset.  DAMON
+ * also checks whether the target memory regions need update (e.g., by
+ * ``mmap()`` calls from the application, in case of virtual memory monitoring)
+ * and applies the changes for each @ops_update_interval.  All time intervals
+ * are in micro-seconds.  Please refer to &struct damon_operations and &struct
+ * damon_callback for more detail.
  */
 struct damon_attrs {
 	unsigned long sample_interval;
-- 
cgit v1.2.3


From cf0a96bd3ab4d9d8a1c92baf1a822f2ddbca3a34 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Thu, 7 Sep 2023 02:29:28 +0000
Subject: mm/damon/core: remove duplicated comment for watermarks-based
 deactivation

The comment for explaining about watermarks-based monitoring part
deactivation is duplicated in two paragraphs.  Remove one.

Link: https://lkml.kernel.org/r/20230907022929.91361-11-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Steven Rostedt (Google) <rostedt@goodmis.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/damon.h | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/damon.h b/include/linux/damon.h
index 266f92b34dd2..ab3089de1478 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -317,9 +317,6 @@ struct damos_access_pattern {
  * monitoring context are inactive, DAMON stops monitoring either, and just
  * repeatedly checks the watermarks.
  *
- * If all schemes that registered to a &struct damon_ctx are inactive, DAMON
- * stops monitoring and just repeatedly checks the watermarks.
- *
  * Before applying the &action to a memory region, &struct damon_operations
  * implementation could check pages of the region and skip &action to respect
  * &filters
-- 
cgit v1.2.3


From 7cd34dd3c9bf1d67ce7d1ab3fe8886c583ae0d9a Mon Sep 17 00:00:00 2001
From: Adrian Hunter <adrian.hunter@intel.com>
Date: Mon, 11 Sep 2023 14:21:13 +0300
Subject: efi/unaccepted: do not let /proc/vmcore try to access unaccepted
 memory

Patch series "Do not try to access unaccepted memory", v2.

Support for unaccepted memory was added recently, refer commit
dcdfdd40fa82 ("mm: Add support for unaccepted memory"), whereby
a virtual machine may need to accept memory before it can be used.

Plug a few gaps where RAM is exposed without checking if it is
unaccepted memory.


This patch (of 2):

Support for unaccepted memory was added recently, refer commit
dcdfdd40fa82 ("mm: Add support for unaccepted memory"), whereby a virtual
machine may need to accept memory before it can be used.

Do not let /proc/vmcore try to access unaccepted memory because it can
cause the guest to fail.

For /proc/vmcore, which is read-only, this means a read or mmap of
unaccepted memory will return zeros.

Link: https://lkml.kernel.org/r/20230911112114.91323-1-adrian.hunter@intel.com
Link: https://lkml.kernel.org/r/20230911112114.91323-2-adrian.hunter@intel.com
Signed-off-by: Adrian Hunter <adrian.hunter@intel.com>
Cc: Ard Biesheuvel <ardb@kernel.org>
Cc: Baoquan He <bhe@redhat.com>
Cc: Borislav Petkov (AMD) <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Dave Young <dyoung@redhat.com>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Lorenzo Stoakes <lstoakes@gmail.com>
Cc: Mike Rapoport <rppt@linux.ibm.com>
Cc: Tom Lendacky <thomas.lendacky@amd.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 21c3d4e8a282..31dc25d3f6b5 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -4054,4 +4054,11 @@ static inline void accept_memory(phys_addr_t start, phys_addr_t end)
 
 #endif
 
+static inline bool pfn_is_unaccepted_memory(unsigned long pfn)
+{
+	phys_addr_t paddr = pfn << PAGE_SHIFT;
+
+	return range_contains_unaccepted_memory(paddr, paddr + PAGE_SIZE);
+}
+
 #endif /* _LINUX_MM_H */
-- 
cgit v1.2.3


From 3ee0aa9f06756e959633ffda37856c6741d948ed Mon Sep 17 00:00:00 2001
From: Qi Zheng <zhengqi.arch@bytedance.com>
Date: Mon, 11 Sep 2023 17:25:14 +0800
Subject: mm: move some shrinker-related function declarations to mm/internal.h
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Patch series "cleanups for lockless slab shrink", v4.

This series is some cleanups for lockless slab shrink.


This patch (of 4):

The following functions are only used inside the mm subsystem, so it's
better to move their declarations to the mm/internal.h file.

1. shrinker_debugfs_add()
2. shrinker_debugfs_detach()
3. shrinker_debugfs_remove()

Link: https://lkml.kernel.org/r/20230911092517.64141-1-zhengqi.arch@bytedance.com
Link: https://lkml.kernel.org/r/20230911092517.64141-2-zhengqi.arch@bytedance.com
Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
Reviewed-by: Muchun Song <songmuchun@bytedance.com>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Christian König <christian.koenig@amd.com>
Cc: Chuck Lever <cel@kernel.org>
Cc: Daniel Vetter <daniel@ffwll.ch>
Cc: Darrick J. Wong <djwong@kernel.org>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Joel Fernandes <joel@joelfernandes.org>
Cc: Kirill Tkhai <tkhai@ya.ru>
Cc: Paul E. McKenney <paulmck@kernel.org>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Sergey Senozhatsky <senozhatsky@chromium.org>
Cc: Steven Price <steven.price@arm.com>
Cc: Theodore Ts'o <tytso@mit.edu>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
Cc: Abhinav Kumar <quic_abhinavk@quicinc.com>
Cc: Alasdair Kergon <agk@redhat.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com>
Cc: Andreas Dilger <adilger.kernel@dilger.ca>
Cc: Andreas Gruenbacher <agruenba@redhat.com>
Cc: Anna Schumaker <anna@kernel.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Bob Peterson <rpeterso@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Carlos Llamas <cmllamas@google.com>
Cc: Chandan Babu R <chandan.babu@oracle.com>
Cc: Chao Yu <chao@kernel.org>
Cc: Chris Mason <clm@fb.com>
Cc: Coly Li <colyli@suse.de>
Cc: Dai Ngo <Dai.Ngo@oracle.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Airlie <airlied@gmail.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Sterba <dsterba@suse.com>
Cc: Dmitry Baryshkov <dmitry.baryshkov@linaro.org>
Cc: Gao Xiang <hsiangkao@linux.alibaba.com>
Cc: Huang Rui <ray.huang@amd.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jaegeuk Kim <jaegeuk@kernel.org>
Cc: Jani Nikula <jani.nikula@linux.intel.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jason Wang <jasowang@redhat.com>
Cc: Jeff Layton <jlayton@kernel.org>
Cc: Jeffle Xu <jefflexu@linux.alibaba.com>
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Cc: Josef Bacik <josef@toxicpanda.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Kent Overstreet <kent.overstreet@gmail.com>
Cc: Marijn Suijten <marijn.suijten@somainline.org>
Cc: "Michael S. Tsirkin" <mst@redhat.com>
Cc: Mike Snitzer <snitzer@kernel.org>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Nadav Amit <namit@vmware.com>
Cc: Neil Brown <neilb@suse.de>
Cc: Oleksandr Tyshchenko <oleksandr_tyshchenko@epam.com>
Cc: Olga Kornievskaia <kolga@netapp.com>
Cc: Richard Weinberger <richard@nod.at>
Cc: Rob Clark <robdclark@gmail.com>
Cc: Rob Herring <robh@kernel.org>
Cc: Rodrigo Vivi <rodrigo.vivi@intel.com>
Cc: Sean Paul <sean@poorly.run>
Cc: Song Liu <song@kernel.org>
Cc: Stefano Stabellini <sstabellini@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tomeu Vizoso <tomeu.vizoso@collabora.com>
Cc: Tom Talpey <tom@talpey.com>
Cc: Trond Myklebust <trond.myklebust@hammerspace.com>
Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
Cc: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
Cc: Yue Hu <huyue2@coolpad.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/shrinker.h | 19 -------------------
 1 file changed, 19 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h
index 224293b2dd06..8dc15aa37410 100644
--- a/include/linux/shrinker.h
+++ b/include/linux/shrinker.h
@@ -106,28 +106,9 @@ extern void free_prealloced_shrinker(struct shrinker *shrinker);
 extern void synchronize_shrinkers(void);
 
 #ifdef CONFIG_SHRINKER_DEBUG
-extern int shrinker_debugfs_add(struct shrinker *shrinker);
-extern struct dentry *shrinker_debugfs_detach(struct shrinker *shrinker,
-					      int *debugfs_id);
-extern void shrinker_debugfs_remove(struct dentry *debugfs_entry,
-				    int debugfs_id);
 extern int __printf(2, 3) shrinker_debugfs_rename(struct shrinker *shrinker,
 						  const char *fmt, ...);
 #else /* CONFIG_SHRINKER_DEBUG */
-static inline int shrinker_debugfs_add(struct shrinker *shrinker)
-{
-	return 0;
-}
-static inline struct dentry *shrinker_debugfs_detach(struct shrinker *shrinker,
-						     int *debugfs_id)
-{
-	*debugfs_id = -1;
-	return NULL;
-}
-static inline void shrinker_debugfs_remove(struct dentry *debugfs_entry,
-					   int debugfs_id)
-{
-}
 static inline __printf(2, 3)
 int shrinker_debugfs_rename(struct shrinker *shrinker, const char *fmt, ...)
 {
-- 
cgit v1.2.3


From 0b2f5ea1aa39c0ed34bdadb53faf519e3d84ac4a Mon Sep 17 00:00:00 2001
From: Qi Zheng <zhengqi.arch@bytedance.com>
Date: Mon, 11 Sep 2023 17:25:17 +0800
Subject: drm/ttm: introduce pool_shrink_rwsem
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Currently, synchronize_shrinkers() is only used by TTM pool.  It only
requires that no shrinkers run in parallel.

After we use RCU+refcount method to implement the lockless slab shrink, we
can not use shrinker_rwsem or synchronize_rcu() to guarantee that all
shrinker invocations have seen an update before freeing memory.

So we introduce a new pool_shrink_rwsem to implement a private
ttm_pool_synchronize_shrinkers(), so as to achieve the same purpose.

Link: https://lkml.kernel.org/r/20230911092517.64141-5-zhengqi.arch@bytedance.com
Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
Reviewed-by: Muchun Song <songmuchun@bytedance.com>
Reviewed-by: Christian König <christian.koenig@amd.com>
Acked-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Chuck Lever <cel@kernel.org>
Cc: Daniel Vetter <daniel@ffwll.ch>
Cc: Darrick J. Wong <djwong@kernel.org>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Joel Fernandes <joel@joelfernandes.org>
Cc: Kirill Tkhai <tkhai@ya.ru>
Cc: Paul E. McKenney <paulmck@kernel.org>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Sergey Senozhatsky <senozhatsky@chromium.org>
Cc: Steven Price <steven.price@arm.com>
Cc: Theodore Ts'o <tytso@mit.edu>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Abhinav Kumar <quic_abhinavk@quicinc.com>
Cc: Alasdair Kergon <agk@redhat.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com>
Cc: Andreas Dilger <adilger.kernel@dilger.ca>
Cc: Andreas Gruenbacher <agruenba@redhat.com>
Cc: Anna Schumaker <anna@kernel.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Bob Peterson <rpeterso@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Carlos Llamas <cmllamas@google.com>
Cc: Chandan Babu R <chandan.babu@oracle.com>
Cc: Chao Yu <chao@kernel.org>
Cc: Chris Mason <clm@fb.com>
Cc: Coly Li <colyli@suse.de>
Cc: Dai Ngo <Dai.Ngo@oracle.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Airlie <airlied@gmail.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Sterba <dsterba@suse.com>
Cc: Dmitry Baryshkov <dmitry.baryshkov@linaro.org>
Cc: Gao Xiang <hsiangkao@linux.alibaba.com>
Cc: Huang Rui <ray.huang@amd.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jaegeuk Kim <jaegeuk@kernel.org>
Cc: Jani Nikula <jani.nikula@linux.intel.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jason Wang <jasowang@redhat.com>
Cc: Jeff Layton <jlayton@kernel.org>
Cc: Jeffle Xu <jefflexu@linux.alibaba.com>
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Cc: Josef Bacik <josef@toxicpanda.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Kent Overstreet <kent.overstreet@gmail.com>
Cc: Marijn Suijten <marijn.suijten@somainline.org>
Cc: "Michael S. Tsirkin" <mst@redhat.com>
Cc: Mike Snitzer <snitzer@kernel.org>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Nadav Amit <namit@vmware.com>
Cc: Neil Brown <neilb@suse.de>
Cc: Oleksandr Tyshchenko <oleksandr_tyshchenko@epam.com>
Cc: Olga Kornievskaia <kolga@netapp.com>
Cc: Richard Weinberger <richard@nod.at>
Cc: Rob Clark <robdclark@gmail.com>
Cc: Rob Herring <robh@kernel.org>
Cc: Rodrigo Vivi <rodrigo.vivi@intel.com>
Cc: Sean Paul <sean@poorly.run>
Cc: Song Liu <song@kernel.org>
Cc: Stefano Stabellini <sstabellini@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tomeu Vizoso <tomeu.vizoso@collabora.com>
Cc: Tom Talpey <tom@talpey.com>
Cc: Trond Myklebust <trond.myklebust@hammerspace.com>
Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
Cc: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
Cc: Yue Hu <huyue2@coolpad.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/shrinker.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h
index 8dc15aa37410..6b5843c3b827 100644
--- a/include/linux/shrinker.h
+++ b/include/linux/shrinker.h
@@ -103,7 +103,6 @@ extern int __printf(2, 3) register_shrinker(struct shrinker *shrinker,
 					    const char *fmt, ...);
 extern void unregister_shrinker(struct shrinker *shrinker);
 extern void free_prealloced_shrinker(struct shrinker *shrinker);
-extern void synchronize_shrinkers(void);
 
 #ifdef CONFIG_SHRINKER_DEBUG
 extern int __printf(2, 3) shrinker_debugfs_rename(struct shrinker *shrinker,
-- 
cgit v1.2.3


From c42d50aefd17a6bad3ed617769edbbb579137545 Mon Sep 17 00:00:00 2001
From: Qi Zheng <zhengqi.arch@bytedance.com>
Date: Mon, 11 Sep 2023 17:44:00 +0800
Subject: mm: shrinker: add infrastructure for dynamically allocating shrinker

Patch series "use refcount+RCU method to implement lockless slab shrink",
v6.

1. Background
=============

We used to implement the lockless slab shrink with SRCU [1], but then kernel
test robot reported -88.8% regression in stress-ng.ramfs.ops_per_sec test
case [2], so we reverted it [3].

This patch series aims to re-implement the lockless slab shrink using the
refcount+RCU method proposed by Dave Chinner [4].

[1]. https://lore.kernel.org/lkml/20230313112819.38938-1-zhengqi.arch@bytedance.com/
[2]. https://lore.kernel.org/lkml/202305230837.db2c233f-yujie.liu@intel.com/
[3]. https://lore.kernel.org/all/20230609081518.3039120-1-qi.zheng@linux.dev/
[4]. https://lore.kernel.org/lkml/ZIJhou1d55d4H1s0@dread.disaster.area/

2. Implementation
=================

Currently, the shrinker instances can be divided into the following three types:

a) global shrinker instance statically defined in the kernel, such as
   workingset_shadow_shrinker.

b) global shrinker instance statically defined in the kernel modules, such as
   mmu_shrinker in x86.

c) shrinker instance embedded in other structures.

For case a, the memory of shrinker instance is never freed. For case b, the
memory of shrinker instance will be freed after synchronize_rcu() when the
module is unloaded. For case c, the memory of shrinker instance will be freed
along with the structure it is embedded in.

In preparation for implementing lockless slab shrink, we need to dynamically
allocate those shrinker instances in case c, then the memory can be dynamically
freed alone by calling kfree_rcu().

This patchset adds the following new APIs for dynamically allocating shrinker,
and add a private_data field to struct shrinker to record and get the original
embedded structure.

1. shrinker_alloc()
2. shrinker_register()
3. shrinker_free()

In order to simplify shrinker-related APIs and make shrinker more independent of
other kernel mechanisms, this patchset uses the above APIs to convert all
shrinkers (including case a and b) to dynamically allocated, and then remove all
existing APIs. This will also have another advantage mentioned by Dave Chinner:

```
The other advantage of this is that it will break all the existing out of tree
code and third party modules using the old API and will no longer work with a
kernel using lockless slab shrinkers. They need to break (both at the source and
binary levels) to stop bad things from happening due to using uncoverted
shrinkers in the new setup.
```

Then we free the shrinker by calling call_rcu(), and use rcu_read_{lock,unlock}()
to ensure that the shrinker instance is valid. And the shrinker::refcount
mechanism ensures that the shrinker instance will not be run again after
unregistration. So the structure that records the pointer of shrinker instance
can be safely freed without waiting for the RCU read-side critical section.

In this way, while we implement the lockless slab shrink, we don't need to be
blocked in unregister_shrinker() to wait RCU read-side critical section.

PATCH 1: introduce new APIs
PATCH 2~38: convert all shrinnkers to use new APIs
PATCH 39: remove old APIs
PATCH 40~41: some cleanups and preparations
PATCH 42-43: implement the lockless slab shrink
PATCH 44~45: convert shrinker_rwsem to mutex

3. Testing
==========

3.1 slab shrink stress test
---------------------------

We can reproduce the down_read_trylock() hotspot through the following script:

```

DIR="/root/shrinker/memcg/mnt"

do_create()
{
    mkdir -p /sys/fs/cgroup/memory/test
    echo 4G > /sys/fs/cgroup/memory/test/memory.limit_in_bytes
    for i in `seq 0 $1`;
    do
        mkdir -p /sys/fs/cgroup/memory/test/$i;
        echo $$ > /sys/fs/cgroup/memory/test/$i/cgroup.procs;
        mkdir -p $DIR/$i;
    done
}

do_mount()
{
    for i in `seq $1 $2`;
    do
        mount -t tmpfs $i $DIR/$i;
    done
}

do_touch()
{
    for i in `seq $1 $2`;
    do
        echo $$ > /sys/fs/cgroup/memory/test/$i/cgroup.procs;
        dd if=/dev/zero of=$DIR/$i/file$i bs=1M count=1 &
    done
}

case "$1" in
  touch)
    do_touch $2 $3
    ;;
  test)
    do_create 4000
    do_mount 0 4000
    do_touch 0 3000
    ;;
  *)
    exit 1
    ;;
esac
```

Save the above script, then run test and touch commands. Then we can use the
following perf command to view hotspots:

perf top -U -F 999

1) Before applying this patchset:

  33.15%  [kernel]          [k] down_read_trylock
  25.38%  [kernel]          [k] shrink_slab
  21.75%  [kernel]          [k] up_read
   4.45%  [kernel]          [k] _find_next_bit
   2.27%  [kernel]          [k] do_shrink_slab
   1.80%  [kernel]          [k] intel_idle_irq
   1.79%  [kernel]          [k] shrink_lruvec
   0.67%  [kernel]          [k] xas_descend
   0.41%  [kernel]          [k] mem_cgroup_iter
   0.40%  [kernel]          [k] shrink_node
   0.38%  [kernel]          [k] list_lru_count_one

2) After applying this patchset:

  64.56%  [kernel]          [k] shrink_slab
  12.18%  [kernel]          [k] do_shrink_slab
   3.30%  [kernel]          [k] __rcu_read_unlock
   2.61%  [kernel]          [k] shrink_lruvec
   2.49%  [kernel]          [k] __rcu_read_lock
   1.93%  [kernel]          [k] intel_idle_irq
   0.89%  [kernel]          [k] shrink_node
   0.81%  [kernel]          [k] mem_cgroup_iter
   0.77%  [kernel]          [k] mem_cgroup_calculate_protection
   0.66%  [kernel]          [k] list_lru_count_one

We can see that the first perf hotspot becomes shrink_slab, which is what we
expect.

3.2 registration and unregistration stress test
-----------------------------------------------

Run the command below to test:

stress-ng --timeout 60 --times --verify --metrics-brief --ramfs 9 &

1) Before applying this patchset:

setting to a 60 second run per stressor
dispatching hogs: 9 ramfs
stressor       bogo ops real time  usr time  sys time   bogo ops/s     bogo ops/s
                          (secs)    (secs)    (secs)   (real time) (usr+sys time)
ramfs            473062     60.00      8.00    279.13      7884.12        1647.59
for a 60.01s run time:
   1440.34s available CPU time
      7.99s user time   (  0.55%)
    279.13s system time ( 19.38%)
    287.12s total time  ( 19.93%)
load average: 7.12 2.99 1.15
successful run completed in 60.01s (1 min, 0.01 secs)

2) After applying this patchset:

setting to a 60 second run per stressor
dispatching hogs: 9 ramfs
stressor       bogo ops real time  usr time  sys time   bogo ops/s     bogo ops/s
                          (secs)    (secs)    (secs)   (real time) (usr+sys time)
ramfs            477165     60.00      8.13    281.34      7952.55        1648.40
for a 60.01s run time:
   1440.33s available CPU time
      8.12s user time   (  0.56%)
    281.34s system time ( 19.53%)
    289.46s total time  ( 20.10%)
load average: 6.98 3.03 1.19
successful run completed in 60.01s (1 min, 0.01 secs)

We can see that the ops/s has hardly changed.


This patch (of 45):

Currently, the shrinker instances can be divided into the following three
types:

a) global shrinker instance statically defined in the kernel, such as
   workingset_shadow_shrinker.

b) global shrinker instance statically defined in the kernel modules, such
   as mmu_shrinker in x86.

c) shrinker instance embedded in other structures.

For case a, the memory of shrinker instance is never freed. For case b,
the memory of shrinker instance will be freed after synchronize_rcu() when
the module is unloaded. For case c, the memory of shrinker instance will
be freed along with the structure it is embedded in.

In preparation for implementing lockless slab shrink, we need to
dynamically allocate those shrinker instances in case c, then the memory
can be dynamically freed alone by calling kfree_rcu().

So this commit adds the following new APIs for dynamically allocating
shrinker, and add a private_data field to struct shrinker to record and
get the original embedded structure.

1. shrinker_alloc()

Used to allocate shrinker instance itself and related memory, it will
return a pointer to the shrinker instance on success and NULL on failure.

2. shrinker_register()

Used to register the shrinker instance, which is same as the current
register_shrinker_prepared().

3. shrinker_free()

Used to unregister (if needed) and free the shrinker instance.

In order to simplify shrinker-related APIs and make shrinker more
independent of other kernel mechanisms, subsequent submissions will use
the above API to convert all shrinkers (including case a and b) to
dynamically allocated, and then remove all existing APIs.

This will also have another advantage mentioned by Dave Chinner:

```
The other advantage of this is that it will break all the existing
out of tree code and third party modules using the old API and will
no longer work with a kernel using lockless slab shrinkers. They
need to break (both at the source and binary levels) to stop bad
things from happening due to using unconverted shrinkers in the new
setup.
```

[zhengqi.arch@bytedance.com: mm: shrinker: some cleanup]
  Link: https://lkml.kernel.org/r/20230919024607.65463-1-zhengqi.arch@bytedance.com
Link: https://lkml.kernel.org/r/20230911094444.68966-1-zhengqi.arch@bytedance.com
Link: https://lkml.kernel.org/r/20230911094444.68966-2-zhengqi.arch@bytedance.com
Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
Reviewed-by: Muchun Song <songmuchun@bytedance.com>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Chuck Lever <cel@kernel.org>
Cc: Darrick J. Wong <djwong@kernel.org>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Kirill Tkhai <tkhai@ya.ru>
Cc: Paul E. McKenney <paulmck@kernel.org>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Sergey Senozhatsky <senozhatsky@chromium.org>
Cc: Steven Price <steven.price@arm.com>
Cc: Theodore Ts'o <tytso@mit.edu>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Abhinav Kumar <quic_abhinavk@quicinc.com>
Cc: Alasdair Kergon <agk@redhat.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com>
Cc: Andreas Dilger <adilger.kernel@dilger.ca>
Cc: Andreas Gruenbacher <agruenba@redhat.com>
Cc: Anna Schumaker <anna@kernel.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Bob Peterson <rpeterso@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Carlos Llamas <cmllamas@google.com>
Cc: Chandan Babu R <chandan.babu@oracle.com>
Cc: Chao Yu <chao@kernel.org>
Cc: Chris Mason <clm@fb.com>
Cc: Christian Koenig <christian.koenig@amd.com>
Cc: Coly Li <colyli@suse.de>
Cc: Dai Ngo <Dai.Ngo@oracle.com>
Cc: Daniel Vetter <daniel@ffwll.ch>
Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Airlie <airlied@gmail.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Sterba <dsterba@suse.com>
Cc: Dmitry Baryshkov <dmitry.baryshkov@linaro.org>
Cc: Gao Xiang <hsiangkao@linux.alibaba.com>
Cc: Huang Rui <ray.huang@amd.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jaegeuk Kim <jaegeuk@kernel.org>
Cc: Jani Nikula <jani.nikula@linux.intel.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jason Wang <jasowang@redhat.com>
Cc: Jeff Layton <jlayton@kernel.org>
Cc: Jeffle Xu <jefflexu@linux.alibaba.com>
Cc: Joel Fernandes (Google) <joel@joelfernandes.org>
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Cc: Josef Bacik <josef@toxicpanda.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Kent Overstreet <kent.overstreet@gmail.com>
Cc: Marijn Suijten <marijn.suijten@somainline.org>
Cc: "Michael S. Tsirkin" <mst@redhat.com>
Cc: Mike Snitzer <snitzer@kernel.org>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Nadav Amit <namit@vmware.com>
Cc: Neil Brown <neilb@suse.de>
Cc: Oleksandr Tyshchenko <oleksandr_tyshchenko@epam.com>
Cc: Olga Kornievskaia <kolga@netapp.com>
Cc: Richard Weinberger <richard@nod.at>
Cc: Rob Clark <robdclark@gmail.com>
Cc: Rob Herring <robh@kernel.org>
Cc: Rodrigo Vivi <rodrigo.vivi@intel.com>
Cc: Sean Paul <sean@poorly.run>
Cc: Song Liu <song@kernel.org>
Cc: Stefano Stabellini <sstabellini@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tomeu Vizoso <tomeu.vizoso@collabora.com>
Cc: Tom Talpey <tom@talpey.com>
Cc: Trond Myklebust <trond.myklebust@hammerspace.com>
Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
Cc: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
Cc: Yue Hu <huyue2@coolpad.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/shrinker.h | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h
index 6b5843c3b827..f4a5249f00b2 100644
--- a/include/linux/shrinker.h
+++ b/include/linux/shrinker.h
@@ -70,6 +70,8 @@ struct shrinker {
 	int seeks;	/* seeks to recreate an obj */
 	unsigned flags;
 
+	void *private_data;
+
 	/* These are for internal use */
 	struct list_head list;
 #ifdef CONFIG_MEMCG
@@ -86,15 +88,22 @@ struct shrinker {
 };
 #define DEFAULT_SEEKS 2 /* A good number if you don't know better. */
 
-/* Flags */
-#define SHRINKER_REGISTERED	(1 << 0)
-#define SHRINKER_NUMA_AWARE	(1 << 1)
-#define SHRINKER_MEMCG_AWARE	(1 << 2)
+/* Internal flags */
+#define SHRINKER_REGISTERED	BIT(0)
+#define SHRINKER_ALLOCATED	BIT(1)
+
+/* Flags for users to use */
+#define SHRINKER_NUMA_AWARE	BIT(2)
+#define SHRINKER_MEMCG_AWARE	BIT(3)
 /*
  * It just makes sense when the shrinker is also MEMCG_AWARE for now,
  * non-MEMCG_AWARE shrinker should not have this flag set.
  */
-#define SHRINKER_NONSLAB	(1 << 3)
+#define SHRINKER_NONSLAB	BIT(4)
+
+struct shrinker *shrinker_alloc(unsigned int flags, const char *fmt, ...);
+void shrinker_register(struct shrinker *shrinker);
+void shrinker_free(struct shrinker *shrinker);
 
 extern int __printf(2, 3) prealloc_shrinker(struct shrinker *shrinker,
 					    const char *fmt, ...);
-- 
cgit v1.2.3


From 4b403dfa8ea8dfbc9d317b25a0433c1ac6765f7f Mon Sep 17 00:00:00 2001
From: Qi Zheng <zhengqi.arch@bytedance.com>
Date: Mon, 11 Sep 2023 17:44:30 +0800
Subject: jbd2,ext4: dynamically allocate the jbd2-journal shrinker

In preparation for implementing lockless slab shrink, use new APIs to
dynamically allocate the jbd2-journal shrinker, so that it can be freed
asynchronously via RCU. Then it doesn't need to wait for RCU read-side
critical section when releasing the struct journal_s.

Link: https://lkml.kernel.org/r/20230911094444.68966-32-zhengqi.arch@bytedance.com
Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
Reviewed-by: Muchun Song <songmuchun@bytedance.com>
Acked-by: Jan Kara <jack@suse.cz>
Cc: "Theodore Ts'o" <tytso@mit.edu>
Cc: Abhinav Kumar <quic_abhinavk@quicinc.com>
Cc: Alasdair Kergon <agk@redhat.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com>
Cc: Andreas Dilger <adilger.kernel@dilger.ca>
Cc: Andreas Gruenbacher <agruenba@redhat.com>
Cc: Anna Schumaker <anna@kernel.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Bob Peterson <rpeterso@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Carlos Llamas <cmllamas@google.com>
Cc: Chandan Babu R <chandan.babu@oracle.com>
Cc: Chao Yu <chao@kernel.org>
Cc: Chris Mason <clm@fb.com>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Christian Koenig <christian.koenig@amd.com>
Cc: Chuck Lever <cel@kernel.org>
Cc: Coly Li <colyli@suse.de>
Cc: Dai Ngo <Dai.Ngo@oracle.com>
Cc: Daniel Vetter <daniel@ffwll.ch>
Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
Cc: "Darrick J. Wong" <djwong@kernel.org>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Airlie <airlied@gmail.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Sterba <dsterba@suse.com>
Cc: Dmitry Baryshkov <dmitry.baryshkov@linaro.org>
Cc: Gao Xiang <hsiangkao@linux.alibaba.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Huang Rui <ray.huang@amd.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jaegeuk Kim <jaegeuk@kernel.org>
Cc: Jani Nikula <jani.nikula@linux.intel.com>
Cc: Jason Wang <jasowang@redhat.com>
Cc: Jeff Layton <jlayton@kernel.org>
Cc: Jeffle Xu <jefflexu@linux.alibaba.com>
Cc: Joel Fernandes (Google) <joel@joelfernandes.org>
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Cc: Josef Bacik <josef@toxicpanda.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Kent Overstreet <kent.overstreet@gmail.com>
Cc: Kirill Tkhai <tkhai@ya.ru>
Cc: Marijn Suijten <marijn.suijten@somainline.org>
Cc: "Michael S. Tsirkin" <mst@redhat.com>
Cc: Mike Snitzer <snitzer@kernel.org>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Nadav Amit <namit@vmware.com>
Cc: Neil Brown <neilb@suse.de>
Cc: Oleksandr Tyshchenko <oleksandr_tyshchenko@epam.com>
Cc: Olga Kornievskaia <kolga@netapp.com>
Cc: Paul E. McKenney <paulmck@kernel.org>
Cc: Richard Weinberger <richard@nod.at>
Cc: Rob Clark <robdclark@gmail.com>
Cc: Rob Herring <robh@kernel.org>
Cc: Rodrigo Vivi <rodrigo.vivi@intel.com>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Sean Paul <sean@poorly.run>
Cc: Sergey Senozhatsky <senozhatsky@chromium.org>
Cc: Song Liu <song@kernel.org>
Cc: Stefano Stabellini <sstabellini@kernel.org>
Cc: Steven Price <steven.price@arm.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tomeu Vizoso <tomeu.vizoso@collabora.com>
Cc: Tom Talpey <tom@talpey.com>
Cc: Trond Myklebust <trond.myklebust@hammerspace.com>
Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
Cc: Yue Hu <huyue2@coolpad.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/jbd2.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index 52772c826c86..6dcbb4eb80fb 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -886,7 +886,7 @@ struct journal_s
 	 * Journal head shrinker, reclaim buffer's journal head which
 	 * has been written back.
 	 */
-	struct shrinker		j_shrinker;
+	struct shrinker		*j_shrinker;
 
 	/**
 	 * @j_checkpoint_jh_count:
-- 
cgit v1.2.3


From 1720f5dd8d3af34d6023fb9e8c35e5e60e8b6643 Mon Sep 17 00:00:00 2001
From: Qi Zheng <zhengqi.arch@bytedance.com>
Date: Mon, 11 Sep 2023 17:44:37 +0800
Subject: fs: super: dynamically allocate the s_shrink

In preparation for implementing lockless slab shrink, use new APIs to
dynamically allocate the s_shrink, so that it can be freed asynchronously
via RCU. Then it doesn't need to wait for RCU read-side critical section
when releasing the struct super_block.

Link: https://lkml.kernel.org/r/20230911094444.68966-39-zhengqi.arch@bytedance.com
Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
Reviewed-by: Muchun Song <songmuchun@bytedance.com>
Acked-by: David Sterba <dsterba@suse.com>
Cc: Chris Mason <clm@fb.com>
Cc: Josef Bacik <josef@toxicpanda.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Abhinav Kumar <quic_abhinavk@quicinc.com>
Cc: Alasdair Kergon <agk@redhat.com>
Cc: Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com>
Cc: Andreas Dilger <adilger.kernel@dilger.ca>
Cc: Andreas Gruenbacher <agruenba@redhat.com>
Cc: Anna Schumaker <anna@kernel.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Bob Peterson <rpeterso@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Carlos Llamas <cmllamas@google.com>
Cc: Chandan Babu R <chandan.babu@oracle.com>
Cc: Chao Yu <chao@kernel.org>
Cc: Christian Koenig <christian.koenig@amd.com>
Cc: Chuck Lever <cel@kernel.org>
Cc: Coly Li <colyli@suse.de>
Cc: Dai Ngo <Dai.Ngo@oracle.com>
Cc: Daniel Vetter <daniel@ffwll.ch>
Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
Cc: "Darrick J. Wong" <djwong@kernel.org>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Airlie <airlied@gmail.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Dmitry Baryshkov <dmitry.baryshkov@linaro.org>
Cc: Gao Xiang <hsiangkao@linux.alibaba.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Huang Rui <ray.huang@amd.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jaegeuk Kim <jaegeuk@kernel.org>
Cc: Jani Nikula <jani.nikula@linux.intel.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jason Wang <jasowang@redhat.com>
Cc: Jeff Layton <jlayton@kernel.org>
Cc: Jeffle Xu <jefflexu@linux.alibaba.com>
Cc: Joel Fernandes (Google) <joel@joelfernandes.org>
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Kent Overstreet <kent.overstreet@gmail.com>
Cc: Kirill Tkhai <tkhai@ya.ru>
Cc: Marijn Suijten <marijn.suijten@somainline.org>
Cc: "Michael S. Tsirkin" <mst@redhat.com>
Cc: Mike Snitzer <snitzer@kernel.org>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Nadav Amit <namit@vmware.com>
Cc: Neil Brown <neilb@suse.de>
Cc: Oleksandr Tyshchenko <oleksandr_tyshchenko@epam.com>
Cc: Olga Kornievskaia <kolga@netapp.com>
Cc: Paul E. McKenney <paulmck@kernel.org>
Cc: Richard Weinberger <richard@nod.at>
Cc: Rob Clark <robdclark@gmail.com>
Cc: Rob Herring <robh@kernel.org>
Cc: Rodrigo Vivi <rodrigo.vivi@intel.com>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Sean Paul <sean@poorly.run>
Cc: Sergey Senozhatsky <senozhatsky@chromium.org>
Cc: Song Liu <song@kernel.org>
Cc: Stefano Stabellini <sstabellini@kernel.org>
Cc: Steven Price <steven.price@arm.com>
Cc: "Theodore Ts'o" <tytso@mit.edu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tomeu Vizoso <tomeu.vizoso@collabora.com>
Cc: Tom Talpey <tom@talpey.com>
Cc: Trond Myklebust <trond.myklebust@hammerspace.com>
Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
Cc: Yue Hu <huyue2@coolpad.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/fs.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index b528f063e8ff..fd539c9fef8e 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1265,7 +1265,7 @@ struct super_block {
 
 	const struct dentry_operations *s_d_op; /* default d_op for dentries */
 
-	struct shrinker s_shrink;	/* per-sb shrinker handle */
+	struct shrinker *s_shrink;	/* per-sb shrinker handle */
 
 	/* Number of inodes with nlink == 0 but still referenced */
 	atomic_long_t s_remove_count;
-- 
cgit v1.2.3


From f2383e01507eeee8a1c1283d61a117a97d6c4ebe Mon Sep 17 00:00:00 2001
From: Qi Zheng <zhengqi.arch@bytedance.com>
Date: Mon, 11 Sep 2023 17:44:38 +0800
Subject: mm: shrinker: remove old APIs

Now no users are using the old APIs, just remove them.

Link: https://lkml.kernel.org/r/20230911094444.68966-40-zhengqi.arch@bytedance.com
Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
Reviewed-by: Muchun Song <songmuchun@bytedance.com>
Cc: Abhinav Kumar <quic_abhinavk@quicinc.com>
Cc: Alasdair Kergon <agk@redhat.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com>
Cc: Andreas Dilger <adilger.kernel@dilger.ca>
Cc: Andreas Gruenbacher <agruenba@redhat.com>
Cc: Anna Schumaker <anna@kernel.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Bob Peterson <rpeterso@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Carlos Llamas <cmllamas@google.com>
Cc: Chandan Babu R <chandan.babu@oracle.com>
Cc: Chao Yu <chao@kernel.org>
Cc: Chris Mason <clm@fb.com>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Christian Koenig <christian.koenig@amd.com>
Cc: Chuck Lever <cel@kernel.org>
Cc: Coly Li <colyli@suse.de>
Cc: Dai Ngo <Dai.Ngo@oracle.com>
Cc: Daniel Vetter <daniel@ffwll.ch>
Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
Cc: "Darrick J. Wong" <djwong@kernel.org>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Airlie <airlied@gmail.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Sterba <dsterba@suse.com>
Cc: Dmitry Baryshkov <dmitry.baryshkov@linaro.org>
Cc: Gao Xiang <hsiangkao@linux.alibaba.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Huang Rui <ray.huang@amd.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jaegeuk Kim <jaegeuk@kernel.org>
Cc: Jani Nikula <jani.nikula@linux.intel.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jason Wang <jasowang@redhat.com>
Cc: Jeff Layton <jlayton@kernel.org>
Cc: Jeffle Xu <jefflexu@linux.alibaba.com>
Cc: Joel Fernandes (Google) <joel@joelfernandes.org>
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Cc: Josef Bacik <josef@toxicpanda.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Kent Overstreet <kent.overstreet@gmail.com>
Cc: Kirill Tkhai <tkhai@ya.ru>
Cc: Marijn Suijten <marijn.suijten@somainline.org>
Cc: "Michael S. Tsirkin" <mst@redhat.com>
Cc: Mike Snitzer <snitzer@kernel.org>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Nadav Amit <namit@vmware.com>
Cc: Neil Brown <neilb@suse.de>
Cc: Oleksandr Tyshchenko <oleksandr_tyshchenko@epam.com>
Cc: Olga Kornievskaia <kolga@netapp.com>
Cc: Paul E. McKenney <paulmck@kernel.org>
Cc: Richard Weinberger <richard@nod.at>
Cc: Rob Clark <robdclark@gmail.com>
Cc: Rob Herring <robh@kernel.org>
Cc: Rodrigo Vivi <rodrigo.vivi@intel.com>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Sean Paul <sean@poorly.run>
Cc: Sergey Senozhatsky <senozhatsky@chromium.org>
Cc: Song Liu <song@kernel.org>
Cc: Stefano Stabellini <sstabellini@kernel.org>
Cc: Steven Price <steven.price@arm.com>
Cc: "Theodore Ts'o" <tytso@mit.edu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tomeu Vizoso <tomeu.vizoso@collabora.com>
Cc: Tom Talpey <tom@talpey.com>
Cc: Trond Myklebust <trond.myklebust@hammerspace.com>
Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
Cc: Yue Hu <huyue2@coolpad.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/shrinker.h | 8 --------
 1 file changed, 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h
index f4a5249f00b2..1d3899f37229 100644
--- a/include/linux/shrinker.h
+++ b/include/linux/shrinker.h
@@ -105,14 +105,6 @@ struct shrinker *shrinker_alloc(unsigned int flags, const char *fmt, ...);
 void shrinker_register(struct shrinker *shrinker);
 void shrinker_free(struct shrinker *shrinker);
 
-extern int __printf(2, 3) prealloc_shrinker(struct shrinker *shrinker,
-					    const char *fmt, ...);
-extern void register_shrinker_prepared(struct shrinker *shrinker);
-extern int __printf(2, 3) register_shrinker(struct shrinker *shrinker,
-					    const char *fmt, ...);
-extern void unregister_shrinker(struct shrinker *shrinker);
-extern void free_prealloced_shrinker(struct shrinker *shrinker);
-
 #ifdef CONFIG_SHRINKER_DEBUG
 extern int __printf(2, 3) shrinker_debugfs_rename(struct shrinker *shrinker,
 						  const char *fmt, ...);
-- 
cgit v1.2.3


From 307bececcd1205bcb67a3c0d53a69db237ccc9d4 Mon Sep 17 00:00:00 2001
From: Qi Zheng <zhengqi.arch@bytedance.com>
Date: Mon, 11 Sep 2023 17:44:39 +0800
Subject: mm: shrinker: add a secondary array for shrinker_info::{map,
 nr_deferred}

Currently, we maintain two linear arrays per node per memcg, which are
shrinker_info::map and shrinker_info::nr_deferred. And we need to resize
them when the shrinker_nr_max is exceeded, that is, allocate a new array,
and then copy the old array to the new array, and finally free the old
array by RCU.

For shrinker_info::map, we do set_bit() under the RCU lock, so we may set
the value into the old map which is about to be freed. This may cause the
value set to be lost. The current solution is not to copy the old map when
resizing, but to set all the corresponding bits in the new map to 1. This
solves the data loss problem, but bring the overhead of more pointless
loops while doing memcg slab shrink.

For shrinker_info::nr_deferred, we will only modify it under the read lock
of shrinker_rwsem, so it will not run concurrently with the resizing. But
after we make memcg slab shrink lockless, there will be the same data loss
problem as shrinker_info::map, and we can't work around it like the map.

For such resizable arrays, the most straightforward idea is to change it
to xarray, like we did for list_lru [1]. We need to do xa_store() in the
list_lru_add()-->set_shrinker_bit(), but this will cause memory
allocation, and the list_lru_add() doesn't accept failure. A possible
solution is to pre-allocate, but the location of pre-allocation is not
well determined (such as deferred_split_shrinker case).

Therefore, this commit chooses to introduce the following secondary array
for shrinker_info::{map, nr_deferred}:

+---------------+--------+--------+-----+
| shrinker_info | unit 0 | unit 1 | ... | (secondary array)
+---------------+--------+--------+-----+
                     |
                     v
                +---------------+-----+
                | nr_deferred[] | map | (leaf array)
                +---------------+-----+
                (shrinker_info_unit)

The leaf array is never freed unless the memcg is destroyed. The secondary
array will be resized every time the shrinker id exceeds shrinker_nr_max.
So the shrinker_info_unit can be indexed from both the old and the new
shrinker_info->unit[x]. Then even if we get the old secondary array under
the RCU lock, the found map and nr_deferred are also true, so the updated
nr_deferred and map will not be lost.

[1]. https://lore.kernel.org/all/20220228122126.37293-13-songmuchun@bytedance.com/

[zhengqi.arch@bytedance.com: unlock the &shrinker_rwsem before the call to free_shrinker_info()]
  Link: https://lkml.kernel.org/r/20230928141517.12164-1-zhengqi.arch@bytedance.com
Link: https://lkml.kernel.org/r/20230911094444.68966-41-zhengqi.arch@bytedance.com
Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
Reviewed-by: Muchun Song <songmuchun@bytedance.com>
Cc: Abhinav Kumar <quic_abhinavk@quicinc.com>
Cc: Alasdair Kergon <agk@redhat.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com>
Cc: Andreas Dilger <adilger.kernel@dilger.ca>
Cc: Andreas Gruenbacher <agruenba@redhat.com>
Cc: Anna Schumaker <anna@kernel.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Bob Peterson <rpeterso@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Carlos Llamas <cmllamas@google.com>
Cc: Chandan Babu R <chandan.babu@oracle.com>
Cc: Chao Yu <chao@kernel.org>
Cc: Chris Mason <clm@fb.com>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Christian Koenig <christian.koenig@amd.com>
Cc: Chuck Lever <cel@kernel.org>
Cc: Coly Li <colyli@suse.de>
Cc: Dai Ngo <Dai.Ngo@oracle.com>
Cc: Daniel Vetter <daniel@ffwll.ch>
Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
Cc: "Darrick J. Wong" <djwong@kernel.org>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Airlie <airlied@gmail.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Sterba <dsterba@suse.com>
Cc: Dmitry Baryshkov <dmitry.baryshkov@linaro.org>
Cc: Gao Xiang <hsiangkao@linux.alibaba.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Huang Rui <ray.huang@amd.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jaegeuk Kim <jaegeuk@kernel.org>
Cc: Jani Nikula <jani.nikula@linux.intel.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jason Wang <jasowang@redhat.com>
Cc: Jeff Layton <jlayton@kernel.org>
Cc: Jeffle Xu <jefflexu@linux.alibaba.com>
Cc: Joel Fernandes (Google) <joel@joelfernandes.org>
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Cc: Josef Bacik <josef@toxicpanda.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Kent Overstreet <kent.overstreet@gmail.com>
Cc: Kirill Tkhai <tkhai@ya.ru>
Cc: Marijn Suijten <marijn.suijten@somainline.org>
Cc: "Michael S. Tsirkin" <mst@redhat.com>
Cc: Mike Snitzer <snitzer@kernel.org>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Nadav Amit <namit@vmware.com>
Cc: Neil Brown <neilb@suse.de>
Cc: Oleksandr Tyshchenko <oleksandr_tyshchenko@epam.com>
Cc: Olga Kornievskaia <kolga@netapp.com>
Cc: Paul E. McKenney <paulmck@kernel.org>
Cc: Richard Weinberger <richard@nod.at>
Cc: Rob Clark <robdclark@gmail.com>
Cc: Rob Herring <robh@kernel.org>
Cc: Rodrigo Vivi <rodrigo.vivi@intel.com>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Sean Paul <sean@poorly.run>
Cc: Sergey Senozhatsky <senozhatsky@chromium.org>
Cc: Song Liu <song@kernel.org>
Cc: Stefano Stabellini <sstabellini@kernel.org>
Cc: Steven Price <steven.price@arm.com>
Cc: "Theodore Ts'o" <tytso@mit.edu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tomeu Vizoso <tomeu.vizoso@collabora.com>
Cc: Tom Talpey <tom@talpey.com>
Cc: Trond Myklebust <trond.myklebust@hammerspace.com>
Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
Cc: Yue Hu <huyue2@coolpad.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/memcontrol.h | 12 +-----------
 include/linux/shrinker.h   | 17 +++++++++++++++++
 2 files changed, 18 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index e4e24da16d2c..031102ac9311 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -21,6 +21,7 @@
 #include <linux/vmstat.h>
 #include <linux/writeback.h>
 #include <linux/page-flags.h>
+#include <linux/shrinker.h>
 
 struct mem_cgroup;
 struct obj_cgroup;
@@ -88,17 +89,6 @@ struct mem_cgroup_reclaim_iter {
 	unsigned int generation;
 };
 
-/*
- * Bitmap and deferred work of shrinker::id corresponding to memcg-aware
- * shrinkers, which have elements charged to this memcg.
- */
-struct shrinker_info {
-	struct rcu_head rcu;
-	atomic_long_t *nr_deferred;
-	unsigned long *map;
-	int map_nr_max;
-};
-
 struct lruvec_stats_percpu {
 	/* Local (CPU and cgroup) state */
 	long state[NR_VM_NODE_STAT_ITEMS];
diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h
index 1d3899f37229..ba5ac82b5dbd 100644
--- a/include/linux/shrinker.h
+++ b/include/linux/shrinker.h
@@ -5,6 +5,23 @@
 #include <linux/atomic.h>
 #include <linux/types.h>
 
+#define SHRINKER_UNIT_BITS	BITS_PER_LONG
+
+/*
+ * Bitmap and deferred work of shrinker::id corresponding to memcg-aware
+ * shrinkers, which have elements charged to the memcg.
+ */
+struct shrinker_info_unit {
+	atomic_long_t nr_deferred[SHRINKER_UNIT_BITS];
+	DECLARE_BITMAP(map, SHRINKER_UNIT_BITS);
+};
+
+struct shrinker_info {
+	struct rcu_head rcu;
+	int map_nr_max;
+	struct shrinker_info_unit *unit[];
+};
+
 /*
  * This struct is used to pass information from page reclaim to the shrinkers.
  * We consolidate the values for easier extension later.
-- 
cgit v1.2.3


From ca1d36b823944f24b5755311e95883fb5fdb807b Mon Sep 17 00:00:00 2001
From: Qi Zheng <zhengqi.arch@bytedance.com>
Date: Mon, 11 Sep 2023 17:44:41 +0800
Subject: mm: shrinker: make global slab shrink lockless

The shrinker_rwsem is a global read-write lock in shrinkers subsystem,
which protects most operations such as slab shrink, registration and
unregistration of shrinkers, etc. This can easily cause problems in the
following cases.

1) When the memory pressure is high and there are many filesystems
   mounted or unmounted at the same time, slab shrink will be affected
   (down_read_trylock() failed).

   Such as the real workload mentioned by Kirill Tkhai:

   ```
   One of the real workloads from my experience is start
   of an overcommitted node containing many starting
   containers after node crash (or many resuming containers
   after reboot for kernel update). In these cases memory
   pressure is huge, and the node goes round in long reclaim.
   ```

2) If a shrinker is blocked (such as the case mentioned
   in [1]) and a writer comes in (such as mount a fs),
   then this writer will be blocked and cause all
   subsequent shrinker-related operations to be blocked.

Even if there is no competitor when shrinking slab, there may still be a
problem. The down_read_trylock() may become a perf hotspot with frequent
calls to shrink_slab(). Because of the poor multicore scalability of
atomic operations, this can lead to a significant drop in IPC
(instructions per cycle).

We used to implement the lockless slab shrink with SRCU [2], but then
kernel test robot reported -88.8% regression in
stress-ng.ramfs.ops_per_sec test case [3], so we reverted it [4].

This commit uses the refcount+RCU method [5] proposed by Dave Chinner
to re-implement the lockless global slab shrink. The memcg slab shrink is
handled in the subsequent patch.

For now, all shrinker instances are converted to dynamically allocated and
will be freed by call_rcu(). So we can use rcu_read_{lock,unlock}() to
ensure that the shrinker instance is valid.

And the shrinker instance will not be run again after unregistration. So
the structure that records the pointer of shrinker instance can be safely
freed without waiting for the RCU read-side critical section.

In this way, while we implement the lockless slab shrink, we don't need to
be blocked in unregister_shrinker().

The following are the test results:

stress-ng --timeout 60 --times --verify --metrics-brief --ramfs 9 &

1) Before applying this patchset:

setting to a 60 second run per stressor
dispatching hogs: 9 ramfs
stressor       bogo ops real time  usr time  sys time   bogo ops/s     bogo ops/s
                          (secs)    (secs)    (secs)   (real time) (usr+sys time)
ramfs            473062     60.00      8.00    279.13      7884.12        1647.59
for a 60.01s run time:
   1440.34s available CPU time
      7.99s user time   (  0.55%)
    279.13s system time ( 19.38%)
    287.12s total time  ( 19.93%)
load average: 7.12 2.99 1.15
successful run completed in 60.01s (1 min, 0.01 secs)

2) After applying this patchset:

setting to a 60 second run per stressor
dispatching hogs: 9 ramfs
stressor       bogo ops real time  usr time  sys time   bogo ops/s     bogo ops/s
                          (secs)    (secs)    (secs)   (real time) (usr+sys time)
ramfs            477165     60.00      8.13    281.34      7952.55        1648.40
for a 60.01s run time:
   1440.33s available CPU time
      8.12s user time   (  0.56%)
    281.34s system time ( 19.53%)
    289.46s total time  ( 20.10%)
load average: 6.98 3.03 1.19
successful run completed in 60.01s (1 min, 0.01 secs)

We can see that the ops/s has hardly changed.

[1]. https://lore.kernel.org/lkml/20191129214541.3110-1-ptikhomirov@virtuozzo.com/
[2]. https://lore.kernel.org/lkml/20230313112819.38938-1-zhengqi.arch@bytedance.com/
[3]. https://lore.kernel.org/lkml/202305230837.db2c233f-yujie.liu@intel.com/
[4]. https://lore.kernel.org/all/20230609081518.3039120-1-qi.zheng@linux.dev/
[5]. https://lore.kernel.org/lkml/ZIJhou1d55d4H1s0@dread.disaster.area/

Link: https://lkml.kernel.org/r/20230911094444.68966-43-zhengqi.arch@bytedance.com
Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
Cc: Abhinav Kumar <quic_abhinavk@quicinc.com>
Cc: Alasdair Kergon <agk@redhat.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com>
Cc: Andreas Dilger <adilger.kernel@dilger.ca>
Cc: Andreas Gruenbacher <agruenba@redhat.com>
Cc: Anna Schumaker <anna@kernel.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Bob Peterson <rpeterso@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Carlos Llamas <cmllamas@google.com>
Cc: Chandan Babu R <chandan.babu@oracle.com>
Cc: Chao Yu <chao@kernel.org>
Cc: Chris Mason <clm@fb.com>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Christian Koenig <christian.koenig@amd.com>
Cc: Chuck Lever <cel@kernel.org>
Cc: Coly Li <colyli@suse.de>
Cc: Dai Ngo <Dai.Ngo@oracle.com>
Cc: Daniel Vetter <daniel@ffwll.ch>
Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
Cc: "Darrick J. Wong" <djwong@kernel.org>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Airlie <airlied@gmail.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Sterba <dsterba@suse.com>
Cc: Dmitry Baryshkov <dmitry.baryshkov@linaro.org>
Cc: Gao Xiang <hsiangkao@linux.alibaba.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Huang Rui <ray.huang@amd.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jaegeuk Kim <jaegeuk@kernel.org>
Cc: Jani Nikula <jani.nikula@linux.intel.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jason Wang <jasowang@redhat.com>
Cc: Jeff Layton <jlayton@kernel.org>
Cc: Jeffle Xu <jefflexu@linux.alibaba.com>
Cc: Joel Fernandes (Google) <joel@joelfernandes.org>
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Cc: Josef Bacik <josef@toxicpanda.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Kent Overstreet <kent.overstreet@gmail.com>
Cc: Kirill Tkhai <tkhai@ya.ru>
Cc: Marijn Suijten <marijn.suijten@somainline.org>
Cc: "Michael S. Tsirkin" <mst@redhat.com>
Cc: Mike Snitzer <snitzer@kernel.org>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Muchun Song <songmuchun@bytedance.com>
Cc: Nadav Amit <namit@vmware.com>
Cc: Neil Brown <neilb@suse.de>
Cc: Oleksandr Tyshchenko <oleksandr_tyshchenko@epam.com>
Cc: Olga Kornievskaia <kolga@netapp.com>
Cc: Paul E. McKenney <paulmck@kernel.org>
Cc: Richard Weinberger <richard@nod.at>
Cc: Rob Clark <robdclark@gmail.com>
Cc: Rob Herring <robh@kernel.org>
Cc: Rodrigo Vivi <rodrigo.vivi@intel.com>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Sean Paul <sean@poorly.run>
Cc: Sergey Senozhatsky <senozhatsky@chromium.org>
Cc: Song Liu <song@kernel.org>
Cc: Stefano Stabellini <sstabellini@kernel.org>
Cc: Steven Price <steven.price@arm.com>
Cc: "Theodore Ts'o" <tytso@mit.edu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tomeu Vizoso <tomeu.vizoso@collabora.com>
Cc: Tom Talpey <tom@talpey.com>
Cc: Trond Myklebust <trond.myklebust@hammerspace.com>
Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
Cc: Yue Hu <huyue2@coolpad.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/shrinker.h | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h
index ba5ac82b5dbd..e4f93120e0ab 100644
--- a/include/linux/shrinker.h
+++ b/include/linux/shrinker.h
@@ -4,6 +4,8 @@
 
 #include <linux/atomic.h>
 #include <linux/types.h>
+#include <linux/refcount.h>
+#include <linux/completion.h>
 
 #define SHRINKER_UNIT_BITS	BITS_PER_LONG
 
@@ -87,6 +89,17 @@ struct shrinker {
 	int seeks;	/* seeks to recreate an obj */
 	unsigned flags;
 
+	/*
+	 * The reference count of this shrinker. Registered shrinker have an
+	 * initial refcount of 1, then the lookup operations are now allowed
+	 * to use it via shrinker_try_get(). Later in the unregistration step,
+	 * the initial refcount will be discarded, and will free the shrinker
+	 * asynchronously via RCU after its refcount reaches 0.
+	 */
+	refcount_t refcount;
+	struct completion done;	/* use to wait for refcount to reach 0 */
+	struct rcu_head rcu;
+
 	void *private_data;
 
 	/* These are for internal use */
@@ -122,6 +135,17 @@ struct shrinker *shrinker_alloc(unsigned int flags, const char *fmt, ...);
 void shrinker_register(struct shrinker *shrinker);
 void shrinker_free(struct shrinker *shrinker);
 
+static inline bool shrinker_try_get(struct shrinker *shrinker)
+{
+	return refcount_inc_not_zero(&shrinker->refcount);
+}
+
+static inline void shrinker_put(struct shrinker *shrinker)
+{
+	if (refcount_dec_and_test(&shrinker->refcount))
+		complete(&shrinker->done);
+}
+
 #ifdef CONFIG_SHRINKER_DEBUG
 extern int __printf(2, 3) shrinker_debugfs_rename(struct shrinker *shrinker,
 						  const char *fmt, ...);
-- 
cgit v1.2.3


From 09c550508a4b8f7844b197cc16877dd0f7c42d8f Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Wed, 13 Sep 2023 14:51:13 +0200
Subject: mm/rmap: pass folio to hugepage_add_anon_rmap()

Let's pass a folio; we are always mapping the entire thing.

Link: https://lkml.kernel.org/r/20230913125113.313322-7-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Muchun Song <muchun.song@linux.dev>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/rmap.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index 51cc21ebb568..d22f4d21a11c 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -203,7 +203,7 @@ void folio_add_file_rmap_range(struct folio *, struct page *, unsigned int nr,
 void page_remove_rmap(struct page *, struct vm_area_struct *,
 		bool compound);
 
-void hugepage_add_anon_rmap(struct page *, struct vm_area_struct *,
+void hugepage_add_anon_rmap(struct folio *, struct vm_area_struct *,
 		unsigned long address, rmap_t flags);
 void hugepage_add_new_anon_rmap(struct folio *, struct vm_area_struct *,
 		unsigned long address);
-- 
cgit v1.2.3


From 73eab3ca481e5be0f1fd8140365d604482f84ee1 Mon Sep 17 00:00:00 2001
From: Kefeng Wang <wangkefeng.wang@huawei.com>
Date: Wed, 13 Sep 2023 17:51:27 +0800
Subject: mm: migrate: convert migrate_misplaced_page() to
 migrate_misplaced_folio()

At present, numa balance only support base page and PMD-mapped THP, but we
will expand to support to migrate large folio/pte-mapped THP in the
future, it is better to make migrate_misplaced_page() to take a folio
instead of a page, and rename it to migrate_misplaced_folio(), it is a
preparation, also this remove several compound_head() calls.

Link: https://lkml.kernel.org/r/20230913095131.2426871-5-wangkefeng.wang@huawei.com
Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Reviewed-by: Zi Yan <ziy@nvidia.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: "Huang, Ying" <ying.huang@intel.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/migrate.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index 711dd9412561..2ce13e8a309b 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -142,10 +142,10 @@ const struct movable_operations *page_movable_ops(struct page *page)
 }
 
 #ifdef CONFIG_NUMA_BALANCING
-int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma,
+int migrate_misplaced_folio(struct folio *folio, struct vm_area_struct *vma,
 			   int node);
 #else
-static inline int migrate_misplaced_page(struct page *page,
+static inline int migrate_misplaced_folio(struct folio *folio,
 					 struct vm_area_struct *vma, int node)
 {
 	return -EAGAIN; /* can't migrate now */
-- 
cgit v1.2.3


From 2a41815784e029cbef571511384f00fa40f2a82e Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Thu, 14 Sep 2023 16:00:04 +0100
Subject: buffer: pass GFP flags to folio_alloc_buffers()

Patch series "Add and use bdev_getblk()", v2.

This patch series fixes a bug reported by Hui Zhu; see proposed
patches v1 and v2:
https://lore.kernel.org/linux-fsdevel/20230811035705.3296-1-teawaterz@linux.alibaba.com/
https://lore.kernel.org/linux-fsdevel/20230811071519.1094-1-teawaterz@linux.alibaba.com/

I decided to go in a rather different direction for this fix, and fix a
related problem at the same time.  I don't think there's any urgency to
rush this into Linus' tree, nor have I marked it for stable.  Reasonable
people may disagree.


This patch (of 8):

Instead of creating entirely new flags, inherit them from grow_dev_page().
The other callers create the same flags that this function used to
create.

Link: https://lkml.kernel.org/r/20230914150011.843330-1-willy@infradead.org
Link: https://lkml.kernel.org/r/20230914150011.843330-2-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Hui Zhu <teawater@antgroup.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/buffer_head.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index 44e9de51eedf..67d94d2be475 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -198,7 +198,7 @@ void touch_buffer(struct buffer_head *bh);
 void folio_set_bh(struct buffer_head *bh, struct folio *folio,
 		  unsigned long offset);
 struct buffer_head *folio_alloc_buffers(struct folio *folio, unsigned long size,
-					bool retry);
+					gfp_t gfp);
 struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size,
 		bool retry);
 void create_empty_buffers(struct page *, unsigned long,
-- 
cgit v1.2.3


From 3ed65f04aac4d1cd025f30ee3fac174bcbf2b018 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Thu, 14 Sep 2023 16:00:05 +0100
Subject: buffer: hoist GFP flags from grow_dev_page() to __getblk_gfp()

grow_dev_page() is only called by grow_buffers().  grow_buffers() is only
called by __getblk_slow() and __getblk_slow() is only called from
__getblk_gfp(), so it is safe to move the GFP flags setting all the way
up.  With that done, add a new bdev_getblk() entry point that leaves the
GFP flags the way the caller specified them.

[willy@infradead.org: fix grow_dev_page() error handling]
  Link: https://lkml.kernel.org/r/ZRREEIwqiy5DijKB@casper.infradead.org
Link: https://lkml.kernel.org/r/20230914150011.843330-3-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Hui Zhu <teawater@antgroup.com>
Cc: Dan Carpenter <dan.carpenter@linaro.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/buffer_head.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index 67d94d2be475..7825bb3d63a7 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -227,6 +227,8 @@ void __wait_on_buffer(struct buffer_head *);
 wait_queue_head_t *bh_waitq_head(struct buffer_head *bh);
 struct buffer_head *__find_get_block(struct block_device *bdev, sector_t block,
 			unsigned size);
+struct buffer_head *bdev_getblk(struct block_device *bdev, sector_t block,
+		unsigned size, gfp_t gfp);
 struct buffer_head *__getblk_gfp(struct block_device *bdev, sector_t block,
 				  unsigned size, gfp_t gfp);
 void __brelse(struct buffer_head *);
-- 
cgit v1.2.3


From c645e65c0675dd4df7ee68b995154dc1c1e7ce3b Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Thu, 14 Sep 2023 16:00:08 +0100
Subject: buffer: convert getblk_unmovable() and __getblk() to use
 bdev_getblk()

Move these two functions up in the file for the benefit of the next patch,
and pass in all of the GFP flags to use instead of the partial GFP flags
used by __getblk_gfp().

Link: https://lkml.kernel.org/r/20230914150011.843330-6-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Hui Zhu <teawater@antgroup.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/buffer_head.h | 36 ++++++++++++++++++++++--------------
 1 file changed, 22 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index 7825bb3d63a7..9a3ca5f6d63d 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -340,6 +340,28 @@ sb_breadahead(struct super_block *sb, sector_t block)
 	__breadahead(sb->s_bdev, block, sb->s_blocksize);
 }
 
+static inline struct buffer_head *getblk_unmovable(struct block_device *bdev,
+		sector_t block, unsigned size)
+{
+	gfp_t gfp;
+
+	gfp = mapping_gfp_constraint(bdev->bd_inode->i_mapping, ~__GFP_FS);
+	gfp |= __GFP_NOFAIL;
+
+	return bdev_getblk(bdev, block, size, gfp);
+}
+
+static inline struct buffer_head *__getblk(struct block_device *bdev,
+		sector_t block, unsigned size)
+{
+	gfp_t gfp;
+
+	gfp = mapping_gfp_constraint(bdev->bd_inode->i_mapping, ~__GFP_FS);
+	gfp |= __GFP_MOVABLE | __GFP_NOFAIL;
+
+	return bdev_getblk(bdev, block, size, gfp);
+}
+
 static inline struct buffer_head *
 sb_getblk(struct super_block *sb, sector_t block)
 {
@@ -387,20 +409,6 @@ static inline void lock_buffer(struct buffer_head *bh)
 		__lock_buffer(bh);
 }
 
-static inline struct buffer_head *getblk_unmovable(struct block_device *bdev,
-						   sector_t block,
-						   unsigned size)
-{
-	return __getblk_gfp(bdev, block, size, 0);
-}
-
-static inline struct buffer_head *__getblk(struct block_device *bdev,
-					   sector_t block,
-					   unsigned size)
-{
-	return __getblk_gfp(bdev, block, size, __GFP_MOVABLE);
-}
-
 static inline void bh_readahead(struct buffer_head *bh, blk_opf_t op_flags)
 {
 	if (!buffer_uptodate(bh) && trylock_buffer(bh)) {
-- 
cgit v1.2.3


From 4b9c8b1919323f7f359376ca31a4c721cb2b3acf Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Thu, 14 Sep 2023 16:00:09 +0100
Subject: buffer: convert sb_getblk() to call __getblk()

Now that __getblk() is in the right place in the file, it is trivial to
call it from sb_getblk().

Link: https://lkml.kernel.org/r/20230914150011.843330-7-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Hui Zhu <teawater@antgroup.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/buffer_head.h | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index 9a3ca5f6d63d..b294e2cccbae 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -362,13 +362,12 @@ static inline struct buffer_head *__getblk(struct block_device *bdev,
 	return bdev_getblk(bdev, block, size, gfp);
 }
 
-static inline struct buffer_head *
-sb_getblk(struct super_block *sb, sector_t block)
+static inline struct buffer_head *sb_getblk(struct super_block *sb,
+		sector_t block)
 {
-	return __getblk_gfp(sb->s_bdev, block, sb->s_blocksize, __GFP_MOVABLE);
+	return __getblk(sb->s_bdev, block, sb->s_blocksize);
 }
 
-
 static inline struct buffer_head *
 sb_getblk_gfp(struct super_block *sb, sector_t block, gfp_t gfp)
 {
-- 
cgit v1.2.3


From 8a83ac54940d27b8f56d766e1cb270d150fedd50 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Thu, 14 Sep 2023 16:00:10 +0100
Subject: ext4: call bdev_getblk() from sb_getblk_gfp()

Most of the callers of sb_getblk_gfp() already assumed that they were
passing the entire GFP flags to use.  Fix up the two callers that didn't,
and remove the __GFP_NOFAIL from them since they both appear to correctly
handle failure.

Link: https://lkml.kernel.org/r/20230914150011.843330-8-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Hui Zhu <teawater@antgroup.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/buffer_head.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index b294e2cccbae..22f13eece719 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -368,10 +368,10 @@ static inline struct buffer_head *sb_getblk(struct super_block *sb,
 	return __getblk(sb->s_bdev, block, sb->s_blocksize);
 }
 
-static inline struct buffer_head *
-sb_getblk_gfp(struct super_block *sb, sector_t block, gfp_t gfp)
+static inline struct buffer_head *sb_getblk_gfp(struct super_block *sb,
+		sector_t block, gfp_t gfp)
 {
-	return __getblk_gfp(sb->s_bdev, block, sb->s_blocksize, gfp);
+	return bdev_getblk(sb->s_bdev, block, sb->s_blocksize, gfp);
 }
 
 static inline struct buffer_head *
-- 
cgit v1.2.3


From 93b13ecaa713e1fcbf23b7483eec065d300c5ad8 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Thu, 14 Sep 2023 16:00:11 +0100
Subject: buffer: remove __getblk_gfp()

Inline it into __bread_gfp().

Link: https://lkml.kernel.org/r/20230914150011.843330-9-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Hui Zhu <teawater@antgroup.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/buffer_head.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index 22f13eece719..3dc4720e4773 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -229,8 +229,6 @@ struct buffer_head *__find_get_block(struct block_device *bdev, sector_t block,
 			unsigned size);
 struct buffer_head *bdev_getblk(struct block_device *bdev, sector_t block,
 		unsigned size, gfp_t gfp);
-struct buffer_head *__getblk_gfp(struct block_device *bdev, sector_t block,
-				  unsigned size, gfp_t gfp);
 void __brelse(struct buffer_head *);
 void __bforget(struct buffer_head *);
 void __breadahead(struct block_device *, sector_t block, unsigned int size);
-- 
cgit v1.2.3


From 3dfbb555c98ac55b9d911f9af0e35014b445fb41 Mon Sep 17 00:00:00 2001
From: Vlastimil Babka <vbabka@suse.cz>
Date: Thu, 14 Sep 2023 15:16:39 +0200
Subject: mm, vmscan: remove ISOLATE_UNMAPPED

This isolate_mode_t flag is effectively unused since 89f6c88a6ab4 ("mm:
__isolate_lru_page_prepare() in isolate_migratepages_block()") as
sc->may_unmap is now checked directly (and only node_reclaim has a mode
that sets it to 0).  The last remaining place is mm_vmscan_lru_isolate
tracepoint for the isolate_mode parameter.  That one was mainly used to
indicate the active/inactive mode, which the trace-vmscan-postprocess.pl
script consumed, but that got silently broken.  After fixing the script by
the previous patch, it does not need the isolate_mode anymore.  So just
remove the parameter and with that the whole ISOLATE_UNMAPPED flag.

Link: https://lkml.kernel.org/r/20230914131637.12204-4-vbabka@suse.cz
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Hugh Dickins <hughd@google.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mmzone.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 4106fbc5b4b3..486587fcd27f 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -639,8 +639,6 @@ struct lruvec {
 #endif
 };
 
-/* Isolate unmapped pages */
-#define ISOLATE_UNMAPPED	((__force isolate_mode_t)0x2)
 /* Isolate for asynchronous migration */
 #define ISOLATE_ASYNC_MIGRATE	((__force isolate_mode_t)0x4)
 /* Isolate unevictable pages */
-- 
cgit v1.2.3


From 4472edf63d6630e6cf65e205b4fc8c3c94d0afe5 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Thu, 14 Sep 2023 02:15:23 +0000
Subject: mm/damon/core: use number of passed access sampling as a timer

DAMON sleeps for sampling interval after each sampling, and check if the
aggregation interval and the ops update interval have passed using
ktime_get_coarse_ts64() and baseline timestamps for the intervals.  That
design is for making the operations occur at deterministic timing
regardless of the time that spend for each work.  However, it turned out
it is not that useful, and incur not-that-intuitive results.

After all, timer functions, and especially sleep functions that DAMON uses
to wait for specific timing, are not necessarily strictly accurate.  It is
legal design, so no problem.  However, depending on such inaccuracies, the
nr_accesses can be larger than aggregation interval divided by sampling
interval.  For example, with the default setting (5 ms sampling interval
and 100 ms aggregation interval) we frequently show regions having
nr_accesses larger than 20.  Also, if the execution of a DAMOS scheme
takes a long time, next aggregation could happen before enough number of
samples are collected.  This is not what usual users would intuitively
expect.

Since access check sampling is the smallest unit work of DAMON, using the
number of passed sampling intervals as the DAMON-internal timer can easily
avoid these problems.  That is, convert aggregation and ops update
intervals to numbers of sampling intervals that need to be passed before
those operations be executed, count the number of passed sampling
intervals, and invoke the operations as soon as the specific amount of
sampling intervals passed.  Make the change.

Note that this could make a behavioral change to settings that using
intervals that not aligned by the sampling interval.  For example, if the
sampling interval is 5 ms and the aggregation interval is 12 ms, DAMON
effectively uses 15 ms as its aggregation interval, because it checks
whether the aggregation interval after sleeping the sampling interval.
This change will make DAMON to effectively use 10 ms as aggregation
interval, since it uses 'aggregation interval / sampling interval *
sampling interval' as the effective aggregation interval, and we don't use
floating point types.  Usual users would have used aligned intervals, so
this behavioral change is not expected to make any meaningful impact, so
just make this change.

Link: https://lkml.kernel.org/r/20230914021523.60649-1-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/damon.h | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/damon.h b/include/linux/damon.h
index ab3089de1478..9a32b8fd0bd3 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -524,8 +524,18 @@ struct damon_ctx {
 	struct damon_attrs attrs;
 
 /* private: internal use only */
-	struct timespec64 last_aggregation;
-	struct timespec64 last_ops_update;
+	/* number of sample intervals that passed since this context started */
+	unsigned long passed_sample_intervals;
+	/*
+	 * number of sample intervals that should be passed before next
+	 * aggregation
+	 */
+	unsigned long next_aggregation_sis;
+	/*
+	 * number of sample intervals that should be passed before next ops
+	 * update
+	 */
+	unsigned long next_ops_update_sis;
 
 /* public: */
 	struct task_struct *kdamond;
-- 
cgit v1.2.3


From 78fbfb155d204428119310d1b9df665ab88da6e8 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Fri, 15 Sep 2023 02:52:44 +0000
Subject: mm/damon/core: define and use a dedicated function for region access
 rate update

Patch series "mm/damon: provide pseudo-moving sum based access rate".

DAMON checks the access to each region for every sampling interval,
increase the access rate counter of the region, namely nr_accesses, if the
access was made.  For every aggregation interval, the counter is reset.
The counter is exposed to users to be used as a metric showing the
relative access rate (frequency) of each region.  In other words, DAMON
provides access rate of each region in every aggregation interval.  The
aggregation avoids temporal access pattern changes making things
confusing.  However, this also makes a few DAMON-related operations to
unnecessarily need to be aligned to the aggregation interval.  This can
restrict the flexibility of DAMON applications, especially when the
aggregation interval is huge.

To provide the monitoring results in finer-grained timing while keeping
handling of temporal access pattern change, this patchset implements a
pseudo-moving sum based access rate metric.  It is pseudo-moving sum
because strict moving sum implementation would need to keep all values for
last time window, and that could incur high overhead of there could be
arbitrary number of values in a time window.  Especially in case of the
nr_accesses, since the sampling interval and aggregation interval can
arbitrarily set and the past values should be maintained for every region,
it could be risky.  The pseudo-moving sum assumes there were no temporal
access pattern change in last discrete time window to remove the needs for
keeping the list of the last time window values.  As a result, it beocmes
not strict moving sum implementation, but provides a reasonable accuracy.

Also, it keeps an important property of the moving sum.  That is, the
moving sum becomes same to discrete-window based sum at the time that
aligns to the time window.  This means using the pseudo moving sum based
nr_accesses makes no change to users who shows the value for every
aggregation interval.

Patches Sequence
----------------

The sequence of the patches is as follows.  The first four patches are for
preparation of the change.  The first two (patches 1 and 2) implements a
helper function for nr_accesses update and eliminate corner case that
skips use of the function, respectively.  Following two (patches 3 and 4)
respectively implement the pseudo-moving sum function and its simple unit
test case.

Two patches for making DAMON to use the pseudo-moving sum follow.  The
fifthe one (patch 5) introduces a new field for representing the
pseudo-moving sum-based access rate of each region, and the sixth one
makes the new representation to actually updated with the pseudo-moving
sum function.

Last two patches (patches 7 and 8) makes followup fixes for skipping
unnecessary updates and marking the moving sum function as static,
respectively.


This patch (of 8):

Each DAMON operarions set is updating nr_accesses field of each
damon_region for each of their access check results, from the
check_accesses() callback.  Directly accessing the field could make things
complex to manage and change in future.  Define and use a dedicated
function for the purpose.

Link: https://lkml.kernel.org/r/20230915025251.72816-1-sj@kernel.org
Link: https://lkml.kernel.org/r/20230915025251.72816-2-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Brendan Higgins <brendanhiggins@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/damon.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/damon.h b/include/linux/damon.h
index 9a32b8fd0bd3..17c504d236b9 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -45,7 +45,9 @@ struct damon_addr_range {
  *
  * @nr_accesses is reset to zero for every &damon_attrs->aggr_interval and be
  * increased for every &damon_attrs->sample_interval if an access to the region
- * during the last sampling interval is found.
+ * during the last sampling interval is found.  The update of this field should
+ * not be done with direct access but with the helper function,
+ * damon_update_region_access_rate().
  *
  * @age is initially zero, increased for each aggregation interval, and reset
  * to zero again if the access frequency is significantly changed.  If two
@@ -620,6 +622,7 @@ void damon_add_region(struct damon_region *r, struct damon_target *t);
 void damon_destroy_region(struct damon_region *r, struct damon_target *t);
 int damon_set_regions(struct damon_target *t, struct damon_addr_range *ranges,
 		unsigned int nr_ranges);
+void damon_update_region_access_rate(struct damon_region *r, bool accessed);
 
 struct damos_filter *damos_new_filter(enum damos_filter_type type,
 		bool matching);
-- 
cgit v1.2.3


From d2c062ade07ffd206dd16bf085f02abc59651309 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Fri, 15 Sep 2023 02:52:46 +0000
Subject: mm/damon/core: implement a pseudo-moving sum function

For values that continuously change, moving average or sum are good ways
to provide fast updates while handling temporal and errorneous variability
of the value.  For example, the access rate counter (nr_accesses) is
calculated as a sum of the number of positive sampled access check results
that collected during a discrete time window (aggregation interval), and
hence it handles temporal and errorneous access check results, but
provides the update only for every aggregation interval.  Using a moving
sum method for that could allow providing the value for every sampling
interval.  That could be useful for getting monitoring results snapshot or
running DAMOS in fine-grained timing.

However, supporting the moving sum for cases that number of samples in the
time window is arbirary could impose high overhead, since the number of
past values that it needs to keep could be too high.  The nr_accesses
would also be one of the cases.  To mitigate the overhead, implement a
pseudo-moving sum function that only provides an estimated pseudo-moving
sum.  It assumes there was no error in last discrete time window and
subtract constant portion of last discrete time window sum.

Note that the function is not strictly implementing the moving sum, but it
keeps a property of moving sum, which makes the value same to the
dsicrete-window based sum for each time window-aligned timing.  Hence,
people collecting the value in the old timings would show no difference.

Link: https://lkml.kernel.org/r/20230915025251.72816-4-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Brendan Higgins <brendanhiggins@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/damon.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/damon.h b/include/linux/damon.h
index 17c504d236b9..487a545a11b4 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -622,6 +622,8 @@ void damon_add_region(struct damon_region *r, struct damon_target *t);
 void damon_destroy_region(struct damon_region *r, struct damon_target *t);
 int damon_set_regions(struct damon_target *t, struct damon_addr_range *ranges,
 		unsigned int nr_ranges);
+unsigned int damon_moving_sum(unsigned int mvsum, unsigned int nomvsum,
+		unsigned int len_window, unsigned int new_value);
 void damon_update_region_access_rate(struct damon_region *r, bool accessed);
 
 struct damos_filter *damos_new_filter(enum damos_filter_type type,
-- 
cgit v1.2.3


From 80333828ea7728ebe85d079bb5c1467eb9fc6c8c Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Fri, 15 Sep 2023 02:52:48 +0000
Subject: mm/damon/core: introduce nr_accesses_bp

Add yet another representation of the access rate of each region, namely
nr_accesses_bp.  It is just same to the nr_accesses but represents the
value in basis point (1 in 10,000), and updated at once in every
aggregation interval.  That is, moving_accesses_bp is just nr_accesses *
10000.  This may seems useless at the moment.  However, it will be useful
for representing less than one nr_accesses value that will be needed to
make moving sum-based nr_accesses.

Link: https://lkml.kernel.org/r/20230915025251.72816-6-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Brendan Higgins <brendanhiggins@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/damon.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/damon.h b/include/linux/damon.h
index 487a545a11b4..15f24b23c9a0 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -40,6 +40,7 @@ struct damon_addr_range {
  * @ar:			The address range of the region.
  * @sampling_addr:	Address of the sample for the next access check.
  * @nr_accesses:	Access frequency of this region.
+ * @nr_accesses_bp:	@nr_accesses in basis point (0.01%).
  * @list:		List head for siblings.
  * @age:		Age of this region.
  *
@@ -49,6 +50,9 @@ struct damon_addr_range {
  * not be done with direct access but with the helper function,
  * damon_update_region_access_rate().
  *
+ * @nr_accesses_bp is another representation of @nr_accesses in basis point
+ * (1 in 10,000) that updated every aggregation interval.
+ *
  * @age is initially zero, increased for each aggregation interval, and reset
  * to zero again if the access frequency is significantly changed.  If two
  * regions are merged into a new region, both @nr_accesses and @age of the new
@@ -58,6 +62,7 @@ struct damon_region {
 	struct damon_addr_range ar;
 	unsigned long sampling_addr;
 	unsigned int nr_accesses;
+	unsigned int nr_accesses_bp;
 	struct list_head list;
 
 	unsigned int age;
-- 
cgit v1.2.3


From ace30fb21af5f1be1605db72c16040b95b1557ef Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Fri, 15 Sep 2023 02:52:49 +0000
Subject: mm/damon/core: use pseudo-moving sum for nr_accesses_bp

Let nr_accesses_bp be calculated as a pseudo-moving sum that updated for
every sampling interval, using damon_moving_sum().  This is assumed to be
useful for cases that the aggregation interval is set quite huge, but the
monivoting results need to be collected earlier than next aggregation
interval is passed.

Link: https://lkml.kernel.org/r/20230915025251.72816-7-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Brendan Higgins <brendanhiggins@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/damon.h | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/damon.h b/include/linux/damon.h
index 15f24b23c9a0..0fe13482df63 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -40,7 +40,8 @@ struct damon_addr_range {
  * @ar:			The address range of the region.
  * @sampling_addr:	Address of the sample for the next access check.
  * @nr_accesses:	Access frequency of this region.
- * @nr_accesses_bp:	@nr_accesses in basis point (0.01%).
+ * @nr_accesses_bp:	@nr_accesses in basis point (0.01%) that updated for
+ *			each sampling interval.
  * @list:		List head for siblings.
  * @age:		Age of this region.
  *
@@ -51,7 +52,11 @@ struct damon_addr_range {
  * damon_update_region_access_rate().
  *
  * @nr_accesses_bp is another representation of @nr_accesses in basis point
- * (1 in 10,000) that updated every aggregation interval.
+ * (1 in 10,000) that updated for every &damon_attrs->sample_interval in a
+ * manner similar to moving sum.  By the algorithm, this value becomes
+ * @nr_accesses * 10000 for every &struct damon_attrs->aggr_interval.  This can
+ * be used when the aggregation interval is too huge and therefore cannot wait
+ * for it before getting the access monitoring results.
  *
  * @age is initially zero, increased for each aggregation interval, and reset
  * to zero again if the access frequency is significantly changed.  If two
@@ -629,7 +634,8 @@ int damon_set_regions(struct damon_target *t, struct damon_addr_range *ranges,
 		unsigned int nr_ranges);
 unsigned int damon_moving_sum(unsigned int mvsum, unsigned int nomvsum,
 		unsigned int len_window, unsigned int new_value);
-void damon_update_region_access_rate(struct damon_region *r, bool accessed);
+void damon_update_region_access_rate(struct damon_region *r, bool accessed,
+		struct damon_attrs *attrs);
 
 struct damos_filter *damos_new_filter(enum damos_filter_type type,
 		bool matching);
-- 
cgit v1.2.3


From 863803a7948c8e33e6a7b002017747ca83ecfd63 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Fri, 15 Sep 2023 02:52:51 +0000
Subject: mm/damon/core: mark damon_moving_sum() as a static function

The function is used by only mm/damon/core.c.  Mark it as a static
function.

Link: https://lkml.kernel.org/r/20230915025251.72816-9-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Brendan Higgins <brendanhiggins@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/damon.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/damon.h b/include/linux/damon.h
index 0fe13482df63..491fdd3e4c76 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -632,8 +632,6 @@ void damon_add_region(struct damon_region *r, struct damon_target *t);
 void damon_destroy_region(struct damon_region *r, struct damon_target *t);
 int damon_set_regions(struct damon_target *t, struct damon_addr_range *ranges,
 		unsigned int nr_ranges);
-unsigned int damon_moving_sum(unsigned int mvsum, unsigned int nomvsum,
-		unsigned int len_window, unsigned int new_value);
 void damon_update_region_access_rate(struct damon_region *r, bool accessed,
 		struct damon_attrs *attrs);
 
-- 
cgit v1.2.3


From 77e6c43e137c130138c3fbadc847351a83c4befe Mon Sep 17 00:00:00 2001
From: Usama Arif <usama.arif@bytedance.com>
Date: Wed, 13 Sep 2023 11:54:00 +0100
Subject: memblock: introduce MEMBLOCK_RSRV_NOINIT flag

For reserved memory regions marked with this flag, reserve_bootmem_region
is not called during memmap_init_reserved_pages.  This can be used to
avoid struct page initialization for regions which won't need them, for
e.g.  hugepages with Hugepage Vmemmap Optimization enabled.

Link: https://lkml.kernel.org/r/20230913105401.519709-4-usama.arif@bytedance.com
Signed-off-by: Usama Arif <usama.arif@bytedance.com>
Acked-by: Muchun Song <songmuchun@bytedance.com>
Reviewed-by: Mike Rapoport (IBM) <rppt@kernel.org>
Cc: Fam Zheng <fam.zheng@bytedance.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Punit Agrawal <punit.agrawal@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/memblock.h | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index 1c1072e3ca06..ae3bde302f70 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -40,6 +40,8 @@ extern unsigned long long max_possible_pfn;
  * via a driver, and never indicated in the firmware-provided memory map as
  * system RAM. This corresponds to IORESOURCE_SYSRAM_DRIVER_MANAGED in the
  * kernel resource tree.
+ * @MEMBLOCK_RSRV_NOINIT: memory region for which struct pages are
+ * not initialized (only for reserved regions).
  */
 enum memblock_flags {
 	MEMBLOCK_NONE		= 0x0,	/* No special request */
@@ -47,6 +49,7 @@ enum memblock_flags {
 	MEMBLOCK_MIRROR		= 0x2,	/* mirrored region */
 	MEMBLOCK_NOMAP		= 0x4,	/* don't add to kernel direct mapping */
 	MEMBLOCK_DRIVER_MANAGED = 0x8,	/* always detected via a driver */
+	MEMBLOCK_RSRV_NOINIT	= 0x10,	/* don't initialize struct pages */
 };
 
 /**
@@ -125,6 +128,7 @@ int memblock_clear_hotplug(phys_addr_t base, phys_addr_t size);
 int memblock_mark_mirror(phys_addr_t base, phys_addr_t size);
 int memblock_mark_nomap(phys_addr_t base, phys_addr_t size);
 int memblock_clear_nomap(phys_addr_t base, phys_addr_t size);
+int memblock_reserved_mark_noinit(phys_addr_t base, phys_addr_t size);
 
 void memblock_free_all(void);
 void memblock_free(void *ptr, size_t size);
@@ -259,6 +263,11 @@ static inline bool memblock_is_nomap(struct memblock_region *m)
 	return m->flags & MEMBLOCK_NOMAP;
 }
 
+static inline bool memblock_is_reserved_noinit(struct memblock_region *m)
+{
+	return m->flags & MEMBLOCK_RSRV_NOINIT;
+}
+
 static inline bool memblock_is_driver_managed(struct memblock_region *m)
 {
 	return m->flags & MEMBLOCK_DRIVER_MANAGED;
-- 
cgit v1.2.3


From 42f994b71404b17abcd6b170de7a6aa95ffe5d4a Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Sat, 16 Sep 2023 02:09:40 +0000
Subject: mm/damon/core: implement scheme-specific apply interval

DAMON-based operation schemes are applied for every aggregation interval.
That was mainly because schemes were using nr_accesses, which be complete
to be used for every aggregation interval.  However, the schemes are now
using nr_accesses_bp, which is updated for each sampling interval in a way
that reasonable to be used.  Therefore, there is no reason to apply
schemes for each aggregation interval.

The unnecessary alignment with aggregation interval was also making some
use cases of DAMOS tricky.  Quotas setting under long aggregation interval
is one such example.  Suppose the aggregation interval is ten seconds, and
there is a scheme having CPU quota 100ms per 1s.  The scheme will actually
uses 100ms per ten seconds, since it cannobe be applied before next
aggregation interval.  The feature is working as intended, but the results
might not that intuitive for some users.  This could be fixed by updating
the quota to 1s per 10s.  But, in the case, the CPU usage of DAMOS could
look like spikes, and would actually make a bad effect to other
CPU-sensitive workloads.

Implement a dedicated timing interval for each DAMON-based operation
scheme, namely apply_interval.  The interval will be sampling interval
aligned, and each scheme will be applied for its apply_interval.  The
interval is set to 0 by default, and it means the scheme should use the
aggregation interval instead.  This avoids old users getting any
behavioral difference.

Link: https://lkml.kernel.org/r/20230916020945.47296-5-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Steven Rostedt (Google) <rostedt@goodmis.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/damon.h | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/damon.h b/include/linux/damon.h
index 491fdd3e4c76..27b995c22497 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -314,16 +314,19 @@ struct damos_access_pattern {
  * struct damos - Represents a Data Access Monitoring-based Operation Scheme.
  * @pattern:		Access pattern of target regions.
  * @action:		&damo_action to be applied to the target regions.
+ * @apply_interval_us:	The time between applying the @action.
  * @quota:		Control the aggressiveness of this scheme.
  * @wmarks:		Watermarks for automated (in)activation of this scheme.
  * @filters:		Additional set of &struct damos_filter for &action.
  * @stat:		Statistics of this scheme.
  * @list:		List head for siblings.
  *
- * For each aggregation interval, DAMON finds regions which fit in the
+ * For each @apply_interval_us, DAMON finds regions which fit in the
  * &pattern and applies &action to those. To avoid consuming too much
  * CPU time or IO resources for the &action, &quota is used.
  *
+ * If @apply_interval_us is zero, &damon_attrs->aggr_interval is used instead.
+ *
  * To do the work only when needed, schemes can be activated for specific
  * system situations using &wmarks.  If all schemes that registered to the
  * monitoring context are inactive, DAMON stops monitoring either, and just
@@ -340,6 +343,14 @@ struct damos_access_pattern {
 struct damos {
 	struct damos_access_pattern pattern;
 	enum damos_action action;
+	unsigned long apply_interval_us;
+/* private: internal use only */
+	/*
+	 * number of sample intervals that should be passed before applying
+	 * @action
+	 */
+	unsigned long next_apply_sis;
+/* public: */
 	struct damos_quota quota;
 	struct damos_watermarks wmarks;
 	struct list_head filters;
@@ -641,7 +652,9 @@ void damos_add_filter(struct damos *s, struct damos_filter *f);
 void damos_destroy_filter(struct damos_filter *f);
 
 struct damos *damon_new_scheme(struct damos_access_pattern *pattern,
-			enum damos_action action, struct damos_quota *quota,
+			enum damos_action action,
+			unsigned long apply_interval_us,
+			struct damos_quota *quota,
 			struct damos_watermarks *wmarks);
 void damon_add_scheme(struct damon_ctx *ctx, struct damos *s);
 void damon_destroy_scheme(struct damos *s);
-- 
cgit v1.2.3


From 51a23b1be92046f0cc52384d30cf060700f1a54e Mon Sep 17 00:00:00 2001
From: Li Zhijian <lizhijian@cn.fujitsu.com>
Date: Wed, 2 Aug 2023 17:28:56 +0800
Subject: acpi,mm: fix typo sibiling -> sibling

First found this typo as reviewing memory tier code. Fix it by sed like:
$ sed -i 's/sibiling/sibling/g' $(git grep -l sibiling)

so the acpi one will be corrected as well.

Link: https://lkml.kernel.org/r/20230802092856.819328-1-lizhijian@cn.fujitsu.com
Signed-off-by: Li Zhijian <lizhijian@cn.fujitsu.com>
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Cc: Huang, Ying <ying.huang@intel.com>
Cc: Len Brown <lenb@kernel.org>
Cc: "Rafael J. Wysocki" <rafael@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/memory-tiers.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/memory-tiers.h b/include/linux/memory-tiers.h
index 437441cdf78f..4fa178b50784 100644
--- a/include/linux/memory-tiers.h
+++ b/include/linux/memory-tiers.h
@@ -22,7 +22,7 @@
 struct memory_tier;
 struct memory_dev_type {
 	/* list of memory types that are part of same tier as this type */
-	struct list_head tier_sibiling;
+	struct list_head tier_sibling;
 	/* abstract distance for this specific memory type */
 	int adistance;
 	/* Nodes of same abstract distance */
-- 
cgit v1.2.3


From 55d2a0bd5eadaade850efa9d3a7ffbb0aeb67198 Mon Sep 17 00:00:00 2001
From: Baolin Wang <baolin.wang@linux.alibaba.com>
Date: Mon, 18 Sep 2023 14:31:42 +0800
Subject: mm: add statistics for PUD level pagetable

Recently, we found that cross-die access to pagetable pages on ARM64
machines can cause performance fluctuations in our business.  Currently,
there are no PMU events available to track this situation on our ARM64
machines, so accurate pagetable accounting can help to analyze this issue,
but now the PUD level pagetable accounting is missed.

So introduce pagetable_pud_ctor/dtor() to help to get accurate PUD
pagetable accounting, as well as converting the architectures which use
generic PUD pagetable allocation to add corresponding PUD pagetable
accounting.  Moreover this patch will mark the PUD level pagetable with
PG_table flag, which will help to do sanity validation in
unpoison_memory().

On my testing machine, I can see more pagetables statistics after the patch
with page-types tool:

Before patch:
        flags           page-count      MB  symbolic-flags                     long-symbolic-flags
0x0000000004000000           27326      106  __________________________g_________________       pgtable
After patch:
0x0000000004000000           27541      107  __________________________g_________________       pgtable

Link: https://lkml.kernel.org/r/876c71c03a7e69c17722a690e3225a4f7b172fb2.1695017383.git.baolin.wang@linux.alibaba.com
Signed-off-by: Baolin Wang <baolin.wang@linux.alibaba.com>
Acked-by: Mike Rapoport (IBM) <rppt@kernel.org>
Acked-by: Vishal Moola (Oracle) <vishal.moola@gmail.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Huacai Chen <chenhuacai@kernel.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 31dc25d3f6b5..126b54b45442 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -3049,6 +3049,22 @@ static inline spinlock_t *pud_lock(struct mm_struct *mm, pud_t *pud)
 	return ptl;
 }
 
+static inline void pagetable_pud_ctor(struct ptdesc *ptdesc)
+{
+	struct folio *folio = ptdesc_folio(ptdesc);
+
+	__folio_set_pgtable(folio);
+	lruvec_stat_add_folio(folio, NR_PAGETABLE);
+}
+
+static inline void pagetable_pud_dtor(struct ptdesc *ptdesc)
+{
+	struct folio *folio = ptdesc_folio(ptdesc);
+
+	__folio_clear_pgtable(folio);
+	lruvec_stat_sub_folio(folio, NR_PAGETABLE);
+}
+
 extern void __init pagecache_init(void);
 extern void free_initmem(void);
 
-- 
cgit v1.2.3


From 7ced098fcfe596feab3cea4f40128b0119c7bf1a Mon Sep 17 00:00:00 2001
From: Jann Horn <jannh@google.com>
Date: Mon, 18 Sep 2023 22:18:32 +0200
Subject: mm: document mmu_notifier_invalidate_range_start_nonblock()

Document what mmu_notifier_invalidate_range_start_nonblock() is for.  Also
add a __must_check annotation to signal that callers must bail out if a
notifier vetoes the operation.

Link: https://lkml.kernel.org/r/20230918201832.265108-1-jannh@google.com
Signed-off-by: Jann Horn <jannh@google.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Alistair Popple <apopple@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mmu_notifier.h | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
index 6e3c857606f1..f349e08a9dfe 100644
--- a/include/linux/mmu_notifier.h
+++ b/include/linux/mmu_notifier.h
@@ -459,7 +459,14 @@ mmu_notifier_invalidate_range_start(struct mmu_notifier_range *range)
 	lock_map_release(&__mmu_notifier_invalidate_range_start_map);
 }
 
-static inline int
+/*
+ * This version of mmu_notifier_invalidate_range_start() avoids blocking, but it
+ * can return an error if a notifier can't proceed without blocking, in which
+ * case you're not allowed to modify PTEs in the specified range.
+ *
+ * This is mainly intended for OOM handling.
+ */
+static inline int __must_check
 mmu_notifier_invalidate_range_start_nonblock(struct mmu_notifier_range *range)
 {
 	int ret = 0;
-- 
cgit v1.2.3


From 24e41bf8a6b424c76c5902fb999e9eca61bdf83d Mon Sep 17 00:00:00 2001
From: Florent Revest <revest@chromium.org>
Date: Mon, 28 Aug 2023 17:08:57 +0200
Subject: mm: add a NO_INHERIT flag to the PR_SET_MDWE prctl

This extends the current PR_SET_MDWE prctl arg with a bit to indicate that
the process doesn't want MDWE protection to propagate to children.

To implement this no-inherit mode, the tag in current->mm->flags must be
absent from MMF_INIT_MASK.  This means that the encoding for "MDWE but
without inherit" is different in the prctl than in the mm flags.  This
leads to a bit of bit-mangling in the prctl implementation.

Link: https://lkml.kernel.org/r/20230828150858.393570-6-revest@chromium.org
Signed-off-by: Florent Revest <revest@chromium.org>
Reviewed-by: Kees Cook <keescook@chromium.org>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Cc: Alexey Izbyshev <izbyshev@ispras.ru>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Ayush Jain <ayush.jain3@amd.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Greg Thelen <gthelen@google.com>
Cc: Joey Gouly <joey.gouly@arm.com>
Cc: KP Singh <kpsingh@kernel.org>
Cc: Mark Brown <broonie@kernel.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Szabolcs Nagy <Szabolcs.Nagy@arm.com>
Cc: Topi Miettinen <toiwoton@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/sched/coredump.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/sched/coredump.h b/include/linux/sched/coredump.h
index 0ee96ea7a0e9..1b37fa8fc723 100644
--- a/include/linux/sched/coredump.h
+++ b/include/linux/sched/coredump.h
@@ -91,4 +91,14 @@ static inline int get_dumpable(struct mm_struct *mm)
 				 MMF_DISABLE_THP_MASK | MMF_HAS_MDWE_MASK)
 
 #define MMF_VM_MERGE_ANY	29
+#define MMF_HAS_MDWE_NO_INHERIT	30
+
+static inline unsigned long mmf_init_flags(unsigned long flags)
+{
+	if (flags & (1UL << MMF_HAS_MDWE_NO_INHERIT))
+		flags &= ~((1UL << MMF_HAS_MDWE) |
+			   (1UL << MMF_HAS_MDWE_NO_INHERIT));
+	return flags & MMF_INIT_MASK;
+}
+
 #endif /* _LINUX_SCHED_COREDUMP_H */
-- 
cgit v1.2.3


From 65610453459f9048678a0daef89d592e412ec00a Mon Sep 17 00:00:00 2001
From: Kefeng Wang <wangkefeng.wang@huawei.com>
Date: Thu, 21 Sep 2023 15:44:12 +0800
Subject: mm: memory: add vm_normal_folio_pmd()

Patch series "mm: convert numa balancing functions to use a folio", v2.

do_numa_pages() only handles non-compound pages, and only PMD-mapped THPs
are handled in do_huge_pmd_numa_page().  But a large, PTE-mapped folio
will be supported so let's convert more numa balancing functions to
use/take a folio in preparation for that, no functional change intended
for now.


This patch (of 6):

The new vm_normal_folio_pmd() wrapper is similar to vm_normal_folio(),
which allow them to completely replace the struct page variables with
struct folio variables.

Link: https://lkml.kernel.org/r/20230921074417.24004-1-wangkefeng.wang@huawei.com
Link: https://lkml.kernel.org/r/20230921074417.24004-2-wangkefeng.wang@huawei.com
Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: "Huang, Ying" <ying.huang@intel.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 126b54b45442..52c40b3d0813 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2327,6 +2327,8 @@ struct folio *vm_normal_folio(struct vm_area_struct *vma, unsigned long addr,
 			     pte_t pte);
 struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
 			     pte_t pte);
+struct folio *vm_normal_folio_pmd(struct vm_area_struct *vma,
+				  unsigned long addr, pmd_t pmd);
 struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr,
 				pmd_t pmd);
 
-- 
cgit v1.2.3


From 75c70128a67311070115b90d826a229d4bbbb2b5 Mon Sep 17 00:00:00 2001
From: Kefeng Wang <wangkefeng.wang@huawei.com>
Date: Thu, 21 Sep 2023 15:44:16 +0800
Subject: mm: mempolicy: make mpol_misplaced() to take a folio

In preparation for large folio numa balancing, make mpol_misplaced() to
take a folio, no functional change intended.

Link: https://lkml.kernel.org/r/20230921074417.24004-6-wangkefeng.wang@huawei.com
Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: "Huang, Ying" <ying.huang@intel.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mempolicy.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index d232de7cdc56..6c2754d7bfed 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -174,7 +174,7 @@ extern void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol);
 /* Check if a vma is migratable */
 extern bool vma_migratable(struct vm_area_struct *vma);
 
-extern int mpol_misplaced(struct page *, struct vm_area_struct *, unsigned long);
+int mpol_misplaced(struct folio *, struct vm_area_struct *, unsigned long);
 extern void mpol_put_task_policy(struct task_struct *);
 
 static inline bool mpol_is_preferred_many(struct mempolicy *pol)
@@ -278,7 +278,8 @@ static inline int mpol_parse_str(char *str, struct mempolicy **mpol)
 }
 #endif
 
-static inline int mpol_misplaced(struct page *page, struct vm_area_struct *vma,
+static inline int mpol_misplaced(struct folio *folio,
+				 struct vm_area_struct *vma,
 				 unsigned long address)
 {
 	return -1; /* no node preference */
-- 
cgit v1.2.3


From 8c9ae56dc73b5ae48a14000b96292bd4f2aeb710 Mon Sep 17 00:00:00 2001
From: Kefeng Wang <wangkefeng.wang@huawei.com>
Date: Thu, 21 Sep 2023 15:44:17 +0800
Subject: sched/numa, mm: make numa migrate functions to take a folio

The cpupid (or access time) is stored in the head page for THP, so it is
safely to make should_numa_migrate_memory() and numa_hint_fault_latency()
to take a folio.  This is in preparation for large folio numa balancing.

Link: https://lkml.kernel.org/r/20230921074417.24004-7-wangkefeng.wang@huawei.com
Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: "Huang, Ying" <ying.huang@intel.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/sched/numa_balancing.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched/numa_balancing.h b/include/linux/sched/numa_balancing.h
index 3988762efe15..06a9d35650f0 100644
--- a/include/linux/sched/numa_balancing.h
+++ b/include/linux/sched/numa_balancing.h
@@ -20,8 +20,8 @@ extern void task_numa_fault(int last_node, int node, int pages, int flags);
 extern pid_t task_numa_group_id(struct task_struct *p);
 extern void set_numabalancing_state(bool enabled);
 extern void task_numa_free(struct task_struct *p, bool final);
-extern bool should_numa_migrate_memory(struct task_struct *p, struct page *page,
-					int src_nid, int dst_cpu);
+bool should_numa_migrate_memory(struct task_struct *p, struct folio *folio,
+				int src_nid, int dst_cpu);
 #else
 static inline void task_numa_fault(int last_node, int node, int pages,
 				   int flags)
@@ -38,7 +38,7 @@ static inline void task_numa_free(struct task_struct *p, bool final)
 {
 }
 static inline bool should_numa_migrate_memory(struct task_struct *p,
-				struct page *page, int src_nid, int dst_cpu)
+				struct folio *folio, int src_nid, int dst_cpu)
 {
 	return true;
 }
-- 
cgit v1.2.3


From 3c6f33b7273a7e2f2b2497b62c8400bd957b2fbe Mon Sep 17 00:00:00 2001
From: Stefan Roesch <shr@devkernel.io>
Date: Fri, 22 Sep 2023 14:11:40 -0700
Subject: mm/ksm: support fork/exec for prctl

Patch series "mm/ksm: add fork-exec support for prctl", v4.

A process can enable KSM with the prctl system call.  When the process is
forked the KSM flag is inherited by the child process.  However if the
process is executing an exec system call directly after the fork, the KSM
setting is cleared.  This patch series addresses this problem.

1) Change the mask in coredump.h for execing a new process
2) Add a new test case in ksm_functional_tests


This patch (of 2):

Today we have two ways to enable KSM:

1) madvise system call
   This allows to enable KSM for a memory region for a long time.

2) prctl system call
   This is a recent addition to enable KSM for the complete process.
   In addition when a process is forked, the KSM setting is inherited.

This change only affects the second case.

One of the use cases for (2) was to support the ability to enable
KSM for cgroups. This allows systemd to enable KSM for the seed
process. By enabling it in the seed process all child processes inherit
the setting.

This works correctly when the process is forked. However it doesn't
support fork/exec workflow.

From the previous cover letter:

....
Use case 3:
With the madvise call sharing opportunities are only enabled for the
current process: it is a workload-local decision. A considerable number
of sharing opportunities may exist across multiple workloads or jobs
(if they are part of the same security domain). Only a higler level
entity like a job scheduler or container can know for certain if its
running one or more instances of a job. That job scheduler however
doesn't have the necessary internal workload knowledge to make targeted
madvise calls.
....

In addition it can also be a bit surprising that fork keeps the KSM
setting and fork/exec does not.

Link: https://lkml.kernel.org/r/20230922211141.320789-1-shr@devkernel.io
Link: https://lkml.kernel.org/r/20230922211141.320789-2-shr@devkernel.io
Signed-off-by: Stefan Roesch <shr@devkernel.io>
Fixes: d7597f59d1d3 ("mm: add new api to enable ksm per process")
Reviewed-by: David Hildenbrand <david@redhat.com>
Reported-by: Carl Klemm <carl@uvos.xyz>
Tested-by: Carl Klemm <carl@uvos.xyz>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Rik van Riel <riel@surriel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/sched/coredump.h | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched/coredump.h b/include/linux/sched/coredump.h
index 1b37fa8fc723..02f5090ffea2 100644
--- a/include/linux/sched/coredump.h
+++ b/include/linux/sched/coredump.h
@@ -71,6 +71,7 @@ static inline int get_dumpable(struct mm_struct *mm)
 #define MMF_UNSTABLE		22	/* mm is unstable for copy_from_user */
 #define MMF_HUGE_ZERO_PAGE	23      /* mm has ever used the global huge zero page */
 #define MMF_DISABLE_THP		24	/* disable THP for all VMAs */
+#define MMF_DISABLE_THP_MASK	(1 << MMF_DISABLE_THP)
 #define MMF_OOM_REAP_QUEUED	25	/* mm was queued for oom_reaper */
 #define MMF_MULTIPROCESS	26	/* mm is shared between processes */
 /*
@@ -85,13 +86,15 @@ static inline int get_dumpable(struct mm_struct *mm)
 #define MMF_HAS_MDWE		28
 #define MMF_HAS_MDWE_MASK	(1 << MMF_HAS_MDWE)
 
-#define MMF_DISABLE_THP_MASK	(1 << MMF_DISABLE_THP)
 
-#define MMF_INIT_MASK		(MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK |\
-				 MMF_DISABLE_THP_MASK | MMF_HAS_MDWE_MASK)
+#define MMF_HAS_MDWE_NO_INHERIT	29
 
-#define MMF_VM_MERGE_ANY	29
-#define MMF_HAS_MDWE_NO_INHERIT	30
+#define MMF_VM_MERGE_ANY	30
+#define MMF_VM_MERGE_ANY_MASK	(1 << MMF_VM_MERGE_ANY)
+
+#define MMF_INIT_MASK		(MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK |\
+				 MMF_DISABLE_THP_MASK | MMF_HAS_MDWE_MASK |\
+				 MMF_VM_MERGE_ANY_MASK)
 
 static inline unsigned long mmf_init_flags(unsigned long flags)
 {
-- 
cgit v1.2.3


From a08c7193e4f18dc8508f2d07d0de2c5b94cb39a3 Mon Sep 17 00:00:00 2001
From: Sidhartha Kumar <sidhartha.kumar@oracle.com>
Date: Tue, 26 Sep 2023 12:20:17 -0700
Subject: mm/filemap: remove hugetlb special casing in filemap.c

Remove special cased hugetlb handling code within the page cache by
changing the granularity of ->index to the base page size rather than the
huge page size.  The motivation of this patch is to reduce complexity
within the filemap code while also increasing performance by removing
branches that are evaluated on every page cache lookup.

To support the change in index, new wrappers for hugetlb page cache
interactions are added.  These wrappers perform the conversion to a linear
index which is now expected by the page cache for huge pages.

========================= PERFORMANCE ======================================

Perf was used to check the performance differences after the patch.
Overall the performance is similar to mainline with a very small larger
overhead that occurs in __filemap_add_folio() and
hugetlb_add_to_page_cache().  This is because of the larger overhead that
occurs in xa_load() and xa_store() as the xarray is now using more entries
to store hugetlb folios in the page cache.

Timing

aarch64
    2MB Page Size
        6.5-rc3 + this patch:
            [root@sidhakum-ol9-1 hugepages]# time fallocate -l 700GB test.txt
            real    1m49.568s
            user    0m0.000s
            sys     1m49.461s

        6.5-rc3:
            [root]# time fallocate -l 700GB test.txt
            real    1m47.495s
            user    0m0.000s
            sys     1m47.370s
    1GB Page Size
        6.5-rc3 + this patch:
            [root@sidhakum-ol9-1 hugepages1G]# time fallocate -l 700GB test.txt
            real    1m47.024s
            user    0m0.000s
            sys     1m46.921s

        6.5-rc3:
            [root@sidhakum-ol9-1 hugepages1G]# time fallocate -l 700GB test.txt
            real    1m44.551s
            user    0m0.000s
            sys     1m44.438s

x86
    2MB Page Size
        6.5-rc3 + this patch:
            [root@sidhakum-ol9-2 hugepages]# time fallocate -l 100GB test.txt
            real    0m22.383s
            user    0m0.000s
            sys     0m22.255s

        6.5-rc3:
            [opc@sidhakum-ol9-2 hugepages]$ time sudo fallocate -l 100GB /dev/hugepages/test.txt
            real    0m22.735s
            user    0m0.038s
            sys     0m22.567s

    1GB Page Size
        6.5-rc3 + this patch:
            [root@sidhakum-ol9-2 hugepages1GB]# time fallocate -l 100GB test.txt
            real    0m25.786s
            user    0m0.001s
            sys     0m25.589s

        6.5-rc3:
            [root@sidhakum-ol9-2 hugepages1G]# time fallocate -l 100GB test.txt
            real    0m33.454s
            user    0m0.001s
            sys     0m33.193s

aarch64:
    workload - fallocate a 700GB file backed by huge pages

    6.5-rc3 + this patch:
        2MB Page Size:
            --100.00%--__arm64_sys_fallocate
                          ksys_fallocate
                          vfs_fallocate
                          hugetlbfs_fallocate
                          |
                          |--95.04%--__pi_clear_page
                          |
                          |--3.57%--clear_huge_page
                          |          |
                          |          |--2.63%--rcu_all_qs
                          |          |
                          |           --0.91%--__cond_resched
                          |
                           --0.67%--__cond_resched
            0.17%     0.00%             0  fallocate  [kernel.vmlinux]       [k] hugetlb_add_to_page_cache
            0.14%     0.10%            11  fallocate  [kernel.vmlinux]       [k] __filemap_add_folio

    6.5-rc3
        2MB Page Size:
                --100.00%--__arm64_sys_fallocate
                          ksys_fallocate
                          vfs_fallocate
                          hugetlbfs_fallocate
                          |
                          |--94.91%--__pi_clear_page
                          |
                          |--4.11%--clear_huge_page
                          |          |
                          |          |--3.00%--rcu_all_qs
                          |          |
                          |           --1.10%--__cond_resched
                          |
                           --0.59%--__cond_resched
            0.08%     0.01%             1  fallocate  [kernel.kallsyms]  [k] hugetlb_add_to_page_cache
            0.05%     0.03%             3  fallocate  [kernel.kallsyms]  [k] __filemap_add_folio

x86
    workload - fallocate a 100GB file backed by huge pages

    6.5-rc3 + this patch:
        2MB Page Size:
            hugetlbfs_fallocate
            |
            --99.57%--clear_huge_page
                |
                --98.47%--clear_page_erms
                    |
                    --0.53%--asm_sysvec_apic_timer_interrupt

            0.04%     0.04%             1  fallocate  [kernel.kallsyms]     [k] xa_load
            0.04%     0.00%             0  fallocate  [kernel.kallsyms]     [k] hugetlb_add_to_page_cache
            0.04%     0.00%             0  fallocate  [kernel.kallsyms]     [k] __filemap_add_folio
            0.04%     0.00%             0  fallocate  [kernel.kallsyms]     [k] xas_store

    6.5-rc3
        2MB Page Size:
                --99.93%--__x64_sys_fallocate
                          vfs_fallocate
                          hugetlbfs_fallocate
                          |
                           --99.38%--clear_huge_page
                                     |
                                     |--98.40%--clear_page_erms
                                     |
                                      --0.59%--__cond_resched
            0.03%     0.03%             1  fallocate  [kernel.kallsyms]  [k] __filemap_add_folio

========================= TESTING ======================================

This patch passes libhugetlbfs tests and LTP hugetlb tests

********** TEST SUMMARY
*                      2M
*                      32-bit 64-bit
*     Total testcases:   110    113
*             Skipped:     0      0
*                PASS:   107    113
*                FAIL:     0      0
*    Killed by signal:     3      0
*   Bad configuration:     0      0
*       Expected FAIL:     0      0
*     Unexpected PASS:     0      0
*    Test not present:     0      0
* Strange test result:     0      0
**********

    Done executing testcases.
    LTP Version:  20220527-178-g2761a81c4

page migration was also tested using Mike Kravetz's test program.[8]

[dan.carpenter@linaro.org: fix an NULL vs IS_ERR() bug]
  Link: https://lkml.kernel.org/r/1772c296-1417-486f-8eef-171af2192681@moroto.mountain
Link: https://lkml.kernel.org/r/20230926192017.98183-1-sidhartha.kumar@oracle.com
Signed-off-by: Sidhartha Kumar <sidhartha.kumar@oracle.com>
Signed-off-by: Dan Carpenter <dan.carpenter@linaro.org>
Reported-and-tested-by: syzbot+c225dea486da4d5592bd@syzkaller.appspotmail.com
Closes: https://syzkaller.appspot.com/bug?extid=c225dea486da4d5592bd
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Muchun Song <songmuchun@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/hugetlb.h | 12 ++++++++++++
 include/linux/pagemap.h | 32 ++------------------------------
 2 files changed, 14 insertions(+), 30 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index a30686e649f7..d935b0584f1d 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -812,6 +812,12 @@ static inline unsigned int blocks_per_huge_page(struct hstate *h)
 	return huge_page_size(h) / 512;
 }
 
+static inline struct folio *filemap_lock_hugetlb_folio(struct hstate *h,
+				struct address_space *mapping, pgoff_t idx)
+{
+	return filemap_lock_folio(mapping, idx << huge_page_order(h));
+}
+
 #include <asm/hugetlb.h>
 
 #ifndef is_hugepage_only_range
@@ -1008,6 +1014,12 @@ static inline struct hugepage_subpool *hugetlb_folio_subpool(struct folio *folio
 	return NULL;
 }
 
+static inline struct folio *filemap_lock_hugetlb_folio(struct hstate *h,
+				struct address_space *mapping, pgoff_t idx)
+{
+	return NULL;
+}
+
 static inline int isolate_or_dissolve_huge_page(struct page *page,
 						struct list_head *list)
 {
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 351c3b7f93a1..759b29d9a69a 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -789,9 +789,6 @@ static inline pgoff_t folio_next_index(struct folio *folio)
  */
 static inline struct page *folio_file_page(struct folio *folio, pgoff_t index)
 {
-	/* HugeTLBfs indexes the page cache in units of hpage_size */
-	if (folio_test_hugetlb(folio))
-		return &folio->page;
 	return folio_page(folio, index & (folio_nr_pages(folio) - 1));
 }
 
@@ -807,9 +804,6 @@ static inline struct page *folio_file_page(struct folio *folio, pgoff_t index)
  */
 static inline bool folio_contains(struct folio *folio, pgoff_t index)
 {
-	/* HugeTLBfs indexes the page cache in units of hpage_size */
-	if (folio_test_hugetlb(folio))
-		return folio->index == index;
 	return index - folio_index(folio) < folio_nr_pages(folio);
 }
 
@@ -867,10 +861,9 @@ static inline struct folio *read_mapping_folio(struct address_space *mapping,
 }
 
 /*
- * Get index of the page within radix-tree (but not for hugetlb pages).
- * (TODO: remove once hugetlb pages will have ->index in PAGE_SIZE)
+ * Get the offset in PAGE_SIZE (even for hugetlb pages).
  */
-static inline pgoff_t page_to_index(struct page *page)
+static inline pgoff_t page_to_pgoff(struct page *page)
 {
 	struct page *head;
 
@@ -885,19 +878,6 @@ static inline pgoff_t page_to_index(struct page *page)
 	return head->index + page - head;
 }
 
-extern pgoff_t hugetlb_basepage_index(struct page *page);
-
-/*
- * Get the offset in PAGE_SIZE (even for hugetlb pages).
- * (TODO: hugetlb pages should have ->index in PAGE_SIZE)
- */
-static inline pgoff_t page_to_pgoff(struct page *page)
-{
-	if (unlikely(PageHuge(page)))
-		return hugetlb_basepage_index(page);
-	return page_to_index(page);
-}
-
 /*
  * Return byte-offset into filesystem object for page.
  */
@@ -934,24 +914,16 @@ static inline loff_t folio_file_pos(struct folio *folio)
 
 /*
  * Get the offset in PAGE_SIZE (even for hugetlb folios).
- * (TODO: hugetlb folios should have ->index in PAGE_SIZE)
  */
 static inline pgoff_t folio_pgoff(struct folio *folio)
 {
-	if (unlikely(folio_test_hugetlb(folio)))
-		return hugetlb_basepage_index(&folio->page);
 	return folio->index;
 }
 
-extern pgoff_t linear_hugepage_index(struct vm_area_struct *vma,
-				     unsigned long address);
-
 static inline pgoff_t linear_page_index(struct vm_area_struct *vma,
 					unsigned long address)
 {
 	pgoff_t pgoff;
-	if (unlikely(is_vm_hugetlb_page(vma)))
-		return linear_hugepage_index(vma, address);
 	pgoff = (address - vma->vm_start) >> PAGE_SHIFT;
 	pgoff += vma->vm_pgoff;
 	return pgoff;
-- 
cgit v1.2.3


From 07a8bdd4120ced3490ef9adf51b8086af0aaa8e7 Mon Sep 17 00:00:00 2001
From: Huang Ying <ying.huang@intel.com>
Date: Tue, 26 Sep 2023 14:06:25 +0800
Subject: memory tiering: add abstract distance calculation algorithms
 management

Patch series "memory tiering: calculate abstract distance based on ACPI
HMAT", v4.

We have the explicit memory tiers framework to manage systems with
multiple types of memory, e.g., DRAM in DIMM slots and CXL memory devices.
Where, same kind of memory devices will be grouped into memory types,
then put into memory tiers.  To describe the performance of a memory type,
abstract distance is defined.  Which is in direct proportion to the memory
latency and inversely proportional to the memory bandwidth.  To keep the
code as simple as possible, fixed abstract distance is used in dax/kmem to
describe slow memory such as Optane DCPMM.

To support more memory types, in this series, we added the abstract
distance calculation algorithm management mechanism, provided a algorithm
implementation based on ACPI HMAT, and used the general abstract distance
calculation interface in dax/kmem driver.  So, dax/kmem can support HBM
(high bandwidth memory) in addition to the original Optane DCPMM.


This patch (of 4):

The abstract distance may be calculated by various drivers, such as ACPI
HMAT, CXL CDAT, etc.  While it may be used by various code which hot-add
memory node, such as dax/kmem etc.  To decouple the algorithm users and
the providers, the abstract distance calculation algorithms management
mechanism is implemented in this patch.  It provides interface for the
providers to register the implementation, and interface for the users.

Multiple algorithm implementations can cooperate via calculating abstract
distance for different memory nodes.  The preference of algorithm
implementations can be specified via priority (notifier_block.priority).

Link: https://lkml.kernel.org/r/20230926060628.265989-1-ying.huang@intel.com
Link: https://lkml.kernel.org/r/20230926060628.265989-2-ying.huang@intel.com
Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
Tested-by: Bharata B Rao <bharata@amd.com>
Reviewed-by: Alistair Popple <apopple@nvidia.com>
Reviewed-by: Dave Jiang <dave.jiang@intel.com>
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Cc: Wei Xu <weixugc@google.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Yang Shi <shy828301@gmail.com>
Cc: Rafael J Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/memory-tiers.h | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/memory-tiers.h b/include/linux/memory-tiers.h
index 4fa178b50784..fddc59990644 100644
--- a/include/linux/memory-tiers.h
+++ b/include/linux/memory-tiers.h
@@ -6,6 +6,7 @@
 #include <linux/nodemask.h>
 #include <linux/kref.h>
 #include <linux/mmzone.h>
+#include <linux/notifier.h>
 /*
  * Each tier cover a abstrace distance chunk size of 128
  */
@@ -36,6 +37,9 @@ struct memory_dev_type *alloc_memory_type(int adistance);
 void put_memory_type(struct memory_dev_type *memtype);
 void init_node_memory_type(int node, struct memory_dev_type *default_type);
 void clear_node_memory_type(int node, struct memory_dev_type *memtype);
+int register_mt_adistance_algorithm(struct notifier_block *nb);
+int unregister_mt_adistance_algorithm(struct notifier_block *nb);
+int mt_calc_adistance(int node, int *adist);
 #ifdef CONFIG_MIGRATION
 int next_demotion_node(int node);
 void node_get_allowed_targets(pg_data_t *pgdat, nodemask_t *targets);
@@ -97,5 +101,20 @@ static inline bool node_is_toptier(int node)
 {
 	return true;
 }
+
+static inline int register_mt_adistance_algorithm(struct notifier_block *nb)
+{
+	return 0;
+}
+
+static inline int unregister_mt_adistance_algorithm(struct notifier_block *nb)
+{
+	return 0;
+}
+
+static inline int mt_calc_adistance(int node, int *adist)
+{
+	return NOTIFY_DONE;
+}
 #endif	/* CONFIG_NUMA */
 #endif  /* _LINUX_MEMORY_TIERS_H */
-- 
cgit v1.2.3


From 3718c02dbd4c88d47b5af003acdb3d1112604ea3 Mon Sep 17 00:00:00 2001
From: Huang Ying <ying.huang@intel.com>
Date: Tue, 26 Sep 2023 14:06:27 +0800
Subject: acpi, hmat: calculate abstract distance with HMAT

A memory tiering abstract distance calculation algorithm based on ACPI
HMAT is implemented.  The basic idea is as follows.

The performance attributes of system default DRAM nodes are recorded as
the base line.  Whose abstract distance is MEMTIER_ADISTANCE_DRAM.  Then,
the ratio of the abstract distance of a memory node (target) to
MEMTIER_ADISTANCE_DRAM is scaled based on the ratio of the performance
attributes of the node to that of the default DRAM nodes.

The functions to record the read/write latency/bandwidth of the default
DRAM nodes and calculate abstract distance according to read/write
latency/bandwidth ratio will be used by CXL CDAT (Coherent Device
Attribute Table) and other memory device drivers.  So, they are put in
memory-tiers.c.

Link: https://lkml.kernel.org/r/20230926060628.265989-4-ying.huang@intel.com
Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
Tested-by: Bharata B Rao <bharata@amd.com>
Reviewed-by: Dave Jiang <dave.jiang@intel.com>
Reviewed-by: Alistair Popple <apopple@nvidia.com>
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Cc: Wei Xu <weixugc@google.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Yang Shi <shy828301@gmail.com>
Cc: Rafael J Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/memory-tiers.h | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/memory-tiers.h b/include/linux/memory-tiers.h
index fddc59990644..59d15c6d3c0d 100644
--- a/include/linux/memory-tiers.h
+++ b/include/linux/memory-tiers.h
@@ -31,8 +31,11 @@ struct memory_dev_type {
 	struct kref kref;
 };
 
+struct node_hmem_attrs;
+
 #ifdef CONFIG_NUMA
 extern bool numa_demotion_enabled;
+extern struct memory_dev_type *default_dram_type;
 struct memory_dev_type *alloc_memory_type(int adistance);
 void put_memory_type(struct memory_dev_type *memtype);
 void init_node_memory_type(int node, struct memory_dev_type *default_type);
@@ -40,6 +43,9 @@ void clear_node_memory_type(int node, struct memory_dev_type *memtype);
 int register_mt_adistance_algorithm(struct notifier_block *nb);
 int unregister_mt_adistance_algorithm(struct notifier_block *nb);
 int mt_calc_adistance(int node, int *adist);
+int mt_set_default_dram_perf(int nid, struct node_hmem_attrs *perf,
+			     const char *source);
+int mt_perf_to_adistance(struct node_hmem_attrs *perf, int *adist);
 #ifdef CONFIG_MIGRATION
 int next_demotion_node(int node);
 void node_get_allowed_targets(pg_data_t *pgdat, nodemask_t *targets);
@@ -64,6 +70,7 @@ static inline bool node_is_toptier(int node)
 #else
 
 #define numa_demotion_enabled	false
+#define default_dram_type	NULL
 /*
  * CONFIG_NUMA implementation returns non NULL error.
  */
@@ -116,5 +123,16 @@ static inline int mt_calc_adistance(int node, int *adist)
 {
 	return NOTIFY_DONE;
 }
+
+static inline int mt_set_default_dram_perf(int nid, struct node_hmem_attrs *perf,
+					   const char *source)
+{
+	return -EIO;
+}
+
+static inline int mt_perf_to_adistance(struct node_hmem_attrs *perf, int *adist)
+{
+	return -EIO;
+}
 #endif	/* CONFIG_NUMA */
 #endif  /* _LINUX_MEMORY_TIERS_H */
-- 
cgit v1.2.3


From 6bc2cfdf82d56863b7cf5e86e37a662b2ae5d47e Mon Sep 17 00:00:00 2001
From: Huang Ying <ying.huang@intel.com>
Date: Tue, 26 Sep 2023 14:06:28 +0800
Subject: dax, kmem: calculate abstract distance with general interface

Previously, a fixed abstract distance MEMTIER_DEFAULT_DAX_ADISTANCE is
used for slow memory type in kmem driver.  This limits the usage of kmem
driver, for example, it cannot be used for HBM (high bandwidth memory).

So, we use the general abstract distance calculation mechanism in kmem
drivers to get more accurate abstract distance on systems with proper
support.  The original MEMTIER_DEFAULT_DAX_ADISTANCE is used as fallback
only.

Now, multiple memory types may be managed by kmem.  These memory types are
put into the "kmem_memory_types" list and protected by
kmem_memory_type_lock.

Link: https://lkml.kernel.org/r/20230926060628.265989-5-ying.huang@intel.com
Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
Tested-by: Bharata B Rao <bharata@amd.com>
Reviewed-by: Dave Jiang <dave.jiang@intel.com>
Reviewed-by: Alistair Popple <apopple@nvidia.com>
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Cc: Wei Xu <weixugc@google.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Yang Shi <shy828301@gmail.com>
Cc: Rafael J Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/memory-tiers.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/memory-tiers.h b/include/linux/memory-tiers.h
index 59d15c6d3c0d..1e39d27bee41 100644
--- a/include/linux/memory-tiers.h
+++ b/include/linux/memory-tiers.h
@@ -24,6 +24,8 @@ struct memory_tier;
 struct memory_dev_type {
 	/* list of memory types that are part of same tier as this type */
 	struct list_head tier_sibling;
+	/* list of memory types that are managed by one driver */
+	struct list_head list;
 	/* abstract distance for this specific memory type */
 	int adistance;
 	/* Nodes of same abstract distance */
-- 
cgit v1.2.3


From b7c67206594a56be2407ae4da54a114c90609e53 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Fri, 22 Sep 2023 10:53:28 -0700
Subject: mm/memcg: annotate struct mem_cgroup_threshold_ary with __counted_by

Prepare for the coming implementation by GCC and Clang of the __counted_by
attribute.  Flexible array members annotated with __counted_by can have
their accesses bounds-checked at run-time checking via CONFIG_UBSAN_BOUNDS
(for array indexing) and CONFIG_FORTIFY_SOURCE (for strcpy/memcpy-family
functions).

As found with Coccinelle[1], add __counted_by for struct
mem_cgroup_threshold_ary.

[1] https://github.com/kees/kernel-tools/blob/trunk/coccinelle/examples/counted_by.cocci

Link: https://lkml.kernel.org/r/20230922175327.work.985-kees@kernel.org
Signed-off-by: Kees Cook <keescook@chromium.org>
Acked-by: Shakeel Butt <shakeelb@google.com>
Acked-by: Roman Gushchin <roman.gushchin@linux.dev>
Reviewed-by: Gustavo A. R. Silva <gustavoars@kernel.org>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/memcontrol.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 031102ac9311..c6029aeaa268 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -143,7 +143,7 @@ struct mem_cgroup_threshold_ary {
 	/* Size of entries[] */
 	unsigned int size;
 	/* Array of thresholds */
-	struct mem_cgroup_threshold entries[];
+	struct mem_cgroup_threshold entries[] __counted_by(size);
 };
 
 struct mem_cgroup_thresholds {
-- 
cgit v1.2.3


From d61ea1cb009532dcbd77a9d44b812704cec60146 Mon Sep 17 00:00:00 2001
From: Peter Xu <peterx@redhat.com>
Date: Mon, 21 Aug 2023 19:15:13 +0500
Subject: userfaultfd: UFFD_FEATURE_WP_ASYNC
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Patch series "Implement IOCTL to get and optionally clear info about
PTEs", v33.

*Motivation*
The real motivation for adding PAGEMAP_SCAN IOCTL is to emulate Windows
GetWriteWatch() and ResetWriteWatch() syscalls [1].  The GetWriteWatch()
retrieves the addresses of the pages that are written to in a region of
virtual memory.

This syscall is used in Windows applications and games etc.  This syscall
is being emulated in pretty slow manner in userspace.  Our purpose is to
enhance the kernel such that we translate it efficiently in a better way.
Currently some out of tree hack patches are being used to efficiently
emulate it in some kernels.  We intend to replace those with these
patches.  So the whole gaming on Linux can effectively get benefit from
this.  It means there would be tons of users of this code.

CRIU use case [2] was mentioned by Andrei and Danylo:
> Use cases for migrating sparse VMAs are binaries sanitized with ASAN,
> MSAN or TSAN [3]. All of these sanitizers produce sparse mappings of
> shadow memory [4]. Being able to migrate such binaries allows to highly
> reduce the amount of work needed to identify and fix post-migration
> crashes, which happen constantly.

Andrei defines the following uses of this code:
* it is more granular and allows us to track changed pages more
  effectively. The current interface can clear dirty bits for the entire
  process only. In addition, reading info about pages is a separate
  operation. It means we must freeze the process to read information
  about all its pages, reset dirty bits, only then we can start dumping
  pages. The information about pages becomes more and more outdated,
  while we are processing pages. The new interface solves both these
  downsides. First, it allows us to read pte bits and clear the
  soft-dirty bit atomically. It means that CRIU will not need to freeze
  processes to pre-dump their memory. Second, it clears soft-dirty bits
  for a specified region of memory. It means CRIU will have actual info
  about pages to the moment of dumping them.
* The new interface has to be much faster because basic page filtering
  is happening in the kernel. With the old interface, we have to read
  pagemap for each page.

*Implementation Evolution (Short Summary)*
From the definition of GetWriteWatch(), we feel like kernel's soft-dirty
feature can be used under the hood with some additions like:
* reset soft-dirty flag for only a specific region of memory instead of
clearing the flag for the entire process
* get and clear soft-dirty flag for a specific region atomically

So we decided to use ioctl on pagemap file to read or/and reset soft-dirty
flag. But using soft-dirty flag, sometimes we get extra pages which weren't
even written. They had become soft-dirty because of VMA merging and
VM_SOFTDIRTY flag. This breaks the definition of GetWriteWatch(). We were
able to by-pass this short coming by ignoring VM_SOFTDIRTY until David
reported that mprotect etc messes up the soft-dirty flag while ignoring
VM_SOFTDIRTY [5]. This wasn't happening until [6] got introduced. We
discussed if we can revert these patches. But we could not reach to any
conclusion. So at this point, I made couple of tries to solve this whole
VM_SOFTDIRTY issue by correcting the soft-dirty implementation:
* [7] Correct the bug fixed wrongly back in 2014. It had potential to cause
regression. We left it behind.
* [8] Keep a list of soft-dirty part of a VMA across splits and merges. I
got the reply don't increase the size of the VMA by 8 bytes.

At this point, we left soft-dirty considering it is too much delicate and
userfaultfd [9] seemed like the only way forward. From there onward, we
have been basing soft-dirty emulation on userfaultfd wp feature where
kernel resolves the faults itself when WP_ASYNC feature is used. It was
straight forward to add WP_ASYNC feature in userfautlfd. Now we get only
those pages dirty or written-to which are really written in reality. (PS
There is another WP_UNPOPULATED userfautfd feature is required which is
needed to avoid pre-faulting memory before write-protecting [9].)

All the different masks were added on the request of CRIU devs to create
interface more generic and better.

[1] https://learn.microsoft.com/en-us/windows/win32/api/memoryapi/nf-memoryapi-getwritewatch
[2] https://lore.kernel.org/all/20221014134802.1361436-1-mdanylo@google.com
[3] https://github.com/google/sanitizers
[4] https://github.com/google/sanitizers/wiki/AddressSanitizerAlgorithm#64-bit
[5] https://lore.kernel.org/all/bfcae708-db21-04b4-0bbe-712badd03071@redhat.com
[6] https://lore.kernel.org/all/20220725142048.30450-1-peterx@redhat.com/
[7] https://lore.kernel.org/all/20221122115007.2787017-1-usama.anjum@collabora.com
[8] https://lore.kernel.org/all/20221220162606.1595355-1-usama.anjum@collabora.com
[9] https://lore.kernel.org/all/20230306213925.617814-1-peterx@redhat.com
[10] https://lore.kernel.org/all/20230125144529.1630917-1-mdanylo@google.com


This patch (of 6):

Add a new userfaultfd-wp feature UFFD_FEATURE_WP_ASYNC, that allows
userfaultfd wr-protect faults to be resolved by the kernel directly.

It can be used like a high accuracy version of soft-dirty, without vma
modifications during tracking, and also with ranged support by default
rather than for a whole mm when reset the protections due to existence of
ioctl(UFFDIO_WRITEPROTECT).

Several goals of such a dirty tracking interface:

1. All types of memory should be supported and tracable. This is nature
   for soft-dirty but should mention when the context is userfaultfd,
   because it used to only support anon/shmem/hugetlb. The problem is for
   a dirty tracking purpose these three types may not be enough, and it's
   legal to track anything e.g. any page cache writes from mmap.

2. Protections can be applied to partial of a memory range, without vma
   split/merge fuss.  The hope is that the tracking itself should not
   affect any vma layout change.  It also helps when reset happens because
   the reset will not need mmap write lock which can block the tracee.

3. Accuracy needs to be maintained.  This means we need pte markers to work
   on any type of VMA.

One could question that, the whole concept of async dirty tracking is not
really close to fundamentally what userfaultfd used to be: it's not "a
fault to be serviced by userspace" anymore. However, using userfaultfd-wp
here as a framework is convenient for us in at least:

1. VM_UFFD_WP vma flag, which has a very good name to suite something like
   this, so we don't need VM_YET_ANOTHER_SOFT_DIRTY. Just use a new
   feature bit to identify from a sync version of uffd-wp registration.

2. PTE markers logic can be leveraged across the whole kernel to maintain
   the uffd-wp bit as long as an arch supports, this also applies to this
   case where uffd-wp bit will be a hint to dirty information and it will
   not go lost easily (e.g. when some page cache ptes got zapped).

3. Reuse ioctl(UFFDIO_WRITEPROTECT) interface for either starting or
   resetting a range of memory, while there's no counterpart in the old
   soft-dirty world, hence if this is wanted in a new design we'll need a
   new interface otherwise.

We can somehow understand that commonality because uffd-wp was
fundamentally a similar idea of write-protecting pages just like
soft-dirty.

This implementation allows WP_ASYNC to imply WP_UNPOPULATED, because so
far WP_ASYNC seems to not usable if without WP_UNPOPULATE.  This also
gives us chance to modify impl of WP_ASYNC just in case it could be not
depending on WP_UNPOPULATED anymore in the future kernels.  It's also fine
to imply that because both features will rely on PTE_MARKER_UFFD_WP config
option, so they'll show up together (or both missing) in an UFFDIO_API
probe.

vma_can_userfault() now allows any VMA if the userfaultfd registration is
only about async uffd-wp.  So we can track dirty for all kinds of memory
including generic file systems (like XFS, EXT4 or BTRFS).

One trick worth mention in do_wp_page() is that we need to manually update
vmf->orig_pte here because it can be used later with a pte_same() check -
this path always has FAULT_FLAG_ORIG_PTE_VALID set in the flags.

The major defect of this approach of dirty tracking is we need to populate
the pgtables when tracking starts.  Soft-dirty doesn't do it like that.
It's unwanted in the case where the range of memory to track is huge and
unpopulated (e.g., tracking updates on a 10G file with mmap() on top,
without having any page cache installed yet).  One way to improve this is
to allow pte markers exist for larger than PTE level for PMD+.  That will
not change the interface if to implemented, so we can leave that for
later.

Link: https://lkml.kernel.org/r/20230821141518.870589-1-usama.anjum@collabora.com
Link: https://lkml.kernel.org/r/20230821141518.870589-2-usama.anjum@collabora.com
Signed-off-by: Peter Xu <peterx@redhat.com>
Co-developed-by: Muhammad Usama Anjum <usama.anjum@collabora.com>
Signed-off-by: Muhammad Usama Anjum <usama.anjum@collabora.com>
Cc: Alex Sierra <alex.sierra@amd.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Andrei Vagin <avagin@gmail.com>
Cc: Axel Rasmussen <axelrasmussen@google.com>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Cyrill Gorcunov <gorcunov@gmail.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Gustavo A. R. Silva <gustavoars@kernel.org>
Cc: "Liam R. Howlett" <Liam.Howlett@oracle.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Michal Miroslaw <emmir@google.com>
Cc: Mike Rapoport (IBM) <rppt@kernel.org>
Cc: Nadav Amit <namit@vmware.com>
Cc: Pasha Tatashin <pasha.tatashin@soleen.com>
Cc: Paul Gofman <pgofman@codeweavers.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Yang Shi <shy828301@gmail.com>
Cc: Yun Zhou <yun.zhou@windriver.com>
Cc: Michał Mirosław <mirq-linux@rere.qmqm.pl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/userfaultfd_k.h | 21 ++++++++++++++++++++-
 1 file changed, 20 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
index ac8c6854097c..c98df391bfd8 100644
--- a/include/linux/userfaultfd_k.h
+++ b/include/linux/userfaultfd_k.h
@@ -161,11 +161,22 @@ static inline bool userfaultfd_armed(struct vm_area_struct *vma)
 }
 
 static inline bool vma_can_userfault(struct vm_area_struct *vma,
-				     unsigned long vm_flags)
+				     unsigned long vm_flags,
+				     bool wp_async)
 {
+	vm_flags &= __VM_UFFD_FLAGS;
+
 	if ((vm_flags & VM_UFFD_MINOR) &&
 	    (!is_vm_hugetlb_page(vma) && !vma_is_shmem(vma)))
 		return false;
+
+	/*
+	 * If wp async enabled, and WP is the only mode enabled, allow any
+	 * memory type.
+	 */
+	if (wp_async && (vm_flags == VM_UFFD_WP))
+		return true;
+
 #ifndef CONFIG_PTE_MARKER_UFFD_WP
 	/*
 	 * If user requested uffd-wp but not enabled pte markers for
@@ -175,6 +186,8 @@ static inline bool vma_can_userfault(struct vm_area_struct *vma,
 	if ((vm_flags & VM_UFFD_WP) && !vma_is_anonymous(vma))
 		return false;
 #endif
+
+	/* By default, allow any of anon|shmem|hugetlb */
 	return vma_is_anonymous(vma) || is_vm_hugetlb_page(vma) ||
 	    vma_is_shmem(vma);
 }
@@ -197,6 +210,7 @@ extern int userfaultfd_unmap_prep(struct vm_area_struct *vma,
 extern void userfaultfd_unmap_complete(struct mm_struct *mm,
 				       struct list_head *uf);
 extern bool userfaultfd_wp_unpopulated(struct vm_area_struct *vma);
+extern bool userfaultfd_wp_async(struct vm_area_struct *vma);
 
 #else /* CONFIG_USERFAULTFD */
 
@@ -297,6 +311,11 @@ static inline bool userfaultfd_wp_unpopulated(struct vm_area_struct *vma)
 	return false;
 }
 
+static inline bool userfaultfd_wp_async(struct vm_area_struct *vma)
+{
+	return false;
+}
+
 #endif /* CONFIG_USERFAULTFD */
 
 static inline bool userfaultfd_wp_use_markers(struct vm_area_struct *vma)
-- 
cgit v1.2.3


From 52526ca7fdb905a768a93f8faa418e9b988fc34b Mon Sep 17 00:00:00 2001
From: Muhammad Usama Anjum <usama.anjum@collabora.com>
Date: Mon, 21 Aug 2023 19:15:14 +0500
Subject: fs/proc/task_mmu: implement IOCTL to get and optionally clear info
 about PTEs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The PAGEMAP_SCAN IOCTL on the pagemap file can be used to get or optionally
clear the info about page table entries. The following operations are
supported in this IOCTL:
- Scan the address range and get the memory ranges matching the provided
  criteria. This is performed when the output buffer is specified.
- Write-protect the pages. The PM_SCAN_WP_MATCHING is used to write-protect
  the pages of interest. The PM_SCAN_CHECK_WPASYNC aborts the operation if
  non-Async Write Protected pages are found. The ``PM_SCAN_WP_MATCHING``
  can be used with or without PM_SCAN_CHECK_WPASYNC.
- Both of those operations can be combined into one atomic operation where
  we can get and write protect the pages as well.

Following flags about pages are currently supported:
- PAGE_IS_WPALLOWED - Page has async-write-protection enabled
- PAGE_IS_WRITTEN - Page has been written to from the time it was write protected
- PAGE_IS_FILE - Page is file backed
- PAGE_IS_PRESENT - Page is present in the memory
- PAGE_IS_SWAPPED - Page is in swapped
- PAGE_IS_PFNZERO - Page has zero PFN
- PAGE_IS_HUGE - Page is THP or Hugetlb backed

This IOCTL can be extended to get information about more PTE bits. The
entire address range passed by user [start, end) is scanned until either
the user provided buffer is full or max_pages have been found.

[akpm@linux-foundation.org: update it for "mm: hugetlb: add huge page size param to set_huge_pte_at()"]
[akpm@linux-foundation.org: fix CONFIG_HUGETLB_PAGE=n warning]
[arnd@arndb.de: hide unused pagemap_scan_backout_range() function]
  Link: https://lkml.kernel.org/r/20230927060257.2975412-1-arnd@kernel.org
[sfr@canb.auug.org.au: fix "fs/proc/task_mmu: hide unused pagemap_scan_backout_range() function"]
  Link: https://lkml.kernel.org/r/20230928092223.0625c6bf@canb.auug.org.au
Link: https://lkml.kernel.org/r/20230821141518.870589-3-usama.anjum@collabora.com
Signed-off-by: Muhammad Usama Anjum <usama.anjum@collabora.com>
Signed-off-by: Michał Mirosław <mirq-linux@rere.qmqm.pl>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au>
Reviewed-by: Andrei Vagin <avagin@gmail.com>
Reviewed-by: Michał Mirosław <mirq-linux@rere.qmqm.pl>
Cc: Alex Sierra <alex.sierra@amd.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Axel Rasmussen <axelrasmussen@google.com>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Cyrill Gorcunov <gorcunov@gmail.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Gustavo A. R. Silva <gustavoars@kernel.org>
Cc: "Liam R. Howlett" <Liam.Howlett@oracle.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Michal Miroslaw <emmir@google.com>
Cc: Mike Rapoport (IBM) <rppt@kernel.org>
Cc: Nadav Amit <namit@vmware.com>
Cc: Pasha Tatashin <pasha.tatashin@soleen.com>
Cc: Paul Gofman <pgofman@codeweavers.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Yang Shi <shy828301@gmail.com>
Cc: Yun Zhou <yun.zhou@windriver.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/hugetlb.h       | 1 +
 include/linux/userfaultfd_k.h | 7 +++++++
 2 files changed, 8 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index e5b9f7e62eeb..205469aa0613 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -280,6 +280,7 @@ long hugetlb_change_protection(struct vm_area_struct *vma,
 		unsigned long cp_flags);
 
 bool is_hugetlb_entry_migration(pte_t pte);
+bool is_hugetlb_entry_hwpoisoned(pte_t pte);
 void hugetlb_unshare_all_pmds(struct vm_area_struct *vma);
 
 #else /* !CONFIG_HUGETLB_PAGE */
diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
index c98df391bfd8..f2dc19f40d05 100644
--- a/include/linux/userfaultfd_k.h
+++ b/include/linux/userfaultfd_k.h
@@ -221,6 +221,13 @@ static inline vm_fault_t handle_userfault(struct vm_fault *vmf,
 	return VM_FAULT_SIGBUS;
 }
 
+static inline long uffd_wp_range(struct vm_area_struct *vma,
+				 unsigned long start, unsigned long len,
+				 bool enable_wp)
+{
+	return false;
+}
+
 static inline bool is_mergeable_vm_userfaultfd_ctx(struct vm_area_struct *vma,
 					struct vm_userfaultfd_ctx vm_ctx)
 {
-- 
cgit v1.2.3


From ee615d4585cfc305bf6c218a62123c3051f8b4a3 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Fri, 29 Sep 2023 20:25:38 -0700
Subject: shmem: shrink shmem_inode_info: dir_offsets in a union

Patch series "shmem,tmpfs: general maintenance".

Mostly just cosmetic mods in mm/shmem.c, but the last two enforcing the
"size=" limit better.  8/8 goes into percpu counter territory, and could
stand alone.


This patch (of 8):

Shave 32 bytes off (the 64-bit) shmem_inode_info.  There was a 4-byte
pahole after stop_eviction, better filled by fsflags.  And the 24-byte
dir_offsets can only be used by directories, whereas shrinklist and
swaplist only by shmem_mapping() inodes (regular files or long symlinks):
so put those into a union.  No change in mm/shmem.c is required for this.

Link: https://lkml.kernel.org/r/c7441dc6-f3bb-dd60-c670-9f5cbd9f266@google.com
Link: https://lkml.kernel.org/r/86ebb4b-c571-b9e8-27f5-cb82ec50357e@google.com
Signed-off-by: Hugh Dickins <hughd@google.com>
Reviewed-by: Chuck Lever <chuck.lever@oracle.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Cc: Axel Rasmussen <axelrasmussen@google.com>
Cc: Carlos Maiolino <cem@kernel.org>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Darrick J. Wong <djwong@kernel.org>
Cc: Dave Chinner <dchinner@redhat.com>
Cc: Tim Chen <tim.c.chen@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/shmem_fs.h | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h
index 6b0c626620f5..2caa6b86106a 100644
--- a/include/linux/shmem_fs.h
+++ b/include/linux/shmem_fs.h
@@ -23,18 +23,22 @@ struct shmem_inode_info {
 	unsigned long		flags;
 	unsigned long		alloced;	/* data pages alloced to file */
 	unsigned long		swapped;	/* subtotal assigned to swap */
-	pgoff_t			fallocend;	/* highest fallocate endindex */
-	struct list_head        shrinklist;     /* shrinkable hpage inodes */
-	struct list_head	swaplist;	/* chain of maybes on swap */
+	union {
+	    struct offset_ctx	dir_offsets;	/* stable directory offsets */
+	    struct {
+		struct list_head shrinklist;	/* shrinkable hpage inodes */
+		struct list_head swaplist;	/* chain of maybes on swap */
+	    };
+	};
+	struct timespec64	i_crtime;	/* file creation time */
 	struct shared_policy	policy;		/* NUMA memory alloc policy */
 	struct simple_xattrs	xattrs;		/* list of xattrs */
+	pgoff_t			fallocend;	/* highest fallocate endindex */
+	unsigned int		fsflags;	/* for FS_IOC_[SG]ETFLAGS */
 	atomic_t		stop_eviction;	/* hold when working on inode */
-	struct timespec64	i_crtime;	/* file creation time */
-	unsigned int		fsflags;	/* flags for FS_IOC_[SG]ETFLAGS */
 #ifdef CONFIG_TMPFS_QUOTA
 	struct dquot		*i_dquot[MAXQUOTAS];
 #endif
-	struct offset_ctx	dir_offsets;	/* stable entry offsets */
 	struct inode		vfs_inode;
 };
 
-- 
cgit v1.2.3


From beb9868628445306958fd7b2da1cd369a4a381cc Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Fri, 29 Sep 2023 20:42:45 -0700
Subject: shmem,percpu_counter: add _limited_add(fbc, limit, amount)

Percpu counter's compare and add are separate functions: without locking
around them (which would defeat their purpose), it has been possible to
overflow the intended limit.  Imagine all the other CPUs fallocating tmpfs
huge pages to the limit, in between this CPU's compare and its add.

I have not seen reports of that happening; but tmpfs's recent addition of
dquot_alloc_block_nodirty() in between the compare and the add makes it
even more likely, and I'd be uncomfortable to leave it unfixed.

Introduce percpu_counter_limited_add(fbc, limit, amount) to prevent it.

I believe this implementation is correct, and slightly more efficient than
the combination of compare and add (taking the lock once rather than twice
when nearing full - the last 128MiB of a tmpfs volume on a machine with
128 CPUs and 4KiB pages); but it does beg for a better design - when
nearing full, there is no new batching, but the costly percpu counter sum
across CPUs still has to be done, while locked.

Follow __percpu_counter_sum()'s example, including cpu_dying_mask as well
as cpu_online_mask: but shouldn't __percpu_counter_compare() and
__percpu_counter_limited_add() then be adding a num_dying_cpus() to
num_online_cpus(), when they calculate the maximum which could be held
across CPUs?  But the times when it matters would be vanishingly rare.

Link: https://lkml.kernel.org/r/bb817848-2d19-bcc8-39ca-ea179af0f0b4@google.com
Signed-off-by: Hugh Dickins <hughd@google.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Cc: Tim Chen <tim.c.chen@intel.com>
Cc: Dave Chinner <dchinner@redhat.com>
Cc: Darrick J. Wong <djwong@kernel.org>
Cc: Axel Rasmussen <axelrasmussen@google.com>
Cc: Carlos Maiolino <cem@kernel.org>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Chuck Lever <chuck.lever@oracle.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/percpu_counter.h | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/percpu_counter.h b/include/linux/percpu_counter.h
index d01351b1526f..8cb7c071bd5c 100644
--- a/include/linux/percpu_counter.h
+++ b/include/linux/percpu_counter.h
@@ -57,6 +57,8 @@ void percpu_counter_add_batch(struct percpu_counter *fbc, s64 amount,
 			      s32 batch);
 s64 __percpu_counter_sum(struct percpu_counter *fbc);
 int __percpu_counter_compare(struct percpu_counter *fbc, s64 rhs, s32 batch);
+bool __percpu_counter_limited_add(struct percpu_counter *fbc, s64 limit,
+				  s64 amount, s32 batch);
 void percpu_counter_sync(struct percpu_counter *fbc);
 
 static inline int percpu_counter_compare(struct percpu_counter *fbc, s64 rhs)
@@ -69,6 +71,13 @@ static inline void percpu_counter_add(struct percpu_counter *fbc, s64 amount)
 	percpu_counter_add_batch(fbc, amount, percpu_counter_batch);
 }
 
+static inline bool
+percpu_counter_limited_add(struct percpu_counter *fbc, s64 limit, s64 amount)
+{
+	return __percpu_counter_limited_add(fbc, limit, amount,
+					    percpu_counter_batch);
+}
+
 /*
  * With percpu_counter_add_local() and percpu_counter_sub_local(), counts
  * are accumulated in local per cpu counter and not in fbc->count until
@@ -185,6 +194,20 @@ percpu_counter_add(struct percpu_counter *fbc, s64 amount)
 	local_irq_restore(flags);
 }
 
+static inline bool
+percpu_counter_limited_add(struct percpu_counter *fbc, s64 limit, s64 amount)
+{
+	unsigned long flags;
+	s64 count;
+
+	local_irq_save(flags);
+	count = fbc->count + amount;
+	if (count <= limit)
+		fbc->count = count;
+	local_irq_restore(flags);
+	return count <= limit;
+}
+
 /* non-SMP percpu_counter_add_local is the same with percpu_counter_add */
 static inline void
 percpu_counter_add_local(struct percpu_counter *fbc, s64 amount)
-- 
cgit v1.2.3


From 1431996bf9088ee59f8017637ab9a7f89909ae63 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Wed, 11 Oct 2023 21:40:09 -0700
Subject: percpu_counter: extend _limited_add() to negative amounts

Though tmpfs does not need it, percpu_counter_limited_add() can be twice
as useful if it works sensibly with negative amounts (subs) - typically
decrements towards a limit of 0 or nearby: as suggested by Dave Chinner.

And in the course of that reworking, skip the percpu counter sum if it is
already obvious that the limit would be passed: as suggested by Tim Chen.

Extend the comment above __percpu_counter_limited_add(), defining the
behaviour with positive and negative amounts, allowing negative limits,
but not bothering about overflow beyond S64_MAX.

Link: https://lkml.kernel.org/r/8f86083b-c452-95d4-365b-f16a2e4ebcd4@google.com
Signed-off-by: Hugh Dickins <hughd@google.com>
Cc: Axel Rasmussen <axelrasmussen@google.com>
Cc: Carlos Maiolino <cem@kernel.org>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Chuck Lever <chuck.lever@oracle.com>
Cc: Darrick J. Wong <djwong@kernel.org>
Cc: Dave Chinner <dchinner@redhat.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Tim Chen <tim.c.chen@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/percpu_counter.h | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/percpu_counter.h b/include/linux/percpu_counter.h
index 8cb7c071bd5c..3a44dd1e33d2 100644
--- a/include/linux/percpu_counter.h
+++ b/include/linux/percpu_counter.h
@@ -198,14 +198,21 @@ static inline bool
 percpu_counter_limited_add(struct percpu_counter *fbc, s64 limit, s64 amount)
 {
 	unsigned long flags;
+	bool good = false;
 	s64 count;
 
+	if (amount == 0)
+		return true;
+
 	local_irq_save(flags);
 	count = fbc->count + amount;
-	if (count <= limit)
+	if ((amount > 0 && count <= limit) ||
+	    (amount < 0 && count >= limit)) {
 		fbc->count = count;
+		good = true;
+	}
 	local_irq_restore(flags);
-	return count <= limit;
+	return good;
 }
 
 /* non-SMP percpu_counter_add_local is the same with percpu_counter_add */
-- 
cgit v1.2.3


From 069686255c16a75b6a796e42df47f5af27b496a4 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Mon, 2 Oct 2023 16:29:48 +0200
Subject: mm/rmap: convert page_move_anon_rmap() to folio_move_anon_rmap()

Let's convert it to consume a folio.

[akpm@linux-foundation.org: fix kerneldoc]
Link: https://lkml.kernel.org/r/20231002142949.235104-3-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Suren Baghdasaryan <surenb@google.com>
Reviewed-by: Vishal Moola (Oracle) <vishal.moola@gmail.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Matthew Wilcox <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/rmap.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index d22f4d21a11c..b26fe858fd44 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -189,7 +189,7 @@ typedef int __bitwise rmap_t;
 /*
  * rmap interfaces called when adding or removing pte of page
  */
-void page_move_anon_rmap(struct page *, struct vm_area_struct *);
+void folio_move_anon_rmap(struct folio *, struct vm_area_struct *);
 void page_add_anon_rmap(struct page *, struct vm_area_struct *,
 		unsigned long address, rmap_t flags);
 void page_add_new_anon_rmap(struct page *, struct vm_area_struct *,
-- 
cgit v1.2.3


From c43cfa42541c04a3a94312e39ab81c41ba431277 Mon Sep 17 00:00:00 2001
From: Lorenzo Stoakes <lstoakes@gmail.com>
Date: Tue, 3 Oct 2023 00:14:51 +0100
Subject: mm: make __access_remote_vm() static

Patch series "various improvements to the GUP interface", v2.

A series of fixes to simplify and improve the GUP interface with an eye to
providing groundwork to future improvements:-

* __access_remote_vm() and access_remote_vm() are functionally identical,
  so make the former static such that in future we can potentially change
  the external-facing implementation details of this function.

* Extend is_valid_gup_args() to cover the missing FOLL_TOUCH case, and
  simplify things by defining INTERNAL_GUP_FLAGS to check against.

* Adjust __get_user_pages_locked() to explicitly treat a failure to pin any
  pages as an error in all circumstances other than FOLL_NOWAIT being
  specified, bringing it in line with the nommu implementation of this
  function.

* (With many thanks to Arnd who suggested this in the first instance)
  Update get_user_page_vma_remote() to explicitly only return a page or an
  error, simplifying the interface and avoiding the questionable
  IS_ERR_OR_NULL() pattern.


This patch (of 4):

access_remote_vm() passes through parameters to __access_remote_vm()
directly, so remove the __access_remote_vm() function from mm.h and use
access_remote_vm() in the one caller that needs it (ptrace_access_vm()).

This allows future adjustments to the GUP-internal __access_remote_vm()
function while keeping the access_remote_vm() function stable.

Link: https://lkml.kernel.org/r/cover.1696288092.git.lstoakes@gmail.com
Link: https://lkml.kernel.org/r/f7877c5039ce1c202a514a8aeeefc5cdd5e32d19.1696288092.git.lstoakes@gmail.com
Signed-off-by: Lorenzo Stoakes <lstoakes@gmail.com>
Reviewed-by: Arnd Bergmann <arnd@arndb.de>
Reviewed-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 52c40b3d0813..7b89f7bd420d 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2415,8 +2415,6 @@ extern int access_process_vm(struct task_struct *tsk, unsigned long addr,
 		void *buf, int len, unsigned int gup_flags);
 extern int access_remote_vm(struct mm_struct *mm, unsigned long addr,
 		void *buf, int len, unsigned int gup_flags);
-extern int __access_remote_vm(struct mm_struct *mm, unsigned long addr,
-			      void *buf, int len, unsigned int gup_flags);
 
 long get_user_pages_remote(struct mm_struct *mm,
 			   unsigned long start, unsigned long nr_pages,
-- 
cgit v1.2.3


From 6a1960b8a8773324d870fa32ba68ff3106523a95 Mon Sep 17 00:00:00 2001
From: Lorenzo Stoakes <lstoakes@gmail.com>
Date: Tue, 3 Oct 2023 00:14:54 +0100
Subject: mm/gup: adapt get_user_page_vma_remote() to never return NULL

get_user_pages_remote() will never return 0 except in the case of
FOLL_NOWAIT being specified, which we explicitly disallow.

This simplifies error handling for the caller and avoids the awkwardness
of dealing with both errors and failing to pin.  Failing to pin here is an
error.

Link: https://lkml.kernel.org/r/00319ce292d27b3aae76a0eb220ce3f528187508.1696288092.git.lstoakes@gmail.com
Signed-off-by: Lorenzo Stoakes <lstoakes@gmail.com>
Suggested-by: Arnd Bergmann <arnd@arndb.de>
Reviewed-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 7b89f7bd420d..fa608cba041f 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2425,6 +2425,9 @@ long pin_user_pages_remote(struct mm_struct *mm,
 			   unsigned int gup_flags, struct page **pages,
 			   int *locked);
 
+/*
+ * Retrieves a single page alongside its VMA. Does not support FOLL_NOWAIT.
+ */
 static inline struct page *get_user_page_vma_remote(struct mm_struct *mm,
 						    unsigned long addr,
 						    int gup_flags,
@@ -2432,12 +2435,15 @@ static inline struct page *get_user_page_vma_remote(struct mm_struct *mm,
 {
 	struct page *page;
 	struct vm_area_struct *vma;
-	int got = get_user_pages_remote(mm, addr, 1, gup_flags, &page, NULL);
+	int got;
+
+	if (WARN_ON_ONCE(unlikely(gup_flags & FOLL_NOWAIT)))
+		return ERR_PTR(-EINVAL);
+
+	got = get_user_pages_remote(mm, addr, 1, gup_flags, &page, NULL);
 
 	if (got < 0)
 		return ERR_PTR(got);
-	if (got == 0)
-		return NULL;
 
 	vma = vma_lookup(mm, addr);
 	if (WARN_ON_ONCE(!vma)) {
-- 
cgit v1.2.3


From 0b237047d5a72ffe06c0bdf2f4536f669dcd31c9 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Wed, 4 Oct 2023 17:53:03 +0100
Subject: mm: add folio_end_read()

Provide a function for filesystems to call when they have finished reading
an entire folio.

Link: https://lkml.kernel.org/r/20231004165317.1061855-4-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Andreas Dilger <adilger.kernel@dilger.ca>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Ivan Kokshaysky <ink@jurassic.park.msu.ru>
Cc: Matt Turner <mattst88@gmail.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Walmsley <paul.walmsley@sifive.com>
Cc: Richard Henderson <richard.henderson@linaro.org>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: "Theodore Ts'o" <tytso@mit.edu>
Cc: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/pagemap.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 759b29d9a69a..bcc1ea44b4e8 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -1101,6 +1101,7 @@ static inline void wait_on_page_locked(struct page *page)
 	folio_wait_locked(page_folio(page));
 }
 
+void folio_end_read(struct folio *folio, bool success);
 void wait_on_page_writeback(struct page *page);
 void folio_wait_writeback(struct folio *folio);
 int folio_wait_writeback_killable(struct folio *folio);
-- 
cgit v1.2.3


From 0410cd844ed0af3db3cb510d877d62c66d26e5cc Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Wed, 4 Oct 2023 17:53:15 +0100
Subject: mm: add folio_xor_flags_has_waiters()

Optimise folio_end_read() by setting the uptodate bit at the same time we
clear the unlock bit.  This saves at least one memory barrier and one
write-after-write hazard.

Link: https://lkml.kernel.org/r/20231004165317.1061855-16-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Andreas Dilger <adilger.kernel@dilger.ca>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Ivan Kokshaysky <ink@jurassic.park.msu.ru>
Cc: Matt Turner <mattst88@gmail.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Walmsley <paul.walmsley@sifive.com>
Cc: Richard Henderson <richard.henderson@linaro.org>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: "Theodore Ts'o" <tytso@mit.edu>
Cc: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/page-flags.h | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 5c02720c53a5..a88e64acebfe 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -692,6 +692,25 @@ TESTPAGEFLAG_FALSE(Ksm, ksm)
 
 u64 stable_page_flags(struct page *page);
 
+/**
+ * folio_xor_flags_has_waiters - Change some folio flags.
+ * @folio: The folio.
+ * @mask: Bits set in this word will be changed.
+ *
+ * This must only be used for flags which are changed with the folio
+ * lock held.  For example, it is unsafe to use for PG_dirty as that
+ * can be set without the folio lock held.  It can also only be used
+ * on flags which are in the range 0-6 as some of the implementations
+ * only affect those bits.
+ *
+ * Return: Whether there are tasks waiting on the folio.
+ */
+static inline bool folio_xor_flags_has_waiters(struct folio *folio,
+		unsigned long mask)
+{
+	return xor_unlock_is_negative_byte(mask, folio_flags(folio, 0));
+}
+
 /**
  * folio_test_uptodate - Is this folio up to date?
  * @folio: The folio.
-- 
cgit v1.2.3


From 59838b2566f6d03099743675a2e0f425813078c6 Mon Sep 17 00:00:00 2001
From: Frank van der Linden <fvdl@google.com>
Date: Wed, 4 Oct 2023 15:32:48 +0000
Subject: mm, hugetlb: remove HUGETLB_CGROUP_MIN_ORDER

Originally, hugetlb_cgroup was the only hugetlb user of tail page
structure fields.  So, the code defined and checked against
HUGETLB_CGROUP_MIN_ORDER to make sure pages weren't too small to use.

However, by now, tail page #2 is used to store hugetlb hwpoison and
subpool information as well.  In other words, without that tail page
hugetlb doesn't work.

Acknowledge this fact by getting rid of HUGETLB_CGROUP_MIN_ORDER and
checks against it.  Instead, just check for the minimum viable page order
at hstate creation time.

Link: https://lkml.kernel.org/r/20231004153248.3842997-1-fvdl@google.com
Signed-off-by: Frank van der Linden <fvdl@google.com>
Reviewed-by: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Muchun Song <muchun.song@linux.dev>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/hugetlb_cgroup.h | 11 -----------
 1 file changed, 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hugetlb_cgroup.h b/include/linux/hugetlb_cgroup.h
index 3d82d91f49ac..e5d64b8b59c2 100644
--- a/include/linux/hugetlb_cgroup.h
+++ b/include/linux/hugetlb_cgroup.h
@@ -22,13 +22,6 @@ struct resv_map;
 struct file_region;
 
 #ifdef CONFIG_CGROUP_HUGETLB
-/*
- * Minimum page order trackable by hugetlb cgroup.
- * At least 3 pages are necessary for all the tracking information.
- * The second tail page contains all of the hugetlb-specific fields.
- */
-#define HUGETLB_CGROUP_MIN_ORDER order_base_2(__NR_USED_SUBPAGE)
-
 enum hugetlb_memory_event {
 	HUGETLB_MAX,
 	HUGETLB_NR_MEMORY_EVENTS,
@@ -68,8 +61,6 @@ static inline struct hugetlb_cgroup *
 __hugetlb_cgroup_from_folio(struct folio *folio, bool rsvd)
 {
 	VM_BUG_ON_FOLIO(!folio_test_hugetlb(folio), folio);
-	if (folio_order(folio) < HUGETLB_CGROUP_MIN_ORDER)
-		return NULL;
 	if (rsvd)
 		return folio->_hugetlb_cgroup_rsvd;
 	else
@@ -91,8 +82,6 @@ static inline void __set_hugetlb_cgroup(struct folio *folio,
 				       struct hugetlb_cgroup *h_cg, bool rsvd)
 {
 	VM_BUG_ON_FOLIO(!folio_test_hugetlb(folio), folio);
-	if (folio_order(folio) < HUGETLB_CGROUP_MIN_ORDER)
-		return;
 	if (rsvd)
 		folio->_hugetlb_cgroup_rsvd = h_cg;
 	else
-- 
cgit v1.2.3


From 4b569387c0d566db288e7c3e1b484b43df797bdb Mon Sep 17 00:00:00 2001
From: Nhat Pham <nphamcs@gmail.com>
Date: Fri, 6 Oct 2023 11:46:26 -0700
Subject: memcontrol: add helpers for hugetlb memcg accounting

Patch series "hugetlb memcg accounting", v4.

Currently, hugetlb memory usage is not acounted for in the memory
controller, which could lead to memory overprotection for cgroups with
hugetlb-backed memory.  This has been observed in our production system.

For instance, here is one of our usecases: suppose there are two 32G
containers.  The machine is booted with hugetlb_cma=6G, and each container
may or may not use up to 3 gigantic page, depending on the workload within
it.  The rest is anon, cache, slab, etc.  We can set the hugetlb cgroup
limit of each cgroup to 3G to enforce hugetlb fairness.  But it is very
difficult to configure memory.max to keep overall consumption, including
anon, cache, slab etcetera fair.

What we have had to resort to is to constantly poll hugetlb usage and
readjust memory.max.  Similar procedure is done to other memory limits
(memory.low for e.g).  However, this is rather cumbersome and buggy.
Furthermore, when there is a delay in memory limits correction, (for e.g
when hugetlb usage changes within consecutive runs of the userspace
agent), the system could be in an over/underprotected state.

This patch series rectifies this issue by charging the memcg when the
hugetlb folio is allocated, and uncharging when the folio is freed.  In
addition, a new selftest is added to demonstrate and verify this new
behavior.


This patch (of 4):

This patch exposes charge committing and cancelling as parts of the memory
controller interface.  These functionalities are useful when the
try_charge() and commit_charge() stages have to be separated by other
actions in between (which can fail).  One such example is the new hugetlb
accounting behavior in the following patch.

The patch also adds a helper function to obtain a reference to the
current task's memcg.

Link: https://lkml.kernel.org/r/20231006184629.155543-1-nphamcs@gmail.com
Link: https://lkml.kernel.org/r/20231006184629.155543-2-nphamcs@gmail.com
Signed-off-by: Nhat Pham <nphamcs@gmail.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Frank van der Linden <fvdl@google.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Rik van Riel <riel@surriel.com>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Tejun heo <tj@kernel.org>
Cc: Yosry Ahmed <yosryahmed@google.com>
Cc: Zefan Li <lizefan.x@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/memcontrol.h | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index c6029aeaa268..bea3d997b076 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -652,6 +652,8 @@ static inline bool mem_cgroup_below_min(struct mem_cgroup *target,
 		page_counter_read(&memcg->memory);
 }
 
+void mem_cgroup_commit_charge(struct folio *folio, struct mem_cgroup *memcg);
+
 int __mem_cgroup_charge(struct folio *folio, struct mm_struct *mm, gfp_t gfp);
 
 /**
@@ -703,6 +705,8 @@ static inline void mem_cgroup_uncharge_list(struct list_head *page_list)
 	__mem_cgroup_uncharge_list(page_list);
 }
 
+void mem_cgroup_cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages);
+
 void mem_cgroup_migrate(struct folio *old, struct folio *new);
 
 /**
@@ -759,6 +763,8 @@ struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p);
 
 struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm);
 
+struct mem_cgroup *get_mem_cgroup_from_current(void);
+
 struct lruvec *folio_lruvec_lock(struct folio *folio);
 struct lruvec *folio_lruvec_lock_irq(struct folio *folio);
 struct lruvec *folio_lruvec_lock_irqsave(struct folio *folio,
@@ -1239,6 +1245,11 @@ static inline bool mem_cgroup_below_min(struct mem_cgroup *target,
 	return false;
 }
 
+static inline void mem_cgroup_commit_charge(struct folio *folio,
+		struct mem_cgroup *memcg)
+{
+}
+
 static inline int mem_cgroup_charge(struct folio *folio,
 		struct mm_struct *mm, gfp_t gfp)
 {
@@ -1263,6 +1274,11 @@ static inline void mem_cgroup_uncharge_list(struct list_head *page_list)
 {
 }
 
+static inline void mem_cgroup_cancel_charge(struct mem_cgroup *memcg,
+		unsigned int nr_pages)
+{
+}
+
 static inline void mem_cgroup_migrate(struct folio *old, struct folio *new)
 {
 }
@@ -1300,6 +1316,11 @@ static inline struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)
 	return NULL;
 }
 
+static inline struct mem_cgroup *get_mem_cgroup_from_current(void)
+{
+	return NULL;
+}
+
 static inline
 struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *css)
 {
-- 
cgit v1.2.3


From 85ce2c517ade0d51b7ad95f2e88be9bbe294379a Mon Sep 17 00:00:00 2001
From: Nhat Pham <nphamcs@gmail.com>
Date: Fri, 6 Oct 2023 11:46:27 -0700
Subject: memcontrol: only transfer the memcg data for migration

For most migration use cases, only transfer the memcg data from the old
folio to the new folio, and clear the old folio's memcg data.  No charging
and uncharging will be done.

This shaves off some work on the migration path, and avoids the temporary
double charging of a folio during its migration.

The only exception is replace_page_cache_folio(), which will use the old
mem_cgroup_migrate() (now renamed to mem_cgroup_replace_folio).  In that
context, the isolation of the old page isn't quite as thorough as with
migration, so we cannot use our new implementation directly.

This patch is the result of the following discussion on the new hugetlb
memcg accounting behavior:

https://lore.kernel.org/lkml/20231003171329.GB314430@monkey/

Link: https://lkml.kernel.org/r/20231006184629.155543-3-nphamcs@gmail.com
Signed-off-by: Nhat Pham <nphamcs@gmail.com>
Suggested-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Frank van der Linden <fvdl@google.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Rik van Riel <riel@surriel.com>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Tejun heo <tj@kernel.org>
Cc: Yosry Ahmed <yosryahmed@google.com>
Cc: Zefan Li <lizefan.x@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/memcontrol.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index bea3d997b076..a3fce570122e 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -707,6 +707,8 @@ static inline void mem_cgroup_uncharge_list(struct list_head *page_list)
 
 void mem_cgroup_cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages);
 
+void mem_cgroup_replace_folio(struct folio *old, struct folio *new);
+
 void mem_cgroup_migrate(struct folio *old, struct folio *new);
 
 /**
@@ -1279,6 +1281,11 @@ static inline void mem_cgroup_cancel_charge(struct mem_cgroup *memcg,
 {
 }
 
+static inline void mem_cgroup_replace_folio(struct folio *old,
+		struct folio *new)
+{
+}
+
 static inline void mem_cgroup_migrate(struct folio *old, struct folio *new)
 {
 }
-- 
cgit v1.2.3


From 8cba9576df601c384abd334a503c3f6e1e29eefb Mon Sep 17 00:00:00 2001
From: Nhat Pham <nphamcs@gmail.com>
Date: Fri, 6 Oct 2023 11:46:28 -0700
Subject: hugetlb: memcg: account hugetlb-backed memory in memory controller

Currently, hugetlb memory usage is not acounted for in the memory
controller, which could lead to memory overprotection for cgroups with
hugetlb-backed memory.  This has been observed in our production system.

For instance, here is one of our usecases: suppose there are two 32G
containers.  The machine is booted with hugetlb_cma=6G, and each container
may or may not use up to 3 gigantic page, depending on the workload within
it.  The rest is anon, cache, slab, etc.  We can set the hugetlb cgroup
limit of each cgroup to 3G to enforce hugetlb fairness.  But it is very
difficult to configure memory.max to keep overall consumption, including
anon, cache, slab etc.  fair.

What we have had to resort to is to constantly poll hugetlb usage and
readjust memory.max.  Similar procedure is done to other memory limits
(memory.low for e.g).  However, this is rather cumbersome and buggy.
Furthermore, when there is a delay in memory limits correction, (for e.g
when hugetlb usage changes within consecutive runs of the userspace
agent), the system could be in an over/underprotected state.

This patch rectifies this issue by charging the memcg when the hugetlb
folio is utilized, and uncharging when the folio is freed (analogous to
the hugetlb controller).  Note that we do not charge when the folio is
allocated to the hugetlb pool, because at this point it is not owned by
any memcg.

Some caveats to consider:
  * This feature is only available on cgroup v2.
  * There is no hugetlb pool management involved in the memory
    controller. As stated above, hugetlb folios are only charged towards
    the memory controller when it is used. Host overcommit management
    has to consider it when configuring hard limits.
  * Failure to charge towards the memcg results in SIGBUS. This could
    happen even if the hugetlb pool still has pages (but the cgroup
    limit is hit and reclaim attempt fails).
  * When this feature is enabled, hugetlb pages contribute to memory
    reclaim protection. low, min limits tuning must take into account
    hugetlb memory.
  * Hugetlb pages utilized while this option is not selected will not
    be tracked by the memory controller (even if cgroup v2 is remounted
    later on).

Link: https://lkml.kernel.org/r/20231006184629.155543-4-nphamcs@gmail.com
Signed-off-by: Nhat Pham <nphamcs@gmail.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Frank van der Linden <fvdl@google.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Rik van Riel <riel@surriel.com>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Tejun heo <tj@kernel.org>
Cc: Yosry Ahmed <yosryahmed@google.com>
Cc: Zefan Li <lizefan.x@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/cgroup-defs.h | 5 +++++
 include/linux/memcontrol.h  | 9 +++++++++
 2 files changed, 14 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index f1b3151ac30b..8641f4320c98 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -115,6 +115,11 @@ enum {
 	 * Enable recursive subtree protection
 	 */
 	CGRP_ROOT_MEMORY_RECURSIVE_PROT = (1 << 18),
+
+	/*
+	 * Enable hugetlb accounting for the memory controller.
+	 */
+	 CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING = (1 << 19),
 };
 
 /* cftype->flags */
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index a3fce570122e..6674c12725d5 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -678,6 +678,9 @@ static inline int mem_cgroup_charge(struct folio *folio, struct mm_struct *mm,
 	return __mem_cgroup_charge(folio, mm, gfp);
 }
 
+int mem_cgroup_hugetlb_try_charge(struct mem_cgroup *memcg, gfp_t gfp,
+		long nr_pages);
+
 int mem_cgroup_swapin_charge_folio(struct folio *folio, struct mm_struct *mm,
 				  gfp_t gfp, swp_entry_t entry);
 void mem_cgroup_swapin_uncharge_swap(swp_entry_t entry);
@@ -1258,6 +1261,12 @@ static inline int mem_cgroup_charge(struct folio *folio,
 	return 0;
 }
 
+static inline int mem_cgroup_hugetlb_try_charge(struct mem_cgroup *memcg,
+		gfp_t gfp, long nr_pages)
+{
+	return 0;
+}
+
 static inline int mem_cgroup_swapin_charge_folio(struct folio *folio,
 			struct mm_struct *mm, gfp_t gfp, swp_entry_t entry)
 {
-- 
cgit v1.2.3


From f04eba134e598d4a5af2eeecff0562df5042cbfc Mon Sep 17 00:00:00 2001
From: Lucy Mielke <lucymielke@icloud.com>
Date: Fri, 6 Oct 2023 22:30:51 +0200
Subject: mm: add printf attribute to shrinker_debugfs_name_alloc

This fixes a compiler warning when compiling an allyesconfig with W=1:

mm/internal.h:1235:9: error: function might be a candidate for `gnu_printf'
format attribute [-Werror=suggest-attribute=format]

[akpm@linux-foundation.org: fix shrinker_alloc() as welll per Qi Zheng]
  Link: https://lkml.kernel.org/r/822387b7-4895-4e64-5806-0f56b5d6c447@bytedance.com
Link: https://lkml.kernel.org/r/ZSBue-3kM6gI6jCr@mainframe
Fixes: c42d50aefd17 ("mm: shrinker: add infrastructure for dynamically allocating shrinker")
Signed-off-by: Lucy Mielke <lucymielke@icloud.com>
Cc: Qi Zheng <zhengqi.arch@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/shrinker.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h
index e4f93120e0ab..1a00be90d93a 100644
--- a/include/linux/shrinker.h
+++ b/include/linux/shrinker.h
@@ -131,6 +131,7 @@ struct shrinker {
  */
 #define SHRINKER_NONSLAB	BIT(4)
 
+__printf(2, 3)
 struct shrinker *shrinker_alloc(unsigned int flags, const char *fmt, ...);
 void shrinker_register(struct shrinker *shrinker);
 void shrinker_free(struct shrinker *shrinker);
-- 
cgit v1.2.3


From 37acade0ce8938f00d6979bd02b8043b5b7089ae Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Tue, 10 Oct 2023 04:58:29 +0100
Subject: sched: remove wait bookmarks

There are no users of wait bookmarks left, so simplify the wait
code by removing them.

Link: https://lkml.kernel.org/r/20231010035829.544242-2-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Acked-by: Ingo Molnar <mingo@kernel.org>
Cc: Benjamin Segall <bsegall@google.com>
Cc: Bin Lai <sclaibin@gmail.com>
Cc: Daniel Bristot de Oliveira <bristot@redhat.com>
Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Juri Lelli <juri.lelli@redhat.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Steven Rostedt (Google) <rostedt@goodmis.org>
Cc: Valentin Schneider <vschneid@redhat.com>
Cc: Vincent Guittot <vincent.guittot@linaro.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/wait.h | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/wait.h b/include/linux/wait.h
index 5ec7739400f4..3473b663176f 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -19,10 +19,9 @@ int default_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, int
 /* wait_queue_entry::flags */
 #define WQ_FLAG_EXCLUSIVE	0x01
 #define WQ_FLAG_WOKEN		0x02
-#define WQ_FLAG_BOOKMARK	0x04
-#define WQ_FLAG_CUSTOM		0x08
-#define WQ_FLAG_DONE		0x10
-#define WQ_FLAG_PRIORITY	0x20
+#define WQ_FLAG_CUSTOM		0x04
+#define WQ_FLAG_DONE		0x08
+#define WQ_FLAG_PRIORITY	0x10
 
 /*
  * A single wait-queue entry structure:
@@ -212,8 +211,6 @@ __remove_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq
 int __wake_up(struct wait_queue_head *wq_head, unsigned int mode, int nr, void *key);
 void __wake_up_on_current_cpu(struct wait_queue_head *wq_head, unsigned int mode, void *key);
 void __wake_up_locked_key(struct wait_queue_head *wq_head, unsigned int mode, void *key);
-void __wake_up_locked_key_bookmark(struct wait_queue_head *wq_head,
-		unsigned int mode, void *key, wait_queue_entry_t *bookmark);
 void __wake_up_sync_key(struct wait_queue_head *wq_head, unsigned int mode, void *key);
 void __wake_up_locked_sync_key(struct wait_queue_head *wq_head, unsigned int mode, void *key);
 void __wake_up_locked(struct wait_queue_head *wq_head, unsigned int mode, int nr);
-- 
cgit v1.2.3


From 3657fdc2451abf135c2d20949acf57d78cc50338 Mon Sep 17 00:00:00 2001
From: Lorenzo Stoakes <lstoakes@gmail.com>
Date: Wed, 11 Oct 2023 18:04:27 +0100
Subject: mm: move vma_policy() and anon_vma_name() decls to mm_types.h

Patch series "Abstract vma_merge() and split_vma()", v4.

The vma_merge() interface is very confusing and its implementation has led
to numerous bugs as a result of that confusion.

In addition there is duplication both in invocation of vma_merge(), but
also in the common mprotect()-style pattern of attempting a merge, then if
this fails, splitting the portion of a VMA about to have its attributes
changed.

This pattern has been copy/pasted around the kernel in each instance where
such an operation has been required, each very slightly modified from the
last to make it even harder to decipher what is going on.

Simplify the whole thing by dividing the actual uses of vma_merge() and
split_vma() into specific and abstracted functions and de-duplicate the
vma_merge()/split_vma() pattern altogether.

Doing so also opens the door to changing how vma_merge() is implemented -
by knowing precisely what cases a caller is invoking rather than having a
central interface where anything might happen we can untangle the brittle
and confusing vma_merge() implementation into something more workable.

For mprotect()-like cases we introduce vma_modify() which performs the
vma_merge()/split_vma() pattern, returning a pointer to either the merged
or split VMA or an ERR_PTR(err) if the splits fail.

We provide a number of inline helper functions to make things even clearer:-

* vma_modify_flags()      - Prepare to modify the VMA's flags.
* vma_modify_flags_name() - Prepare to modify the VMA's flags/anon_vma_name
* vma_modify_policy()     - Prepare to modify the VMA's mempolicy.
* vma_modify_flags_uffd() - Prepare to modify the VMA's flags/uffd context.

For cases where a new VMA is attempted to be merged with adjacent VMAs we
add:-

* vma_merge_new_vma() - Prepare to merge a new VMA.
* vma_merge_extend()  - Prepare to extend the end of a new VMA.


This patch (of 5):

The vma_policy() define is a helper specifically for a VMA field so it
makes sense to host it in the memory management types header.

The anon_vma_name(), anon_vma_name_alloc() and anon_vma_name_free()
functions are a little out of place in mm_inline.h as they define external
functions, and so it makes sense to locate them in mm_types.h.

The purpose of these relocations is to make it possible to abstract static
inline wrappers which invoke both of these helpers.

Link: https://lkml.kernel.org/r/cover.1697043508.git.lstoakes@gmail.com
Link: https://lkml.kernel.org/r/24bfc6c9e382fffbcb0ea8d424392c27d56cc8ca.1697043508.git.lstoakes@gmail.com
Signed-off-by: Lorenzo Stoakes <lstoakes@gmail.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Lorenzo Stoakes <lstoakes@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mempolicy.h |  4 ----
 include/linux/mm_inline.h | 20 +-------------------
 include/linux/mm_types.h  | 27 +++++++++++++++++++++++++++
 3 files changed, 28 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index 6c2754d7bfed..1d7f4ec2614c 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -89,8 +89,6 @@ static inline struct mempolicy *mpol_dup(struct mempolicy *pol)
 	return pol;
 }
 
-#define vma_policy(vma) ((vma)->vm_policy)
-
 static inline void mpol_get(struct mempolicy *pol)
 {
 	if (pol)
@@ -222,8 +220,6 @@ mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
 	return NULL;
 }
 
-#define vma_policy(vma) NULL
-
 static inline int
 vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst)
 {
diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
index 8148b30a9df1..9ae7def16cb2 100644
--- a/include/linux/mm_inline.h
+++ b/include/linux/mm_inline.h
@@ -4,6 +4,7 @@
 
 #include <linux/atomic.h>
 #include <linux/huge_mm.h>
+#include <linux/mm_types.h>
 #include <linux/swap.h>
 #include <linux/string.h>
 #include <linux/userfaultfd_k.h>
@@ -352,15 +353,6 @@ void lruvec_del_folio(struct lruvec *lruvec, struct folio *folio)
 }
 
 #ifdef CONFIG_ANON_VMA_NAME
-/*
- * mmap_lock should be read-locked when calling anon_vma_name(). Caller should
- * either keep holding the lock while using the returned pointer or it should
- * raise anon_vma_name refcount before releasing the lock.
- */
-extern struct anon_vma_name *anon_vma_name(struct vm_area_struct *vma);
-extern struct anon_vma_name *anon_vma_name_alloc(const char *name);
-extern void anon_vma_name_free(struct kref *kref);
-
 /* mmap_lock should be read-locked */
 static inline void anon_vma_name_get(struct anon_vma_name *anon_name)
 {
@@ -415,16 +407,6 @@ static inline bool anon_vma_name_eq(struct anon_vma_name *anon_name1,
 }
 
 #else /* CONFIG_ANON_VMA_NAME */
-static inline struct anon_vma_name *anon_vma_name(struct vm_area_struct *vma)
-{
-	return NULL;
-}
-
-static inline struct anon_vma_name *anon_vma_name_alloc(const char *name)
-{
-	return NULL;
-}
-
 static inline void anon_vma_name_get(struct anon_vma_name *anon_name) {}
 static inline void anon_vma_name_put(struct anon_vma_name *anon_name) {}
 static inline void dup_anon_vma_name(struct vm_area_struct *orig_vma,
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 36c5b43999e6..21eb56145f57 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -546,6 +546,27 @@ struct anon_vma_name {
 	char name[];
 };
 
+#ifdef CONFIG_ANON_VMA_NAME
+/*
+ * mmap_lock should be read-locked when calling anon_vma_name(). Caller should
+ * either keep holding the lock while using the returned pointer or it should
+ * raise anon_vma_name refcount before releasing the lock.
+ */
+struct anon_vma_name *anon_vma_name(struct vm_area_struct *vma);
+struct anon_vma_name *anon_vma_name_alloc(const char *name);
+void anon_vma_name_free(struct kref *kref);
+#else /* CONFIG_ANON_VMA_NAME */
+static inline struct anon_vma_name *anon_vma_name(struct vm_area_struct *vma)
+{
+	return NULL;
+}
+
+static inline struct anon_vma_name *anon_vma_name_alloc(const char *name)
+{
+	return NULL;
+}
+#endif
+
 struct vma_lock {
 	struct rw_semaphore lock;
 };
@@ -662,6 +683,12 @@ struct vm_area_struct {
 	struct vm_userfaultfd_ctx vm_userfaultfd_ctx;
 } __randomize_layout;
 
+#ifdef CONFIG_NUMA
+#define vma_policy(vma) ((vma)->vm_policy)
+#else
+#define vma_policy(vma) NULL
+#endif
+
 #ifdef CONFIG_SCHED_MM_CID
 struct mm_cid {
 	u64 time;
-- 
cgit v1.2.3


From 94d7d923395129b9248777e575c877e40007f9dc Mon Sep 17 00:00:00 2001
From: Lorenzo Stoakes <lstoakes@gmail.com>
Date: Wed, 11 Oct 2023 18:04:28 +0100
Subject: mm: abstract the vma_merge()/split_vma() pattern for mprotect() et
 al.

mprotect() and other functions which change VMA parameters over a range
each employ a pattern of:-

1. Attempt to merge the range with adjacent VMAs.
2. If this fails, and the range spans a subset of the VMA, split it
   accordingly.

This is open-coded and duplicated in each case. Also in each case most of
the parameters passed to vma_merge() remain the same.

Create a new function, vma_modify(), which abstracts this operation,
accepting only those parameters which can be changed.

To avoid the mess of invoking each function call with unnecessary
parameters, create inline wrapper functions for each of the modify
operations, parameterised only by what is required to perform the action.

We can also significantly simplify the logic - by returning the VMA if we
split (or merged VMA if we do not) we no longer need specific handling for
merge/split cases in any of the call sites.

Note that the userfaultfd_release() case works even though it does not
split VMAs - since start is set to vma->vm_start and end is set to
vma->vm_end, the split logic does not trigger.

In addition, since we calculate pgoff to be equal to vma->vm_pgoff + (start
- vma->vm_start) >> PAGE_SHIFT, and start - vma->vm_start will be 0 in this
instance, this invocation will remain unchanged.

We eliminate a VM_WARN_ON() in mprotect_fixup() as this simply asserts that
vma_merge() correctly ensures that flags remain the same, something that is
already checked in is_mergeable_vma() and elsewhere, and in any case is not
specific to mprotect().

Link: https://lkml.kernel.org/r/0dfa9368f37199a423674bf0ee312e8ea0619044.1697043508.git.lstoakes@gmail.com
Signed-off-by: Lorenzo Stoakes <lstoakes@gmail.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Liam R. Howlett <Liam.Howlett@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h | 60 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 60 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index fa608cba041f..895feb814620 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -3251,6 +3251,66 @@ extern struct vm_area_struct *copy_vma(struct vm_area_struct **,
 	unsigned long addr, unsigned long len, pgoff_t pgoff,
 	bool *need_rmap_locks);
 extern void exit_mmap(struct mm_struct *);
+struct vm_area_struct *vma_modify(struct vma_iterator *vmi,
+				  struct vm_area_struct *prev,
+				  struct vm_area_struct *vma,
+				  unsigned long start, unsigned long end,
+				  unsigned long vm_flags,
+				  struct mempolicy *policy,
+				  struct vm_userfaultfd_ctx uffd_ctx,
+				  struct anon_vma_name *anon_name);
+
+/* We are about to modify the VMA's flags. */
+static inline struct vm_area_struct
+*vma_modify_flags(struct vma_iterator *vmi,
+		  struct vm_area_struct *prev,
+		  struct vm_area_struct *vma,
+		  unsigned long start, unsigned long end,
+		  unsigned long new_flags)
+{
+	return vma_modify(vmi, prev, vma, start, end, new_flags,
+			  vma_policy(vma), vma->vm_userfaultfd_ctx,
+			  anon_vma_name(vma));
+}
+
+/* We are about to modify the VMA's flags and/or anon_name. */
+static inline struct vm_area_struct
+*vma_modify_flags_name(struct vma_iterator *vmi,
+		       struct vm_area_struct *prev,
+		       struct vm_area_struct *vma,
+		       unsigned long start,
+		       unsigned long end,
+		       unsigned long new_flags,
+		       struct anon_vma_name *new_name)
+{
+	return vma_modify(vmi, prev, vma, start, end, new_flags,
+			  vma_policy(vma), vma->vm_userfaultfd_ctx, new_name);
+}
+
+/* We are about to modify the VMA's memory policy. */
+static inline struct vm_area_struct
+*vma_modify_policy(struct vma_iterator *vmi,
+		   struct vm_area_struct *prev,
+		   struct vm_area_struct *vma,
+		   unsigned long start, unsigned long end,
+		   struct mempolicy *new_pol)
+{
+	return vma_modify(vmi, prev, vma, start, end, vma->vm_flags,
+			  new_pol, vma->vm_userfaultfd_ctx, anon_vma_name(vma));
+}
+
+/* We are about to modify the VMA's flags and/or uffd context. */
+static inline struct vm_area_struct
+*vma_modify_flags_uffd(struct vma_iterator *vmi,
+		       struct vm_area_struct *prev,
+		       struct vm_area_struct *vma,
+		       unsigned long start, unsigned long end,
+		       unsigned long new_flags,
+		       struct vm_userfaultfd_ctx new_ctx)
+{
+	return vma_modify(vmi, prev, vma, start, end, new_flags,
+			  vma_policy(vma), new_ctx, anon_vma_name(vma));
+}
 
 static inline int check_data_rlimit(unsigned long rlim,
 				    unsigned long new,
-- 
cgit v1.2.3


From adb20b0c785e9e8d5e4d4a07b9c3340d7de0e5dc Mon Sep 17 00:00:00 2001
From: Lorenzo Stoakes <lstoakes@gmail.com>
Date: Wed, 11 Oct 2023 18:04:29 +0100
Subject: mm: make vma_merge() and split_vma() internal

Now the common pattern of - attempting a merge via vma_merge() and should
this fail splitting VMAs via split_vma() - has been abstracted, the former
can be placed into mm/internal.h and the latter made static.

In addition, the split_vma() nommu variant also need not be exported.

Link: https://lkml.kernel.org/r/405f2be10e20c4e9fbcc9fe6b2dfea105f6642e0.1697043508.git.lstoakes@gmail.com
Signed-off-by: Lorenzo Stoakes <lstoakes@gmail.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Liam R. Howlett <Liam.Howlett@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h | 9 ---------
 1 file changed, 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 895feb814620..1bddb151cd5c 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -3235,16 +3235,7 @@ extern int vma_expand(struct vma_iterator *vmi, struct vm_area_struct *vma,
 		      struct vm_area_struct *next);
 extern int vma_shrink(struct vma_iterator *vmi, struct vm_area_struct *vma,
 		       unsigned long start, unsigned long end, pgoff_t pgoff);
-extern struct vm_area_struct *vma_merge(struct vma_iterator *vmi,
-	struct mm_struct *, struct vm_area_struct *prev, unsigned long addr,
-	unsigned long end, unsigned long vm_flags, struct anon_vma *,
-	struct file *, pgoff_t, struct mempolicy *, struct vm_userfaultfd_ctx,
-	struct anon_vma_name *);
 extern struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *);
-extern int __split_vma(struct vma_iterator *vmi, struct vm_area_struct *,
-		       unsigned long addr, int new_below);
-extern int split_vma(struct vma_iterator *vmi, struct vm_area_struct *,
-			 unsigned long addr, int new_below);
 extern int insert_vm_struct(struct mm_struct *, struct vm_area_struct *);
 extern void unlink_file_vma(struct vm_area_struct *);
 extern struct vm_area_struct *copy_vma(struct vm_area_struct **,
-- 
cgit v1.2.3


From e8e17ee90eaf650c855adb0a3e5e965fd6692ff1 Mon Sep 17 00:00:00 2001
From: Lorenzo Stoakes <lstoakes@gmail.com>
Date: Thu, 12 Oct 2023 18:04:28 +0100
Subject: mm: drop the assumption that VM_SHARED always implies writable

Patch series "permit write-sealed memfd read-only shared mappings", v4.

The man page for fcntl() describing memfd file seals states the following
about F_SEAL_WRITE:-

    Furthermore, trying to create new shared, writable memory-mappings via
    mmap(2) will also fail with EPERM.

With emphasis on 'writable'.  In turns out in fact that currently the
kernel simply disallows all new shared memory mappings for a memfd with
F_SEAL_WRITE applied, rendering this documentation inaccurate.

This matters because users are therefore unable to obtain a shared mapping
to a memfd after write sealing altogether, which limits their usefulness.
This was reported in the discussion thread [1] originating from a bug
report [2].

This is a product of both using the struct address_space->i_mmap_writable
atomic counter to determine whether writing may be permitted, and the
kernel adjusting this counter when any VM_SHARED mapping is performed and
more generally implicitly assuming VM_SHARED implies writable.

It seems sensible that we should only update this mapping if VM_MAYWRITE
is specified, i.e.  whether it is possible that this mapping could at any
point be written to.

If we do so then all we need to do to permit write seals to function as
documented is to clear VM_MAYWRITE when mapping read-only.  It turns out
this functionality already exists for F_SEAL_FUTURE_WRITE - we can
therefore simply adapt this logic to do the same for F_SEAL_WRITE.

We then hit a chicken and egg situation in mmap_region() where the check
for VM_MAYWRITE occurs before we are able to clear this flag.  To work
around this, perform this check after we invoke call_mmap(), with careful
consideration of error paths.

Thanks to Andy Lutomirski for the suggestion!

[1]:https://lore.kernel.org/all/20230324133646.16101dfa666f253c4715d965@linux-foundation.org/
[2]:https://bugzilla.kernel.org/show_bug.cgi?id=217238


This patch (of 3):

There is a general assumption that VMAs with the VM_SHARED flag set are
writable.  If the VM_MAYWRITE flag is not set, then this is simply not the
case.

Update those checks which affect the struct address_space->i_mmap_writable
field to explicitly test for this by introducing
[vma_]is_shared_maywrite() helper functions.

This remains entirely conservative, as the lack of VM_MAYWRITE guarantees
that the VMA cannot be written to.

Link: https://lkml.kernel.org/r/cover.1697116581.git.lstoakes@gmail.com
Link: https://lkml.kernel.org/r/d978aefefa83ec42d18dfa964ad180dbcde34795.1697116581.git.lstoakes@gmail.com
Signed-off-by: Lorenzo Stoakes <lstoakes@gmail.com>
Suggested-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Jan Kara <jack@suse.cz>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Muchun Song <muchun.song@linux.dev>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/fs.h |  4 ++--
 include/linux/mm.h | 11 +++++++++++
 2 files changed, 13 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index fd539c9fef8e..5265186da0e2 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -454,7 +454,7 @@ extern const struct address_space_operations empty_aops;
  *   It is also used to block modification of page cache contents through
  *   memory mappings.
  * @gfp_mask: Memory allocation flags to use for allocating pages.
- * @i_mmap_writable: Number of VM_SHARED mappings.
+ * @i_mmap_writable: Number of VM_SHARED, VM_MAYWRITE mappings.
  * @nr_thps: Number of THPs in the pagecache (non-shmem only).
  * @i_mmap: Tree of private and shared mappings.
  * @i_mmap_rwsem: Protects @i_mmap and @i_mmap_writable.
@@ -557,7 +557,7 @@ static inline int mapping_mapped(struct address_space *mapping)
 
 /*
  * Might pages of this file have been modified in userspace?
- * Note that i_mmap_writable counts all VM_SHARED vmas: do_mmap
+ * Note that i_mmap_writable counts all VM_SHARED, VM_MAYWRITE vmas: do_mmap
  * marks vma as VM_SHARED if it is shared, and the file was opened for
  * writing i.e. vma may be mprotected writable even if now readonly.
  *
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 1bddb151cd5c..c3b5749ede9d 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -937,6 +937,17 @@ static inline bool vma_is_accessible(struct vm_area_struct *vma)
 	return vma->vm_flags & VM_ACCESS_FLAGS;
 }
 
+static inline bool is_shared_maywrite(vm_flags_t vm_flags)
+{
+	return (vm_flags & (VM_SHARED | VM_MAYWRITE)) ==
+		(VM_SHARED | VM_MAYWRITE);
+}
+
+static inline bool vma_is_shared_maywrite(struct vm_area_struct *vma)
+{
+	return is_shared_maywrite(vma->vm_flags);
+}
+
 static inline
 struct vm_area_struct *vma_find(struct vma_iterator *vmi, unsigned long max)
 {
-- 
cgit v1.2.3


From 28464bbb2ddc199433383994bcb9600c8034afa1 Mon Sep 17 00:00:00 2001
From: Lorenzo Stoakes <lstoakes@gmail.com>
Date: Thu, 12 Oct 2023 18:04:29 +0100
Subject: mm: update memfd seal write check to include F_SEAL_WRITE

The seal_check_future_write() function is called by shmem_mmap() or
hugetlbfs_file_mmap() to disallow any future writable mappings of an memfd
sealed this way.

The F_SEAL_WRITE flag is not checked here, as that is handled via the
mapping->i_mmap_writable mechanism and so any attempt at a mapping would
fail before this could be run.

However we intend to change this, meaning this check can be performed for
F_SEAL_WRITE mappings also.

The logic here is equally applicable to both flags, so update this
function to accommodate both and rename it accordingly.

Link: https://lkml.kernel.org/r/913628168ce6cce77df7d13a63970bae06a526e0.1697116581.git.lstoakes@gmail.com
Signed-off-by: Lorenzo Stoakes <lstoakes@gmail.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Muchun Song <muchun.song@linux.dev>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index c3b5749ede9d..86e040e886e4 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -4076,25 +4076,26 @@ static inline void mem_dump_obj(void *object) {}
 #endif
 
 /**
- * seal_check_future_write - Check for F_SEAL_FUTURE_WRITE flag and handle it
+ * seal_check_write - Check for F_SEAL_WRITE or F_SEAL_FUTURE_WRITE flags and
+ *                    handle them.
  * @seals: the seals to check
  * @vma: the vma to operate on
  *
- * Check whether F_SEAL_FUTURE_WRITE is set; if so, do proper check/handling on
- * the vma flags.  Return 0 if check pass, or <0 for errors.
+ * Check whether F_SEAL_WRITE or F_SEAL_FUTURE_WRITE are set; if so, do proper
+ * check/handling on the vma flags.  Return 0 if check pass, or <0 for errors.
  */
-static inline int seal_check_future_write(int seals, struct vm_area_struct *vma)
+static inline int seal_check_write(int seals, struct vm_area_struct *vma)
 {
-	if (seals & F_SEAL_FUTURE_WRITE) {
+	if (seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE)) {
 		/*
 		 * New PROT_WRITE and MAP_SHARED mmaps are not allowed when
-		 * "future write" seal active.
+		 * write seals are active.
 		 */
 		if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_WRITE))
 			return -EPERM;
 
 		/*
-		 * Since an F_SEAL_FUTURE_WRITE sealed memfd can be mapped as
+		 * Since an F_SEAL_[FUTURE_]WRITE sealed memfd can be mapped as
 		 * MAP_SHARED and read-only, take care to not allow mprotect to
 		 * revert protections on such mappings. Do this only for shared
 		 * mappings. For private mappings, don't need to mask
-- 
cgit v1.2.3


From 3decb8564eff88a2533f83b01cec2cf9259c3eaf Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Mon, 16 Oct 2023 21:10:49 +0100
Subject: buffer: make folio_create_empty_buffers() return a buffer_head

Patch series "Finish the create_empty_buffers() transition", v2.

Pankaj recently added folio_create_empty_buffers() as the folio equivalent
to create_empty_buffers().  This patch set finishes the conversion by
first converting all remaining filesystems to call
folio_create_empty_buffers(), then renaming it back to
create_empty_buffers().  I took the opportunity to make a few
simplifications like making folio_create_empty_buffers() return the head
buffer and extracting get_nth_bh() from nilfs2.

A few of the patches in this series aren't directly related to
create_empty_buffers(), but I saw them while I was working on this and
thought they'd be easy enough to add to this series.  Compile-tested only,
other than ext4.


This patch (of 26):

Almost all callers want to know the first BH that was allocated for this
folio.  We already have that handy, so return it.

Link: https://lkml.kernel.org/r/20231016201114.1928083-1-willy@infradead.org
Link: https://lkml.kernel.org/r/20231016201114.1928083-3-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Pankaj Raghav <p.raghav@samsung.com>
Cc: Andreas Gruenbacher <agruenba@redhat.com>
Cc: Ryusuke Konishi <konishi.ryusuke@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/buffer_head.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index 3dc4720e4773..1001244a8941 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -203,8 +203,8 @@ struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size,
 		bool retry);
 void create_empty_buffers(struct page *, unsigned long,
 			unsigned long b_state);
-void folio_create_empty_buffers(struct folio *folio, unsigned long blocksize,
-				unsigned long b_state);
+struct buffer_head *folio_create_empty_buffers(struct folio *folio,
+		unsigned long blocksize, unsigned long b_state);
 void end_buffer_read_sync(struct buffer_head *bh, int uptodate);
 void end_buffer_write_sync(struct buffer_head *bh, int uptodate);
 void end_buffer_async_write(struct buffer_head *bh, int uptodate);
-- 
cgit v1.2.3


From 0217fbb0271a7c78e16629cf45375916ec2eb35f Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Mon, 16 Oct 2023 21:10:52 +0100
Subject: buffer: add get_nth_bh()

Extract this useful helper from nilfs_page_get_nth_block()

Link: https://lkml.kernel.org/r/20231016201114.1928083-6-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Acked-by: Ryusuke Konishi <konishi.ryusuke@gmail.com>
Cc: Andreas Gruenbacher <agruenba@redhat.com>
Cc: Pankaj Raghav <p.raghav@samsung.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/buffer_head.h | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index 1001244a8941..3d85a0cf0ca5 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -457,6 +457,28 @@ __bread(struct block_device *bdev, sector_t block, unsigned size)
 	return __bread_gfp(bdev, block, size, __GFP_MOVABLE);
 }
 
+/**
+ * get_nth_bh - Get a reference on the n'th buffer after this one.
+ * @bh: The buffer to start counting from.
+ * @count: How many buffers to skip.
+ *
+ * This is primarily useful for finding the nth buffer in a folio; in
+ * that case you pass the head buffer and the byte offset in the folio
+ * divided by the block size.  It can be used for other purposes, but
+ * it will wrap at the end of the folio rather than returning NULL or
+ * proceeding to the next folio for you.
+ *
+ * Return: The requested buffer with an elevated refcount.
+ */
+static inline __must_check
+struct buffer_head *get_nth_bh(struct buffer_head *bh, unsigned int count)
+{
+	while (count--)
+		bh = bh->b_this_page;
+	get_bh(bh);
+	return bh;
+}
+
 bool block_dirty_folio(struct address_space *mapping, struct folio *folio);
 
 #ifdef CONFIG_BUFFER_HEAD
-- 
cgit v1.2.3


From 0a88810d9b76e6ecfd234f5728e27344a39e3ff8 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Mon, 16 Oct 2023 21:11:14 +0100
Subject: buffer: remove folio_create_empty_buffers()

With all users converted, remove the old create_empty_buffers() and rename
folio_create_empty_buffers() to create_empty_buffers().

Link: https://lkml.kernel.org/r/20231016201114.1928083-28-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Andreas Gruenbacher <agruenba@redhat.com>
Cc: Pankaj Raghav <p.raghav@samsung.com>
Cc: Ryusuke Konishi <konishi.ryusuke@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/buffer_head.h | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index 3d85a0cf0ca5..5f23ee599889 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -201,9 +201,7 @@ struct buffer_head *folio_alloc_buffers(struct folio *folio, unsigned long size,
 					gfp_t gfp);
 struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size,
 		bool retry);
-void create_empty_buffers(struct page *, unsigned long,
-			unsigned long b_state);
-struct buffer_head *folio_create_empty_buffers(struct folio *folio,
+struct buffer_head *create_empty_buffers(struct folio *folio,
 		unsigned long blocksize, unsigned long b_state);
 void end_buffer_read_sync(struct buffer_head *bh, int uptodate);
 void end_buffer_write_sync(struct buffer_head *bh, int uptodate);
-- 
cgit v1.2.3


From ca71fe1ad9221a89c6a25f49159c600d9e598ae1 Mon Sep 17 00:00:00 2001
From: Huang Ying <ying.huang@intel.com>
Date: Mon, 16 Oct 2023 13:29:54 +0800
Subject: mm, pcp: avoid to drain PCP when process exit

Patch series "mm: PCP high auto-tuning", v3.

The page allocation performance requirements of different workloads are
often different.  So, we need to tune the PCP (Per-CPU Pageset) high on
each CPU automatically to optimize the page allocation performance.

The list of patches in series is as follows,

[1/9] mm, pcp: avoid to drain PCP when process exit
[2/9] cacheinfo: calculate per-CPU data cache size
[3/9] mm, pcp: reduce lock contention for draining high-order pages
[4/9] mm: restrict the pcp batch scale factor to avoid too long latency
[5/9] mm, page_alloc: scale the number of pages that are batch allocated
[6/9] mm: add framework for PCP high auto-tuning
[7/9] mm: tune PCP high automatically
[8/9] mm, pcp: decrease PCP high if free pages < high watermark
[9/9] mm, pcp: reduce detecting time of consecutive high order page freeing

Patch [1/9], [2/9], [3/9] optimize the PCP draining for consecutive
high-order pages freeing.

Patch [4/9], [5/9] optimize batch freeing and allocating.

Patch [6/9], [7/9], [8/9] implement and optimize a PCP high
auto-tuning method.

Patch [9/9] optimize the PCP draining for consecutive high order page
freeing based on PCP high auto-tuning.

The test results for patches with performance impact are as follows,

kbuild
======

On a 2-socket Intel server with 224 logical CPU, we run 8 kbuild instances
in parallel (each with `make -j 28`) in 8 cgroup.  This simulates the
kbuild server that is used by 0-Day kbuild service.

	build time   lock contend%	free_high	alloc_zone
	----------	----------	---------	----------
base	     100.0	      14.0          100.0            100.0
patch1	      99.5	      12.8	     19.5	      95.6
patch3	      99.4	      12.6	      7.1	      95.6
patch5	      98.6	      11.0	      8.1	      97.1
patch7	      95.1	       0.5	      2.8	      15.6
patch9	      95.0	       1.0	      8.8	      20.0

The PCP draining optimization (patch [1/9], [3/9]) and PCP batch
allocation optimization (patch [5/9]) reduces zone lock contention a
little.  The PCP high auto-tuning (patch [7/9], [9/9]) reduces build time
visibly.  Where the tuning target: the number of pages allocated from zone
reduces greatly.  So, the zone contention cycles% reduces greatly.

With PCP tuning patches (patch [7/9], [9/9]), the average used memory
during test increases up to 18.4% because more pages are cached in PCP.
But at the end of the test, the number of the used memory decreases to the
same level as that of the base patch.  That is, the pages cached in PCP
will be released to zone after not being used actively.

netperf SCTP_STREAM_MANY
========================

On a 2-socket Intel server with 128 logical CPU, we tested
SCTP_STREAM_MANY test case of netperf test suite with 64-pair processes.

	     score   lock contend%	free_high	alloc_zone  cache miss rate%
	     -----	----------	---------	----------  ----------------
base	     100.0	       2.1          100.0            100.0	         1.3
patch1	      99.4	       2.1	     99.4	      99.4		 1.3
patch3	     106.4	       1.3	     13.3	     106.3		 1.3
patch5	     106.0	       1.2	     13.2	     105.9		 1.3
patch7	     103.4	       1.9	      6.7	      90.3		 7.6
patch9	     108.6	       1.3	     13.7	     108.6		 1.3

The PCP draining optimization (patch [1/9]+[3/9]) improves performance.
The PCP high auto-tuning (patch [7/9]) reduces performance a little
because PCP draining cannot be triggered in time sometimes.  So, the cache
miss rate% increases.  The further PCP draining optimization (patch [9/9])
based on PCP tuning restore the performance.

lmbench3 UNIX (AF_UNIX)
=======================

On a 2-socket Intel server with 128 logical CPU, we tested UNIX
(AF_UNIX socket) test case of lmbench3 test suite with 16-pair
processes.

	     score   lock contend%	free_high	alloc_zone  cache miss rate%
	     -----	----------	---------	----------  ----------------
base	     100.0	      51.4          100.0            100.0	         0.2
patch1	     116.8	      46.1           69.5	     104.3	         0.2
patch3	     199.1	      21.3            7.0	     104.9	         0.2
patch5	     200.0	      20.8            7.1	     106.9	         0.3
patch7	     191.6	      19.9            6.8	     103.8	         2.8
patch9	     193.4	      21.7            7.0	     104.7	         2.1

The PCP draining optimization (patch [1/9], [3/9]) improves performance
much.  The PCP tuning (patch [7/9]) reduces performance a little because
PCP draining cannot be triggered in time sometimes.  The further PCP
draining optimization (patch [9/9]) based on PCP tuning restores the
performance partly.

The patchset adds several fields in struct per_cpu_pages.  The struct
layout before/after the patchset is as follows,

base
====

struct per_cpu_pages {
	spinlock_t                 lock;                 /*     0     4 */
	int                        count;                /*     4     4 */
	int                        high;                 /*     8     4 */
	int                        batch;                /*    12     4 */
	short int                  free_factor;          /*    16     2 */
	short int                  expire;               /*    18     2 */

	/* XXX 4 bytes hole, try to pack */

	struct list_head           lists[13];            /*    24   208 */

	/* size: 256, cachelines: 4, members: 7 */
	/* sum members: 228, holes: 1, sum holes: 4 */
	/* padding: 24 */
} __attribute__((__aligned__(64)));

patched
=======

struct per_cpu_pages {
	spinlock_t                 lock;                 /*     0     4 */
	int                        count;                /*     4     4 */
	int                        high;                 /*     8     4 */
	int                        high_min;             /*    12     4 */
	int                        high_max;             /*    16     4 */
	int                        batch;                /*    20     4 */
	u8                         flags;                /*    24     1 */
	u8                         alloc_factor;         /*    25     1 */
	u8                         expire;               /*    26     1 */

	/* XXX 1 byte hole, try to pack */

	short int                  free_count;           /*    28     2 */

	/* XXX 2 bytes hole, try to pack */

	struct list_head           lists[13];            /*    32   208 */

	/* size: 256, cachelines: 4, members: 11 */
	/* sum members: 237, holes: 2, sum holes: 3 */
	/* padding: 16 */
} __attribute__((__aligned__(64)));

The size of the struct doesn't changed with the patchset.


This patch (of 9):

In commit f26b3fa04611 ("mm/page_alloc: limit number of high-order pages
on PCP during bulk free"), the PCP (Per-CPU Pageset) will be drained when
PCP is mostly used for high-order pages freeing to improve the cache-hot
pages reusing between page allocation and freeing CPUs.

But, the PCP draining mechanism may be triggered unexpectedly when process
exits.  With some customized trace point, it was found that PCP draining
(free_high == true) was triggered with the order-1 page freeing with the
following call stack,

 => free_unref_page_commit
 => free_unref_page
 => __mmdrop
 => exit_mm
 => do_exit
 => do_group_exit
 => __x64_sys_exit_group
 => do_syscall_64

Checking the source code, this is the page table PGD freeing
(mm_free_pgd()).  It's a order-1 page freeing if
CONFIG_PAGE_TABLE_ISOLATION=y.  Which is a common configuration for
security.

Just before that, page freeing with the following call stack was found,

 => free_unref_page_commit
 => free_unref_page_list
 => release_pages
 => tlb_batch_pages_flush
 => tlb_finish_mmu
 => exit_mmap
 => __mmput
 => exit_mm
 => do_exit
 => do_group_exit
 => __x64_sys_exit_group
 => do_syscall_64

So, when a process exits,

- a large number of user pages of the process will be freed without
  page allocation, it's highly possible that pcp->free_factor becomes >
  0.  In fact, this is expected behavior to improve process exit
  performance.

- after freeing all user pages, the PGD will be freed, which is a
  order-1 page freeing, PCP will be drained.

All in all, when a process exits, it's high possible that the PCP will be
drained.  This is an unexpected behavior.

To avoid this, in the patch, the PCP draining will only be triggered for 2
consecutive high-order page freeing.

On a 2-socket Intel server with 224 logical CPU, we run 8 kbuild instances
in parallel (each with `make -j 28`) in 8 cgroup.  This simulates the
kbuild server that is used by 0-Day kbuild service.  With the patch, the
cycles% of the spinlock contention (mostly for zone lock) decreases from
14.0% to 12.8% (with PCP size == 367).  The number of PCP draining for
high order pages freeing (free_high) decreases 80.5%.

This helps network workload too for reduced zone lock contention.  On a
2-socket Intel server with 128 logical CPU, with the patch, the network
bandwidth of the UNIX (AF_UNIX) test case of lmbench test suite with
16-pair processes increase 16.8%.  The cycles% of the spinlock contention
(mostly for zone lock) decreases from 51.4% to 46.1%.  The number of PCP
draining for high order pages freeing (free_high) decreases 30.5%.  The
cache miss rate keeps 0.2%.

Link: https://lkml.kernel.org/r/20231016053002.756205-1-ying.huang@intel.com
Link: https://lkml.kernel.org/r/20231016053002.756205-2-ying.huang@intel.com
Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
Acked-by: Mel Gorman <mgorman@techsingularity.net>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: David Hildenbrand <david@redhat.com>
Cc: Johannes Weiner <jweiner@redhat.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Pavel Tatashin <pasha.tatashin@soleen.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Christoph Lameter <cl@linux.com>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Sudeep Holla <sudeep.holla@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mmzone.h | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 486587fcd27f..de313f1c15f9 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -674,12 +674,22 @@ enum zone_watermarks {
 #define high_wmark_pages(z) (z->_watermark[WMARK_HIGH] + z->watermark_boost)
 #define wmark_pages(z, i) (z->_watermark[i] + z->watermark_boost)
 
+/*
+ * Flags used in pcp->flags field.
+ *
+ * PCPF_PREV_FREE_HIGH_ORDER: a high-order page is freed in the
+ * previous page freeing.  To avoid to drain PCP for an accident
+ * high-order page freeing.
+ */
+#define	PCPF_PREV_FREE_HIGH_ORDER	BIT(0)
+
 struct per_cpu_pages {
 	spinlock_t lock;	/* Protects lists field */
 	int count;		/* number of pages in the list */
 	int high;		/* high watermark, emptying needed */
 	int batch;		/* chunk size for buddy add/remove */
-	short free_factor;	/* batch scaling factor during free */
+	u8 flags;		/* protected by pcp->lock */
+	u8 free_factor;		/* batch scaling factor during free */
 #ifdef CONFIG_NUMA
 	short expire;		/* When 0, remote pagesets are drained */
 #endif
-- 
cgit v1.2.3


From 94a3bfe4073cd88b05f7fb201ea7bf9dfa2cf5d5 Mon Sep 17 00:00:00 2001
From: Huang Ying <ying.huang@intel.com>
Date: Mon, 16 Oct 2023 13:29:55 +0800
Subject: cacheinfo: calculate size of per-CPU data cache slice

This can be used to estimate the size of the data cache slice that can be
used by one CPU under ideal circumstances.  Both DATA caches and UNIFIED
caches are used in calculation.  So, the users need to consider the impact
of the code cache usage.

Because the cache inclusive/non-inclusive information isn't available now,
we just use the size of the per-CPU slice of LLC to make the result more
predictable across architectures.  This may be improved when more cache
information is available in the future.

A brute-force algorithm to iterate all online CPUs is used to avoid to
allocate an extra cpumask, especially in offline callback.

Link: https://lkml.kernel.org/r/20231016053002.756205-3-ying.huang@intel.com
Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
Acked-by: Mel Gorman <mgorman@techsingularity.net>
Cc: Sudeep Holla <sudeep.holla@arm.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: David Hildenbrand <david@redhat.com>
Cc: Johannes Weiner <jweiner@redhat.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Pavel Tatashin <pasha.tatashin@soleen.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Christoph Lameter <cl@linux.com>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/cacheinfo.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/cacheinfo.h b/include/linux/cacheinfo.h
index a5cfd44fab45..d504eb4b49ab 100644
--- a/include/linux/cacheinfo.h
+++ b/include/linux/cacheinfo.h
@@ -73,6 +73,7 @@ struct cacheinfo {
 
 struct cpu_cacheinfo {
 	struct cacheinfo *info_list;
+	unsigned int per_cpu_data_slice_size;
 	unsigned int num_levels;
 	unsigned int num_leaves;
 	bool cpu_map_populated;
-- 
cgit v1.2.3


From 362d37a106dd3f6431b2fdd91d9208b0d023b50d Mon Sep 17 00:00:00 2001
From: Huang Ying <ying.huang@intel.com>
Date: Mon, 16 Oct 2023 13:29:56 +0800
Subject: mm, pcp: reduce lock contention for draining high-order pages

In commit f26b3fa04611 ("mm/page_alloc: limit number of high-order pages
on PCP during bulk free"), the PCP (Per-CPU Pageset) will be drained when
PCP is mostly used for high-order pages freeing to improve the cache-hot
pages reusing between page allocating and freeing CPUs.

On system with small per-CPU data cache slice, pages shouldn't be cached
before draining to guarantee cache-hot.  But on a system with large
per-CPU data cache slice, some pages can be cached before draining to
reduce zone lock contention.

So, in this patch, instead of draining without any caching, "pcp->batch"
pages will be cached in PCP before draining if the size of the per-CPU
data cache slice is more than "3 * batch".

In theory, if the size of per-CPU data cache slice is more than "2 *
batch", we can reuse cache-hot pages between CPUs.  But considering the
other usage of cache (code, other data accessing, etc.), "3 * batch" is
used.

Note: "3 * batch" is chosen to make sure the optimization works on recent
x86_64 server CPUs.  If you want to increase it, please check whether it
breaks the optimization.

On a 2-socket Intel server with 128 logical CPU, with the patch, the
network bandwidth of the UNIX (AF_UNIX) test case of lmbench test suite
with 16-pair processes increase 70.5%.  The cycles% of the spinlock
contention (mostly for zone lock) decreases from 46.1% to 21.3%.  The
number of PCP draining for high order pages freeing (free_high) decreases
89.9%.  The cache miss rate keeps 0.2%.

Link: https://lkml.kernel.org/r/20231016053002.756205-4-ying.huang@intel.com
Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
Acked-by: Mel Gorman <mgorman@techsingularity.net>
Cc: Sudeep Holla <sudeep.holla@arm.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: David Hildenbrand <david@redhat.com>
Cc: Johannes Weiner <jweiner@redhat.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Pavel Tatashin <pasha.tatashin@soleen.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Christoph Lameter <cl@linux.com>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/gfp.h    | 1 +
 include/linux/mmzone.h | 6 ++++++
 2 files changed, 7 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 665f06675c83..665edc11fb9f 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -325,6 +325,7 @@ void drain_all_pages(struct zone *zone);
 void drain_local_pages(struct zone *zone);
 
 void page_alloc_init_late(void);
+void setup_pcp_cacheinfo(void);
 
 /*
  * gfp_allowed_mask is set to GFP_BOOT_MASK during early boot to restrict what
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index de313f1c15f9..efe72b3f7872 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -680,8 +680,14 @@ enum zone_watermarks {
  * PCPF_PREV_FREE_HIGH_ORDER: a high-order page is freed in the
  * previous page freeing.  To avoid to drain PCP for an accident
  * high-order page freeing.
+ *
+ * PCPF_FREE_HIGH_BATCH: preserve "pcp->batch" pages in PCP before
+ * draining PCP for consecutive high-order pages freeing without
+ * allocation if data cache slice of CPU is large enough.  To reduce
+ * zone lock contention and keep cache-hot pages reusing.
  */
 #define	PCPF_PREV_FREE_HIGH_ORDER	BIT(0)
+#define	PCPF_FREE_HIGH_BATCH		BIT(1)
 
 struct per_cpu_pages {
 	spinlock_t lock;	/* Protects lists field */
-- 
cgit v1.2.3


From c0a242394cb980bd00e1f61dc8aacb453d2bbe6a Mon Sep 17 00:00:00 2001
From: Huang Ying <ying.huang@intel.com>
Date: Mon, 16 Oct 2023 13:29:58 +0800
Subject: mm, page_alloc: scale the number of pages that are batch allocated

When a task is allocating a large number of order-0 pages, it may acquire
the zone->lock multiple times allocating pages in batches.  This may
unnecessarily contend on the zone lock when allocating very large number
of pages.  This patch adapts the size of the batch based on the recent
pattern to scale the batch size for subsequent allocations.

On a 2-socket Intel server with 224 logical CPU, we run 8 kbuild instances
in parallel (each with `make -j 28`) in 8 cgroup.  This simulates the
kbuild server that is used by 0-Day kbuild service.  With the patch, the
cycles% of the spinlock contention (mostly for zone lock) decreases from
12.6% to 11.0% (with PCP size == 367).

Link: https://lkml.kernel.org/r/20231016053002.756205-6-ying.huang@intel.com
Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
Suggested-by: Mel Gorman <mgorman@techsingularity.net>
Acked-by: Mel Gorman <mgorman@techsingularity.net>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: David Hildenbrand <david@redhat.com>
Cc: Johannes Weiner <jweiner@redhat.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Pavel Tatashin <pasha.tatashin@soleen.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Christoph Lameter <cl@linux.com>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Sudeep Holla <sudeep.holla@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mmzone.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index efe72b3f7872..e6cc2238c65f 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -695,9 +695,10 @@ struct per_cpu_pages {
 	int high;		/* high watermark, emptying needed */
 	int batch;		/* chunk size for buddy add/remove */
 	u8 flags;		/* protected by pcp->lock */
+	u8 alloc_factor;	/* batch scaling factor during allocate */
 	u8 free_factor;		/* batch scaling factor during free */
 #ifdef CONFIG_NUMA
-	short expire;		/* When 0, remote pagesets are drained */
+	u8 expire;		/* When 0, remote pagesets are drained */
 #endif
 
 	/* Lists of pages, one per migrate type stored on the pcp-lists */
-- 
cgit v1.2.3


From 90b41691b9881376fe784e13b5766ec3676fdb55 Mon Sep 17 00:00:00 2001
From: Huang Ying <ying.huang@intel.com>
Date: Mon, 16 Oct 2023 13:29:59 +0800
Subject: mm: add framework for PCP high auto-tuning

The page allocation performance requirements of different workloads are
usually different.  So, we need to tune PCP (per-CPU pageset) high to
optimize the workload page allocation performance.  Now, we have a system
wide sysctl knob (percpu_pagelist_high_fraction) to tune PCP high by hand.
But, it's hard to find out the best value by hand.  And one global
configuration may not work best for the different workloads that run on
the same system.  One solution to these issues is to tune PCP high of each
CPU automatically.

This patch adds the framework for PCP high auto-tuning.  With it,
pcp->high of each CPU will be changed automatically by tuning algorithm at
runtime.  The minimal high (pcp->high_min) is the original PCP high value
calculated based on the low watermark pages.  While the maximal high
(pcp->high_max) is the PCP high value when percpu_pagelist_high_fraction
sysctl knob is set to MIN_PERCPU_PAGELIST_HIGH_FRACTION.  That is, the
maximal pcp->high that can be set via sysctl knob by hand.

It's possible that PCP high auto-tuning doesn't work well for some
workloads.  So, when PCP high is tuned by hand via the sysctl knob, the
auto-tuning will be disabled.  The PCP high set by hand will be used
instead.

This patch only adds the framework, so pcp->high will be set to
pcp->high_min (original default) always.  We will add actual auto-tuning
algorithm in the following patches in the series.

Link: https://lkml.kernel.org/r/20231016053002.756205-7-ying.huang@intel.com
Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
Acked-by: Mel Gorman <mgorman@techsingularity.net>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: David Hildenbrand <david@redhat.com>
Cc: Johannes Weiner <jweiner@redhat.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Pavel Tatashin <pasha.tatashin@soleen.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Christoph Lameter <cl@linux.com>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Sudeep Holla <sudeep.holla@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mmzone.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index e6cc2238c65f..775abc899e80 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -693,6 +693,8 @@ struct per_cpu_pages {
 	spinlock_t lock;	/* Protects lists field */
 	int count;		/* number of pages in the list */
 	int high;		/* high watermark, emptying needed */
+	int high_min;		/* min high watermark */
+	int high_max;		/* max high watermark */
 	int batch;		/* chunk size for buddy add/remove */
 	u8 flags;		/* protected by pcp->lock */
 	u8 alloc_factor;	/* batch scaling factor during allocate */
@@ -852,7 +854,8 @@ struct zone {
 	 * the high and batch values are copied to individual pagesets for
 	 * faster access
 	 */
-	int pageset_high;
+	int pageset_high_min;
+	int pageset_high_max;
 	int pageset_batch;
 
 #ifndef CONFIG_SPARSEMEM
-- 
cgit v1.2.3


From 51a755c56dc05a8b31ed28d24f28354946dc7529 Mon Sep 17 00:00:00 2001
From: Huang Ying <ying.huang@intel.com>
Date: Mon, 16 Oct 2023 13:30:00 +0800
Subject: mm: tune PCP high automatically

The target to tune PCP high automatically is as follows,

- Minimize allocation/freeing from/to shared zone

- Minimize idle pages in PCP

- Minimize pages in PCP if the system free pages is too few

To reach these target, a tuning algorithm as follows is designed,

- When we refill PCP via allocating from the zone, increase PCP high.
  Because if we had larger PCP, we could avoid to allocate from the
  zone.

- In periodic vmstat updating kworker (via refresh_cpu_vm_stats()),
  decrease PCP high to try to free possible idle PCP pages.

- When page reclaiming is active for the zone, stop increasing PCP
  high in allocating path, decrease PCP high and free some pages in
  freeing path.

So, the PCP high can be tuned to the page allocating/freeing depth of
workloads eventually.

One issue of the algorithm is that if the number of pages allocated is
much more than that of pages freed on a CPU, the PCP high may become the
maximal value even if the allocating/freeing depth is small.  But this
isn't a severe issue, because there are no idle pages in this case.

One alternative choice is to increase PCP high when we drain PCP via
trying to free pages to the zone, but don't increase PCP high during PCP
refilling.  This can avoid the issue above.  But if the number of pages
allocated is much less than that of pages freed on a CPU, there will be
many idle pages in PCP and it is hard to free these idle pages.

1/8 (>> 3) of PCP high will be decreased periodically.  The value 1/8 is
kind of arbitrary.  Just to make sure that the idle PCP pages will be
freed eventually.

On a 2-socket Intel server with 224 logical CPU, we run 8 kbuild instances
in parallel (each with `make -j 28`) in 8 cgroup.  This simulates the
kbuild server that is used by 0-Day kbuild service.  With the patch, the
build time decreases 3.5%.  The cycles% of the spinlock contention (mostly
for zone lock) decreases from 11.0% to 0.5%.  The number of PCP draining
for high order pages freeing (free_high) decreases 65.6%.  The number of
pages allocated from zone (instead of from PCP) decreases 83.9%.

Link: https://lkml.kernel.org/r/20231016053002.756205-8-ying.huang@intel.com
Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
Suggested-by: Mel Gorman <mgorman@techsingularity.net>
Suggested-by: Michal Hocko <mhocko@suse.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: David Hildenbrand <david@redhat.com>
Cc: Johannes Weiner <jweiner@redhat.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Pavel Tatashin <pasha.tatashin@soleen.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Christoph Lameter <cl@linux.com>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Sudeep Holla <sudeep.holla@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/gfp.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 665edc11fb9f..5b917e5b9350 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -320,6 +320,7 @@ extern void page_frag_free(void *addr);
 #define free_page(addr) free_pages((addr), 0)
 
 void page_alloc_init_cpuhp(void);
+int decay_pcp_high(struct zone *zone, struct per_cpu_pages *pcp);
 void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp);
 void drain_all_pages(struct zone *zone);
 void drain_local_pages(struct zone *zone);
-- 
cgit v1.2.3


From 57c0419c5f0ea2ccab8700895c8fac20ba1eb21f Mon Sep 17 00:00:00 2001
From: Huang Ying <ying.huang@intel.com>
Date: Mon, 16 Oct 2023 13:30:01 +0800
Subject: mm, pcp: decrease PCP high if free pages < high watermark

One target of PCP is to minimize pages in PCP if the system free pages is
too few.  To reach that target, when page reclaiming is active for the
zone (ZONE_RECLAIM_ACTIVE), we will stop increasing PCP high in allocating
path, decrease PCP high and free some pages in freeing path.  But this may
be too late because the background page reclaiming may introduce latency
for some workloads.  So, in this patch, during page allocation we will
detect whether the number of free pages of the zone is below high
watermark.  If so, we will stop increasing PCP high in allocating path,
decrease PCP high and free some pages in freeing path.  With this, we can
reduce the possibility of the premature background page reclaiming caused
by too large PCP.

The high watermark checking is done in allocating path to reduce the
overhead in hotter freeing path.

Link: https://lkml.kernel.org/r/20231016053002.756205-9-ying.huang@intel.com
Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: David Hildenbrand <david@redhat.com>
Cc: Johannes Weiner <jweiner@redhat.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Pavel Tatashin <pasha.tatashin@soleen.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Christoph Lameter <cl@linux.com>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Sudeep Holla <sudeep.holla@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mmzone.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 775abc899e80..b92ab001e146 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -1016,6 +1016,7 @@ enum zone_flags {
 					 * Cleared when kswapd is woken.
 					 */
 	ZONE_RECLAIM_ACTIVE,		/* kswapd may be scanning the zone. */
+	ZONE_BELOW_HIGH,		/* zone is below high watermark. */
 };
 
 static inline unsigned long zone_managed_pages(struct zone *zone)
-- 
cgit v1.2.3


From 6ccdcb6d3a741c4e005ca6ffd4a62ddf8b5bead3 Mon Sep 17 00:00:00 2001
From: Huang Ying <ying.huang@intel.com>
Date: Mon, 16 Oct 2023 13:30:02 +0800
Subject: mm, pcp: reduce detecting time of consecutive high order page freeing

In current PCP auto-tuning design, if the number of pages allocated is
much more than that of pages freed on a CPU, the PCP high may become the
maximal value even if the allocating/freeing depth is small, for example,
in the sender of network workloads.  If a CPU was used as sender
originally, then it is used as receiver after context switching, we need
to fill the whole PCP with maximal high before triggering PCP draining for
consecutive high order freeing.  This will hurt the performance of some
network workloads.

To solve the issue, in this patch, we will track the consecutive page
freeing with a counter in stead of relying on PCP draining.  So, we can
detect consecutive page freeing much earlier.

On a 2-socket Intel server with 128 logical CPU, we tested
SCTP_STREAM_MANY test case of netperf test suite with 64-pair processes.
With the patch, the network bandwidth improves 5.0%.  This restores the
performance drop caused by PCP auto-tuning.

Link: https://lkml.kernel.org/r/20231016053002.756205-10-ying.huang@intel.com
Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: David Hildenbrand <david@redhat.com>
Cc: Johannes Weiner <jweiner@redhat.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Pavel Tatashin <pasha.tatashin@soleen.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Christoph Lameter <cl@linux.com>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Sudeep Holla <sudeep.holla@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mmzone.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index b92ab001e146..3c25226beeed 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -698,10 +698,10 @@ struct per_cpu_pages {
 	int batch;		/* chunk size for buddy add/remove */
 	u8 flags;		/* protected by pcp->lock */
 	u8 alloc_factor;	/* batch scaling factor during allocate */
-	u8 free_factor;		/* batch scaling factor during free */
 #ifdef CONFIG_NUMA
 	u8 expire;		/* When 0, remote pagesets are drained */
 #endif
+	short free_count;	/* consecutive free count */
 
 	/* Lists of pages, one per migrate type stored on the pcp-lists */
 	struct list_head lists[NR_PCP_LISTS];
-- 
cgit v1.2.3


From 1aacbd354313f25c855e662e41c04e2abf71444a Mon Sep 17 00:00:00 2001
From: Roman Gushchin <roman.gushchin@linux.dev>
Date: Thu, 19 Oct 2023 15:53:42 -0700
Subject: mm: kmem: add direct objcg pointer to task_struct

To charge a freshly allocated kernel object to a memory cgroup, the kernel
needs to obtain an objcg pointer.  Currently it does it indirectly by
obtaining the memcg pointer first and then calling to
__get_obj_cgroup_from_memcg().

Usually tasks spend their entire life belonging to the same object cgroup.
So it makes sense to save the objcg pointer on task_struct directly, so
it can be obtained faster.  It requires some work on fork, exit and cgroup
migrate paths, but these paths are way colder.

To avoid any costly synchronization the following rules are applied:
1) A task sets it's objcg pointer itself.

2) If a task is being migrated to another cgroup, the least
   significant bit of the objcg pointer is set atomically.

3) On the allocation path the objcg pointer is obtained locklessly
   using the READ_ONCE() macro and the least significant bit is
   checked. If it's set, the following procedure is used to update
   it locklessly:
       - task->objcg is zeroed using cmpxcg
       - new objcg pointer is obtained
       - task->objcg is updated using try_cmpxchg
       - operation is repeated if try_cmpxcg fails
   It guarantees that no updates will be lost if task migration
   is racing against objcg pointer update. It also allows to keep
   both read and write paths fully lockless.

Because the task is keeping a reference to the objcg, it can't go away
while the task is alive.

This commit doesn't change the way the remote memcg charging works.

Link: https://lkml.kernel.org/r/20231019225346.1822282-3-roman.gushchin@linux.dev
Signed-off-by: Roman Gushchin (Cruise) <roman.gushchin@linux.dev>
Tested-by: Naresh Kamboju <naresh.kamboju@linaro.org>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Shakeel Butt <shakeelb@google.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Cc: David Rientjes <rientjes@google.com>
Cc: Dennis Zhou <dennis@kernel.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/sched.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 77f01ac385f7..60de42715b56 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1443,6 +1443,10 @@ struct task_struct {
 	struct mem_cgroup		*active_memcg;
 #endif
 
+#ifdef CONFIG_MEMCG_KMEM
+	struct obj_cgroup		*objcg;
+#endif
+
 #ifdef CONFIG_BLK_CGROUP
 	struct gendisk			*throttle_disk;
 #endif
-- 
cgit v1.2.3


From 675d6c9b59e313ca2573c93e8fd87011a99bb8ce Mon Sep 17 00:00:00 2001
From: Roman Gushchin <roman.gushchin@linux.dev>
Date: Thu, 19 Oct 2023 15:53:43 -0700
Subject: mm: kmem: make memcg keep a reference to the original objcg

Keep a reference to the original objcg object for the entire life of a
memcg structure.

This allows to simplify the synchronization on the kernel memory
allocation paths: pinning a (live) memcg will also pin the corresponding
objcg.

The memory overhead of this change is minimal because object cgroups
usually outlive their corresponding memory cgroups even without this
change, so it's only an additional pointer per memcg.

Link: https://lkml.kernel.org/r/20231019225346.1822282-4-roman.gushchin@linux.dev
Signed-off-by: Roman Gushchin (Cruise) <roman.gushchin@linux.dev>
Tested-by: Naresh Kamboju <naresh.kamboju@linaro.org>
Acked-by: Shakeel Butt <shakeelb@google.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Cc: David Rientjes <rientjes@google.com>
Cc: Dennis Zhou <dennis@kernel.org>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/memcontrol.h | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 6674c12725d5..cc110cc8fdfc 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -289,7 +289,13 @@ struct mem_cgroup {
 
 #ifdef CONFIG_MEMCG_KMEM
 	int kmemcg_id;
-	struct obj_cgroup __rcu *objcg;
+	/*
+	 * memcg->objcg is wiped out as a part of the objcg repaprenting
+	 * process. memcg->orig_objcg preserves a pointer (and a reference)
+	 * to the original objcg until the end of live of memcg.
+	 */
+	struct obj_cgroup __rcu	*objcg;
+	struct obj_cgroup	*orig_objcg;
 	/* list of inherited objcgs, protected by objcg_lock */
 	struct list_head objcg_list;
 #endif
-- 
cgit v1.2.3


From e86828e5446d95676835679837d995dec188d2be Mon Sep 17 00:00:00 2001
From: Roman Gushchin <roman.gushchin@linux.dev>
Date: Thu, 19 Oct 2023 15:53:44 -0700
Subject: mm: kmem: scoped objcg protection

Switch to a scope-based protection of the objcg pointer on slab/kmem
allocation paths.  Instead of using the get_() semantics in the
pre-allocation hook and put the reference afterwards, let's rely on the
fact that objcg is pinned by the scope.

It's possible because:
1) if the objcg is received from the current task struct, the task is
   keeping a reference to the objcg.
2) if the objcg is received from an active memcg (remote charging),
   the memcg is pinned by the scope and has a reference to the
   corresponding objcg.

Link: https://lkml.kernel.org/r/20231019225346.1822282-5-roman.gushchin@linux.dev
Signed-off-by: Roman Gushchin (Cruise) <roman.gushchin@linux.dev>
Tested-by: Naresh Kamboju <naresh.kamboju@linaro.org>
Acked-by: Shakeel Butt <shakeelb@google.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Cc: David Rientjes <rientjes@google.com>
Cc: Dennis Zhou <dennis@kernel.org>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/memcontrol.h | 9 +++++++++
 include/linux/sched/mm.h   | 4 ++++
 2 files changed, 13 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index cc110cc8fdfc..8006bc3bd7bf 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -1796,6 +1796,15 @@ bool mem_cgroup_kmem_disabled(void);
 int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order);
 void __memcg_kmem_uncharge_page(struct page *page, int order);
 
+/*
+ * The returned objcg pointer is safe to use without additional
+ * protection within a scope. The scope is defined either by
+ * the current task (similar to the "current" global variable)
+ * or by set_active_memcg() pair.
+ * Please, use obj_cgroup_get() to get a reference if the pointer
+ * needs to be used outside of the local scope.
+ */
+struct obj_cgroup *current_obj_cgroup(void);
 struct obj_cgroup *get_obj_cgroup_from_current(void);
 struct obj_cgroup *get_obj_cgroup_from_folio(struct folio *folio);
 
diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
index 8d89c8c4fac1..9a19f1b42f64 100644
--- a/include/linux/sched/mm.h
+++ b/include/linux/sched/mm.h
@@ -403,6 +403,10 @@ DECLARE_PER_CPU(struct mem_cgroup *, int_active_memcg);
  * __GFP_ACCOUNT allocations till the end of the scope will be charged to the
  * given memcg.
  *
+ * Please, make sure that caller has a reference to the passed memcg structure,
+ * so its lifetime is guaranteed to exceed the scope between two
+ * set_active_memcg() calls.
+ *
  * NOTE: This function can nest. Users must save the return value and
  * reset the previous value after their own charging scope is over.
  */
-- 
cgit v1.2.3


From e56808fef8f71a192b2740c0b6ea8be7ab865d54 Mon Sep 17 00:00:00 2001
From: Roman Gushchin <roman.gushchin@linux.dev>
Date: Thu, 19 Oct 2023 15:53:46 -0700
Subject: mm: kmem: reimplement get_obj_cgroup_from_current()

Reimplement get_obj_cgroup_from_current() using current_obj_cgroup().
get_obj_cgroup_from_current() and current_obj_cgroup() share 80% of the
code, so the new implementation is almost trivial.

get_obj_cgroup_from_current() is a convenient function used by the
bpf subsystem, so there is no reason to get rid of it completely.

Link: https://lkml.kernel.org/r/20231019225346.1822282-7-roman.gushchin@linux.dev
Signed-off-by: Roman Gushchin (Cruise) <roman.gushchin@linux.dev>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Shakeel Butt <shakeelb@google.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Dennis Zhou <dennis@kernel.org>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Naresh Kamboju <naresh.kamboju@linaro.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/memcontrol.h | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 8006bc3bd7bf..b3d2e3e60eed 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -1805,9 +1805,18 @@ void __memcg_kmem_uncharge_page(struct page *page, int order);
  * needs to be used outside of the local scope.
  */
 struct obj_cgroup *current_obj_cgroup(void);
-struct obj_cgroup *get_obj_cgroup_from_current(void);
 struct obj_cgroup *get_obj_cgroup_from_folio(struct folio *folio);
 
+static inline struct obj_cgroup *get_obj_cgroup_from_current(void)
+{
+	struct obj_cgroup *objcg = current_obj_cgroup();
+
+	if (objcg)
+		obj_cgroup_get(objcg);
+
+	return objcg;
+}
+
 int obj_cgroup_charge(struct obj_cgroup *objcg, gfp_t gfp, size_t size);
 void obj_cgroup_uncharge(struct obj_cgroup *objcg, size_t size);
 
-- 
cgit v1.2.3


From 1d44f2e6d178163a94980fd5f9a4b04b6b36535b Mon Sep 17 00:00:00 2001
From: Kefeng Wang <wangkefeng.wang@huawei.com>
Date: Wed, 18 Oct 2023 22:07:48 +0800
Subject: mm_types: add virtual and _last_cpupid into struct folio

Patch series "mm: convert page cpupid functions to folios", v3.

The cpupid(or access time) used by numa balancing is stored in flags or
_last_cpupid(if LAST_CPUPID_NOT_IN_PAGE_FLAGS) of page, this is to convert
page cpupid to folio cpupid, a new _last_cpupid is added into folio, which
make us to use folio->_last_cpupid directly, and the page cpupid functions
are converted to folio ones.

  page_cpupid_last()		-> folio_last_cpupid()
  xchg_page_access_time()	-> folio_xchg_access_time()
  page_cpupid_xchg_last()	-> folio_xchg_last_cpupid()


This patch (of 19):

If WANT_PAGE_VIRTUAL and LAST_CPUPID_NOT_IN_PAGE_FLAGS defined, the
'virtual' and '_last_cpupid' are in struct page, and since _last_cpupid is
used by numa balancing feature, it is better to move it before KMSAN
metadata from struct page, also add them into struct folio to make us to
access them from folio directly.

Link: https://lkml.kernel.org/r/20231018140806.2783514-1-wangkefeng.wang@huawei.com
Link: https://lkml.kernel.org/r/20231018140806.2783514-2-wangkefeng.wang@huawei.com
Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Huang Ying <ying.huang@intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Juri Lelli <juri.lelli@redhat.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Vincent Guittot <vincent.guittot@linaro.org>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm_types.h | 22 ++++++++++++++++++----
 1 file changed, 18 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 21eb56145f57..692e41213cd3 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -199,6 +199,10 @@ struct page {
 					   not kmapped, ie. highmem) */
 #endif /* WANT_PAGE_VIRTUAL */
 
+#ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS
+	int _last_cpupid;
+#endif
+
 #ifdef CONFIG_KMSAN
 	/*
 	 * KMSAN metadata for this page:
@@ -210,10 +214,6 @@ struct page {
 	struct page *kmsan_shadow;
 	struct page *kmsan_origin;
 #endif
-
-#ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS
-	int _last_cpupid;
-#endif
 } _struct_page_alignment;
 
 /*
@@ -272,6 +272,8 @@ typedef struct {
  * @_refcount: Do not access this member directly.  Use folio_ref_count()
  *    to find how many references there are to this folio.
  * @memcg_data: Memory Control Group data.
+ * @virtual: Virtual address in the kernel direct map.
+ * @_last_cpupid: IDs of last CPU and last process that accessed the folio.
  * @_entire_mapcount: Do not use directly, call folio_entire_mapcount().
  * @_nr_pages_mapped: Do not use directly, call folio_mapcount().
  * @_pincount: Do not use directly, call folio_maybe_dma_pinned().
@@ -317,6 +319,12 @@ struct folio {
 			atomic_t _refcount;
 #ifdef CONFIG_MEMCG
 			unsigned long memcg_data;
+#endif
+#if defined(WANT_PAGE_VIRTUAL)
+			void *virtual;
+#endif
+#ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS
+			int _last_cpupid;
 #endif
 	/* private: the union with struct page is transitional */
 		};
@@ -373,6 +381,12 @@ FOLIO_MATCH(_refcount, _refcount);
 #ifdef CONFIG_MEMCG
 FOLIO_MATCH(memcg_data, memcg_data);
 #endif
+#if defined(WANT_PAGE_VIRTUAL)
+FOLIO_MATCH(virtual, virtual);
+#endif
+#ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS
+FOLIO_MATCH(_last_cpupid, _last_cpupid);
+#endif
 #undef FOLIO_MATCH
 #define FOLIO_MATCH(pg, fl)						\
 	static_assert(offsetof(struct folio, fl) ==			\
-- 
cgit v1.2.3


From 155c98cfcf961327adedabd629edfc2301cf354b Mon Sep 17 00:00:00 2001
From: Kefeng Wang <wangkefeng.wang@huawei.com>
Date: Wed, 18 Oct 2023 22:07:49 +0800
Subject: mm: add folio_last_cpupid()

Add folio_last_cpupid() wrapper, which is required to convert
page_cpupid_last() to folio vertion later in the series.

Link: https://lkml.kernel.org/r/20231018140806.2783514-3-wangkefeng.wang@huawei.com
Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Huang Ying <ying.huang@intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Juri Lelli <juri.lelli@redhat.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Vincent Guittot <vincent.guittot@linaro.org>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 86e040e886e4..0c4ff852930b 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1796,6 +1796,11 @@ static inline void vma_set_access_pid_bit(struct vm_area_struct *vma)
 }
 #endif /* CONFIG_NUMA_BALANCING */
 
+static inline int folio_last_cpupid(struct folio *folio)
+{
+	return page_cpupid_last(&folio->page);
+}
+
 #if defined(CONFIG_KASAN_SW_TAGS) || defined(CONFIG_KASAN_HW_TAGS)
 
 /*
-- 
cgit v1.2.3


From f39eac30a8f334f0765ef78fe4d13b3fd5bfa3fd Mon Sep 17 00:00:00 2001
From: Kefeng Wang <wangkefeng.wang@huawei.com>
Date: Wed, 18 Oct 2023 22:07:53 +0800
Subject: mm: remove page_cpupid_last()

Since all calls use folio_last_cpupid(), remove page_cpupid_last().

Link: https://lkml.kernel.org/r/20231018140806.2783514-7-wangkefeng.wang@huawei.com
Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Huang Ying <ying.huang@intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Juri Lelli <juri.lelli@redhat.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Vincent Guittot <vincent.guittot@linaro.org>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h | 17 ++++++-----------
 1 file changed, 6 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 0c4ff852930b..f8b7a1825720 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1702,18 +1702,18 @@ static inline int page_cpupid_xchg_last(struct page *page, int cpupid)
 	return xchg(&page->_last_cpupid, cpupid & LAST_CPUPID_MASK);
 }
 
-static inline int page_cpupid_last(struct page *page)
+static inline int folio_last_cpupid(struct folio *folio)
 {
-	return page->_last_cpupid;
+	return folio->_last_cpupid;
 }
 static inline void page_cpupid_reset_last(struct page *page)
 {
 	page->_last_cpupid = -1 & LAST_CPUPID_MASK;
 }
 #else
-static inline int page_cpupid_last(struct page *page)
+static inline int folio_last_cpupid(struct folio *folio)
 {
-	return (page->flags >> LAST_CPUPID_PGSHIFT) & LAST_CPUPID_MASK;
+	return (folio->flags >> LAST_CPUPID_PGSHIFT) & LAST_CPUPID_MASK;
 }
 
 extern int page_cpupid_xchg_last(struct page *page, int cpupid);
@@ -1752,9 +1752,9 @@ static inline int xchg_page_access_time(struct page *page, int time)
 	return 0;
 }
 
-static inline int page_cpupid_last(struct page *page)
+static inline int folio_last_cpupid(struct folio *folio)
 {
-	return page_to_nid(page); /* XXX */
+	return folio_nid(folio); /* XXX */
 }
 
 static inline int cpupid_to_nid(int cpupid)
@@ -1796,11 +1796,6 @@ static inline void vma_set_access_pid_bit(struct vm_area_struct *vma)
 }
 #endif /* CONFIG_NUMA_BALANCING */
 
-static inline int folio_last_cpupid(struct folio *folio)
-{
-	return page_cpupid_last(&folio->page);
-}
-
 #if defined(CONFIG_KASAN_SW_TAGS) || defined(CONFIG_KASAN_HW_TAGS)
 
 /*
-- 
cgit v1.2.3


From 55c199385c4465e9abe1a3d6d1aba348d0356e03 Mon Sep 17 00:00:00 2001
From: Kefeng Wang <wangkefeng.wang@huawei.com>
Date: Wed, 18 Oct 2023 22:07:54 +0800
Subject: mm: add folio_xchg_access_time()

Add folio_xchg_access_time() wrapper, which is required to convert
xchg_page_access_time() to folio vertion later in the series.

Link: https://lkml.kernel.org/r/20231018140806.2783514-8-wangkefeng.wang@huawei.com
Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Huang Ying <ying.huang@intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Juri Lelli <juri.lelli@redhat.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Vincent Guittot <vincent.guittot@linaro.org>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index f8b7a1825720..dee63b820f0c 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1796,6 +1796,11 @@ static inline void vma_set_access_pid_bit(struct vm_area_struct *vma)
 }
 #endif /* CONFIG_NUMA_BALANCING */
 
+static inline int folio_xchg_access_time(struct folio *folio, int time)
+{
+	return xchg_page_access_time(&folio->page, time);
+}
+
 #if defined(CONFIG_KASAN_SW_TAGS) || defined(CONFIG_KASAN_HW_TAGS)
 
 /*
-- 
cgit v1.2.3


From f393084382fa3bbd5840b428d538dbcb33be0186 Mon Sep 17 00:00:00 2001
From: Kefeng Wang <wangkefeng.wang@huawei.com>
Date: Wed, 18 Oct 2023 22:07:58 +0800
Subject: mm: remove xchg_page_access_time()

Since all calls use folio_xchg_access_time(), remove
xchg_page_access_time().

Link: https://lkml.kernel.org/r/20231018140806.2783514-12-wangkefeng.wang@huawei.com
Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Huang Ying <ying.huang@intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Juri Lelli <juri.lelli@redhat.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Vincent Guittot <vincent.guittot@linaro.org>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index dee63b820f0c..e003465238c4 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1724,11 +1724,12 @@ static inline void page_cpupid_reset_last(struct page *page)
 }
 #endif /* LAST_CPUPID_NOT_IN_PAGE_FLAGS */
 
-static inline int xchg_page_access_time(struct page *page, int time)
+static inline int folio_xchg_access_time(struct folio *folio, int time)
 {
 	int last_time;
 
-	last_time = page_cpupid_xchg_last(page, time >> PAGE_ACCESS_TIME_BUCKETS);
+	last_time = page_cpupid_xchg_last(&folio->page,
+					  time >> PAGE_ACCESS_TIME_BUCKETS);
 	return last_time << PAGE_ACCESS_TIME_BUCKETS;
 }
 
@@ -1747,7 +1748,7 @@ static inline int page_cpupid_xchg_last(struct page *page, int cpupid)
 	return page_to_nid(page); /* XXX */
 }
 
-static inline int xchg_page_access_time(struct page *page, int time)
+static inline int folio_xchg_access_time(struct folio *folio, int time)
 {
 	return 0;
 }
@@ -1796,11 +1797,6 @@ static inline void vma_set_access_pid_bit(struct vm_area_struct *vma)
 }
 #endif /* CONFIG_NUMA_BALANCING */
 
-static inline int folio_xchg_access_time(struct folio *folio, int time)
-{
-	return xchg_page_access_time(&folio->page, time);
-}
-
 #if defined(CONFIG_KASAN_SW_TAGS) || defined(CONFIG_KASAN_HW_TAGS)
 
 /*
-- 
cgit v1.2.3


From 136d0b47576f8701d68c2d504e7237d9fdc4ebbd Mon Sep 17 00:00:00 2001
From: Kefeng Wang <wangkefeng.wang@huawei.com>
Date: Wed, 18 Oct 2023 22:07:59 +0800
Subject: mm: add folio_xchg_last_cpupid()

Add folio_xchg_last_cpupid() wrapper, which is required to convert
page_cpupid_xchg_last() to folio vertion later in the series.

Link: https://lkml.kernel.org/r/20231018140806.2783514-13-wangkefeng.wang@huawei.com
Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Huang Ying <ying.huang@intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Juri Lelli <juri.lelli@redhat.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Vincent Guittot <vincent.guittot@linaro.org>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index e003465238c4..00e0874f7ce2 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1797,6 +1797,11 @@ static inline void vma_set_access_pid_bit(struct vm_area_struct *vma)
 }
 #endif /* CONFIG_NUMA_BALANCING */
 
+static inline int folio_xchg_last_cpupid(struct folio *folio, int cpupid)
+{
+	return page_cpupid_xchg_last(&folio->page, cpupid);
+}
+
 #if defined(CONFIG_KASAN_SW_TAGS) || defined(CONFIG_KASAN_HW_TAGS)
 
 /*
-- 
cgit v1.2.3


From c08b7e3830dbf24dd2552ddeea84f00393842f1b Mon Sep 17 00:00:00 2001
From: Kefeng Wang <wangkefeng.wang@huawei.com>
Date: Wed, 18 Oct 2023 22:08:03 +0800
Subject: mm: make finish_mkwrite_fault() static

Make finish_mkwrite_fault static since it is not used outside of
memory.c.

Link: https://lkml.kernel.org/r/20231018140806.2783514-17-wangkefeng.wang@huawei.com
Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Huang Ying <ying.huang@intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Juri Lelli <juri.lelli@redhat.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Vincent Guittot <vincent.guittot@linaro.org>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 00e0874f7ce2..5d6e1d7656c2 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1348,7 +1348,6 @@ void set_pte_range(struct vm_fault *vmf, struct folio *folio,
 		struct page *page, unsigned int nr, unsigned long addr);
 
 vm_fault_t finish_fault(struct vm_fault *vmf);
-vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf);
 #endif
 
 /*
-- 
cgit v1.2.3


From 8f0f4788b1247c2f92ecacd8f86ce0b379b807b9 Mon Sep 17 00:00:00 2001
From: Kefeng Wang <wangkefeng.wang@huawei.com>
Date: Wed, 18 Oct 2023 22:08:06 +0800
Subject: mm: remove page_cpupid_xchg_last()

Since all calls use folio_xchg_last_cpupid(), remove
page_cpupid_xchg_last().

Link: https://lkml.kernel.org/r/20231018140806.2783514-20-wangkefeng.wang@huawei.com
Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Huang Ying <ying.huang@intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Juri Lelli <juri.lelli@redhat.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Vincent Guittot <vincent.guittot@linaro.org>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h | 19 +++++++------------
 1 file changed, 7 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 5d6e1d7656c2..0e496a8b8e50 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1696,9 +1696,9 @@ static inline bool __cpupid_match_pid(pid_t task_pid, int cpupid)
 
 #define cpupid_match_pid(task, cpupid) __cpupid_match_pid(task->pid, cpupid)
 #ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS
-static inline int page_cpupid_xchg_last(struct page *page, int cpupid)
+static inline int folio_xchg_last_cpupid(struct folio *folio, int cpupid)
 {
-	return xchg(&page->_last_cpupid, cpupid & LAST_CPUPID_MASK);
+	return xchg(&folio->_last_cpupid, cpupid & LAST_CPUPID_MASK);
 }
 
 static inline int folio_last_cpupid(struct folio *folio)
@@ -1715,7 +1715,7 @@ static inline int folio_last_cpupid(struct folio *folio)
 	return (folio->flags >> LAST_CPUPID_PGSHIFT) & LAST_CPUPID_MASK;
 }
 
-extern int page_cpupid_xchg_last(struct page *page, int cpupid);
+int folio_xchg_last_cpupid(struct folio *folio, int cpupid);
 
 static inline void page_cpupid_reset_last(struct page *page)
 {
@@ -1727,8 +1727,8 @@ static inline int folio_xchg_access_time(struct folio *folio, int time)
 {
 	int last_time;
 
-	last_time = page_cpupid_xchg_last(&folio->page,
-					  time >> PAGE_ACCESS_TIME_BUCKETS);
+	last_time = folio_xchg_last_cpupid(folio,
+					   time >> PAGE_ACCESS_TIME_BUCKETS);
 	return last_time << PAGE_ACCESS_TIME_BUCKETS;
 }
 
@@ -1742,9 +1742,9 @@ static inline void vma_set_access_pid_bit(struct vm_area_struct *vma)
 	}
 }
 #else /* !CONFIG_NUMA_BALANCING */
-static inline int page_cpupid_xchg_last(struct page *page, int cpupid)
+static inline int folio_xchg_last_cpupid(struct folio *folio, int cpupid)
 {
-	return page_to_nid(page); /* XXX */
+	return folio_nid(folio); /* XXX */
 }
 
 static inline int folio_xchg_access_time(struct folio *folio, int time)
@@ -1796,11 +1796,6 @@ static inline void vma_set_access_pid_bit(struct vm_area_struct *vma)
 }
 #endif /* CONFIG_NUMA_BALANCING */
 
-static inline int folio_xchg_last_cpupid(struct folio *folio, int cpupid)
-{
-	return page_cpupid_xchg_last(&folio->page, cpupid);
-}
-
 #if defined(CONFIG_KASAN_SW_TAGS) || defined(CONFIG_KASAN_HW_TAGS)
 
 /*
-- 
cgit v1.2.3


From 80203f1ca086835100843f1474bd6dd4a48cc73b Mon Sep 17 00:00:00 2001
From: Liu Shixin <liushixin2@huawei.com>
Date: Wed, 18 Oct 2023 18:29:47 +0800
Subject: bootmem: use kmemleak_free_part_phys in free_bootmem_page

Since kmemleak_alloc_phys() rather than kmemleak_alloc() was called from
memblock_alloc_range_nid(), kmemleak_free_part_phys() should be used to
delete kmemleak object in free_bootmem_page().  In debug mode, there are
following warning:

 kmemleak: Partially freeing unknown object at 0xffff97345aff7000 (size 4096)

Link: https://lkml.kernel.org/r/20231018102952.3339837-3-liushixin2@huawei.com
Fixes: 028725e73375 ("bootmem: remove the vmemmap pages from kmemleak in free_bootmem_page")
Signed-off-by: Liu Shixin <liushixin2@huawei.com>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: Patrick Wang <patrick.wang.shcn@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/bootmem_info.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/bootmem_info.h b/include/linux/bootmem_info.h
index e1a3c9c9754c..cffa38a73618 100644
--- a/include/linux/bootmem_info.h
+++ b/include/linux/bootmem_info.h
@@ -60,7 +60,7 @@ static inline void get_page_bootmem(unsigned long info, struct page *page,
 
 static inline void free_bootmem_page(struct page *page)
 {
-	kmemleak_free_part(page_to_virt(page), PAGE_SIZE);
+	kmemleak_free_part_phys(PFN_PHYS(page_to_pfn(page)), PAGE_SIZE);
 	free_reserved_page(page);
 }
 #endif
-- 
cgit v1.2.3


From b455f39d228935f88eebcd1f7c1a6981093c6a3b Mon Sep 17 00:00:00 2001
From: "Vishal Moola (Oracle)" <vishal.moola@gmail.com>
Date: Fri, 20 Oct 2023 11:33:30 -0700
Subject: mm/khugepaged: convert alloc_charge_hpage() to use folios

Also remove count_memcg_page_event now that its last caller no longer uses
it and reword hpage_collapse_alloc_page() to hpage_collapse_alloc_folio().

This removes 1 call to compound_head() and helps convert khugepaged to
use folios throughout.

Link: https://lkml.kernel.org/r/20231020183331.10770-5-vishal.moola@gmail.com
Signed-off-by: Vishal Moola (Oracle) <vishal.moola@gmail.com>
Reviewed-by: Rik van Riel <riel@surriel.com>
Reviewed-by: Yang Shi <shy828301@gmail.com>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/memcontrol.h | 14 --------------
 1 file changed, 14 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index b3d2e3e60eed..7bdcf3020d7a 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -1087,15 +1087,6 @@ static inline void count_memcg_events(struct mem_cgroup *memcg,
 	local_irq_restore(flags);
 }
 
-static inline void count_memcg_page_event(struct page *page,
-					  enum vm_event_item idx)
-{
-	struct mem_cgroup *memcg = page_memcg(page);
-
-	if (memcg)
-		count_memcg_events(memcg, idx, 1);
-}
-
 static inline void count_memcg_folio_events(struct folio *folio,
 		enum vm_event_item idx, unsigned long nr)
 {
@@ -1598,11 +1589,6 @@ static inline void __count_memcg_events(struct mem_cgroup *memcg,
 {
 }
 
-static inline void count_memcg_page_event(struct page *page,
-					  int idx)
-{
-}
-
 static inline void count_memcg_folio_events(struct folio *folio,
 		enum vm_event_item idx, unsigned long nr)
 {
-- 
cgit v1.2.3


From 35f5d94187a6a3a8df2cba54beccca1c2379edb8 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Thu, 19 Oct 2023 19:49:20 +0000
Subject: mm/damon: implement a function for max nr_accesses safe calculation

Patch series "avoid divide-by-zero due to max_nr_accesses overflow".

The maximum nr_accesses of given DAMON context can be calculated by
dividing the aggregation interval by the sampling interval.  Some logics
in DAMON uses the maximum nr_accesses as a divisor.  Hence, the value
shouldn't be zero.  Such case is avoided since DAMON avoids setting the
agregation interval as samller than the sampling interval.  However, since
nr_accesses is unsigned int while the intervals are unsigned long, the
maximum nr_accesses could be zero while casting.

Avoid the divide-by-zero by implementing a function that handles the
corner case (first patch), and replaces the vulnerable direct max
nr_accesses calculations (remaining patches).

Note that the patches for the replacements are divided for broken commits,
to make backporting on required tres easier.  Especially, the last patch
is for a patch that not yet merged into the mainline but in mm tree.


This patch (of 4):

The maximum nr_accesses of given DAMON context can be calculated by
dividing the aggregation interval by the sampling interval.  Some logics
in DAMON uses the maximum nr_accesses as a divisor.  Hence, the value
shouldn't be zero.  Such case is avoided since DAMON avoids setting the
agregation interval as samller than the sampling interval.  However, since
nr_accesses is unsigned int while the intervals are unsigned long, the
maximum nr_accesses could be zero while casting.  Implement a function
that handles the corner case.

Note that this commit is not fixing the real issue since this is only
introducing the safe function that will replaces the problematic
divisions.  The replacements will be made by followup commits, to make
backporting on stable series easier.

Link: https://lkml.kernel.org/r/20231019194924.100347-1-sj@kernel.org
Link: https://lkml.kernel.org/r/20231019194924.100347-2-sj@kernel.org
Fixes: 198f0f4c58b9 ("mm/damon/vaddr,paddr: support pageout prioritization")
Signed-off-by: SeongJae Park <sj@kernel.org>
Reported-by: Jakub Acs <acsjakub@amazon.de>
Cc: <stable@vger.kernel.org>	[5.16+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/damon.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/damon.h b/include/linux/damon.h
index 27b995c22497..ab2f17d9926b 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -681,6 +681,13 @@ static inline bool damon_target_has_pid(const struct damon_ctx *ctx)
 	return ctx->ops.id == DAMON_OPS_VADDR || ctx->ops.id == DAMON_OPS_FVADDR;
 }
 
+static inline unsigned int damon_max_nr_accesses(const struct damon_attrs *attrs)
+{
+	/* {aggr,sample}_interval are unsigned long, hence could overflow */
+	return min(attrs->aggr_interval / attrs->sample_interval,
+			(unsigned long)UINT_MAX);
+}
+
 
 int damon_start(struct damon_ctx **ctxs, int nr_ctxs, bool exclusive);
 int damon_stop(struct damon_ctx **ctxs, int nr_ctxs);
-- 
cgit v1.2.3


From 10969b5571387047cd461a9c701b8b9cd007f6c7 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Tue, 3 Oct 2023 02:15:09 -0700
Subject: hugetlbfs: drop shared NUMA mempolicy pretence

Patch series "mempolicy: cleanups leading to NUMA mpol without vma", v2.

Mostly cleanups in mm/mempolicy.c, but finally removing the pseudo-vma
from shmem folio allocation, and removing the mmap_lock around folio
migration for mbind and migrate_pages syscalls.


This patch (of 12):

hugetlbfs_fallocate() goes through the motions of pasting a shared NUMA
mempolicy onto its pseudo-vma, but how could there ever be a shared NUMA
mempolicy for this file?  hugetlb_vm_ops has never offered a set_policy
method, and hugetlbfs_parse_param() has never supported any mpol options
for a mount-wide default policy.

It's just an illusion: clean it away so as not to confuse others, giving
us more freedom to adjust shmem's set_policy/get_policy implementation.
But hugetlbfs_inode_info is still required, just to accommodate seals.

Yes, shared NUMA mempolicy support could be added to hugetlbfs, with a
set_policy method and/or mpol mount option (Andi's first posting did
include an admitted-unsatisfactory hugetlb_set_policy()); but it seems
that nobody has bothered to add that in the nineteen years since v2.6.7
made it possible, and there is at least one company that has invested
enough into hugetlbfs, that I guess they have learnt well enough how to
manage its NUMA, without needing shared mempolicy.

Remove linux/mempolicy.h from linux/hugetlb.h: include linux/pagemap.h in
its place, because hugetlb.h's recently added use of filemap_lock_folio()
requires that (although most .configs and .c's get it in some other way).

Link: https://lkml.kernel.org/r/ebc0987e-beff-8bfb-9283-234c2cbd17c5@google.com
Link: https://lkml.kernel.org/r/cae82d4b-904a-faaf-282a-34fcc188c81f@google.com
Signed-off-by: Hugh Dickins <hughd@google.com>
Reviewed-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: "Huang, Ying" <ying.huang@intel.com>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Tejun heo <tj@kernel.org>
Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com>
Cc: Yang Shi <shy828301@gmail.com>
Cc: Nhat Pham <nphamcs@gmail.com>
Cc: Yosry Ahmed <yosryahmed@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/hugetlb.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 205469aa0613..158ff156710b 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -30,7 +30,7 @@ void free_huge_folio(struct folio *folio);
 
 #ifdef CONFIG_HUGETLB_PAGE
 
-#include <linux/mempolicy.h>
+#include <linux/pagemap.h>
 #include <linux/shm.h>
 #include <asm/tlbflush.h>
 
@@ -545,7 +545,6 @@ static inline struct hugetlbfs_sb_info *HUGETLBFS_SB(struct super_block *sb)
 }
 
 struct hugetlbfs_inode_info {
-	struct shared_policy policy;
 	struct inode vfs_inode;
 	unsigned int seals;
 };
-- 
cgit v1.2.3


From c36f6e6dff4d32ec8b6da8f553933727a57a7a4a Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Tue, 3 Oct 2023 02:20:14 -0700
Subject: mempolicy trivia: slightly more consistent naming

Before getting down to work, do a little cleanup, mainly of inconsistent
variable naming.  I gave up trying to rationalize mpol versus pol versus
policy, and node versus nid, but let's avoid p and nd.  Remove a few
superfluous blank lines, but add one; and here prefer vma->vm_policy to
vma_policy(vma) - the latter being appropriate in other sources, which
have to allow for !CONFIG_NUMA.  That intriguing line about KERNEL_DS?
should have gone in v2.6.15, when numa_policy_init() stopped using
set_mempolicy(2)'s system call handler.

Link: https://lkml.kernel.org/r/68287974-b6ae-7df-4ba-d19ddd69cbf@google.com
Signed-off-by: Hugh Dickins <hughd@google.com>
Reviewed-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: "Huang, Ying" <ying.huang@intel.com>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Nhat Pham <nphamcs@gmail.com>
Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Tejun heo <tj@kernel.org>
Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com>
Cc: Yang Shi <shy828301@gmail.com>
Cc: Yosry Ahmed <yosryahmed@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mempolicy.h | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index 1d7f4ec2614c..a807976fe95d 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -124,10 +124,9 @@ struct shared_policy {
 
 int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst);
 void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol);
-int mpol_set_shared_policy(struct shared_policy *info,
-				struct vm_area_struct *vma,
-				struct mempolicy *new);
-void mpol_free_shared_policy(struct shared_policy *p);
+int mpol_set_shared_policy(struct shared_policy *sp,
+			   struct vm_area_struct *vma, struct mempolicy *mpol);
+void mpol_free_shared_policy(struct shared_policy *sp);
 struct mempolicy *mpol_shared_policy_lookup(struct shared_policy *sp,
 					    unsigned long idx);
 
@@ -191,7 +190,7 @@ static inline bool mpol_equal(struct mempolicy *a, struct mempolicy *b)
 	return true;
 }
 
-static inline void mpol_put(struct mempolicy *p)
+static inline void mpol_put(struct mempolicy *pol)
 {
 }
 
@@ -210,7 +209,7 @@ static inline void mpol_shared_policy_init(struct shared_policy *sp,
 {
 }
 
-static inline void mpol_free_shared_policy(struct shared_policy *p)
+static inline void mpol_free_shared_policy(struct shared_policy *sp)
 {
 }
 
-- 
cgit v1.2.3


From 93397c3b7684555b7cec726cd13eef6742d191fe Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Tue, 3 Oct 2023 02:21:34 -0700
Subject: mempolicy trivia: use pgoff_t in shared mempolicy tree

Prefer the more explicit "pgoff_t" to "unsigned long" when dealing with a
shared mempolicy tree.  Delete confusing comment about pseudo mm vmas.

Link: https://lkml.kernel.org/r/5451157-3818-4af5-fd2c-5d26a5d1dc53@google.com
Signed-off-by: Hugh Dickins <hughd@google.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: "Huang, Ying" <ying.huang@intel.com>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Nhat Pham <nphamcs@gmail.com>
Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Tejun heo <tj@kernel.org>
Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com>
Cc: Yang Shi <shy828301@gmail.com>
Cc: Yosry Ahmed <yosryahmed@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mempolicy.h | 20 +++++++-------------
 1 file changed, 7 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index a807976fe95d..acdb12fcb6cd 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -105,22 +105,16 @@ static inline bool mpol_equal(struct mempolicy *a, struct mempolicy *b)
 
 /*
  * Tree of shared policies for a shared memory region.
- * Maintain the policies in a pseudo mm that contains vmas. The vmas
- * carry the policy. As a special twist the pseudo mm is indexed in pages, not
- * bytes, so that we can work with shared memory segments bigger than
- * unsigned long.
  */
-
-struct sp_node {
-	struct rb_node nd;
-	unsigned long start, end;
-	struct mempolicy *policy;
-};
-
 struct shared_policy {
 	struct rb_root root;
 	rwlock_t lock;
 };
+struct sp_node {
+	struct rb_node nd;
+	pgoff_t start, end;
+	struct mempolicy *policy;
+};
 
 int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst);
 void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol);
@@ -128,7 +122,7 @@ int mpol_set_shared_policy(struct shared_policy *sp,
 			   struct vm_area_struct *vma, struct mempolicy *mpol);
 void mpol_free_shared_policy(struct shared_policy *sp);
 struct mempolicy *mpol_shared_policy_lookup(struct shared_policy *sp,
-					    unsigned long idx);
+					    pgoff_t idx);
 
 struct mempolicy *get_task_policy(struct task_struct *p);
 struct mempolicy *__get_vma_policy(struct vm_area_struct *vma,
@@ -214,7 +208,7 @@ static inline void mpol_free_shared_policy(struct shared_policy *sp)
 }
 
 static inline struct mempolicy *
-mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
+mpol_shared_policy_lookup(struct shared_policy *sp, pgoff_t idx)
 {
 	return NULL;
 }
-- 
cgit v1.2.3


From ddc1a5cbc05dc62743a2f409b96faa5cf95ba064 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Thu, 19 Oct 2023 13:39:08 -0700
Subject: mempolicy: alloc_pages_mpol() for NUMA policy without vma

Shrink shmem's stack usage by eliminating the pseudo-vma from its folio
allocation.  alloc_pages_mpol(gfp, order, pol, ilx, nid) becomes the
principal actor for passing mempolicy choice down to __alloc_pages(),
rather than vma_alloc_folio(gfp, order, vma, addr, hugepage).

vma_alloc_folio() and alloc_pages() remain, but as wrappers around
alloc_pages_mpol().  alloc_pages_bulk_*() untouched, except to provide the
additional args to policy_nodemask(), which subsumes policy_node().
Cleanup throughout, cutting out some unhelpful "helpers".

It would all be much simpler without MPOL_INTERLEAVE, but that adds a
dynamic to the constant mpol: complicated by v3.6 commit 09c231cb8bfd
("tmpfs: distribute interleave better across nodes"), which added ino bias
to the interleave, hidden from mm/mempolicy.c until this commit.

Hence "ilx" throughout, the "interleave index".  Originally I thought it
could be done just with nid, but that's wrong: the nodemask may come from
the shared policy layer below a shmem vma, or it may come from the task
layer above a shmem vma; and without the final nodemask then nodeid cannot
be decided.  And how ilx is applied depends also on page order.

The interleave index is almost always irrelevant unless MPOL_INTERLEAVE:
with one exception in alloc_pages_mpol(), where the NO_INTERLEAVE_INDEX
passed down from vma-less alloc_pages() is also used as hint not to use
THP-style hugepage allocation - to avoid the overhead of a hugepage arg
(though I don't understand why we never just added a GFP bit for THP - if
it actually needs a different allocation strategy from other pages of the
same order).  vma_alloc_folio() still carries its hugepage arg here, but
it is not used, and should be removed when agreed.

get_vma_policy() no longer allows a NULL vma: over time I believe we've
eradicated all the places which used to need it e.g.  swapoff and madvise
used to pass NULL vma to read_swap_cache_async(), but now know the vma.

[hughd@google.com: handle NULL mpol being passed to __read_swap_cache_async()]
  Link: https://lkml.kernel.org/r/ea419956-4751-0102-21f7-9c93cb957892@google.com
Link: https://lkml.kernel.org/r/74e34633-6060-f5e3-aee-7040d43f2e93@google.com
Link: https://lkml.kernel.org/r/1738368e-bac0-fd11-ed7f-b87142a939fe@google.com
Signed-off-by: Hugh Dickins <hughd@google.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Huang Ying <ying.huang@intel.com>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Nhat Pham <nphamcs@gmail.com>
Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Tejun heo <tj@kernel.org>
Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com>
Cc: Yang Shi <shy828301@gmail.com>
Cc: Yosry Ahmed <yosryahmed@google.com>
Cc: Domenico Cerasuolo <mimmocerasuolo@gmail.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/gfp.h       | 10 +++++++++-
 include/linux/mempolicy.h | 20 +++++++++++++++++---
 include/linux/mm.h        |  2 +-
 3 files changed, 27 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 5b917e5b9350..de292a007138 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -8,6 +8,7 @@
 #include <linux/topology.h>
 
 struct vm_area_struct;
+struct mempolicy;
 
 /* Convert GFP flags to their corresponding migrate type */
 #define GFP_MOVABLE_MASK (__GFP_RECLAIMABLE|__GFP_MOVABLE)
@@ -262,7 +263,9 @@ static inline struct page *alloc_pages_node(int nid, gfp_t gfp_mask,
 
 #ifdef CONFIG_NUMA
 struct page *alloc_pages(gfp_t gfp, unsigned int order);
-struct folio *folio_alloc(gfp_t gfp, unsigned order);
+struct page *alloc_pages_mpol(gfp_t gfp, unsigned int order,
+		struct mempolicy *mpol, pgoff_t ilx, int nid);
+struct folio *folio_alloc(gfp_t gfp, unsigned int order);
 struct folio *vma_alloc_folio(gfp_t gfp, int order, struct vm_area_struct *vma,
 		unsigned long addr, bool hugepage);
 #else
@@ -270,6 +273,11 @@ static inline struct page *alloc_pages(gfp_t gfp_mask, unsigned int order)
 {
 	return alloc_pages_node(numa_node_id(), gfp_mask, order);
 }
+static inline struct page *alloc_pages_mpol(gfp_t gfp, unsigned int order,
+		struct mempolicy *mpol, pgoff_t ilx, int nid)
+{
+	return alloc_pages(gfp, order);
+}
 static inline struct folio *folio_alloc(gfp_t gfp, unsigned int order)
 {
 	return __folio_alloc_node(gfp, order, numa_node_id());
diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index acdb12fcb6cd..931b118336f4 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -17,6 +17,8 @@
 
 struct mm_struct;
 
+#define NO_INTERLEAVE_INDEX (-1UL)	/* use task il_prev for interleaving */
+
 #ifdef CONFIG_NUMA
 
 /*
@@ -126,7 +128,9 @@ struct mempolicy *mpol_shared_policy_lookup(struct shared_policy *sp,
 
 struct mempolicy *get_task_policy(struct task_struct *p);
 struct mempolicy *__get_vma_policy(struct vm_area_struct *vma,
-		unsigned long addr);
+		unsigned long addr, pgoff_t *ilx);
+struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
+		unsigned long addr, int order, pgoff_t *ilx);
 bool vma_policy_mof(struct vm_area_struct *vma);
 
 extern void numa_default_policy(void);
@@ -140,8 +144,6 @@ extern int huge_node(struct vm_area_struct *vma,
 extern bool init_nodemask_of_mempolicy(nodemask_t *mask);
 extern bool mempolicy_in_oom_domain(struct task_struct *tsk,
 				const nodemask_t *mask);
-extern nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy);
-
 extern unsigned int mempolicy_slab_node(void);
 
 extern enum zone_type policy_zone;
@@ -179,6 +181,11 @@ extern bool apply_policy_zone(struct mempolicy *policy, enum zone_type zone);
 
 struct mempolicy {};
 
+static inline struct mempolicy *get_task_policy(struct task_struct *p)
+{
+	return NULL;
+}
+
 static inline bool mpol_equal(struct mempolicy *a, struct mempolicy *b)
 {
 	return true;
@@ -213,6 +220,13 @@ mpol_shared_policy_lookup(struct shared_policy *sp, pgoff_t idx)
 	return NULL;
 }
 
+static inline struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
+				unsigned long addr, int order, pgoff_t *ilx)
+{
+	*ilx = 0;
+	return NULL;
+}
+
 static inline int
 vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst)
 {
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 0e496a8b8e50..14d5aaff96d0 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -619,7 +619,7 @@ struct vm_operations_struct {
 	 * policy.
 	 */
 	struct mempolicy *(*get_policy)(struct vm_area_struct *vma,
-					unsigned long addr);
+					unsigned long addr, pgoff_t *ilx);
 #endif
 	/*
 	 * Called by vm_normal_page() for special PTEs to find the
-- 
cgit v1.2.3


From 72e315f7a750281b4410ac30d8930f735459e72d Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Tue, 3 Oct 2023 02:27:47 -0700
Subject: mempolicy: mmap_lock is not needed while migrating folios

mbind(2) holds down_write of current task's mmap_lock throughout
(exclusive because it needs to set the new mempolicy on the vmas);
migrate_pages(2) holds down_read of pid's mmap_lock throughout.

They both hold mmap_lock across the internal migrate_pages(), under which
all new page allocations (huge or small) are made.  I'm nervous about it;
and migrate_pages() certainly does not need mmap_lock itself.  It's done
this way for mbind(2), because its page allocator is vma_alloc_folio() or
alloc_hugetlb_folio_vma(), both of which depend on vma and address.

Now that we have alloc_pages_mpol(), depending on (refcounted) memory
policy and interleave index, mbind(2) can be modified to use that or
alloc_hugetlb_folio_nodemask(), and then not need mmap_lock across the
internal migrate_pages() at all: add alloc_migration_target_by_mpol() to
replace mbind's new_page().

(After that change, alloc_hugetlb_folio_vma() is used by nothing but a
userfaultfd function: move it out of hugetlb.h and into the #ifdef.)

migrate_pages(2) has chosen its target node before migrating, so can
continue to use the standard alloc_migration_target(); but let it take and
drop mmap_lock just around migrate_to_node()'s queue_pages_range():
neither the node-to-node calculations nor the page migrations need it.

It seems unlikely, but it is conceivable that some userspace depends on
the kernel's mmap_lock exclusion here, instead of doing its own locking:
more likely in a testsuite than in real life.  It is also possible, of
course, that some pages on the list will be munmapped by another thread
before they are migrated, or a newer memory policy applied to the range by
that time: but such races could happen before, as soon as mmap_lock was
dropped, so it does not appear to be a concern.

Link: https://lkml.kernel.org/r/21e564e8-269f-6a89-7ee2-fd612831c289@google.com
Signed-off-by: Hugh Dickins <hughd@google.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: "Huang, Ying" <ying.huang@intel.com>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Nhat Pham <nphamcs@gmail.com>
Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Tejun heo <tj@kernel.org>
Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com>
Cc: Yang Shi <shy828301@gmail.com>
Cc: Yosry Ahmed <yosryahmed@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/hugetlb.h | 9 ---------
 1 file changed, 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 158ff156710b..d3acecc5db4b 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -748,8 +748,6 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma,
 				unsigned long addr, int avoid_reserve);
 struct folio *alloc_hugetlb_folio_nodemask(struct hstate *h, int preferred_nid,
 				nodemask_t *nmask, gfp_t gfp_mask);
-struct folio *alloc_hugetlb_folio_vma(struct hstate *h, struct vm_area_struct *vma,
-				unsigned long address);
 int hugetlb_add_to_page_cache(struct folio *folio, struct address_space *mapping,
 			pgoff_t idx);
 void restore_reserve_on_error(struct hstate *h, struct vm_area_struct *vma,
@@ -1072,13 +1070,6 @@ alloc_hugetlb_folio_nodemask(struct hstate *h, int preferred_nid,
 	return NULL;
 }
 
-static inline struct folio *alloc_hugetlb_folio_vma(struct hstate *h,
-					       struct vm_area_struct *vma,
-					       unsigned long address)
-{
-	return NULL;
-}
-
 static inline int __alloc_bootmem_huge_page(struct hstate *h)
 {
 	return 0;
-- 
cgit v1.2.3