summaryrefslogtreecommitdiff
path: root/include/linux
diff options
context:
space:
mode:
Diffstat (limited to 'include/linux')
-rw-r--r--include/linux/alloc_tag.h11
-rw-r--r--include/linux/bug.h10
-rw-r--r--include/linux/damon.h112
-rw-r--r--include/linux/fs.h7
-rw-r--r--include/linux/gfp.h30
-rw-r--r--include/linux/huge_mm.h2
-rw-r--r--include/linux/hugetlb.h19
-rw-r--r--include/linux/kasan.h2
-rw-r--r--include/linux/ksm.h1
-rw-r--r--include/linux/list_lru.h44
-rw-r--r--include/linux/memblock.h10
-rw-r--r--include/linux/memcontrol.h43
-rw-r--r--include/linux/memfd.h23
-rw-r--r--include/linux/memory_hotplug.h5
-rw-r--r--include/linux/migrate.h6
-rw-r--r--include/linux/mm.h129
-rw-r--r--include/linux/mm_inline.h99
-rw-r--r--include/linux/mm_types.h4
-rw-r--r--include/linux/mmdebug.h14
-rw-r--r--include/linux/mmzone.h99
-rw-r--r--include/linux/numa_memblks.h3
-rw-r--r--include/linux/page-flags.h23
-rw-r--r--include/linux/page-isolation.h2
-rw-r--r--include/linux/pagemap.h24
-rw-r--r--include/linux/pgtable.h9
-rw-r--r--include/linux/sched/hotplug.h4
-rw-r--r--include/linux/seqlock.h1
-rw-r--r--include/linux/swap.h34
-rw-r--r--include/linux/swap_cgroup.h14
-rw-r--r--include/linux/swap_slots.h3
-rw-r--r--include/linux/task_work.h3
31 files changed, 447 insertions, 343 deletions
diff --git a/include/linux/alloc_tag.h b/include/linux/alloc_tag.h
index 0bbbe537c5f9..a946e0203e6d 100644
--- a/include/linux/alloc_tag.h
+++ b/include/linux/alloc_tag.h
@@ -224,9 +224,14 @@ static inline void alloc_tag_sub(union codetag_ref *ref, size_t bytes) {}
#define alloc_hooks_tag(_tag, _do_alloc) \
({ \
- struct alloc_tag * __maybe_unused _old = alloc_tag_save(_tag); \
- typeof(_do_alloc) _res = _do_alloc; \
- alloc_tag_restore(_tag, _old); \
+ typeof(_do_alloc) _res; \
+ if (mem_alloc_profiling_enabled()) { \
+ struct alloc_tag * __maybe_unused _old; \
+ _old = alloc_tag_save(_tag); \
+ _res = _do_alloc; \
+ alloc_tag_restore(_tag, _old); \
+ } else \
+ _res = _do_alloc; \
_res; \
})
diff --git a/include/linux/bug.h b/include/linux/bug.h
index 348acf2558f3..a9948a9f1093 100644
--- a/include/linux/bug.h
+++ b/include/linux/bug.h
@@ -73,15 +73,23 @@ static inline void generic_bug_clear_once(void) {}
#endif /* CONFIG_GENERIC_BUG */
+#ifdef CONFIG_PRINTK
+void mem_dump_obj(void *object);
+#else
+static inline void mem_dump_obj(void *object) {}
+#endif
+
/*
* Since detected data corruption should stop operation on the affected
* structures. Return value must be checked and sanely acted on by caller.
*/
static inline __must_check bool check_data_corruption(bool v) { return v; }
-#define CHECK_DATA_CORRUPTION(condition, fmt, ...) \
+#define CHECK_DATA_CORRUPTION(condition, addr, fmt, ...) \
check_data_corruption(({ \
bool corruption = unlikely(condition); \
if (corruption) { \
+ if (addr) \
+ mem_dump_obj(addr); \
if (IS_ENABLED(CONFIG_BUG_ON_DATA_CORRUPTION)) { \
pr_err(fmt, ##__VA_ARGS__); \
BUG(); \
diff --git a/include/linux/damon.h b/include/linux/damon.h
index a67f2c4940e9..af525252b853 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -193,11 +193,16 @@ struct damos_quota_goal {
* size quota is set, DAMON tries to apply the action only up to &sz bytes
* within &reset_interval.
*
- * Internally, the time quota is transformed to a size quota using estimated
- * throughput of the scheme's action. DAMON then compares it against &sz and
- * uses smaller one as the effective quota.
+ * To convince the different types of quotas and goals, DAMON internally
+ * converts those into one single size quota called "effective quota". DAMON
+ * internally uses it as the only one real quota. The conversion is made as
+ * follows.
*
- * If @goals is not empt, DAMON calculates yet another size quota based on the
+ * The time quota is transformed to a size quota using estimated throughput of
+ * the scheme's action. DAMON then compares it against &sz and uses smaller
+ * one as the effective quota.
+ *
+ * If @goals is not empty, DAMON calculates yet another size quota based on the
* goals using its internal feedback loop algorithm, for every @reset_interval.
* Then, if the new size quota is smaller than the effective quota, it uses the
* new size quota as the effective quota.
@@ -286,13 +291,33 @@ struct damos_watermarks {
* @sz_tried: Total size of regions that the scheme is tried to be applied.
* @nr_applied: Total number of regions that the scheme is applied.
* @sz_applied: Total size of regions that the scheme is applied.
+ * @sz_ops_filter_passed:
+ * Total bytes that passed ops layer-handled DAMOS filters.
* @qt_exceeds: Total number of times the quota of the scheme has exceeded.
+ *
+ * "Tried an action to a region" in this context means the DAMOS core logic
+ * determined the region as eligible to apply the action. The access pattern
+ * (&struct damos_access_pattern), quotas (&struct damos_quota), watermarks
+ * (&struct damos_watermarks) and filters (&struct damos_filter) that handled
+ * on core logic can affect this. The core logic asks the operation set
+ * (&struct damon_operations) to apply the action to the region.
+ *
+ * "Applied an action to a region" in this context means the operation set
+ * (&struct damon_operations) successfully applied the action to the region, at
+ * least to a part of the region. The filters (&struct damos_filter) that
+ * handled on operation set layer and type of the action and pages of the
+ * region can affect this. For example, if a filter is set to exclude
+ * anonymous pages and the region has only anonymous pages, the region will be
+ * failed at applying the action. If the action is &DAMOS_PAGEOUT and all
+ * pages of the region are already paged out, the region will be failed at
+ * applying the action.
*/
struct damos_stat {
unsigned long nr_tried;
unsigned long sz_tried;
unsigned long nr_applied;
unsigned long sz_applied;
+ unsigned long sz_ops_filter_passed;
unsigned long qt_exceeds;
};
@@ -327,8 +352,9 @@ enum damos_filter_type {
/**
* struct damos_filter - DAMOS action target memory filter.
- * @type: Type of the page.
- * @matching: If the matching page should filtered out or in.
+ * @type: Type of the target memory.
+ * @matching: Whether this is for @type-matching memory.
+ * @allow: Whether to include or exclude the @matching memory.
* @memcg_id: Memcg id of the question if @type is DAMOS_FILTER_MEMCG.
* @addr_range: Address range if @type is DAMOS_FILTER_TYPE_ADDR.
* @target_idx: Index of the &struct damon_target of
@@ -337,13 +363,15 @@ enum damos_filter_type {
* @list: List head for siblings.
*
* Before applying the &damos->action to a memory region, DAMOS checks if each
- * page of the region matches to this and avoid applying the action if so.
- * Support of each filter type depends on the running &struct damon_operations
- * and the type. Refer to &enum damos_filter_type for more detai.
+ * byte of the region matches to this given condition and avoid applying the
+ * action if so. Support of each filter type depends on the running &struct
+ * damon_operations and the type. Refer to &enum damos_filter_type for more
+ * details.
*/
struct damos_filter {
enum damos_filter_type type;
bool matching;
+ bool allow;
union {
unsigned short memcg_id;
struct damon_addr_range addr_range;
@@ -352,6 +380,31 @@ struct damos_filter {
struct list_head list;
};
+struct damon_ctx;
+struct damos;
+
+/**
+ * struct damos_walk_control - Control damos_walk().
+ *
+ * @walk_fn: Function to be called back for each region.
+ * @data: Data that will be passed to walk functions.
+ *
+ * Control damos_walk(), which requests specific kdamond to invoke the given
+ * function to each region that eligible to apply actions of the kdamond's
+ * schemes. Refer to damos_walk() for more details.
+ */
+struct damos_walk_control {
+ void (*walk_fn)(void *data, struct damon_ctx *ctx,
+ struct damon_target *t, struct damon_region *r,
+ struct damos *s, unsigned long sz_filter_passed);
+ void *data;
+/* private: internal use only */
+ /* informs if the kdamond finished handling of the walk request */
+ struct completion completion;
+ /* informs if the walk is canceled. */
+ bool canceled;
+};
+
/**
* struct damos_access_pattern - Target access pattern of the given scheme.
* @min_sz_region: Minimum size of target regions.
@@ -415,6 +468,8 @@ struct damos {
* @action
*/
unsigned long next_apply_sis;
+ /* informs if ongoing DAMOS walk for this scheme is finished */
+ bool walk_completed;
/* public: */
struct damos_quota quota;
struct damos_watermarks wmarks;
@@ -442,8 +497,6 @@ enum damon_ops_id {
NR_DAMON_OPS,
};
-struct damon_ctx;
-
/**
* struct damon_operations - Monitoring operations for given use cases.
*
@@ -487,7 +540,8 @@ struct damon_ctx;
* @apply_scheme is called from @kdamond when a region for user provided
* DAMON-based operation scheme is found. It should apply the scheme's action
* to the region and return bytes of the region that the action is successfully
- * applied.
+ * applied. It should also report how many bytes of the region has passed
+ * filters (&struct damos_filter) that handled by itself.
* @target_valid should check whether the target is still valid for the
* monitoring.
* @cleanup is called from @kdamond just before its termination.
@@ -504,7 +558,7 @@ struct damon_operations {
struct damos *scheme);
unsigned long (*apply_scheme)(struct damon_ctx *context,
struct damon_target *t, struct damon_region *r,
- struct damos *scheme);
+ struct damos *scheme, unsigned long *sz_filter_passed);
bool (*target_valid)(struct damon_target *t);
void (*cleanup)(struct damon_ctx *context);
};
@@ -552,6 +606,27 @@ struct damon_callback {
void (*before_terminate)(struct damon_ctx *context);
};
+/*
+ * struct damon_call_control - Control damon_call().
+ *
+ * @fn: Function to be called back.
+ * @data: Data that will be passed to @fn.
+ * @return_code: Return code from @fn invocation.
+ *
+ * Control damon_call(), which requests specific kdamond to invoke a given
+ * function. Refer to damon_call() for more details.
+ */
+struct damon_call_control {
+ int (*fn)(void *data);
+ void *data;
+ int return_code;
+/* private: internal use only */
+ /* informs if the kdamond finished handling of the request */
+ struct completion completion;
+ /* informs if the kdamond canceled @fn infocation */
+ bool canceled;
+};
+
/**
* struct damon_attrs - Monitoring attributes for accuracy/overhead control.
*
@@ -632,6 +707,12 @@ struct damon_ctx {
/* for scheme quotas prioritization */
unsigned long *regions_score_histogram;
+ struct damon_call_control *call_control;
+ struct mutex call_control_lock;
+
+ struct damos_walk_control *walk_control;
+ struct mutex walk_control_lock;
+
/* public: */
struct task_struct *kdamond;
struct mutex kdamond_lock;
@@ -725,7 +806,7 @@ void damon_update_region_access_rate(struct damon_region *r, bool accessed,
struct damon_attrs *attrs);
struct damos_filter *damos_new_filter(enum damos_filter_type type,
- bool matching);
+ bool matching, bool allow);
void damos_add_filter(struct damos *s, struct damos_filter *f);
void damos_destroy_filter(struct damos_filter *f);
@@ -779,6 +860,9 @@ static inline unsigned int damon_max_nr_accesses(const struct damon_attrs *attrs
int damon_start(struct damon_ctx **ctxs, int nr_ctxs, bool exclusive);
int damon_stop(struct damon_ctx **ctxs, int nr_ctxs);
+int damon_call(struct damon_ctx *ctx, struct damon_call_control *control);
+int damos_walk(struct damon_ctx *ctx, struct damos_walk_control *control);
+
int damon_set_region_biggest_system_ram_default(struct damon_target *t,
unsigned long *start, unsigned long *end);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 534e652bd05f..be3ad155ec9f 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2926,6 +2926,8 @@ extern int __must_check file_fdatawait_range(struct file *file, loff_t lstart,
extern int __must_check file_check_and_advance_wb_err(struct file *file);
extern int __must_check file_write_and_wait_range(struct file *file,
loff_t start, loff_t end);
+int filemap_fdatawrite_range_kick(struct address_space *mapping, loff_t start,
+ loff_t end);
static inline int file_write_and_wait(struct file *file)
{
@@ -2958,6 +2960,11 @@ static inline ssize_t generic_write_sync(struct kiocb *iocb, ssize_t count)
(iocb->ki_flags & IOCB_SYNC) ? 0 : 1);
if (ret)
return ret;
+ } else if (iocb->ki_flags & IOCB_DONTCACHE) {
+ struct address_space *mapping = iocb->ki_filp->f_mapping;
+
+ filemap_fdatawrite_range_kick(mapping, iocb->ki_pos,
+ iocb->ki_pos + count);
}
return count;
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index b0fe9f62d15b..6bb1a5a7a4ae 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -212,35 +212,31 @@ struct folio *__folio_alloc_noprof(gfp_t gfp, unsigned int order, int preferred_
unsigned long alloc_pages_bulk_noprof(gfp_t gfp, int preferred_nid,
nodemask_t *nodemask, int nr_pages,
- struct list_head *page_list,
struct page **page_array);
#define __alloc_pages_bulk(...) alloc_hooks(alloc_pages_bulk_noprof(__VA_ARGS__))
-unsigned long alloc_pages_bulk_array_mempolicy_noprof(gfp_t gfp,
+unsigned long alloc_pages_bulk_mempolicy_noprof(gfp_t gfp,
unsigned long nr_pages,
struct page **page_array);
-#define alloc_pages_bulk_array_mempolicy(...) \
- alloc_hooks(alloc_pages_bulk_array_mempolicy_noprof(__VA_ARGS__))
+#define alloc_pages_bulk_mempolicy(...) \
+ alloc_hooks(alloc_pages_bulk_mempolicy_noprof(__VA_ARGS__))
/* Bulk allocate order-0 pages */
-#define alloc_pages_bulk_list(_gfp, _nr_pages, _list) \
- __alloc_pages_bulk(_gfp, numa_mem_id(), NULL, _nr_pages, _list, NULL)
-
-#define alloc_pages_bulk_array(_gfp, _nr_pages, _page_array) \
- __alloc_pages_bulk(_gfp, numa_mem_id(), NULL, _nr_pages, NULL, _page_array)
+#define alloc_pages_bulk(_gfp, _nr_pages, _page_array) \
+ __alloc_pages_bulk(_gfp, numa_mem_id(), NULL, _nr_pages, _page_array)
static inline unsigned long
-alloc_pages_bulk_array_node_noprof(gfp_t gfp, int nid, unsigned long nr_pages,
+alloc_pages_bulk_node_noprof(gfp_t gfp, int nid, unsigned long nr_pages,
struct page **page_array)
{
if (nid == NUMA_NO_NODE)
nid = numa_mem_id();
- return alloc_pages_bulk_noprof(gfp, nid, NULL, nr_pages, NULL, page_array);
+ return alloc_pages_bulk_noprof(gfp, nid, NULL, nr_pages, page_array);
}
-#define alloc_pages_bulk_array_node(...) \
- alloc_hooks(alloc_pages_bulk_array_node_noprof(__VA_ARGS__))
+#define alloc_pages_bulk_node(...) \
+ alloc_hooks(alloc_pages_bulk_node_noprof(__VA_ARGS__))
static inline void warn_if_node_offline(int this_node, gfp_t gfp_mask)
{
@@ -300,8 +296,6 @@ static inline struct page *alloc_pages_node_noprof(int nid, gfp_t gfp_mask,
#ifdef CONFIG_NUMA
struct page *alloc_pages_noprof(gfp_t gfp, unsigned int order);
-struct page *alloc_pages_mpol_noprof(gfp_t gfp, unsigned int order,
- struct mempolicy *mpol, pgoff_t ilx, int nid);
struct folio *folio_alloc_noprof(gfp_t gfp, unsigned int order);
struct folio *folio_alloc_mpol_noprof(gfp_t gfp, unsigned int order,
struct mempolicy *mpol, pgoff_t ilx, int nid);
@@ -312,11 +306,6 @@ static inline struct page *alloc_pages_noprof(gfp_t gfp_mask, unsigned int order
{
return alloc_pages_node_noprof(numa_node_id(), gfp_mask, order);
}
-static inline struct page *alloc_pages_mpol_noprof(gfp_t gfp, unsigned int order,
- struct mempolicy *mpol, pgoff_t ilx, int nid)
-{
- return alloc_pages_noprof(gfp, order);
-}
static inline struct folio *folio_alloc_noprof(gfp_t gfp, unsigned int order)
{
return __folio_alloc_node_noprof(gfp, order, numa_node_id());
@@ -331,7 +320,6 @@ static inline struct folio *folio_alloc_mpol_noprof(gfp_t gfp, unsigned int orde
#endif
#define alloc_pages(...) alloc_hooks(alloc_pages_noprof(__VA_ARGS__))
-#define alloc_pages_mpol(...) alloc_hooks(alloc_pages_mpol_noprof(__VA_ARGS__))
#define folio_alloc(...) alloc_hooks(folio_alloc_noprof(__VA_ARGS__))
#define folio_alloc_mpol(...) alloc_hooks(folio_alloc_mpol_noprof(__VA_ARGS__))
#define vma_alloc_folio(...) alloc_hooks(vma_alloc_folio_noprof(__VA_ARGS__))
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index b94c2e8ee918..93e509b6c00e 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -121,6 +121,8 @@ enum mthp_stat_item {
MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE,
MTHP_STAT_ZSWPOUT,
MTHP_STAT_SWPIN,
+ MTHP_STAT_SWPIN_FALLBACK,
+ MTHP_STAT_SWPIN_FALLBACK_CHARGE,
MTHP_STAT_SWPOUT,
MTHP_STAT_SWPOUT_FALLBACK,
MTHP_STAT_SHMEM_ALLOC,
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index ae4fe8615bb6..ec8c0ccc8f95 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -153,11 +153,11 @@ bool hugetlb_reserve_pages(struct inode *inode, long from, long to,
vm_flags_t vm_flags);
long hugetlb_unreserve_pages(struct inode *inode, long start, long end,
long freed);
-bool isolate_hugetlb(struct folio *folio, struct list_head *list);
+bool folio_isolate_hugetlb(struct folio *folio, struct list_head *list);
int get_hwpoison_hugetlb_folio(struct folio *folio, bool *hugetlb, bool unpoison);
int get_huge_page_for_hwpoison(unsigned long pfn, int flags,
bool *migratable_cleared);
-void folio_putback_active_hugetlb(struct folio *folio);
+void folio_putback_hugetlb(struct folio *folio);
void move_hugetlb_state(struct folio *old_folio, struct folio *new_folio, int reason);
void hugetlb_fix_reserve_counts(struct inode *inode);
extern struct mutex *hugetlb_fault_mutex_table;
@@ -414,7 +414,7 @@ static inline pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr,
return NULL;
}
-static inline bool isolate_hugetlb(struct folio *folio, struct list_head *list)
+static inline bool folio_isolate_hugetlb(struct folio *folio, struct list_head *list)
{
return false;
}
@@ -430,7 +430,7 @@ static inline int get_huge_page_for_hwpoison(unsigned long pfn, int flags,
return 0;
}
-static inline void folio_putback_active_hugetlb(struct folio *folio)
+static inline void folio_putback_hugetlb(struct folio *folio)
{
}
@@ -681,8 +681,9 @@ struct huge_bootmem_page {
};
int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list);
+int replace_free_hugepage_folios(unsigned long start_pfn, unsigned long end_pfn);
struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma,
- unsigned long addr, int avoid_reserve);
+ unsigned long addr, bool cow_from_owner);
struct folio *alloc_hugetlb_folio_nodemask(struct hstate *h, int preferred_nid,
nodemask_t *nmask, gfp_t gfp_mask,
bool allow_alloc_fallback);
@@ -1059,9 +1060,15 @@ static inline int isolate_or_dissolve_huge_page(struct page *page,
return -ENOMEM;
}
+static inline int replace_free_hugepage_folios(unsigned long start_pfn,
+ unsigned long end_pfn)
+{
+ return 0;
+}
+
static inline struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma,
unsigned long addr,
- int avoid_reserve)
+ bool cow_from_owner)
{
return NULL;
}
diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index 56465af31044..890011071f2b 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -491,7 +491,6 @@ void kasan_cache_create(struct kmem_cache *cache, unsigned int *size,
void kasan_cache_shrink(struct kmem_cache *cache);
void kasan_cache_shutdown(struct kmem_cache *cache);
void kasan_record_aux_stack(void *ptr);
-void kasan_record_aux_stack_noalloc(void *ptr);
#else /* CONFIG_KASAN_GENERIC */
@@ -509,7 +508,6 @@ static inline void kasan_cache_create(struct kmem_cache *cache,
static inline void kasan_cache_shrink(struct kmem_cache *cache) {}
static inline void kasan_cache_shutdown(struct kmem_cache *cache) {}
static inline void kasan_record_aux_stack(void *ptr) {}
-static inline void kasan_record_aux_stack_noalloc(void *ptr) {}
#endif /* CONFIG_KASAN_GENERIC */
diff --git a/include/linux/ksm.h b/include/linux/ksm.h
index 6a53ac4885bb..d73095b5cd96 100644
--- a/include/linux/ksm.h
+++ b/include/linux/ksm.h
@@ -93,6 +93,7 @@ void folio_migrate_ksm(struct folio *newfolio, struct folio *folio);
void collect_procs_ksm(const struct folio *folio, const struct page *page,
struct list_head *to_kill, int force_early);
long ksm_process_profit(struct mm_struct *);
+bool ksm_process_mergeable(struct mm_struct *mm);
#else /* !CONFIG_KSM */
diff --git a/include/linux/list_lru.h b/include/linux/list_lru.h
index 05c166811f6b..fe739d35a864 100644
--- a/include/linux/list_lru.h
+++ b/include/linux/list_lru.h
@@ -91,13 +91,24 @@ void memcg_reparent_list_lrus(struct mem_cgroup *memcg, struct mem_cgroup *paren
* @memcg: the cgroup of the sublist to add the item to.
*
* If the element is already part of a list, this function returns doing
- * nothing. Therefore the caller does not need to keep state about whether or
- * not the element already belongs in the list and is allowed to lazy update
- * it. Note however that this is valid for *a* list, not *this* list. If
- * the caller organize itself in a way that elements can be in more than
- * one type of list, it is up to the caller to fully remove the item from
- * the previous list (with list_lru_del() for instance) before moving it
- * to @lru.
+ * nothing. This means that it is not necessary to keep state about whether or
+ * not the element already belongs in the list. That said, this logic only
+ * works if the item is in *this* list. If the item might be in some other
+ * list, then you cannot rely on this check and you must remove it from the
+ * other list before trying to insert it.
+ *
+ * The lru list consists of many sublists internally; the @nid and @memcg
+ * parameters are used to determine which sublist to insert the item into.
+ * It's important to use the right value of @nid and @memcg when deleting the
+ * item, since it might otherwise get deleted from the wrong sublist.
+ *
+ * This also applies when attempting to insert the item multiple times - if
+ * the item is currently in one sublist and you call list_lru_add() again, you
+ * must pass the right @nid and @memcg parameters so that the same sublist is
+ * used.
+ *
+ * You must ensure that the memcg is not freed during this call (e.g., with
+ * rcu or by taking a css refcnt).
*
* Return: true if the list was updated, false otherwise
*/
@@ -113,7 +124,7 @@ bool list_lru_add(struct list_lru *lru, struct list_head *item, int nid,
* memcg of the sublist is determined by @item list_head. This assumption is
* valid for slab objects LRU such as dentries, inodes, etc.
*
- * Return value: true if the list was updated, false otherwise
+ * Return: true if the list was updated, false otherwise
*/
bool list_lru_add_obj(struct list_lru *lru, struct list_head *item);
@@ -125,8 +136,19 @@ bool list_lru_add_obj(struct list_lru *lru, struct list_head *item);
* @memcg: the cgroup of the sublist to delete the item from.
*
* This function works analogously as list_lru_add() in terms of list
- * manipulation. The comments about an element already pertaining to
- * a list are also valid for list_lru_del().
+ * manipulation.
+ *
+ * The comments in list_lru_add() about an element already being in a list are
+ * also valid for list_lru_del(), that is, you can delete an item that has
+ * already been removed or never been added. However, if the item is in a
+ * list, it must be in *this* list, and you must pass the right value of @nid
+ * and @memcg so that the right sublist is used.
+ *
+ * You must ensure that the memcg is not freed during this call (e.g., with
+ * rcu or by taking a css refcnt). When a memcg is deleted, list_lru entries
+ * are automatically moved to the parent memcg. This is done in a race-free
+ * way, so during deletion of an memcg both the old and new memcg will resolve
+ * to the same sublist internally.
*
* Return: true if the list was updated, false otherwise
*/
@@ -142,7 +164,7 @@ bool list_lru_del(struct list_lru *lru, struct list_head *item, int nid,
* memcg of the sublist is determined by @item list_head. This assumption is
* valid for slab objects LRU such as dentries, inodes, etc.
*
- * Return value: true if the list was updated, false otherwise.
+ * Return: true if the list was updated, false otherwise.
*/
bool list_lru_del_obj(struct list_lru *lru, struct list_head *item);
diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index 673d5cae7c81..e79eb6ac516f 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -378,6 +378,10 @@ static inline int memblock_get_region_node(const struct memblock_region *r)
/* Flags for memblock allocation APIs */
#define MEMBLOCK_ALLOC_ANYWHERE (~(phys_addr_t)0)
#define MEMBLOCK_ALLOC_ACCESSIBLE 0
+/*
+ * MEMBLOCK_ALLOC_NOLEAKTRACE avoids kmemleak tracing. It implies
+ * MEMBLOCK_ALLOC_ACCESSIBLE
+ */
#define MEMBLOCK_ALLOC_NOLEAKTRACE 1
/* We are using top down, so it is safe to use 0 here */
@@ -417,6 +421,12 @@ static __always_inline void *memblock_alloc(phys_addr_t size, phys_addr_t align)
MEMBLOCK_ALLOC_ACCESSIBLE, NUMA_NO_NODE);
}
+void *__memblock_alloc_or_panic(phys_addr_t size, phys_addr_t align,
+ const char *func);
+
+#define memblock_alloc_or_panic(size, align) \
+ __memblock_alloc_or_panic(size, align, __func__)
+
static inline void *memblock_alloc_raw(phys_addr_t size,
phys_addr_t align)
{
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 5502aa8e138e..6e74b8254d9b 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -620,8 +620,6 @@ static inline bool mem_cgroup_below_min(struct mem_cgroup *target,
page_counter_read(&memcg->memory);
}
-void mem_cgroup_commit_charge(struct folio *folio, struct mem_cgroup *memcg);
-
int __mem_cgroup_charge(struct folio *folio, struct mm_struct *mm, gfp_t gfp);
/**
@@ -646,8 +644,7 @@ static inline int mem_cgroup_charge(struct folio *folio, struct mm_struct *mm,
return __mem_cgroup_charge(folio, mm, gfp);
}
-int mem_cgroup_hugetlb_try_charge(struct mem_cgroup *memcg, gfp_t gfp,
- long nr_pages);
+int mem_cgroup_charge_hugetlb(struct folio* folio, gfp_t gfp);
int mem_cgroup_swapin_charge_folio(struct folio *folio, struct mm_struct *mm,
gfp_t gfp, swp_entry_t entry);
@@ -677,7 +674,6 @@ static inline void mem_cgroup_uncharge_folios(struct folio_batch *folios)
__mem_cgroup_uncharge_folios(folios);
}
-void mem_cgroup_cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages);
void mem_cgroup_replace_folio(struct folio *old, struct folio *new);
void mem_cgroup_migrate(struct folio *old, struct folio *new);
@@ -1046,6 +1042,23 @@ static inline void memcg_memory_event_mm(struct mm_struct *mm,
void split_page_memcg(struct page *head, int old_order, int new_order);
+static inline u64 cgroup_id_from_mm(struct mm_struct *mm)
+{
+ struct mem_cgroup *memcg;
+ u64 id;
+
+ if (mem_cgroup_disabled())
+ return 0;
+
+ rcu_read_lock();
+ memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
+ if (!memcg)
+ memcg = root_mem_cgroup;
+ id = cgroup_id(memcg->css.cgroup);
+ rcu_read_unlock();
+ return id;
+}
+
#else /* CONFIG_MEMCG */
#define MEM_CGROUP_ID_SHIFT 0
@@ -1135,21 +1148,15 @@ static inline bool mem_cgroup_below_min(struct mem_cgroup *target,
return false;
}
-static inline void mem_cgroup_commit_charge(struct folio *folio,
- struct mem_cgroup *memcg)
-{
-}
-
static inline int mem_cgroup_charge(struct folio *folio,
struct mm_struct *mm, gfp_t gfp)
{
return 0;
}
-static inline int mem_cgroup_hugetlb_try_charge(struct mem_cgroup *memcg,
- gfp_t gfp, long nr_pages)
+static inline int mem_cgroup_charge_hugetlb(struct folio* folio, gfp_t gfp)
{
- return 0;
+ return 0;
}
static inline int mem_cgroup_swapin_charge_folio(struct folio *folio,
@@ -1170,11 +1177,6 @@ static inline void mem_cgroup_uncharge_folios(struct folio_batch *folios)
{
}
-static inline void mem_cgroup_cancel_charge(struct mem_cgroup *memcg,
- unsigned int nr_pages)
-{
-}
-
static inline void mem_cgroup_replace_folio(struct folio *old,
struct folio *new)
{
@@ -1466,6 +1468,11 @@ void count_memcg_event_mm(struct mm_struct *mm, enum vm_event_item idx)
static inline void split_page_memcg(struct page *head, int old_order, int new_order)
{
}
+
+static inline u64 cgroup_id_from_mm(struct mm_struct *mm)
+{
+ return 0;
+}
#endif /* CONFIG_MEMCG */
/*
diff --git a/include/linux/memfd.h b/include/linux/memfd.h
index d437e3070850..246daadbfde8 100644
--- a/include/linux/memfd.h
+++ b/include/linux/memfd.h
@@ -7,7 +7,14 @@
#ifdef CONFIG_MEMFD_CREATE
extern long memfd_fcntl(struct file *file, unsigned int cmd, unsigned int arg);
struct folio *memfd_alloc_folio(struct file *memfd, pgoff_t idx);
-unsigned int *memfd_file_seals_ptr(struct file *file);
+/*
+ * Check for any existing seals on mmap, return an error if access is denied due
+ * to sealing, or 0 otherwise.
+ *
+ * We also update VMA flags if appropriate by manipulating the VMA flags pointed
+ * to by vm_flags_ptr.
+ */
+int memfd_check_seals_mmap(struct file *file, unsigned long *vm_flags_ptr);
#else
static inline long memfd_fcntl(struct file *f, unsigned int c, unsigned int a)
{
@@ -17,19 +24,11 @@ static inline struct folio *memfd_alloc_folio(struct file *memfd, pgoff_t idx)
{
return ERR_PTR(-EINVAL);
}
-
-static inline unsigned int *memfd_file_seals_ptr(struct file *file)
+static inline int memfd_check_seals_mmap(struct file *file,
+ unsigned long *vm_flags_ptr)
{
- return NULL;
+ return 0;
}
#endif
-/* Retrieve memfd seals associated with the file, if any. */
-static inline unsigned int memfd_file_seals(struct file *file)
-{
- unsigned int *sealsp = memfd_file_seals_ptr(file);
-
- return sealsp ? *sealsp : 0;
-}
-
#endif /* __LINUX_MEMFD_H */
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index b27ddce5d324..eaac5ae8c05c 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -144,8 +144,6 @@ extern u64 max_mem_size;
extern int mhp_online_type_from_str(const char *str);
-/* Default online_type (MMOP_*) when new memory blocks are added. */
-extern int mhp_default_online_type;
/* If movable_node boot option specified */
extern bool movable_node_enabled;
static inline bool movable_node_is_enabled(void)
@@ -303,6 +301,9 @@ static inline void __remove_memory(u64 start, u64 size) {}
#endif /* CONFIG_MEMORY_HOTREMOVE */
#ifdef CONFIG_MEMORY_HOTPLUG
+/* Default online_type (MMOP_*) when new memory blocks are added. */
+extern int mhp_get_default_online_type(void);
+extern void mhp_set_default_online_type(int online_type);
extern void __ref free_area_init_core_hotplug(struct pglist_data *pgdat);
extern int __add_memory(int nid, u64 start, u64 size, mhp_t mhp_flags);
extern int add_memory(int nid, u64 start, u64 size, mhp_t mhp_flags);
diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index 002e49b2ebd9..29919faea2f1 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -144,16 +144,14 @@ const struct movable_operations *page_movable_ops(struct page *page)
#ifdef CONFIG_NUMA_BALANCING
int migrate_misplaced_folio_prepare(struct folio *folio,
struct vm_area_struct *vma, int node);
-int migrate_misplaced_folio(struct folio *folio, struct vm_area_struct *vma,
- int node);
+int migrate_misplaced_folio(struct folio *folio, int node);
#else
static inline int migrate_misplaced_folio_prepare(struct folio *folio,
struct vm_area_struct *vma, int node)
{
return -EAGAIN; /* can't migrate now */
}
-static inline int migrate_misplaced_folio(struct folio *folio,
- struct vm_area_struct *vma, int node)
+static inline int migrate_misplaced_folio(struct folio *folio, int node)
{
return -EAGAIN; /* can't migrate now */
}
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 21428d897d76..7b1068ddcbb7 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2320,6 +2320,7 @@ extern void pagefault_out_of_memory(void);
struct zap_details {
struct folio *single_folio; /* Locked folio to be unmapped */
bool even_cows; /* Zap COWed private pages too? */
+ bool reclaim_pt; /* Need reclaim page tables? */
zap_flags_t zap_flags; /* Extra flags for zapping */
};
@@ -2991,18 +2992,15 @@ static inline bool ptlock_init(struct ptdesc *ptdesc) { return true; }
static inline void ptlock_free(struct ptdesc *ptdesc) {}
#endif /* defined(CONFIG_SPLIT_PTE_PTLOCKS) */
-static inline bool pagetable_pte_ctor(struct ptdesc *ptdesc)
+static inline void __pagetable_ctor(struct ptdesc *ptdesc)
{
struct folio *folio = ptdesc_folio(ptdesc);
- if (!ptlock_init(ptdesc))
- return false;
__folio_set_pgtable(folio);
lruvec_stat_add_folio(folio, NR_PAGETABLE);
- return true;
}
-static inline void pagetable_pte_dtor(struct ptdesc *ptdesc)
+static inline void pagetable_dtor(struct ptdesc *ptdesc)
{
struct folio *folio = ptdesc_folio(ptdesc);
@@ -3011,6 +3009,20 @@ static inline void pagetable_pte_dtor(struct ptdesc *ptdesc)
lruvec_stat_sub_folio(folio, NR_PAGETABLE);
}
+static inline void pagetable_dtor_free(struct ptdesc *ptdesc)
+{
+ pagetable_dtor(ptdesc);
+ pagetable_free(ptdesc);
+}
+
+static inline bool pagetable_pte_ctor(struct ptdesc *ptdesc)
+{
+ if (!ptlock_init(ptdesc))
+ return false;
+ __pagetable_ctor(ptdesc);
+ return true;
+}
+
pte_t *___pte_offset_map(pmd_t *pmd, unsigned long addr, pmd_t *pmdvalp);
static inline pte_t *__pte_offset_map(pmd_t *pmd, unsigned long addr,
pmd_t *pmdvalp)
@@ -3087,14 +3099,6 @@ static inline bool pmd_ptlock_init(struct ptdesc *ptdesc)
return ptlock_init(ptdesc);
}
-static inline void pmd_ptlock_free(struct ptdesc *ptdesc)
-{
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- VM_BUG_ON_PAGE(ptdesc->pmd_huge_pte, ptdesc_page(ptdesc));
-#endif
- ptlock_free(ptdesc);
-}
-
#define pmd_huge_pte(mm, pmd) (pmd_ptdesc(pmd)->pmd_huge_pte)
#else
@@ -3105,7 +3109,6 @@ static inline spinlock_t *pmd_lockptr(struct mm_struct *mm, pmd_t *pmd)
}
static inline bool pmd_ptlock_init(struct ptdesc *ptdesc) { return true; }
-static inline void pmd_ptlock_free(struct ptdesc *ptdesc) {}
#define pmd_huge_pte(mm, pmd) ((mm)->pmd_huge_pte)
@@ -3120,25 +3123,13 @@ static inline spinlock_t *pmd_lock(struct mm_struct *mm, pmd_t *pmd)
static inline bool pagetable_pmd_ctor(struct ptdesc *ptdesc)
{
- struct folio *folio = ptdesc_folio(ptdesc);
-
if (!pmd_ptlock_init(ptdesc))
return false;
- __folio_set_pgtable(folio);
ptdesc_pmd_pts_init(ptdesc);
- lruvec_stat_add_folio(folio, NR_PAGETABLE);
+ __pagetable_ctor(ptdesc);
return true;
}
-static inline void pagetable_pmd_dtor(struct ptdesc *ptdesc)
-{
- struct folio *folio = ptdesc_folio(ptdesc);
-
- pmd_ptlock_free(ptdesc);
- __folio_clear_pgtable(folio);
- lruvec_stat_sub_folio(folio, NR_PAGETABLE);
-}
-
/*
* No scalability reason to split PUD locks yet, but follow the same pattern
* as the PMD locks to make it easier if we decide to. The VM should not be
@@ -3160,18 +3151,17 @@ static inline spinlock_t *pud_lock(struct mm_struct *mm, pud_t *pud)
static inline void pagetable_pud_ctor(struct ptdesc *ptdesc)
{
- struct folio *folio = ptdesc_folio(ptdesc);
-
- __folio_set_pgtable(folio);
- lruvec_stat_add_folio(folio, NR_PAGETABLE);
+ __pagetable_ctor(ptdesc);
}
-static inline void pagetable_pud_dtor(struct ptdesc *ptdesc)
+static inline void pagetable_p4d_ctor(struct ptdesc *ptdesc)
{
- struct folio *folio = ptdesc_folio(ptdesc);
+ __pagetable_ctor(ptdesc);
+}
- __folio_clear_pgtable(folio);
- lruvec_stat_sub_folio(folio, NR_PAGETABLE);
+static inline void pagetable_pgd_ctor(struct ptdesc *ptdesc)
+{
+ __pagetable_ctor(ptdesc);
}
extern void __init pagecache_init(void);
@@ -3324,6 +3314,8 @@ extern int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admi
extern int insert_vm_struct(struct mm_struct *, struct vm_area_struct *);
extern void exit_mmap(struct mm_struct *);
int relocate_vma_down(struct vm_area_struct *vma, unsigned long shift);
+bool mmap_read_lock_maybe_expand(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long addr, bool write);
static inline int check_data_rlimit(unsigned long rlim,
unsigned long new,
@@ -3371,9 +3363,6 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
return __get_unmapped_area(file, addr, len, pgoff, flags, 0);
}
-extern unsigned long mmap_region(struct file *file, unsigned long addr,
- unsigned long len, vm_flags_t vm_flags, unsigned long pgoff,
- struct list_head *uf);
extern unsigned long do_mmap(struct file *file, unsigned long addr,
unsigned long len, unsigned long prot, unsigned long flags,
vm_flags_t vm_flags, unsigned long pgoff, unsigned long *populate,
@@ -3438,9 +3427,6 @@ extern unsigned long stack_guard_gap;
int expand_stack_locked(struct vm_area_struct *vma, unsigned long address);
struct vm_area_struct *expand_stack(struct mm_struct * mm, unsigned long addr);
-/* CONFIG_STACK_GROWSUP still needs to grow downwards at some places */
-int expand_downwards(struct vm_area_struct *vma, unsigned long address);
-
/* Look up the first VMA which satisfies addr < vm_end, NULL if none. */
extern struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long addr);
extern struct vm_area_struct * find_vma_prev(struct mm_struct * mm, unsigned long addr,
@@ -4097,67 +4083,6 @@ unsigned long wp_shared_mapping_range(struct address_space *mapping,
extern int sysctl_nr_trim_pages;
-#ifdef CONFIG_PRINTK
-void mem_dump_obj(void *object);
-#else
-static inline void mem_dump_obj(void *object) {}
-#endif
-
-static inline bool is_write_sealed(int seals)
-{
- return seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE);
-}
-
-/**
- * is_readonly_sealed - Checks whether write-sealed but mapped read-only,
- * in which case writes should be disallowing moving
- * forwards.
- * @seals: the seals to check
- * @vm_flags: the VMA flags to check
- *
- * Returns whether readonly sealed, in which case writess should be disallowed
- * going forward.
- */
-static inline bool is_readonly_sealed(int seals, vm_flags_t vm_flags)
-{
- /*
- * Since an F_SEAL_[FUTURE_]WRITE sealed memfd can be mapped as
- * MAP_SHARED and read-only, take care to not allow mprotect to
- * revert protections on such mappings. Do this only for shared
- * mappings. For private mappings, don't need to mask
- * VM_MAYWRITE as we still want them to be COW-writable.
- */
- if (is_write_sealed(seals) &&
- ((vm_flags & (VM_SHARED | VM_WRITE)) == VM_SHARED))
- return true;
-
- return false;
-}
-
-/**
- * seal_check_write - Check for F_SEAL_WRITE or F_SEAL_FUTURE_WRITE flags and
- * handle them.
- * @seals: the seals to check
- * @vma: the vma to operate on
- *
- * Check whether F_SEAL_WRITE or F_SEAL_FUTURE_WRITE are set; if so, do proper
- * check/handling on the vma flags. Return 0 if check pass, or <0 for errors.
- */
-static inline int seal_check_write(int seals, struct vm_area_struct *vma)
-{
- if (!is_write_sealed(seals))
- return 0;
-
- /*
- * New PROT_WRITE and MAP_SHARED mmaps are not allowed when
- * write seals are active.
- */
- if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_WRITE))
- return -EPERM;
-
- return 0;
-}
-
#ifdef CONFIG_ANON_VMA_NAME
int madvise_set_anon_name(struct mm_struct *mm, unsigned long start,
unsigned long len_in,
diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
index 1b6a917fffa4..f9157a0c42a5 100644
--- a/include/linux/mm_inline.h
+++ b/include/linux/mm_inline.h
@@ -133,31 +133,25 @@ static inline int lru_hist_from_seq(unsigned long seq)
return seq % NR_HIST_GENS;
}
-static inline int lru_tier_from_refs(int refs)
+static inline int lru_tier_from_refs(int refs, bool workingset)
{
VM_WARN_ON_ONCE(refs > BIT(LRU_REFS_WIDTH));
- /* see the comment in folio_lru_refs() */
- return order_base_2(refs + 1);
+ /* see the comment on MAX_NR_TIERS */
+ return workingset ? MAX_NR_TIERS - 1 : order_base_2(refs);
}
static inline int folio_lru_refs(struct folio *folio)
{
unsigned long flags = READ_ONCE(folio->flags);
- bool workingset = flags & BIT(PG_workingset);
+ if (!(flags & BIT(PG_referenced)))
+ return 0;
/*
- * Return the number of accesses beyond PG_referenced, i.e., N-1 if the
- * total number of accesses is N>1, since N=0,1 both map to the first
- * tier. lru_tier_from_refs() will account for this off-by-one. Also see
- * the comment on MAX_NR_TIERS.
+ * Return the total number of accesses including PG_referenced. Also see
+ * the comment on LRU_REFS_FLAGS.
*/
- return ((flags & LRU_REFS_MASK) >> LRU_REFS_PGOFF) + workingset;
-}
-
-static inline void folio_clear_lru_refs(struct folio *folio)
-{
- set_mask_bits(&folio->flags, LRU_REFS_MASK | LRU_REFS_FLAGS, 0);
+ return ((flags & LRU_REFS_MASK) >> LRU_REFS_PGOFF) + 1;
}
static inline int folio_lru_gen(struct folio *folio)
@@ -223,11 +217,43 @@ static inline void lru_gen_update_size(struct lruvec *lruvec, struct folio *foli
VM_WARN_ON_ONCE(lru_gen_is_active(lruvec, old_gen) && !lru_gen_is_active(lruvec, new_gen));
}
+static inline unsigned long lru_gen_folio_seq(struct lruvec *lruvec, struct folio *folio,
+ bool reclaiming)
+{
+ int gen;
+ int type = folio_is_file_lru(folio);
+ struct lru_gen_folio *lrugen = &lruvec->lrugen;
+
+ /*
+ * +-----------------------------------+-----------------------------------+
+ * | Accessed through page tables and | Accessed through file descriptors |
+ * | promoted by folio_update_gen() | and protected by folio_inc_gen() |
+ * +-----------------------------------+-----------------------------------+
+ * | PG_active (set while isolated) | |
+ * +-----------------+-----------------+-----------------+-----------------+
+ * | PG_workingset | PG_referenced | PG_workingset | LRU_REFS_FLAGS |
+ * +-----------------------------------+-----------------------------------+
+ * |<---------- MIN_NR_GENS ---------->| |
+ * |<---------------------------- MAX_NR_GENS ---------------------------->|
+ */
+ if (folio_test_active(folio))
+ gen = MIN_NR_GENS - folio_test_workingset(folio);
+ else if (reclaiming)
+ gen = MAX_NR_GENS;
+ else if ((!folio_is_file_lru(folio) && !folio_test_swapcache(folio)) ||
+ (folio_test_reclaim(folio) &&
+ (folio_test_dirty(folio) || folio_test_writeback(folio))))
+ gen = MIN_NR_GENS;
+ else
+ gen = MAX_NR_GENS - folio_test_workingset(folio);
+
+ return max(READ_ONCE(lrugen->max_seq) - gen + 1, READ_ONCE(lrugen->min_seq[type]));
+}
+
static inline bool lru_gen_add_folio(struct lruvec *lruvec, struct folio *folio, bool reclaiming)
{
unsigned long seq;
unsigned long flags;
- unsigned long mask;
int gen = folio_lru_gen(folio);
int type = folio_is_file_lru(folio);
int zone = folio_zonenum(folio);
@@ -237,40 +263,12 @@ static inline bool lru_gen_add_folio(struct lruvec *lruvec, struct folio *folio,
if (folio_test_unevictable(folio) || !lrugen->enabled)
return false;
- /*
- * There are four common cases for this page:
- * 1. If it's hot, i.e., freshly faulted in, add it to the youngest
- * generation, and it's protected over the rest below.
- * 2. If it can't be evicted immediately, i.e., a dirty page pending
- * writeback, add it to the second youngest generation.
- * 3. If it should be evicted first, e.g., cold and clean from
- * folio_rotate_reclaimable(), add it to the oldest generation.
- * 4. Everything else falls between 2 & 3 above and is added to the
- * second oldest generation if it's considered inactive, or the
- * oldest generation otherwise. See lru_gen_is_active().
- */
- if (folio_test_active(folio))
- seq = lrugen->max_seq;
- else if ((type == LRU_GEN_ANON && !folio_test_swapcache(folio)) ||
- (folio_test_reclaim(folio) &&
- (folio_test_dirty(folio) || folio_test_writeback(folio))))
- seq = lrugen->max_seq - 1;
- else if (reclaiming || lrugen->min_seq[type] + MIN_NR_GENS >= lrugen->max_seq)
- seq = lrugen->min_seq[type];
- else
- seq = lrugen->min_seq[type] + 1;
+ seq = lru_gen_folio_seq(lruvec, folio, reclaiming);
gen = lru_gen_from_seq(seq);
flags = (gen + 1UL) << LRU_GEN_PGOFF;
/* see the comment on MIN_NR_GENS about PG_active */
- mask = LRU_GEN_MASK;
- /*
- * Don't clear PG_workingset here because it can affect PSI accounting
- * if the activation is due to workingset refault.
- */
- if (folio_test_active(folio))
- mask |= LRU_REFS_MASK | BIT(PG_referenced) | BIT(PG_active);
- set_mask_bits(&folio->flags, mask, flags);
+ set_mask_bits(&folio->flags, LRU_GEN_MASK | BIT(PG_active), flags);
lru_gen_update_size(lruvec, folio, -1, gen);
/* for folio_rotate_reclaimable() */
@@ -564,9 +562,9 @@ static inline pte_marker copy_pte_marker(
* Must be called with pgtable lock held so that no thread will see the none
* pte, and if they see it, they'll fault and serialize at the pgtable lock.
*
- * This function is a no-op if PTE_MARKER_UFFD_WP is not enabled.
+ * Returns true if an uffd-wp pte was installed, false otherwise.
*/
-static inline void
+static inline bool
pte_install_uffd_wp_if_needed(struct vm_area_struct *vma, unsigned long addr,
pte_t *pte, pte_t pteval)
{
@@ -583,7 +581,7 @@ pte_install_uffd_wp_if_needed(struct vm_area_struct *vma, unsigned long addr,
* with a swap pte. There's no way of leaking the bit.
*/
if (vma_is_anonymous(vma) || !userfaultfd_wp(vma))
- return;
+ return false;
/* A uffd-wp wr-protected normal pte */
if (unlikely(pte_present(pteval) && pte_uffd_wp(pteval)))
@@ -596,10 +594,13 @@ pte_install_uffd_wp_if_needed(struct vm_area_struct *vma, unsigned long addr,
if (unlikely(pte_swp_uffd_wp_any(pteval)))
arm_uffd_pte = true;
- if (unlikely(arm_uffd_pte))
+ if (unlikely(arm_uffd_pte)) {
set_pte_at(vma->vm_mm, addr, pte,
make_pte_marker(PTE_MARKER_UFFD_WP));
+ return true;
+ }
#endif
+ return false;
}
static inline bool vma_has_recency(struct vm_area_struct *vma)
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 825c04b56403..5f1b2dc788e2 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -438,7 +438,9 @@ FOLIO_MATCH(compound_head, _head_2a);
* struct ptdesc - Memory descriptor for page tables.
* @__page_flags: Same as page flags. Powerpc only.
* @pt_rcu_head: For freeing page table pages.
- * @pt_list: List of used page tables. Used for s390 and x86.
+ * @pt_list: List of used page tables. Used for s390 gmap shadow pages
+ * (which are not linked into the user page tables) and x86
+ * pgds.
* @_pt_pad_1: Padding that aliases with page's compound head.
* @pmd_huge_pte: Protected by ptdesc->ptl, used for THPs.
* @__page_mapping: Aliases with page->mapping. Unused for page tables.
diff --git a/include/linux/mmdebug.h b/include/linux/mmdebug.h
index d7cb1e5ecbda..a0a3894900ed 100644
--- a/include/linux/mmdebug.h
+++ b/include/linux/mmdebug.h
@@ -9,10 +9,12 @@ struct page;
struct vm_area_struct;
struct mm_struct;
struct vma_iterator;
+struct vma_merge_struct;
void dump_page(const struct page *page, const char *reason);
void dump_vma(const struct vm_area_struct *vma);
void dump_mm(const struct mm_struct *mm);
+void dump_vmg(const struct vma_merge_struct *vmg, const char *reason);
void vma_iter_dump_tree(const struct vma_iterator *vmi);
#ifdef CONFIG_DEBUG_VM
@@ -87,6 +89,15 @@ void vma_iter_dump_tree(const struct vma_iterator *vmi);
} \
unlikely(__ret_warn_once); \
})
+#define VM_WARN_ON_VMG(cond, vmg) ({ \
+ int __ret_warn = !!(cond); \
+ \
+ if (unlikely(__ret_warn)) { \
+ dump_vmg(vmg, "VM_WARN_ON_VMG(" __stringify(cond)")"); \
+ WARN_ON(1); \
+ } \
+ unlikely(__ret_warn); \
+})
#define VM_WARN_ON(cond) (void)WARN_ON(cond)
#define VM_WARN_ON_ONCE(cond) (void)WARN_ON_ONCE(cond)
@@ -104,9 +115,10 @@ void vma_iter_dump_tree(const struct vma_iterator *vmi);
#define VM_WARN_ON_FOLIO(cond, folio) BUILD_BUG_ON_INVALID(cond)
#define VM_WARN_ON_ONCE_FOLIO(cond, folio) BUILD_BUG_ON_INVALID(cond)
#define VM_WARN_ON_ONCE_MM(cond, mm) BUILD_BUG_ON_INVALID(cond)
+#define VM_WARN_ON_VMG(cond, vmg) BUILD_BUG_ON_INVALID(cond)
#define VM_WARN_ONCE(cond, format...) BUILD_BUG_ON_INVALID(cond)
#define VM_WARN(cond, format...) BUILD_BUG_ON_INVALID(cond)
-#endif
+#endif /* CONFIG_DEBUG_VM */
#ifdef CONFIG_DEBUG_VM_IRQSOFF
#define VM_WARN_ON_IRQS_ENABLED() WARN_ON_ONCE(!irqs_disabled())
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index b36124145a16..9540b41894da 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -332,66 +332,88 @@ enum lruvec_flags {
#endif /* !__GENERATING_BOUNDS_H */
/*
- * Evictable pages are divided into multiple generations. The youngest and the
+ * Evictable folios are divided into multiple generations. The youngest and the
* oldest generation numbers, max_seq and min_seq, are monotonically increasing.
* They form a sliding window of a variable size [MIN_NR_GENS, MAX_NR_GENS]. An
* offset within MAX_NR_GENS, i.e., gen, indexes the LRU list of the
* corresponding generation. The gen counter in folio->flags stores gen+1 while
- * a page is on one of lrugen->folios[]. Otherwise it stores 0.
+ * a folio is on one of lrugen->folios[]. Otherwise it stores 0.
*
- * A page is added to the youngest generation on faulting. The aging needs to
- * check the accessed bit at least twice before handing this page over to the
- * eviction. The first check takes care of the accessed bit set on the initial
- * fault; the second check makes sure this page hasn't been used since then.
- * This process, AKA second chance, requires a minimum of two generations,
- * hence MIN_NR_GENS. And to maintain ABI compatibility with the active/inactive
- * LRU, e.g., /proc/vmstat, these two generations are considered active; the
- * rest of generations, if they exist, are considered inactive. See
- * lru_gen_is_active().
+ * After a folio is faulted in, the aging needs to check the accessed bit at
+ * least twice before handing this folio over to the eviction. The first check
+ * clears the accessed bit from the initial fault; the second check makes sure
+ * this folio hasn't been used since then. This process, AKA second chance,
+ * requires a minimum of two generations, hence MIN_NR_GENS. And to maintain ABI
+ * compatibility with the active/inactive LRU, e.g., /proc/vmstat, these two
+ * generations are considered active; the rest of generations, if they exist,
+ * are considered inactive. See lru_gen_is_active().
*
- * PG_active is always cleared while a page is on one of lrugen->folios[] so
- * that the aging needs not to worry about it. And it's set again when a page
- * considered active is isolated for non-reclaiming purposes, e.g., migration.
- * See lru_gen_add_folio() and lru_gen_del_folio().
+ * PG_active is always cleared while a folio is on one of lrugen->folios[] so
+ * that the sliding window needs not to worry about it. And it's set again when
+ * a folio considered active is isolated for non-reclaiming purposes, e.g.,
+ * migration. See lru_gen_add_folio() and lru_gen_del_folio().
*
* MAX_NR_GENS is set to 4 so that the multi-gen LRU can support twice the
* number of categories of the active/inactive LRU when keeping track of
* accesses through page tables. This requires order_base_2(MAX_NR_GENS+1) bits
- * in folio->flags.
+ * in folio->flags, masked by LRU_GEN_MASK.
*/
#define MIN_NR_GENS 2U
#define MAX_NR_GENS 4U
/*
- * Each generation is divided into multiple tiers. A page accessed N times
- * through file descriptors is in tier order_base_2(N). A page in the first tier
- * (N=0,1) is marked by PG_referenced unless it was faulted in through page
- * tables or read ahead. A page in any other tier (N>1) is marked by
- * PG_referenced and PG_workingset. This implies a minimum of two tiers is
- * supported without using additional bits in folio->flags.
+ * Each generation is divided into multiple tiers. A folio accessed N times
+ * through file descriptors is in tier order_base_2(N). A folio in the first
+ * tier (N=0,1) is marked by PG_referenced unless it was faulted in through page
+ * tables or read ahead. A folio in the last tier (MAX_NR_TIERS-1) is marked by
+ * PG_workingset. A folio in any other tier (1<N<5) between the first and last
+ * is marked by additional bits of LRU_REFS_WIDTH in folio->flags.
*
* In contrast to moving across generations which requires the LRU lock, moving
* across tiers only involves atomic operations on folio->flags and therefore
* has a negligible cost in the buffered access path. In the eviction path,
- * comparisons of refaulted/(evicted+protected) from the first tier and the
- * rest infer whether pages accessed multiple times through file descriptors
- * are statistically hot and thus worth protecting.
+ * comparisons of refaulted/(evicted+protected) from the first tier and the rest
+ * infer whether folios accessed multiple times through file descriptors are
+ * statistically hot and thus worth protecting.
*
* MAX_NR_TIERS is set to 4 so that the multi-gen LRU can support twice the
* number of categories of the active/inactive LRU when keeping track of
* accesses through file descriptors. This uses MAX_NR_TIERS-2 spare bits in
- * folio->flags.
+ * folio->flags, masked by LRU_REFS_MASK.
*/
#define MAX_NR_TIERS 4U
#ifndef __GENERATING_BOUNDS_H
-struct lruvec;
-struct page_vma_mapped_walk;
-
#define LRU_GEN_MASK ((BIT(LRU_GEN_WIDTH) - 1) << LRU_GEN_PGOFF)
#define LRU_REFS_MASK ((BIT(LRU_REFS_WIDTH) - 1) << LRU_REFS_PGOFF)
+/*
+ * For folios accessed multiple times through file descriptors,
+ * lru_gen_inc_refs() sets additional bits of LRU_REFS_WIDTH in folio->flags
+ * after PG_referenced, then PG_workingset after LRU_REFS_WIDTH. After all its
+ * bits are set, i.e., LRU_REFS_FLAGS|BIT(PG_workingset), a folio is lazily
+ * promoted into the second oldest generation in the eviction path. And when
+ * folio_inc_gen() does that, it clears LRU_REFS_FLAGS so that
+ * lru_gen_inc_refs() can start over. Note that for this case, LRU_REFS_MASK is
+ * only valid when PG_referenced is set.
+ *
+ * For folios accessed multiple times through page tables, folio_update_gen()
+ * from a page table walk or lru_gen_set_refs() from a rmap walk sets
+ * PG_referenced after the accessed bit is cleared for the first time.
+ * Thereafter, those two paths set PG_workingset and promote folios to the
+ * youngest generation. Like folio_inc_gen(), folio_update_gen() also clears
+ * PG_referenced. Note that for this case, LRU_REFS_MASK is not used.
+ *
+ * For both cases above, after PG_workingset is set on a folio, it remains until
+ * this folio is either reclaimed, or "deactivated" by lru_gen_clear_refs(). It
+ * can be set again if lru_gen_test_recent() returns true upon a refault.
+ */
+#define LRU_REFS_FLAGS (LRU_REFS_MASK | BIT(PG_referenced))
+
+struct lruvec;
+struct page_vma_mapped_walk;
+
#ifdef CONFIG_LRU_GEN
enum {
@@ -406,8 +428,6 @@ enum {
NR_LRU_GEN_CAPS
};
-#define LRU_REFS_FLAGS (BIT(PG_referenced) | BIT(PG_workingset))
-
#define MIN_LRU_BATCH BITS_PER_LONG
#define MAX_LRU_BATCH (MIN_LRU_BATCH * 64)
@@ -421,12 +441,11 @@ enum {
/*
* The youngest generation number is stored in max_seq for both anon and file
* types as they are aged on an equal footing. The oldest generation numbers are
- * stored in min_seq[] separately for anon and file types as clean file pages
- * can be evicted regardless of swap constraints.
- *
- * Normally anon and file min_seq are in sync. But if swapping is constrained,
- * e.g., out of swap space, file min_seq is allowed to advance and leave anon
- * min_seq behind.
+ * stored in min_seq[] separately for anon and file types so that they can be
+ * incremented independently. Ideally min_seq[] are kept in sync when both anon
+ * and file types are evictable. However, to adapt to situations like extreme
+ * swappiness, they are allowed to be out of sync by at most
+ * MAX_NR_GENS-MIN_NR_GENS-1.
*
* The number of pages in each generation is eventually consistent and therefore
* can be transiently negative when reset_batch_size() is pending.
@@ -446,8 +465,8 @@ struct lru_gen_folio {
unsigned long avg_refaulted[ANON_AND_FILE][MAX_NR_TIERS];
/* the exponential moving average of evicted+protected */
unsigned long avg_total[ANON_AND_FILE][MAX_NR_TIERS];
- /* the first tier doesn't need protection, hence the minus one */
- unsigned long protected[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS - 1];
+ /* can only be modified under the LRU lock */
+ unsigned long protected[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS];
/* can be modified without holding the LRU lock */
atomic_long_t evicted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS];
atomic_long_t refaulted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS];
@@ -498,7 +517,7 @@ struct lru_gen_mm_walk {
int mm_stats[NR_MM_STATS];
/* total batched items */
int batched;
- bool can_swap;
+ int swappiness;
bool force_scan;
};
diff --git a/include/linux/numa_memblks.h b/include/linux/numa_memblks.h
index cfad6ce7e1bd..dd85613cdd86 100644
--- a/include/linux/numa_memblks.h
+++ b/include/linux/numa_memblks.h
@@ -29,7 +29,10 @@ int __init numa_cleanup_meminfo(struct numa_meminfo *mi);
int __init numa_memblks_init(int (*init_func)(void),
bool memblock_force_top_down);
+extern int numa_distance_cnt;
+
#ifdef CONFIG_NUMA_EMU
+extern int emu_nid_to_phys[MAX_NUMNODES];
int numa_emu_cmdline(char *str);
void __init numa_emu_update_cpu_to_node(int *emu_nid_to_phys,
unsigned int nr_emu_nids);
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 691506bdf2c5..36d283552f80 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -110,6 +110,7 @@ enum pageflags {
PG_reclaim, /* To be reclaimed asap */
PG_swapbacked, /* Page is backed by RAM/swap */
PG_unevictable, /* Page is "unevictable" */
+ PG_dropbehind, /* drop pages on IO completion */
#ifdef CONFIG_MMU
PG_mlocked, /* Page is vma mlocked */
#endif
@@ -562,6 +563,10 @@ PAGEFLAG(Reclaim, reclaim, PF_NO_TAIL)
FOLIO_FLAG(readahead, FOLIO_HEAD_PAGE)
FOLIO_TEST_CLEAR_FLAG(readahead, FOLIO_HEAD_PAGE)
+FOLIO_FLAG(dropbehind, FOLIO_HEAD_PAGE)
+ FOLIO_TEST_CLEAR_FLAG(dropbehind, FOLIO_HEAD_PAGE)
+ __FOLIO_SET_FLAG(dropbehind, FOLIO_HEAD_PAGE)
+
#ifdef CONFIG_HIGHMEM
/*
* Must use a macro here due to header dependency issues. page_zone() is not
@@ -894,21 +899,9 @@ static inline int PageTransCompound(const struct page *page)
{
return PageCompound(page);
}
-
-/*
- * PageTransTail returns true for both transparent huge pages
- * and hugetlbfs pages, so it should only be called when it's known
- * that hugetlbfs pages aren't involved.
- */
-static inline int PageTransTail(const struct page *page)
-{
- return PageTail(page);
-}
#else
TESTPAGEFLAG_FALSE(TransHuge, transhuge)
TESTPAGEFLAG_FALSE(TransCompound, transcompound)
-TESTPAGEFLAG_FALSE(TransCompoundMap, transcompoundmap)
-TESTPAGEFLAG_FALSE(TransTail, transtail)
#endif
#if defined(CONFIG_MEMORY_FAILURE) && defined(CONFIG_TRANSPARENT_HUGEPAGE)
@@ -918,11 +911,9 @@ TESTPAGEFLAG_FALSE(TransTail, transtail)
*
* This flag is set by hwpoison handler. Cleared by THP split or free page.
*/
-PAGEFLAG(HasHWPoisoned, has_hwpoisoned, PF_SECOND)
- TESTSCFLAG(HasHWPoisoned, has_hwpoisoned, PF_SECOND)
+FOLIO_FLAG(has_hwpoisoned, FOLIO_SECOND_PAGE)
#else
-PAGEFLAG_FALSE(HasHWPoisoned, has_hwpoisoned)
- TESTSCFLAG_FALSE(HasHWPoisoned, has_hwpoisoned)
+FOLIO_FLAG_FALSE(has_hwpoisoned)
#endif
/*
diff --git a/include/linux/page-isolation.h b/include/linux/page-isolation.h
index 73dc2c1841ec..898bb788243b 100644
--- a/include/linux/page-isolation.h
+++ b/include/linux/page-isolation.h
@@ -31,7 +31,7 @@ bool move_freepages_block_isolate(struct zone *zone, struct page *page,
int migratetype);
int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
- int migratetype, int flags, gfp_t gfp_flags);
+ int migratetype, int flags);
void undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
int migratetype);
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index bcf0865a38ae..47bfc6b1b632 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -710,6 +710,7 @@ pgoff_t page_cache_prev_miss(struct address_space *mapping,
* * %FGP_NOFS - __GFP_FS will get cleared in gfp.
* * %FGP_NOWAIT - Don't block on the folio lock.
* * %FGP_STABLE - Wait for the folio to be stable (finished writeback)
+ * * %FGP_DONTCACHE - Uncached buffered IO
* * %FGP_WRITEBEGIN - The flags to use in a filesystem write_begin()
* implementation.
*/
@@ -723,10 +724,21 @@ typedef unsigned int __bitwise fgf_t;
#define FGP_NOWAIT ((__force fgf_t)0x00000020)
#define FGP_FOR_MMAP ((__force fgf_t)0x00000040)
#define FGP_STABLE ((__force fgf_t)0x00000080)
+#define FGP_DONTCACHE ((__force fgf_t)0x00000100)
#define FGF_GET_ORDER(fgf) (((__force unsigned)fgf) >> 26) /* top 6 bits */
#define FGP_WRITEBEGIN (FGP_LOCK | FGP_WRITE | FGP_CREAT | FGP_STABLE)
+static inline unsigned int filemap_get_order(size_t size)
+{
+ unsigned int shift = ilog2(size);
+
+ if (shift <= PAGE_SHIFT)
+ return 0;
+
+ return shift - PAGE_SHIFT;
+}
+
/**
* fgf_set_order - Encode a length in the fgf_t flags.
* @size: The suggested size of the folio to create.
@@ -740,11 +752,11 @@ typedef unsigned int __bitwise fgf_t;
*/
static inline fgf_t fgf_set_order(size_t size)
{
- unsigned int shift = ilog2(size);
+ unsigned int order = filemap_get_order(size);
- if (shift <= PAGE_SHIFT)
+ if (!order)
return 0;
- return (__force fgf_t)((shift - PAGE_SHIFT) << 26);
+ return (__force fgf_t)(order << 26);
}
void *filemap_get_entry(struct address_space *mapping, pgoff_t index);
@@ -1271,11 +1283,6 @@ void folio_wait_private_2(struct folio *folio);
int folio_wait_private_2_killable(struct folio *folio);
/*
- * Add an arbitrary waiter to a page's wait queue
- */
-void folio_add_wait_queue(struct folio *folio, wait_queue_entry_t *waiter);
-
-/*
* Fault in userspace address range.
*/
size_t fault_in_writeable(char __user *uaddr, size_t size);
@@ -1353,6 +1360,7 @@ struct readahead_control {
pgoff_t _index;
unsigned int _nr_pages;
unsigned int _batch_count;
+ bool dropbehind;
bool _workingset;
unsigned long _pflags;
};
diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index adef9d6e9b1b..94d267d02372 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -533,7 +533,14 @@ static inline void clear_young_dirty_ptes(struct vm_area_struct *vma,
static inline void ptep_clear(struct mm_struct *mm, unsigned long addr,
pte_t *ptep)
{
- ptep_get_and_clear(mm, addr, ptep);
+ pte_t pte = ptep_get(ptep);
+
+ pte_clear(mm, addr, ptep);
+ /*
+ * No need for ptep_get_and_clear(): page table check doesn't care about
+ * any bits that could have been set by HW concurrently.
+ */
+ page_table_check_pte_clear(mm, pte);
}
#ifdef CONFIG_GUP_GET_PXX_LOW_HIGH
diff --git a/include/linux/sched/hotplug.h b/include/linux/sched/hotplug.h
index 412cdaba33eb..17e04859b9a4 100644
--- a/include/linux/sched/hotplug.h
+++ b/include/linux/sched/hotplug.h
@@ -18,10 +18,6 @@ extern int sched_cpu_dying(unsigned int cpu);
# define sched_cpu_dying NULL
#endif
-#ifdef CONFIG_HOTPLUG_CPU
-extern void idle_task_exit(void);
-#else
static inline void idle_task_exit(void) {}
-#endif
#endif /* _LINUX_SCHED_HOTPLUG_H */
diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h
index d1a2346cf0f8..5ce48eab7a2a 100644
--- a/include/linux/seqlock.h
+++ b/include/linux/seqlock.h
@@ -322,6 +322,7 @@ SEQCOUNT_LOCKNAME(mutex, struct mutex, true, mutex)
* raw_seqcount_try_begin() - begin a seqcount_t read critical section
* w/o lockdep and w/o counter stabilization
* @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants
+ * @start: count to be passed to read_seqcount_retry()
*
* Similar to raw_seqcount_begin(), except it enables eliding the critical
* section entirely if odd, instead of doing the speculation knowing it will
diff --git a/include/linux/swap.h b/include/linux/swap.h
index f3e0ac20c2e8..a5f475335aea 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -219,7 +219,6 @@ enum {
SWP_STABLE_WRITES = (1 << 11), /* no overwrite PG_writeback pages */
SWP_SYNCHRONOUS_IO = (1 << 12), /* synchronous IO is efficient */
/* add others here before... */
- SWP_SCANNING = (1 << 14), /* refcount in scan_swap_map */
};
#define SWAP_CLUSTER_MAX 32UL
@@ -257,18 +256,27 @@ struct swap_cluster_info {
u8 order;
struct list_head list;
};
-#define CLUSTER_FLAG_FREE 1 /* This cluster is free */
-#define CLUSTER_FLAG_NONFULL 2 /* This cluster is on nonfull list */
-#define CLUSTER_FLAG_FRAG 4 /* This cluster is on nonfull list */
-#define CLUSTER_FLAG_FULL 8 /* This cluster is on full list */
+
+/* All on-list cluster must have a non-zero flag. */
+enum swap_cluster_flags {
+ CLUSTER_FLAG_NONE = 0, /* For temporary off-list cluster */
+ CLUSTER_FLAG_FREE,
+ CLUSTER_FLAG_NONFULL,
+ CLUSTER_FLAG_FRAG,
+ /* Clusters with flags above are allocatable */
+ CLUSTER_FLAG_USABLE = CLUSTER_FLAG_FRAG,
+ CLUSTER_FLAG_FULL,
+ CLUSTER_FLAG_DISCARD,
+ CLUSTER_FLAG_MAX,
+};
/*
* The first page in the swap file is the swap header, which is always marked
* bad to prevent it from being allocated as an entry. This also prevents the
* cluster to which it belongs being marked free. Therefore 0 is safe to use as
- * a sentinel to indicate next is not valid in percpu_cluster.
+ * a sentinel to indicate an entry is not valid.
*/
-#define SWAP_NEXT_INVALID 0
+#define SWAP_ENTRY_INVALID 0
#ifdef CONFIG_THP_SWAP
#define SWAP_NR_ORDERS (PMD_ORDER + 1)
@@ -282,6 +290,7 @@ struct swap_cluster_info {
* throughput.
*/
struct percpu_cluster {
+ local_lock_t lock; /* Protect the percpu_cluster above */
unsigned int next[SWAP_NR_ORDERS]; /* Likely next allocation offset */
};
@@ -304,15 +313,12 @@ struct swap_info_struct {
/* list of cluster that contains at least one free slot */
struct list_head frag_clusters[SWAP_NR_ORDERS];
/* list of cluster that are fragmented or contented */
- unsigned int frag_cluster_nr[SWAP_NR_ORDERS];
- unsigned int lowest_bit; /* index of first free in swap_map */
- unsigned int highest_bit; /* index of last free in swap_map */
+ atomic_long_t frag_cluster_nr[SWAP_NR_ORDERS];
unsigned int pages; /* total of usable pages of swap */
- unsigned int inuse_pages; /* number of those currently in use */
- unsigned int cluster_next; /* likely index for next allocation */
- unsigned int cluster_nr; /* countdown to next cluster search */
- unsigned int __percpu *cluster_next_cpu; /*percpu index for next allocation */
+ atomic_long_t inuse_pages; /* number of those currently in use */
struct percpu_cluster __percpu *percpu_cluster; /* per cpu's swap location */
+ struct percpu_cluster *global_cluster; /* Use one global cluster for rotating device */
+ spinlock_t global_cluster_lock; /* Serialize usage of global cluster */
struct rb_root swap_extent_root;/* root of the swap extent rbtree */
struct block_device *bdev; /* swap device or bdev of swap file */
struct file *swap_file; /* seldom referenced */
diff --git a/include/linux/swap_cgroup.h b/include/linux/swap_cgroup.h
index ae73a87775b3..b5ec038069da 100644
--- a/include/linux/swap_cgroup.h
+++ b/include/linux/swap_cgroup.h
@@ -6,10 +6,8 @@
#if defined(CONFIG_MEMCG) && defined(CONFIG_SWAP)
-extern unsigned short swap_cgroup_cmpxchg(swp_entry_t ent,
- unsigned short old, unsigned short new);
-extern unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id,
- unsigned int nr_ents);
+extern void swap_cgroup_record(struct folio *folio, swp_entry_t ent);
+extern unsigned short swap_cgroup_clear(swp_entry_t ent, unsigned int nr_ents);
extern unsigned short lookup_swap_cgroup_id(swp_entry_t ent);
extern int swap_cgroup_swapon(int type, unsigned long max_pages);
extern void swap_cgroup_swapoff(int type);
@@ -17,8 +15,12 @@ extern void swap_cgroup_swapoff(int type);
#else
static inline
-unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id,
- unsigned int nr_ents)
+void swap_cgroup_record(struct folio *folio, swp_entry_t ent)
+{
+}
+
+static inline
+unsigned short swap_cgroup_clear(swp_entry_t ent, unsigned int nr_ents)
{
return 0;
}
diff --git a/include/linux/swap_slots.h b/include/linux/swap_slots.h
index 15adfb8c813a..840aec3523b2 100644
--- a/include/linux/swap_slots.h
+++ b/include/linux/swap_slots.h
@@ -16,15 +16,12 @@ struct swap_slots_cache {
swp_entry_t *slots;
int nr;
int cur;
- spinlock_t free_lock; /* protects slots_ret, n_ret */
- swp_entry_t *slots_ret;
int n_ret;
};
void disable_swap_slots_cache_lock(void);
void reenable_swap_slots_cache_unlock(void);
void enable_swap_slots_cache(void);
-void free_swap_slot(swp_entry_t entry);
extern bool swap_slot_cache_enabled;
diff --git a/include/linux/task_work.h b/include/linux/task_work.h
index 2964171856e0..0646804860ff 100644
--- a/include/linux/task_work.h
+++ b/include/linux/task_work.h
@@ -19,9 +19,6 @@ enum task_work_notify_mode {
TWA_SIGNAL,
TWA_SIGNAL_NO_IPI,
TWA_NMI_CURRENT,
-
- TWA_FLAGS = 0xff00,
- TWAF_NO_ALLOC = 0x0100,
};
static inline bool task_work_pending(struct task_struct *task)