summaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig4
-rw-r--r--mm/compaction.c20
-rw-r--r--mm/fremap.c28
-rw-r--r--mm/huge_memory.c2
-rw-r--r--mm/ksm.c2
-rw-r--r--mm/memcontrol.c14
-rw-r--r--mm/memory-failure.c2
-rw-r--r--mm/mempolicy.c74
-rw-r--r--mm/migrate.c43
-rw-r--r--mm/page_alloc.c30
-rw-r--r--mm/rmap.c5
-rw-r--r--mm/swap.c4
12 files changed, 120 insertions, 108 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 2d9f1504d75e..2888024e0b0a 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -575,5 +575,5 @@ config PGTABLE_MAPPING
then you should select this. This causes zsmalloc to use page table
mapping rather than copying for object mapping.
- You can check speed with zsmalloc benchmark[1].
- [1] https://github.com/spartacus06/zsmalloc
+ You can check speed with zsmalloc benchmark:
+ https://github.com/spartacus06/zsmapbench
diff --git a/mm/compaction.c b/mm/compaction.c
index b48c5259ea33..918577595ea8 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -251,7 +251,6 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
{
int nr_scanned = 0, total_isolated = 0;
struct page *cursor, *valid_page = NULL;
- unsigned long nr_strict_required = end_pfn - blockpfn;
unsigned long flags;
bool locked = false;
@@ -264,11 +263,12 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
nr_scanned++;
if (!pfn_valid_within(blockpfn))
- continue;
+ goto isolate_fail;
+
if (!valid_page)
valid_page = page;
if (!PageBuddy(page))
- continue;
+ goto isolate_fail;
/*
* The zone lock must be held to isolate freepages.
@@ -289,12 +289,10 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
/* Recheck this is a buddy page under lock */
if (!PageBuddy(page))
- continue;
+ goto isolate_fail;
/* Found a free page, break it into order-0 pages */
isolated = split_free_page(page);
- if (!isolated && strict)
- break;
total_isolated += isolated;
for (i = 0; i < isolated; i++) {
list_add(&page->lru, freelist);
@@ -305,7 +303,15 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
if (isolated) {
blockpfn += isolated - 1;
cursor += isolated - 1;
+ continue;
}
+
+isolate_fail:
+ if (strict)
+ break;
+ else
+ continue;
+
}
trace_mm_compaction_isolate_freepages(nr_scanned, total_isolated);
@@ -315,7 +321,7 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
* pages requested were isolated. If there were any failures, 0 is
* returned and CMA will fail.
*/
- if (strict && nr_strict_required > total_isolated)
+ if (strict && blockpfn < end_pfn)
total_isolated = 0;
if (locked)
diff --git a/mm/fremap.c b/mm/fremap.c
index bbc4d660221a..34feba60a17e 100644
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -23,28 +23,44 @@
#include "internal.h"
+static int mm_counter(struct page *page)
+{
+ return PageAnon(page) ? MM_ANONPAGES : MM_FILEPAGES;
+}
+
static void zap_pte(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long addr, pte_t *ptep)
{
pte_t pte = *ptep;
+ struct page *page;
+ swp_entry_t entry;
if (pte_present(pte)) {
- struct page *page;
-
flush_cache_page(vma, addr, pte_pfn(pte));
pte = ptep_clear_flush(vma, addr, ptep);
page = vm_normal_page(vma, addr, pte);
if (page) {
if (pte_dirty(pte))
set_page_dirty(page);
+ update_hiwater_rss(mm);
+ dec_mm_counter(mm, mm_counter(page));
page_remove_rmap(page);
page_cache_release(page);
+ }
+ } else { /* zap_pte() is not called when pte_none() */
+ if (!pte_file(pte)) {
update_hiwater_rss(mm);
- dec_mm_counter(mm, MM_FILEPAGES);
+ entry = pte_to_swp_entry(pte);
+ if (non_swap_entry(entry)) {
+ if (is_migration_entry(entry)) {
+ page = migration_entry_to_page(entry);
+ dec_mm_counter(mm, mm_counter(page));
+ }
+ } else {
+ free_swap_and_cache(entry);
+ dec_mm_counter(mm, MM_SWAPENTS);
+ }
}
- } else {
- if (!pte_file(pte))
- free_swap_and_cache(pte_to_swp_entry(pte));
pte_clear_not_present_full(mm, addr, ptep, 0);
}
}
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 4df39b1bde91..1546655a2d78 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1961,7 +1961,7 @@ out:
return ret;
}
-#define VM_NO_THP (VM_SPECIAL|VM_MIXEDMAP|VM_HUGETLB|VM_SHARED|VM_MAYSHARE)
+#define VM_NO_THP (VM_SPECIAL | VM_HUGETLB | VM_SHARED | VM_MAYSHARE)
int hugepage_madvise(struct vm_area_struct *vma,
unsigned long *vm_flags, int advice)
diff --git a/mm/ksm.c b/mm/ksm.c
index aa4c7c7250c1..68710e80994a 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -444,7 +444,7 @@ static void break_cow(struct rmap_item *rmap_item)
static struct page *page_trans_compound_anon(struct page *page)
{
if (PageTransCompound(page)) {
- struct page *head = compound_trans_head(page);
+ struct page *head = compound_head(page);
/*
* head may actually be splitted and freed from under
* us but it's ok here.
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index ce7a8cc7b404..5b6b0039f725 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1127,8 +1127,8 @@ skip_node:
* skipping css reference should be safe.
*/
if (next_css) {
- if ((next_css->flags & CSS_ONLINE) &&
- (next_css == &root->css || css_tryget(next_css)))
+ if ((next_css == &root->css) ||
+ ((next_css->flags & CSS_ONLINE) && css_tryget(next_css)))
return mem_cgroup_from_css(next_css);
prev_css = next_css;
@@ -6595,6 +6595,7 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
struct mem_cgroup_event *event, *tmp;
+ struct cgroup_subsys_state *iter;
/*
* Unregister events and notify userspace.
@@ -6611,7 +6612,14 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
kmem_cgroup_css_offline(memcg);
mem_cgroup_invalidate_reclaim_iterators(memcg);
- mem_cgroup_reparent_charges(memcg);
+
+ /*
+ * This requires that offlining is serialized. Right now that is
+ * guaranteed because css_killed_work_fn() holds the cgroup_mutex.
+ */
+ css_for_each_descendant_post(iter, css)
+ mem_cgroup_reparent_charges(mem_cgroup_from_css(iter));
+
mem_cgroup_destroy_all_caches(memcg);
vmpressure_cleanup(&memcg->vmpressure);
}
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 2f2f34a4e77d..90002ea43638 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1651,7 +1651,7 @@ int soft_offline_page(struct page *page, int flags)
{
int ret;
unsigned long pfn = page_to_pfn(page);
- struct page *hpage = compound_trans_head(page);
+ struct page *hpage = compound_head(page);
if (PageHWPoison(page)) {
pr_info("soft offline: %#lx page already poisoned\n", pfn);
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index eeb2e3d2c962..4755c8576942 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -2301,35 +2301,6 @@ static void sp_free(struct sp_node *n)
kmem_cache_free(sn_cache, n);
}
-#ifdef CONFIG_NUMA_BALANCING
-static bool numa_migrate_deferred(struct task_struct *p, int last_cpupid)
-{
- /* Never defer a private fault */
- if (cpupid_match_pid(p, last_cpupid))
- return false;
-
- if (p->numa_migrate_deferred) {
- p->numa_migrate_deferred--;
- return true;
- }
- return false;
-}
-
-static inline void defer_numa_migrate(struct task_struct *p)
-{
- p->numa_migrate_deferred = sysctl_numa_balancing_migrate_deferred;
-}
-#else
-static inline bool numa_migrate_deferred(struct task_struct *p, int last_cpupid)
-{
- return false;
-}
-
-static inline void defer_numa_migrate(struct task_struct *p)
-{
-}
-#endif /* CONFIG_NUMA_BALANCING */
-
/**
* mpol_misplaced - check whether current page node is valid in policy
*
@@ -2403,52 +2374,9 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long
/* Migrate the page towards the node whose CPU is referencing it */
if (pol->flags & MPOL_F_MORON) {
- int last_cpupid;
- int this_cpupid;
-
polnid = thisnid;
- this_cpupid = cpu_pid_to_cpupid(thiscpu, current->pid);
-
- /*
- * Multi-stage node selection is used in conjunction
- * with a periodic migration fault to build a temporal
- * task<->page relation. By using a two-stage filter we
- * remove short/unlikely relations.
- *
- * Using P(p) ~ n_p / n_t as per frequentist
- * probability, we can equate a task's usage of a
- * particular page (n_p) per total usage of this
- * page (n_t) (in a given time-span) to a probability.
- *
- * Our periodic faults will sample this probability and
- * getting the same result twice in a row, given these
- * samples are fully independent, is then given by
- * P(n)^2, provided our sample period is sufficiently
- * short compared to the usage pattern.
- *
- * This quadric squishes small probabilities, making
- * it less likely we act on an unlikely task<->page
- * relation.
- */
- last_cpupid = page_cpupid_xchg_last(page, this_cpupid);
- if (!cpupid_pid_unset(last_cpupid) && cpupid_to_nid(last_cpupid) != thisnid) {
- /* See sysctl_numa_balancing_migrate_deferred comment */
- if (!cpupid_match_pid(current, last_cpupid))
- defer_numa_migrate(current);
-
- goto out;
- }
-
- /*
- * The quadratic filter above reduces extraneous migration
- * of shared pages somewhat. This code reduces it even more,
- * reducing the overhead of page migrations of shared pages.
- * This makes workloads with shared pages rely more on
- * "move task near its memory", and less on "move memory
- * towards its task", which is exactly what we want.
- */
- if (numa_migrate_deferred(current, last_cpupid))
+ if (!should_numa_migrate_memory(current, page, curnid, thiscpu))
goto out;
}
diff --git a/mm/migrate.c b/mm/migrate.c
index 482a33d89134..bed48809e5d0 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -178,6 +178,37 @@ out:
}
/*
+ * Congratulations to trinity for discovering this bug.
+ * mm/fremap.c's remap_file_pages() accepts any range within a single vma to
+ * convert that vma to VM_NONLINEAR; and generic_file_remap_pages() will then
+ * replace the specified range by file ptes throughout (maybe populated after).
+ * If page migration finds a page within that range, while it's still located
+ * by vma_interval_tree rather than lost to i_mmap_nonlinear list, no problem:
+ * zap_pte() clears the temporary migration entry before mmap_sem is dropped.
+ * But if the migrating page is in a part of the vma outside the range to be
+ * remapped, then it will not be cleared, and remove_migration_ptes() needs to
+ * deal with it. Fortunately, this part of the vma is of course still linear,
+ * so we just need to use linear location on the nonlinear list.
+ */
+static int remove_linear_migration_ptes_from_nonlinear(struct page *page,
+ struct address_space *mapping, void *arg)
+{
+ struct vm_area_struct *vma;
+ /* hugetlbfs does not support remap_pages, so no huge pgoff worries */
+ pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
+ unsigned long addr;
+
+ list_for_each_entry(vma,
+ &mapping->i_mmap_nonlinear, shared.nonlinear) {
+
+ addr = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
+ if (addr >= vma->vm_start && addr < vma->vm_end)
+ remove_migration_pte(page, vma, addr, arg);
+ }
+ return SWAP_AGAIN;
+}
+
+/*
* Get rid of all migration entries and replace them by
* references to the indicated page.
*/
@@ -186,6 +217,7 @@ static void remove_migration_ptes(struct page *old, struct page *new)
struct rmap_walk_control rwc = {
.rmap_one = remove_migration_pte,
.arg = old,
+ .file_nonlinear = remove_linear_migration_ptes_from_nonlinear,
};
rmap_walk(new, &rwc);
@@ -1158,7 +1190,7 @@ static struct page *new_page_node(struct page *p, unsigned long private,
pm->node);
else
return alloc_pages_exact_node(pm->node,
- GFP_HIGHUSER_MOVABLE | GFP_THISNODE, 0);
+ GFP_HIGHUSER_MOVABLE | __GFP_THISNODE, 0);
}
/*
@@ -1544,9 +1576,9 @@ static struct page *alloc_misplaced_dst_page(struct page *page,
struct page *newpage;
newpage = alloc_pages_exact_node(nid,
- (GFP_HIGHUSER_MOVABLE | GFP_THISNODE |
- __GFP_NOMEMALLOC | __GFP_NORETRY |
- __GFP_NOWARN) &
+ (GFP_HIGHUSER_MOVABLE |
+ __GFP_THISNODE | __GFP_NOMEMALLOC |
+ __GFP_NORETRY | __GFP_NOWARN) &
~GFP_IOFS, 0);
return newpage;
@@ -1747,7 +1779,8 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
goto out_dropref;
new_page = alloc_pages_node(node,
- (GFP_TRANSHUGE | GFP_THISNODE) & ~__GFP_WAIT, HPAGE_PMD_ORDER);
+ (GFP_TRANSHUGE | __GFP_THISNODE) & ~__GFP_WAIT,
+ HPAGE_PMD_ORDER);
if (!new_page)
goto out_fail;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index e3758a09a009..3bac76ae4b30 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -369,9 +369,11 @@ void prep_compound_page(struct page *page, unsigned long order)
__SetPageHead(page);
for (i = 1; i < nr_pages; i++) {
struct page *p = page + i;
- __SetPageTail(p);
set_page_count(p, 0);
p->first_page = page;
+ /* Make sure p->first_page is always valid for PageTail() */
+ smp_wmb();
+ __SetPageTail(p);
}
}
@@ -1236,6 +1238,15 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
}
local_irq_restore(flags);
}
+static bool gfp_thisnode_allocation(gfp_t gfp_mask)
+{
+ return (gfp_mask & GFP_THISNODE) == GFP_THISNODE;
+}
+#else
+static bool gfp_thisnode_allocation(gfp_t gfp_mask)
+{
+ return false;
+}
#endif
/*
@@ -1572,7 +1583,13 @@ again:
get_pageblock_migratetype(page));
}
- __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order));
+ /*
+ * NOTE: GFP_THISNODE allocations do not partake in the kswapd
+ * aging protocol, so they can't be fair.
+ */
+ if (!gfp_thisnode_allocation(gfp_flags))
+ __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order));
+
__count_zone_vm_events(PGALLOC, zone, 1 << order);
zone_statistics(preferred_zone, zone, gfp_flags);
local_irq_restore(flags);
@@ -1944,8 +1961,12 @@ zonelist_scan:
* ultimately fall back to remote zones that do not
* partake in the fairness round-robin cycle of this
* zonelist.
+ *
+ * NOTE: GFP_THISNODE allocations do not partake in
+ * the kswapd aging protocol, so they can't be fair.
*/
- if (alloc_flags & ALLOC_WMARK_LOW) {
+ if ((alloc_flags & ALLOC_WMARK_LOW) &&
+ !gfp_thisnode_allocation(gfp_mask)) {
if (zone_page_state(zone, NR_ALLOC_BATCH) <= 0)
continue;
if (!zone_local(preferred_zone, zone))
@@ -2501,8 +2522,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
* allowed per node queues are empty and that nodes are
* over allocated.
*/
- if (IS_ENABLED(CONFIG_NUMA) &&
- (gfp_mask & GFP_THISNODE) == GFP_THISNODE)
+ if (gfp_thisnode_allocation(gfp_mask))
goto nopage;
restart:
diff --git a/mm/rmap.c b/mm/rmap.c
index d9d42316a99a..8fc049f9a5a6 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1360,8 +1360,9 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount,
}
static int try_to_unmap_nonlinear(struct page *page,
- struct address_space *mapping, struct vm_area_struct *vma)
+ struct address_space *mapping, void *arg)
{
+ struct vm_area_struct *vma;
int ret = SWAP_AGAIN;
unsigned long cursor;
unsigned long max_nl_cursor = 0;
@@ -1663,7 +1664,7 @@ static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc)
if (list_empty(&mapping->i_mmap_nonlinear))
goto done;
- ret = rwc->file_nonlinear(page, mapping, vma);
+ ret = rwc->file_nonlinear(page, mapping, rwc->arg);
done:
mutex_unlock(&mapping->i_mmap_mutex);
diff --git a/mm/swap.c b/mm/swap.c
index b31ba67d440a..0092097b3f4c 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -98,7 +98,7 @@ static void put_compound_page(struct page *page)
}
/* __split_huge_page_refcount can run under us */
- page_head = compound_trans_head(page);
+ page_head = compound_head(page);
/*
* THP can not break up slab pages so avoid taking
@@ -253,7 +253,7 @@ bool __get_page_tail(struct page *page)
*/
unsigned long flags;
bool got;
- struct page *page_head = compound_trans_head(page);
+ struct page *page_head = compound_head(page);
/* Ref to put_compound_page() comment. */
if (!__compound_tail_refcounted(page_head)) {