diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/gup.c | 3 | ||||
-rw-r--r-- | mm/hugetlb.c | 14 | ||||
-rw-r--r-- | mm/memory-failure.c | 17 | ||||
-rw-r--r-- | mm/memory.c | 43 | ||||
-rw-r--r-- | mm/memory_hotplug.c | 22 | ||||
-rw-r--r-- | mm/mempolicy.c | 6 | ||||
-rw-r--r-- | mm/migrate.c | 18 | ||||
-rw-r--r-- | mm/mmap.c | 7 | ||||
-rw-r--r-- | mm/oom_kill.c | 12 | ||||
-rw-r--r-- | mm/page-writeback.c | 35 | ||||
-rw-r--r-- | mm/page_alloc.c | 18 | ||||
-rw-r--r-- | mm/shmem.c | 12 | ||||
-rw-r--r-- | mm/slab.c | 6 | ||||
-rw-r--r-- | mm/util.c | 2 | ||||
-rw-r--r-- | mm/vmalloc.c | 2 | ||||
-rw-r--r-- | mm/zsmalloc.c | 11 |
16 files changed, 158 insertions, 70 deletions
@@ -1423,7 +1423,8 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end, if (pmd_none(pmd)) return 0; - if (unlikely(pmd_trans_huge(pmd) || pmd_huge(pmd))) { + if (unlikely(pmd_trans_huge(pmd) || pmd_huge(pmd) || + pmd_devmap(pmd))) { /* * NUMA hinting faults need to be handled in the GUP * slowpath for accounting purposes and so that they diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 3e50fcfe6ad8..8b682da98d95 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3579,7 +3579,6 @@ retry_avoidcopy: copy_user_huge_page(new_page, old_page, address, vma, pages_per_huge_page(h)); __SetPageUptodate(new_page); - set_page_huge_active(new_page); mmun_start = address & huge_page_mask(h); mmun_end = mmun_start + huge_page_size(h); @@ -3601,6 +3600,7 @@ retry_avoidcopy: make_huge_pte(vma, new_page, 1)); page_remove_rmap(old_page, true); hugepage_add_new_anon_rmap(new_page, vma, address); + set_page_huge_active(new_page); /* Make the old page be freed below */ new_page = old_page; } @@ -3683,6 +3683,7 @@ static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, struct page *page; pte_t new_pte; spinlock_t *ptl; + bool new_page = false; /* * Currently, we are forced to kill the process in the event the @@ -3716,7 +3717,7 @@ retry: } clear_huge_page(page, address, pages_per_huge_page(h)); __SetPageUptodate(page); - set_page_huge_active(page); + new_page = true; if (vma->vm_flags & VM_MAYSHARE) { int err = huge_add_to_page_cache(page, mapping, idx); @@ -3788,6 +3789,15 @@ retry: } spin_unlock(ptl); + + /* + * Only make newly allocated pages active. Existing pages found + * in the pagecache could be !page_huge_active() if they have been + * isolated for migration. + */ + if (new_page) + set_page_huge_active(page); + unlock_page(page); out: return ret; diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 851efb004857..d6524dce43b2 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -336,7 +336,8 @@ static void kill_procs(struct list_head *to_kill, int forcekill, int trapno, if (fail || tk->addr_valid == 0) { pr_err("Memory failure: %#lx: forcibly killing %s:%d because of failure to unmap corrupted page\n", pfn, tk->tsk->comm, tk->tsk->pid); - force_sig(SIGKILL, tk->tsk); + do_send_sig_info(SIGKILL, SEND_SIG_PRIV, + tk->tsk, PIDTYPE_PID); } /* @@ -1704,19 +1705,17 @@ static int soft_offline_in_use_page(struct page *page, int flags) struct page *hpage = compound_head(page); if (!PageHuge(page) && PageTransHuge(hpage)) { - lock_page(hpage); - if (!PageAnon(hpage) || unlikely(split_huge_page(hpage))) { - unlock_page(hpage); - if (!PageAnon(hpage)) + lock_page(page); + if (!PageAnon(page) || unlikely(split_huge_page(page))) { + unlock_page(page); + if (!PageAnon(page)) pr_info("soft offline: %#lx: non anonymous thp\n", page_to_pfn(page)); else pr_info("soft offline: %#lx: thp split failed\n", page_to_pfn(page)); - put_hwpoison_page(hpage); + put_hwpoison_page(page); return -EBUSY; } - unlock_page(hpage); - get_hwpoison_page(page); - put_hwpoison_page(hpage); + unlock_page(page); } if (PageHuge(page)) diff --git a/mm/memory.c b/mm/memory.c index f3fef1df7402..47248dc0b9e1 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2823,6 +2823,28 @@ static int __do_fault(struct fault_env *fe, pgoff_t pgoff, struct vm_fault vmf; int ret; + /* + * Preallocate pte before we take page_lock because this might lead to + * deadlocks for memcg reclaim which waits for pages under writeback: + * lock_page(A) + * SetPageWriteback(A) + * unlock_page(A) + * lock_page(B) + * lock_page(B) + * pte_alloc_pne + * shrink_page_list + * wait_on_page_writeback(A) + * SetPageWriteback(B) + * unlock_page(B) + * # flush A, B to clear the writeback + */ + if (pmd_none(*fe->pmd) && !fe->prealloc_pte) { + fe->prealloc_pte = pte_alloc_one(vma->vm_mm, fe->address); + if (!fe->prealloc_pte) + return VM_FAULT_OOM; + smp_wmb(); /* See comment in __pte_alloc() */ + } + vmf.virtual_address = (void __user *)(fe->address & PAGE_MASK); vmf.pgoff = pgoff; vmf.flags = fe->flags; @@ -3307,15 +3329,24 @@ static int do_fault(struct fault_env *fe) { struct vm_area_struct *vma = fe->vma; pgoff_t pgoff = linear_page_index(vma, fe->address); + int ret; /* The VMA was not fully populated on mmap() or missing VM_DONTEXPAND */ if (!vma->vm_ops->fault) - return VM_FAULT_SIGBUS; - if (!(fe->flags & FAULT_FLAG_WRITE)) - return do_read_fault(fe, pgoff); - if (!(vma->vm_flags & VM_SHARED)) - return do_cow_fault(fe, pgoff); - return do_shared_fault(fe, pgoff); + ret = VM_FAULT_SIGBUS; + else if (!(fe->flags & FAULT_FLAG_WRITE)) + ret = do_read_fault(fe, pgoff); + else if (!(vma->vm_flags & VM_SHARED)) + ret = do_cow_fault(fe, pgoff); + else + ret = do_shared_fault(fe, pgoff); + + /* preallocated pagetable is unused: free it */ + if (fe->prealloc_pte) { + pte_free(vma->vm_mm, fe->prealloc_pte); + fe->prealloc_pte = 0; + } + return ret; } static int numa_migrate_prep(struct page *page, struct vm_area_struct *vma, diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 0f962cc3f1bf..b4c8d7b9ab82 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -34,6 +34,7 @@ #include <linux/memblock.h> #include <linux/bootmem.h> #include <linux/compaction.h> +#include <linux/rmap.h> #include <asm/tlbflush.h> @@ -1470,7 +1471,8 @@ static struct page *next_active_pageblock(struct page *page) bool is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages) { struct page *page = pfn_to_page(start_pfn); - struct page *end_page = page + nr_pages; + unsigned long end_pfn = min(start_pfn + nr_pages, zone_end_pfn(page_zone(page))); + struct page *end_page = pfn_to_page(end_pfn); /* Check the starting page of each pageblock within the range */ for (; page < end_page; page = next_active_pageblock(page)) { @@ -1510,6 +1512,9 @@ int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn, i++; if (i == MAX_ORDER_NR_PAGES || pfn + i >= end_pfn) continue; + /* Check if we got outside of the zone */ + if (zone && !zone_spans_pfn(zone, pfn + i)) + return 0; page = pfn_to_page(pfn + i); if (zone && page_zone(page) != zone) return 0; @@ -1617,6 +1622,21 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) continue; } + /* + * HWPoison pages have elevated reference counts so the migration would + * fail on them. It also doesn't make any sense to migrate them in the + * first place. Still try to unmap such a page in case it is still mapped + * (e.g. current hwpoison implementation doesn't unmap KSM pages but keep + * the unmap as the catch all safety net). + */ + if (PageHWPoison(page)) { + if (WARN_ON(PageLRU(page))) + isolate_lru_page(page); + if (page_mapped(page)) + try_to_unmap(page, TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS); + continue; + } + if (!get_page_unless_zero(page)) continue; /* diff --git a/mm/mempolicy.c b/mm/mempolicy.c index e21d9b44247b..593b74bed59b 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1327,7 +1327,7 @@ static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode, nodemask_t *nodes) { unsigned long copy = ALIGN(maxnode-1, 64) / 8; - const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long); + unsigned int nbytes = BITS_TO_LONGS(nr_node_ids) * sizeof(long); if (copy > nbytes) { if (copy > PAGE_SIZE) @@ -1488,7 +1488,7 @@ SYSCALL_DEFINE5(get_mempolicy, int __user *, policy, int uninitialized_var(pval); nodemask_t nodes; - if (nmask != NULL && maxnode < MAX_NUMNODES) + if (nmask != NULL && maxnode < nr_node_ids) return -EINVAL; err = do_get_mempolicy(&pval, &nodes, addr, flags); @@ -1517,7 +1517,7 @@ COMPAT_SYSCALL_DEFINE5(get_mempolicy, int __user *, policy, unsigned long nr_bits, alloc_size; DECLARE_BITMAP(bm, MAX_NUMNODES); - nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES); + nr_bits = min_t(unsigned long, maxnode-1, nr_node_ids); alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8; if (nmask) diff --git a/mm/migrate.c b/mm/migrate.c index 821623fc7091..b810ac1359f0 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1044,10 +1044,13 @@ out: * If migration is successful, decrease refcount of the newpage * which will not free the page because new page owner increased * refcounter. As well, if it is LRU page, add the page to LRU - * list in here. + * list in here. Use the old state of the isolated source page to + * determine if we migrated a LRU page. newpage was already unlocked + * and possibly modified by its owner - don't rely on the page + * state. */ if (rc == MIGRATEPAGE_SUCCESS) { - if (unlikely(__PageMovable(newpage))) + if (unlikely(!is_lru)) put_page(newpage); else putback_lru_page(newpage); @@ -1231,6 +1234,16 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, lock_page(hpage); } + /* + * Check for pages which are in the process of being freed. Without + * page_mapping() set, hugetlbfs specific move page routine will not + * be called and we could leak usage counts for subpools. + */ + if (page_private(hpage) && !page_mapping(hpage)) { + rc = -EBUSY; + goto out_unlock; + } + if (PageAnon(hpage)) anon_vma = page_get_anon_vma(hpage); @@ -1262,6 +1275,7 @@ put_anon: set_page_owner_migrate_reason(new_hpage, reason); } +out_unlock: unlock_page(hpage); out: if (rc != -EAGAIN) diff --git a/mm/mmap.c b/mm/mmap.c index 283755645d17..3f2314ad6acd 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2345,12 +2345,11 @@ int expand_downwards(struct vm_area_struct *vma, struct mm_struct *mm = vma->vm_mm; struct vm_area_struct *prev; unsigned long gap_addr; - int error; + int error = 0; address &= PAGE_MASK; - error = security_mmap_addr(address); - if (error) - return error; + if (address < mmap_min_addr) + return -EPERM; /* Enforce stack_guard_gap */ gap_addr = address - stack_guard_gap; diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 4a184157cc3d..d514eebad905 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -626,8 +626,8 @@ static void wake_oom_reaper(struct task_struct *tsk) if (!oom_reaper_th) return; - /* tsk is already queued? */ - if (tsk == oom_reaper_list || tsk->oom_reaper_list) + /* mm is already queued? */ + if (test_and_set_bit(MMF_OOM_REAP_QUEUED, &tsk->signal->oom_mm->flags)) return; get_task_struct(tsk); @@ -861,6 +861,13 @@ static void oom_kill_process(struct oom_control *oc, const char *message) * still freeing memory. */ read_lock(&tasklist_lock); + + /* + * The task 'p' might have already exited before reaching here. The + * put_task_struct() will free task_struct 'p' while the loop still try + * to access the field of 'p', so, get an extra reference. + */ + get_task_struct(p); for_each_thread(p, t) { list_for_each_entry(child, &t->children, sibling) { unsigned int child_points; @@ -880,6 +887,7 @@ static void oom_kill_process(struct oom_control *oc, const char *message) } } } + put_task_struct(p); read_unlock(&tasklist_lock); p = find_lock_task_mm(victim); diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 807236aed275..281a46aeae61 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2148,6 +2148,7 @@ int write_cache_pages(struct address_space *mapping, { int ret = 0; int done = 0; + int error; struct pagevec pvec; int nr_pages; pgoff_t uninitialized_var(writeback_index); @@ -2244,25 +2245,31 @@ continue_unlock: goto continue_unlock; trace_wbc_writepage(wbc, inode_to_bdi(mapping->host)); - ret = (*writepage)(page, wbc, data); - if (unlikely(ret)) { - if (ret == AOP_WRITEPAGE_ACTIVATE) { + error = (*writepage)(page, wbc, data); + if (unlikely(error)) { + /* + * Handle errors according to the type of + * writeback. There's no need to continue for + * background writeback. Just push done_index + * past this page so media errors won't choke + * writeout for the entire file. For integrity + * writeback, we must process the entire dirty + * set regardless of errors because the fs may + * still have state to clear for each page. In + * that case we continue processing and return + * the first error. + */ + if (error == AOP_WRITEPAGE_ACTIVATE) { unlock_page(page); - ret = 0; - } else { - /* - * done_index is set past this page, - * so media errors will not choke - * background writeout for the entire - * file. This has consequences for - * range_cyclic semantics (ie. it may - * not be suitable for data integrity - * writeout). - */ + error = 0; + } else if (wbc->sync_mode != WB_SYNC_ALL) { + ret = error; done_index = page->index + 1; done = 1; break; } + if (!ret) + ret = error; } /* diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 28240ce475d6..05f141e39ac1 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3530,8 +3530,6 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, enum compact_result compact_result; int compaction_retries; int no_progress_loops; - unsigned long alloc_start = jiffies; - unsigned int stall_timeout = 10 * HZ; unsigned int cpuset_mems_cookie; /* @@ -3704,14 +3702,6 @@ retry: if (order > PAGE_ALLOC_COSTLY_ORDER && !(gfp_mask & __GFP_REPEAT)) goto nopage; - /* Make sure we know about allocations which stall for too long */ - if (time_after(jiffies, alloc_start + stall_timeout)) { - warn_alloc(gfp_mask, - "page allocation stalls for %ums, order:%u", - jiffies_to_msecs(jiffies-alloc_start), order); - stall_timeout += 10 * HZ; - } - if (should_reclaim_retry(gfp_mask, order, ac, alloc_flags, did_some_progress > 0, &no_progress_loops)) goto retry; @@ -3965,11 +3955,11 @@ refill: /* Even if we own the page, we do not use atomic_set(). * This would break get_page_unless_zero() users. */ - page_ref_add(page, size - 1); + page_ref_add(page, size); /* reset page count bias and offset to start of new frag */ nc->pfmemalloc = page_is_pfmemalloc(page); - nc->pagecnt_bias = size; + nc->pagecnt_bias = size + 1; nc->offset = size; } @@ -3985,10 +3975,10 @@ refill: size = nc->size; #endif /* OK, page count is 0, we can safely set it */ - set_page_count(page, size); + set_page_count(page, size + 1); /* reset page count bias and offset to start of new frag */ - nc->pagecnt_bias = size; + nc->pagecnt_bias = size + 1; offset = size - fragsz; } diff --git a/mm/shmem.c b/mm/shmem.c index 9b17bd4cbc5e..944242491059 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2896,16 +2896,20 @@ static int shmem_create(struct inode *dir, struct dentry *dentry, umode_t mode, static int shmem_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) { struct inode *inode = d_inode(old_dentry); - int ret; + int ret = 0; /* * No ordinary (disk based) filesystem counts links as inodes; * but each new link needs a new dentry, pinning lowmem, and * tmpfs dentries cannot be pruned until they are unlinked. + * But if an O_TMPFILE file is linked into the tmpfs, the + * first link must skip that, to get the accounting right. */ - ret = shmem_reserve_inode(inode->i_sb); - if (ret) - goto out; + if (inode->i_nlink) { + ret = shmem_reserve_inode(inode->i_sb); + if (ret) + goto out; + } dir->i_size += BOGO_DIRENT_SIZE; inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(inode); diff --git a/mm/slab.c b/mm/slab.c index 263dcda6897b..354a09deecff 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -682,8 +682,10 @@ static struct alien_cache *__alloc_alien_cache(int node, int entries, struct alien_cache *alc = NULL; alc = kmalloc_node(memsize, gfp, node); - init_arraycache(&alc->ac, entries, batch); - spin_lock_init(&alc->lock); + if (alc) { + init_arraycache(&alc->ac, entries, batch); + spin_lock_init(&alc->lock); + } return alc; } diff --git a/mm/util.c b/mm/util.c index 8c755d05d4e6..07f467206186 100644 --- a/mm/util.c +++ b/mm/util.c @@ -389,7 +389,7 @@ bool page_mapped(struct page *page) return true; if (PageHuge(page)) return false; - for (i = 0; i < hpage_nr_pages(page); i++) { + for (i = 0; i < (1 << compound_order(page)); i++) { if (atomic_read(&page[i]._mapcount) >= 0) return true; } diff --git a/mm/vmalloc.c b/mm/vmalloc.c index fa598162dbf0..e6aa073f01df 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2191,7 +2191,7 @@ int remap_vmalloc_range_partial(struct vm_area_struct *vma, unsigned long uaddr, if (!(area->flags & VM_USERMAP)) return -EINVAL; - if (kaddr + size > area->addr + area->size) + if (kaddr + size > area->addr + get_vm_area_size(area)) return -EINVAL; do { diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index d3548c48369f..cf15851a7d2f 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -473,7 +473,7 @@ static bool is_zspage_isolated(struct zspage *zspage) return zspage->isolated; } -static int is_first_page(struct page *page) +static __maybe_unused int is_first_page(struct page *page) { return PagePrivate(page); } @@ -558,20 +558,23 @@ static int get_size_class_index(int size) return min(zs_size_classes - 1, idx); } +/* type can be of enum type zs_stat_type or fullness_group */ static inline void zs_stat_inc(struct size_class *class, - enum zs_stat_type type, unsigned long cnt) + int type, unsigned long cnt) { class->stats.objs[type] += cnt; } +/* type can be of enum type zs_stat_type or fullness_group */ static inline void zs_stat_dec(struct size_class *class, - enum zs_stat_type type, unsigned long cnt) + int type, unsigned long cnt) { class->stats.objs[type] -= cnt; } +/* type can be of enum type zs_stat_type or fullness_group */ static inline unsigned long zs_stat_get(struct size_class *class, - enum zs_stat_type type) + int type) { return class->stats.objs[type]; } |