From cc2383ec06be093789469852e1fe96e1148e9a2c Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Mon, 8 Oct 2012 16:28:37 -0700 Subject: mm: introduce arch-specific vma flag VM_ARCH_1 Combine several arch-specific vma flags into one. before patch: 0x00000200 0x01000000 0x20000000 0x40000000 x86 VM_NOHUGEPAGE VM_HUGEPAGE - VM_PAT powerpc - - VM_SAO - parisc VM_GROWSUP - - - ia64 VM_GROWSUP - - - nommu - VM_MAPPED_COPY - - others - - - - after patch: 0x00000200 0x01000000 0x20000000 0x40000000 x86 - VM_PAT VM_HUGEPAGE VM_NOHUGEPAGE powerpc - VM_SAO - - parisc - VM_GROWSUP - - ia64 - VM_GROWSUP - - nommu - VM_MAPPED_COPY - - others - VM_ARCH_1 - - And voila! One completely free bit. Signed-off-by: Konstantin Khlebnikov Cc: Alexander Viro Cc: Carsten Otte Cc: Chris Metcalf Cc: Cyrill Gorcunov Cc: Eric Paris Cc: H. Peter Anvin Cc: Hugh Dickins Cc: Ingo Molnar Cc: James Morris Cc: Jason Baron Cc: Kentaro Takeda Cc: Matt Helsley Cc: Nick Piggin Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Robert Richter Cc: Suresh Siddha Cc: Tetsuo Handa Cc: Venkatesh Pallipadi Acked-by: Linus Torvalds Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/ksm.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'mm/ksm.c') diff --git a/mm/ksm.c b/mm/ksm.c index 47c885368890..d1cbe2aa6b3a 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -1470,9 +1470,14 @@ int ksm_madvise(struct vm_area_struct *vma, unsigned long start, if (*vm_flags & (VM_MERGEABLE | VM_SHARED | VM_MAYSHARE | VM_PFNMAP | VM_IO | VM_DONTEXPAND | VM_RESERVED | VM_HUGETLB | VM_INSERTPAGE | - VM_NONLINEAR | VM_MIXEDMAP | VM_SAO)) + VM_NONLINEAR | VM_MIXEDMAP)) return 0; /* just ignore the advice */ +#ifdef VM_SAO + if (*vm_flags & VM_SAO) + return 0; +#endif + if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) { err = __ksm_enter(mm); if (err) -- cgit v1.2.3 From 4b6e1e37026ec7dae9b23d78ffcebdd5ddb1bfa1 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Mon, 8 Oct 2012 16:28:40 -0700 Subject: mm: kill vma flag VM_INSERTPAGE Merge VM_INSERTPAGE into VM_MIXEDMAP. VM_MIXEDMAP VMA can mix pure-pfn ptes, special ptes and normal ptes. Now copy_page_range() always copies VM_MIXEDMAP VMA on fork like VM_PFNMAP. If driver populates whole VMA at mmap() it probably not expects page-faults. This patch removes special check from vma_wants_writenotify() which disables pages write tracking for VMA populated via vm_instert_page(). BDI below mapped file should not use dirty-accounting, moreover do_wp_page() can handle this. vm_insert_page() still marks vma after first usage. Usually it is called from f_op->mmap() handler under mm->mmap_sem write-lock, so it able to change vma->vm_flags. Caller must set VM_MIXEDMAP at mmap time if it wants to call this function from other places, for example from page-fault handler. Signed-off-by: Konstantin Khlebnikov Cc: Alexander Viro Cc: Carsten Otte Cc: Chris Metcalf Cc: Cyrill Gorcunov Cc: Eric Paris Cc: H. Peter Anvin Cc: Hugh Dickins Cc: Ingo Molnar Cc: James Morris Cc: Jason Baron Cc: Kentaro Takeda Cc: Matt Helsley Cc: Nick Piggin Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Robert Richter Cc: Suresh Siddha Cc: Tetsuo Handa Cc: Venkatesh Pallipadi Acked-by: Linus Torvalds Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/ksm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm/ksm.c') diff --git a/mm/ksm.c b/mm/ksm.c index d1cbe2aa6b3a..f9ccb16559ee 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -1469,7 +1469,7 @@ int ksm_madvise(struct vm_area_struct *vma, unsigned long start, */ if (*vm_flags & (VM_MERGEABLE | VM_SHARED | VM_MAYSHARE | VM_PFNMAP | VM_IO | VM_DONTEXPAND | - VM_RESERVED | VM_HUGETLB | VM_INSERTPAGE | + VM_RESERVED | VM_HUGETLB | VM_NONLINEAR | VM_MIXEDMAP)) return 0; /* just ignore the advice */ -- cgit v1.2.3 From 314e51b9851b4f4e8ab302243ff5a6fc6147f379 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Mon, 8 Oct 2012 16:29:02 -0700 Subject: mm: kill vma flag VM_RESERVED and mm->reserved_vm counter A long time ago, in v2.4, VM_RESERVED kept swapout process off VMA, currently it lost original meaning but still has some effects: | effect | alternative flags -+------------------------+--------------------------------------------- 1| account as reserved_vm | VM_IO 2| skip in core dump | VM_IO, VM_DONTDUMP 3| do not merge or expand | VM_IO, VM_DONTEXPAND, VM_HUGETLB, VM_PFNMAP 4| do not mlock | VM_IO, VM_DONTEXPAND, VM_HUGETLB, VM_PFNMAP This patch removes reserved_vm counter from mm_struct. Seems like nobody cares about it, it does not exported into userspace directly, it only reduces total_vm showed in proc. Thus VM_RESERVED can be replaced with VM_IO or pair VM_DONTEXPAND | VM_DONTDUMP. remap_pfn_range() and io_remap_pfn_range() set VM_IO|VM_DONTEXPAND|VM_DONTDUMP. remap_vmalloc_range() set VM_DONTEXPAND | VM_DONTDUMP. [akpm@linux-foundation.org: drivers/vfio/pci/vfio_pci.c fixup] Signed-off-by: Konstantin Khlebnikov Cc: Alexander Viro Cc: Carsten Otte Cc: Chris Metcalf Cc: Cyrill Gorcunov Cc: Eric Paris Cc: H. Peter Anvin Cc: Hugh Dickins Cc: Ingo Molnar Cc: James Morris Cc: Jason Baron Cc: Kentaro Takeda Cc: Matt Helsley Cc: Nick Piggin Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Robert Richter Cc: Suresh Siddha Cc: Tetsuo Handa Cc: Venkatesh Pallipadi Acked-by: Linus Torvalds Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/ksm.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'mm/ksm.c') diff --git a/mm/ksm.c b/mm/ksm.c index f9ccb16559ee..9638620a7530 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -1469,8 +1469,7 @@ int ksm_madvise(struct vm_area_struct *vma, unsigned long start, */ if (*vm_flags & (VM_MERGEABLE | VM_SHARED | VM_MAYSHARE | VM_PFNMAP | VM_IO | VM_DONTEXPAND | - VM_RESERVED | VM_HUGETLB | - VM_NONLINEAR | VM_MIXEDMAP)) + VM_HUGETLB | VM_NONLINEAR | VM_MIXEDMAP)) return 0; /* just ignore the advice */ #ifdef VM_SAO -- cgit v1.2.3 From bf181b9f9d8dfbba58b23441ad60d0bc33806d64 Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Mon, 8 Oct 2012 16:31:39 -0700 Subject: mm anon rmap: replace same_anon_vma linked list with an interval tree. When a large VMA (anon or private file mapping) is first touched, which will populate its anon_vma field, and then split into many regions through the use of mprotect(), the original anon_vma ends up linking all of the vmas on a linked list. This can cause rmap to become inefficient, as we have to walk potentially thousands of irrelevent vmas before finding the one a given anon page might fall into. By replacing the same_anon_vma linked list with an interval tree (where each avc's interval is determined by its vma's start and last pgoffs), we can make rmap efficient for this use case again. While the change is large, all of its pieces are fairly simple. Most places that were walking the same_anon_vma list were looking for a known pgoff, so they can just use the anon_vma_interval_tree_foreach() interval tree iterator instead. The exception here is ksm, where the page's index is not known. It would probably be possible to rework ksm so that the index would be known, but for now I have decided to keep things simple and just walk the entirety of the interval tree there. When updating vma's that already have an anon_vma assigned, we must take care to re-index the corresponding avc's on their interval tree. This is done through the use of anon_vma_interval_tree_pre_update_vma() and anon_vma_interval_tree_post_update_vma(), which remove the avc's from their interval tree before the update and re-insert them after the update. The anon_vma stays locked during the update, so there is no chance that rmap would miss the vmas that are being updated. Signed-off-by: Michel Lespinasse Cc: Andrea Arcangeli Cc: Rik van Riel Cc: Peter Zijlstra Cc: Daniel Santos Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/ksm.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'mm/ksm.c') diff --git a/mm/ksm.c b/mm/ksm.c index 9638620a7530..14ee5cf8a513 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -1618,7 +1618,8 @@ again: struct vm_area_struct *vma; anon_vma_lock(anon_vma); - list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) { + anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root, + 0, ULONG_MAX) { vma = vmac->vma; if (rmap_item->address < vma->vm_start || rmap_item->address >= vma->vm_end) @@ -1671,7 +1672,8 @@ again: struct vm_area_struct *vma; anon_vma_lock(anon_vma); - list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) { + anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root, + 0, ULONG_MAX) { vma = vmac->vma; if (rmap_item->address < vma->vm_start || rmap_item->address >= vma->vm_end) @@ -1723,7 +1725,8 @@ again: struct vm_area_struct *vma; anon_vma_lock(anon_vma); - list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) { + anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root, + 0, ULONG_MAX) { vma = vmac->vma; if (rmap_item->address < vma->vm_start || rmap_item->address >= vma->vm_end) -- cgit v1.2.3 From 39b5f29ac1f988c1615fbc9c69f6651ab0d0c3c7 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Mon, 8 Oct 2012 16:33:18 -0700 Subject: mm: remove vma arg from page_evictable page_evictable(page, vma) is an irritant: almost all its callers pass NULL for vma. Remove the vma arg and use mlocked_vma_newpage(vma, page) explicitly in the couple of places it's needed. But in those places we don't even need page_evictable() itself! They're dealing with a freshly allocated anonymous page, which has no "mapping" and cannot be mlocked yet. Signed-off-by: Hugh Dickins Acked-by: Mel Gorman Cc: Rik van Riel Acked-by: Johannes Weiner Cc: Michel Lespinasse Cc: Ying Han Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/ksm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm/ksm.c') diff --git a/mm/ksm.c b/mm/ksm.c index 14ee5cf8a513..ecbc090cdaad 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -1586,7 +1586,7 @@ struct page *ksm_does_need_to_copy(struct page *page, SetPageSwapBacked(new_page); __set_page_locked(new_page); - if (page_evictable(new_page, vma)) + if (!mlocked_vma_newpage(vma, new_page)) lru_cache_add_lru(new_page, LRU_ACTIVE_ANON); else add_page_to_unevictable_list(new_page); -- cgit v1.2.3 From 6bdb913f0a70a4dfb7f066fb15e2d6f960701d00 Mon Sep 17 00:00:00 2001 From: Haggai Eran Date: Mon, 8 Oct 2012 16:33:35 -0700 Subject: mm: wrap calls to set_pte_at_notify with invalidate_range_start and invalidate_range_end In order to allow sleeping during invalidate_page mmu notifier calls, we need to avoid calling when holding the PT lock. In addition to its direct calls, invalidate_page can also be called as a substitute for a change_pte call, in case the notifier client hasn't implemented change_pte. This patch drops the invalidate_page call from change_pte, and instead wraps all calls to change_pte with invalidate_range_start and invalidate_range_end calls. Note that change_pte still cannot sleep after this patch, and that clients implementing change_pte should not take action on it in case the number of outstanding invalidate_range_start calls is larger than one, otherwise they might miss a later invalidation. Signed-off-by: Haggai Eran Cc: Andrea Arcangeli Cc: Sagi Grimberg Cc: Peter Zijlstra Cc: Xiao Guangrong Cc: Or Gerlitz Cc: Haggai Eran Cc: Shachar Raindel Cc: Liran Liss Cc: Christoph Lameter Cc: Avi Kivity Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/ksm.c | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) (limited to 'mm/ksm.c') diff --git a/mm/ksm.c b/mm/ksm.c index ecbc090cdaad..ae539f0b8aa1 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -709,15 +709,22 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page, spinlock_t *ptl; int swapped; int err = -EFAULT; + unsigned long mmun_start; /* For mmu_notifiers */ + unsigned long mmun_end; /* For mmu_notifiers */ addr = page_address_in_vma(page, vma); if (addr == -EFAULT) goto out; BUG_ON(PageTransCompound(page)); + + mmun_start = addr; + mmun_end = addr + PAGE_SIZE; + mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); + ptep = page_check_address(page, mm, addr, &ptl, 0); if (!ptep) - goto out; + goto out_mn; if (pte_write(*ptep) || pte_dirty(*ptep)) { pte_t entry; @@ -752,6 +759,8 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page, out_unlock: pte_unmap_unlock(ptep, ptl); +out_mn: + mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); out: return err; } @@ -776,6 +785,8 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, spinlock_t *ptl; unsigned long addr; int err = -EFAULT; + unsigned long mmun_start; /* For mmu_notifiers */ + unsigned long mmun_end; /* For mmu_notifiers */ addr = page_address_in_vma(page, vma); if (addr == -EFAULT) @@ -794,10 +805,14 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, if (!pmd_present(*pmd)) goto out; + mmun_start = addr; + mmun_end = addr + PAGE_SIZE; + mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); + ptep = pte_offset_map_lock(mm, pmd, addr, &ptl); if (!pte_same(*ptep, orig_pte)) { pte_unmap_unlock(ptep, ptl); - goto out; + goto out_mn; } get_page(kpage); @@ -814,6 +829,8 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, pte_unmap_unlock(ptep, ptl); err = 0; +out_mn: + mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); out: return err; } -- cgit v1.2.3