From 318b275fbca1ab9ec0862de71420e0e92c3d1aa7 Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Tue, 22 Mar 2011 16:30:51 -0700 Subject: mm: allow GUP to fail instead of waiting on a page GUP user may want to try to acquire a reference to a page if it is already in memory, but not if IO, to bring it in, is needed. For example KVM may tell vcpu to schedule another guest process if current one is trying to access swapped out page. Meanwhile, the page will be swapped in and the guest process, that depends on it, will be able to run again. This patch adds FAULT_FLAG_RETRY_NOWAIT (suggested by Linus) and FOLL_NOWAIT follow_page flags. FAULT_FLAG_RETRY_NOWAIT, when used in conjunction with VM_FAULT_ALLOW_RETRY, indicates to handle_mm_fault that it shouldn't drop mmap_sem and wait on a page, but return VM_FAULT_RETRY instead. [akpm@linux-foundation.org: improve FOLL_NOWAIT comment] Signed-off-by: Gleb Natapov Cc: Linus Torvalds Cc: Hugh Dickins Acked-by: Rik van Riel Cc: Michel Lespinasse Cc: Avi Kivity Cc: Marcelo Tosatti Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'mm/memory.c') diff --git a/mm/memory.c b/mm/memory.c index e48945ab362b..615be5127ce1 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1569,6 +1569,8 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, fault_flags |= FAULT_FLAG_WRITE; if (nonblocking) fault_flags |= FAULT_FLAG_ALLOW_RETRY; + if (foll_flags & FOLL_NOWAIT) + fault_flags |= (FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT); ret = handle_mm_fault(mm, vma, start, fault_flags); @@ -1595,7 +1597,8 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, tsk->min_flt++; if (ret & VM_FAULT_RETRY) { - *nonblocking = 0; + if (nonblocking) + *nonblocking = 0; return i; } -- cgit v1.2.3 From 31db58b3ab432f72ea76be58b12e6ffaf627d5db Mon Sep 17 00:00:00 2001 From: Stephen Wilson Date: Sun, 13 Mar 2011 15:49:15 -0400 Subject: mm: arch: make get_gate_vma take an mm_struct instead of a task_struct Morally, the presence of a gate vma is more an attribute of a particular mm than a particular task. Moreover, dropping the dependency on task_struct will help make both existing and future operations on mm's more flexible and convenient. Signed-off-by: Stephen Wilson Reviewed-by: Michel Lespinasse Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Signed-off-by: Al Viro --- mm/memory.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm/memory.c') diff --git a/mm/memory.c b/mm/memory.c index e48945ab362b..b6dc37097433 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1488,7 +1488,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, vma = find_extend_vma(mm, start); if (!vma && in_gate_area(tsk, start)) { unsigned long pg = start & PAGE_MASK; - struct vm_area_struct *gate_vma = get_gate_vma(tsk); + struct vm_area_struct *gate_vma = get_gate_vma(tsk->mm); pgd_t *pgd; pud_t *pud; pmd_t *pmd; @@ -3496,7 +3496,7 @@ static int __init gate_vma_init(void) __initcall(gate_vma_init); #endif -struct vm_area_struct *get_gate_vma(struct task_struct *tsk) +struct vm_area_struct *get_gate_vma(struct mm_struct *mm) { #ifdef AT_SYSINFO_EHDR return &gate_vma; -- cgit v1.2.3 From 83b964bbf82eb13a8f31bb49ca420787fe01f7a6 Mon Sep 17 00:00:00 2001 From: Stephen Wilson Date: Sun, 13 Mar 2011 15:49:16 -0400 Subject: mm: arch: make in_gate_area take an mm_struct instead of a task_struct Morally, the question of whether an address lies in a gate vma should be asked with respect to an mm, not a particular task. Moreover, dropping the dependency on task_struct will help make existing and future operations on mm's more flexible and convenient. Signed-off-by: Stephen Wilson Reviewed-by: Michel Lespinasse Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Signed-off-by: Al Viro --- mm/memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm/memory.c') diff --git a/mm/memory.c b/mm/memory.c index b6dc37097433..931d479b80c2 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1486,7 +1486,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, struct vm_area_struct *vma; vma = find_extend_vma(mm, start); - if (!vma && in_gate_area(tsk, start)) { + if (!vma && in_gate_area(tsk->mm, start)) { unsigned long pg = start & PAGE_MASK; struct vm_area_struct *gate_vma = get_gate_vma(tsk->mm); pgd_t *pgd; -- cgit v1.2.3 From cae5d39032acf26c265f6b1dc73d7ce6ff4bc387 Mon Sep 17 00:00:00 2001 From: Stephen Wilson Date: Sun, 13 Mar 2011 15:49:17 -0400 Subject: mm: arch: rename in_gate_area_no_task to in_gate_area_no_mm Now that gate vma's are referenced with respect to a particular mm and not a particular task it only makes sense to propagate the change to this predicate as well. Signed-off-by: Stephen Wilson Reviewed-by: Michel Lespinasse Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Signed-off-by: Al Viro --- mm/memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm/memory.c') diff --git a/mm/memory.c b/mm/memory.c index 931d479b80c2..5f5b5de5a40e 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3505,7 +3505,7 @@ struct vm_area_struct *get_gate_vma(struct mm_struct *mm) #endif } -int in_gate_area_no_task(unsigned long addr) +int in_gate_area_no_mm(unsigned long addr) { #ifdef AT_SYSINFO_EHDR if ((addr >= FIXADDR_USER_START) && (addr < FIXADDR_USER_END)) -- cgit v1.2.3 From e7f22e207bacdba5b73f2893a3abe935a5373e2e Mon Sep 17 00:00:00 2001 From: Stephen Wilson Date: Sun, 13 Mar 2011 15:49:18 -0400 Subject: mm: use mm_struct to resolve gate vma's in __get_user_pages We now check if a requested user page overlaps a gate vma using the supplied mm instead of the supplied task. The given task is now used solely for accounting purposes and may be NULL. Signed-off-by: Stephen Wilson Signed-off-by: Al Viro --- mm/memory.c | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) (limited to 'mm/memory.c') diff --git a/mm/memory.c b/mm/memory.c index 5f5b5de5a40e..5f585b65d734 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1486,9 +1486,9 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, struct vm_area_struct *vma; vma = find_extend_vma(mm, start); - if (!vma && in_gate_area(tsk->mm, start)) { + if (!vma && in_gate_area(mm, start)) { unsigned long pg = start & PAGE_MASK; - struct vm_area_struct *gate_vma = get_gate_vma(tsk->mm); + struct vm_area_struct *gate_vma = get_gate_vma(mm); pgd_t *pgd; pud_t *pud; pmd_t *pmd; @@ -1589,10 +1589,13 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, return i ? i : -EFAULT; BUG(); } - if (ret & VM_FAULT_MAJOR) - tsk->maj_flt++; - else - tsk->min_flt++; + + if (tsk) { + if (ret & VM_FAULT_MAJOR) + tsk->maj_flt++; + else + tsk->min_flt++; + } if (ret & VM_FAULT_RETRY) { *nonblocking = 0; @@ -1638,7 +1641,8 @@ EXPORT_SYMBOL(__get_user_pages); /** * get_user_pages() - pin user pages in memory - * @tsk: task_struct of target task + * @tsk: the task_struct to use for page fault accounting, or + * NULL if faults are not to be recorded. * @mm: mm_struct of target mm * @start: starting user address * @nr_pages: number of pages from start to pin -- cgit v1.2.3 From 206cb636576b969e9b471cdedeaea7752e6acb33 Mon Sep 17 00:00:00 2001 From: Stephen Wilson Date: Sun, 13 Mar 2011 15:49:19 -0400 Subject: mm: factor out main logic of access_process_vm Introduce an internal helper __access_remote_vm and base access_process_vm on top of it. This new method may be called with a NULL task_struct if page fault accounting is not desired. This code will be shared with a new address space accessor that is independent of task_struct. Signed-off-by: Stephen Wilson Signed-off-by: Al Viro --- mm/memory.c | 35 +++++++++++++++++++++++++---------- 1 file changed, 25 insertions(+), 10 deletions(-) (limited to 'mm/memory.c') diff --git a/mm/memory.c b/mm/memory.c index 5f585b65d734..820b4c4810f0 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3650,20 +3650,15 @@ int generic_access_phys(struct vm_area_struct *vma, unsigned long addr, #endif /* - * Access another process' address space. - * Source/target buffer must be kernel space, - * Do not walk the page table directly, use get_user_pages + * Access another process' address space as given in mm. If non-NULL, use the + * given task for page fault accounting. */ -int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write) +static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm, + unsigned long addr, void *buf, int len, int write) { - struct mm_struct *mm; struct vm_area_struct *vma; void *old_buf = buf; - mm = get_task_mm(tsk); - if (!mm) - return 0; - down_read(&mm->mmap_sem); /* ignore errors, just check how much was successfully transferred */ while (len) { @@ -3712,11 +3707,31 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, in addr += bytes; } up_read(&mm->mmap_sem); - mmput(mm); return buf - old_buf; } +/* + * Access another process' address space. + * Source/target buffer must be kernel space, + * Do not walk the page table directly, use get_user_pages + */ +int access_process_vm(struct task_struct *tsk, unsigned long addr, + void *buf, int len, int write) +{ + struct mm_struct *mm; + int ret; + + mm = get_task_mm(tsk); + if (!mm) + return 0; + + ret = __access_remote_vm(tsk, mm, addr, buf, len, write); + mmput(mm); + + return ret; +} + /* * Print the name of a VMA. */ -- cgit v1.2.3 From 5ddd36b9c59887c6416e21daf984fbdd9b1818df Mon Sep 17 00:00:00 2001 From: Stephen Wilson Date: Sun, 13 Mar 2011 15:49:20 -0400 Subject: mm: implement access_remote_vm Provide an alternative to access_process_vm that allows the caller to obtain a reference to the supplied mm_struct. Signed-off-by: Stephen Wilson Signed-off-by: Al Viro --- mm/memory.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'mm/memory.c') diff --git a/mm/memory.c b/mm/memory.c index 820b4c4810f0..468f5076754c 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3711,6 +3711,22 @@ static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm, return buf - old_buf; } +/** + * @access_remote_vm - access another process' address space + * @mm: the mm_struct of the target address space + * @addr: start address to access + * @buf: source or destination buffer + * @len: number of bytes to transfer + * @write: whether the access is a write + * + * The caller must hold a reference on @mm. + */ +int access_remote_vm(struct mm_struct *mm, unsigned long addr, + void *buf, int len, int write) +{ + return __access_remote_vm(NULL, mm, addr, buf, len, write); +} + /* * Access another process' address space. * Source/target buffer must be kernel space, -- cgit v1.2.3 From 56039efa18f2530fc23e8ef19e716b65ee2a1d1e Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Wed, 23 Mar 2011 16:42:19 -0700 Subject: memcg: fix ugly initialization of return value is in caller Remove initialization of vaiable in caller of memory cgroup function. Actually, it's return value of memcg function but it's initialized in caller. Some memory cgroup uses following style to bring the result of start function to the end function for avoiding races. mem_cgroup_start_A(&(*ptr)) /* Something very complicated can happen here. */ mem_cgroup_end_A(*ptr) In some calls, *ptr should be initialized to NULL be caller. But it's ugly. This patch fixes that *ptr is initialized by _start function. Signed-off-by: KAMEZAWA Hiroyuki Acked-by: Johannes Weiner Acked-by: Daisuke Nishimura Cc: Balbir Singh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm/memory.c') diff --git a/mm/memory.c b/mm/memory.c index 615be5127ce1..20d5f7499ce2 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2767,7 +2767,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, swp_entry_t entry; pte_t pte; int locked; - struct mem_cgroup *ptr = NULL; + struct mem_cgroup *ptr; int exclusive = 0; int ret = 0; -- cgit v1.2.3 From ae91dbfc9949cf042c45798557b48d3b83bc3635 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sat, 26 Mar 2011 13:27:01 -0700 Subject: mm: fix memory.c incorrect kernel-doc Fix mm/memory.c incorrect kernel-doc function notation: Warning(mm/memory.c:3718): Cannot understand * @access_remote_vm - access another process' address space on line 3718 - I thought it was a doc line Signed-off-by: Randy Dunlap Signed-off-by: Linus Torvalds --- mm/memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm/memory.c') diff --git a/mm/memory.c b/mm/memory.c index 51a5c23704af..9da8cab1b1b0 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3715,7 +3715,7 @@ static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm, } /** - * @access_remote_vm - access another process' address space + * access_remote_vm - access another process' address space * @mm: the mm_struct of the target address space * @addr: start address to access * @buf: source or destination buffer -- cgit v1.2.3 From 95042f9eb78a8d9a17455e2ef263f2f310ecef15 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Tue, 12 Apr 2011 14:15:51 -0700 Subject: vm: fix mlock() on stack guard page MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit 53a7706d5ed8 ("mlock: do not hold mmap_sem for extended periods of time") changed mlock() to care about the exact number of pages that __get_user_pages() had brought it. Before, it would only care about errors. And that doesn't work, because we also handled one page specially in __mlock_vma_pages_range(), namely the stack guard page. So when that case was handled, the number of pages that the function returned was off by one. In particular, it could be zero, and then the caller would end up not making any progress at all. Rather than try to fix up that off-by-one error for the mlock case specially, this just moves the logic to handle the stack guard page into__get_user_pages() itself, thus making all the counts come out right automatically. Reported-by: Robert Święcki Cc: Hugh Dickins Cc: Oleg Nesterov Cc: stable@kernel.org Signed-off-by: Linus Torvalds --- mm/memory.c | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) (limited to 'mm/memory.c') diff --git a/mm/memory.c b/mm/memory.c index 9da8cab1b1b0..b623a249918c 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1410,6 +1410,13 @@ no_page_table: return page; } +static inline int stack_guard_page(struct vm_area_struct *vma, unsigned long addr) +{ + return (vma->vm_flags & VM_GROWSDOWN) && + (vma->vm_start == addr) && + !vma_stack_continue(vma->vm_prev, addr); +} + /** * __get_user_pages() - pin user pages in memory * @tsk: task_struct of target task @@ -1488,7 +1495,6 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, vma = find_extend_vma(mm, start); if (!vma && in_gate_area(mm, start)) { unsigned long pg = start & PAGE_MASK; - struct vm_area_struct *gate_vma = get_gate_vma(mm); pgd_t *pgd; pud_t *pud; pmd_t *pmd; @@ -1513,10 +1519,11 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, pte_unmap(pte); return i ? : -EFAULT; } + vma = get_gate_vma(mm); if (pages) { struct page *page; - page = vm_normal_page(gate_vma, start, *pte); + page = vm_normal_page(vma, start, *pte); if (!page) { if (!(gup_flags & FOLL_DUMP) && is_zero_pfn(pte_pfn(*pte))) @@ -1530,12 +1537,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, get_page(page); } pte_unmap(pte); - if (vmas) - vmas[i] = gate_vma; - i++; - start += PAGE_SIZE; - nr_pages--; - continue; + goto next_page; } if (!vma || @@ -1549,6 +1551,13 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, continue; } + /* + * If we don't actually want the page itself, + * and it's the stack guard page, just skip it. + */ + if (!pages && stack_guard_page(vma, start)) + goto next_page; + do { struct page *page; unsigned int foll_flags = gup_flags; @@ -1631,6 +1640,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, flush_anon_page(vma, page, start); flush_dcache_page(page); } +next_page: if (vmas) vmas[i] = vma; i++; -- cgit v1.2.3 From fe936dfc23fed3475b11067e8d9b70553eafcd9e Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Thu, 14 Apr 2011 15:22:10 -0700 Subject: mm: check that we have the right vma in __access_remote_vm() In __access_remote_vm() we need to check that we have found the right vma, not the following vma before we try to access it. Otherwise we might call the vma's access routine with an address which does not fall inside the vma. It was discovered on a current kernel but with an unreleased driver, from memory it was strace leading to a kernel bad access, but it obviously depends on what the access implementation does. Looking at other access implementations I only see: $ git grep -A 5 vm_operations|grep access arch/powerpc/platforms/cell/spufs/file.c- .access = spufs_mem_mmap_access, arch/x86/pci/i386.c- .access = generic_access_phys, drivers/char/mem.c- .access = generic_access_phys fs/sysfs/bin.c- .access = bin_access, The spufs one looks like it might behave badly given the wrong vma, it assumes vma->vm_file->private_data is a spu_context, and looks like it would probably blow up pretty quickly if it wasn't. generic_access_phys() only uses the vma to check vm_flags and get the mm, and then walks page tables using the address. So it should bail on the vm_flags check, or at worst let you access some other VM_IO mapping. And bin_access() just proxies to another access implementation. Signed-off-by: Michael Ellerman Reviewed-by: KOSAKI Motohiro Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm/memory.c') diff --git a/mm/memory.c b/mm/memory.c index b623a249918c..ce22a250926f 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3688,7 +3688,7 @@ static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm, */ #ifdef CONFIG_HAVE_IOREMAP_PROT vma = find_vma(mm, addr); - if (!vma) + if (!vma || vma->vm_start > addr) break; if (vma->vm_ops && vma->vm_ops->access) ret = vma->vm_ops->access(vma, addr, buf, -- cgit v1.2.3