From d559db086ff5be9bcc259e5aa50bf3d881eaf1d1 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Fri, 5 Mar 2010 13:41:39 -0800 Subject: mm: clean up mm_counter Presently, per-mm statistics counter is defined by macro in sched.h This patch modifies it to - defined in mm.h as inlinf functions - use array instead of macro's name creation. This patch is for reducing patch size in future patch to modify implementation of per-mm counter. Signed-off-by: KAMEZAWA Hiroyuki Reviewed-by: Minchan Kim Cc: Christoph Lameter Cc: Lee Schermerhorn Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 56 ++++++++++++++++++++++++++++++++++---------------------- 1 file changed, 34 insertions(+), 22 deletions(-) (limited to 'mm/memory.c') diff --git a/mm/memory.c b/mm/memory.c index 72fb5f39bccc..c57678478801 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -121,6 +121,7 @@ static int __init init_zero_pfn(void) } core_initcall(init_zero_pfn); + /* * If a p?d_bad entry is found while walking page tables, report * the error, before resetting entry to p?d_none. Usually (but @@ -376,12 +377,18 @@ int __pte_alloc_kernel(pmd_t *pmd, unsigned long address) return 0; } -static inline void add_mm_rss(struct mm_struct *mm, int file_rss, int anon_rss) +static inline void init_rss_vec(int *rss) { - if (file_rss) - add_mm_counter(mm, file_rss, file_rss); - if (anon_rss) - add_mm_counter(mm, anon_rss, anon_rss); + memset(rss, 0, sizeof(int) * NR_MM_COUNTERS); +} + +static inline void add_mm_rss_vec(struct mm_struct *mm, int *rss) +{ + int i; + + for (i = 0; i < NR_MM_COUNTERS; i++) + if (rss[i]) + add_mm_counter(mm, i, rss[i]); } /* @@ -632,7 +639,10 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, if (page) { get_page(page); page_dup_rmap(page); - rss[PageAnon(page)]++; + if (PageAnon(page)) + rss[MM_ANONPAGES]++; + else + rss[MM_FILEPAGES]++; } out_set_pte: @@ -648,11 +658,12 @@ static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, pte_t *src_pte, *dst_pte; spinlock_t *src_ptl, *dst_ptl; int progress = 0; - int rss[2]; + int rss[NR_MM_COUNTERS]; swp_entry_t entry = (swp_entry_t){0}; again: - rss[1] = rss[0] = 0; + init_rss_vec(rss); + dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl); if (!dst_pte) return -ENOMEM; @@ -688,7 +699,7 @@ again: arch_leave_lazy_mmu_mode(); spin_unlock(src_ptl); pte_unmap_nested(orig_src_pte); - add_mm_rss(dst_mm, rss[0], rss[1]); + add_mm_rss_vec(dst_mm, rss); pte_unmap_unlock(orig_dst_pte, dst_ptl); cond_resched(); @@ -816,8 +827,9 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, struct mm_struct *mm = tlb->mm; pte_t *pte; spinlock_t *ptl; - int file_rss = 0; - int anon_rss = 0; + int rss[NR_MM_COUNTERS]; + + init_rss_vec(rss); pte = pte_offset_map_lock(mm, pmd, addr, &ptl); arch_enter_lazy_mmu_mode(); @@ -863,14 +875,14 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, set_pte_at(mm, addr, pte, pgoff_to_pte(page->index)); if (PageAnon(page)) - anon_rss--; + rss[MM_ANONPAGES]--; else { if (pte_dirty(ptent)) set_page_dirty(page); if (pte_young(ptent) && likely(!VM_SequentialReadHint(vma))) mark_page_accessed(page); - file_rss--; + rss[MM_FILEPAGES]--; } page_remove_rmap(page); if (unlikely(page_mapcount(page) < 0)) @@ -893,7 +905,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); } while (pte++, addr += PAGE_SIZE, (addr != end && *zap_work > 0)); - add_mm_rss(mm, file_rss, anon_rss); + add_mm_rss_vec(mm, rss); arch_leave_lazy_mmu_mode(); pte_unmap_unlock(pte - 1, ptl); @@ -1527,7 +1539,7 @@ static int insert_page(struct vm_area_struct *vma, unsigned long addr, /* Ok, finally just insert the thing.. */ get_page(page); - inc_mm_counter(mm, file_rss); + inc_mm_counter(mm, MM_FILEPAGES); page_add_file_rmap(page); set_pte_at(mm, addr, pte, mk_pte(page, prot)); @@ -2163,11 +2175,11 @@ gotten: if (likely(pte_same(*page_table, orig_pte))) { if (old_page) { if (!PageAnon(old_page)) { - dec_mm_counter(mm, file_rss); - inc_mm_counter(mm, anon_rss); + dec_mm_counter(mm, MM_FILEPAGES); + inc_mm_counter(mm, MM_ANONPAGES); } } else - inc_mm_counter(mm, anon_rss); + inc_mm_counter(mm, MM_ANONPAGES); flush_cache_page(vma, address, pte_pfn(orig_pte)); entry = mk_pte(new_page, vma->vm_page_prot); entry = maybe_mkwrite(pte_mkdirty(entry), vma); @@ -2604,7 +2616,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, * discarded at swap_free(). */ - inc_mm_counter(mm, anon_rss); + inc_mm_counter(mm, MM_ANONPAGES); pte = mk_pte(page, vma->vm_page_prot); if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) { pte = maybe_mkwrite(pte_mkdirty(pte), vma); @@ -2688,7 +2700,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, if (!pte_none(*page_table)) goto release; - inc_mm_counter(mm, anon_rss); + inc_mm_counter(mm, MM_ANONPAGES); page_add_new_anon_rmap(page, vma, address); setpte: set_pte_at(mm, address, page_table, entry); @@ -2842,10 +2854,10 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, if (flags & FAULT_FLAG_WRITE) entry = maybe_mkwrite(pte_mkdirty(entry), vma); if (anon) { - inc_mm_counter(mm, anon_rss); + inc_mm_counter(mm, MM_ANONPAGES); page_add_new_anon_rmap(page, vma, address); } else { - inc_mm_counter(mm, file_rss); + inc_mm_counter(mm, MM_FILEPAGES); page_add_file_rmap(page); if (flags & FAULT_FLAG_WRITE) { dirty_page = page; -- cgit v1.2.3 From 34e55232e59f7b19050267a05ff1226e5cd122a5 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Fri, 5 Mar 2010 13:41:40 -0800 Subject: mm: avoid false sharing of mm_counter Considering the nature of per mm stats, it's the shared object among threads and can be a cache-miss point in the page fault path. This patch adds per-thread cache for mm_counter. RSS value will be counted into a struct in task_struct and synchronized with mm's one at events. Now, in this patch, the event is the number of calls to handle_mm_fault. Per-thread value is added to mm at each 64 calls. rough estimation with small benchmark on parallel thread (2threads) shows [before] 4.5 cache-miss/faults [after] 4.0 cache-miss/faults Anyway, the most contended object is mmap_sem if the number of threads grows. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: KAMEZAWA Hiroyuki Cc: Minchan Kim Cc: Christoph Lameter Cc: Lee Schermerhorn Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 94 +++++++++++++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 86 insertions(+), 8 deletions(-) (limited to 'mm/memory.c') diff --git a/mm/memory.c b/mm/memory.c index c57678478801..a4597614f18d 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -122,6 +122,79 @@ static int __init init_zero_pfn(void) core_initcall(init_zero_pfn); +#if defined(SPLIT_RSS_COUNTING) + +void __sync_task_rss_stat(struct task_struct *task, struct mm_struct *mm) +{ + int i; + + for (i = 0; i < NR_MM_COUNTERS; i++) { + if (task->rss_stat.count[i]) { + add_mm_counter(mm, i, task->rss_stat.count[i]); + task->rss_stat.count[i] = 0; + } + } + task->rss_stat.events = 0; +} + +static void add_mm_counter_fast(struct mm_struct *mm, int member, int val) +{ + struct task_struct *task = current; + + if (likely(task->mm == mm)) + task->rss_stat.count[member] += val; + else + add_mm_counter(mm, member, val); +} +#define inc_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, 1) +#define dec_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, -1) + +/* sync counter once per 64 page faults */ +#define TASK_RSS_EVENTS_THRESH (64) +static void check_sync_rss_stat(struct task_struct *task) +{ + if (unlikely(task != current)) + return; + if (unlikely(task->rss_stat.events++ > TASK_RSS_EVENTS_THRESH)) + __sync_task_rss_stat(task, task->mm); +} + +unsigned long get_mm_counter(struct mm_struct *mm, int member) +{ + long val = 0; + + /* + * Don't use task->mm here...for avoiding to use task_get_mm().. + * The caller must guarantee task->mm is not invalid. + */ + val = atomic_long_read(&mm->rss_stat.count[member]); + /* + * counter is updated in asynchronous manner and may go to minus. + * But it's never be expected number for users. + */ + if (val < 0) + return 0; + return (unsigned long)val; +} + +void sync_mm_rss(struct task_struct *task, struct mm_struct *mm) +{ + __sync_task_rss_stat(task, mm); +} +#else + +#define inc_mm_counter_fast(mm, member) inc_mm_counter(mm, member) +#define dec_mm_counter_fast(mm, member) dec_mm_counter(mm, member) + +static void check_sync_rss_stat(struct task_struct *task) +{ +} + +void sync_mm_rss(struct task_struct *task, struct mm_struct *mm) +{ +} +#endif + /* * If a p?d_bad entry is found while walking page tables, report * the error, before resetting entry to p?d_none. Usually (but @@ -386,6 +459,8 @@ static inline void add_mm_rss_vec(struct mm_struct *mm, int *rss) { int i; + if (current->mm == mm) + sync_mm_rss(current, mm); for (i = 0; i < NR_MM_COUNTERS; i++) if (rss[i]) add_mm_counter(mm, i, rss[i]); @@ -1539,7 +1614,7 @@ static int insert_page(struct vm_area_struct *vma, unsigned long addr, /* Ok, finally just insert the thing.. */ get_page(page); - inc_mm_counter(mm, MM_FILEPAGES); + inc_mm_counter_fast(mm, MM_FILEPAGES); page_add_file_rmap(page); set_pte_at(mm, addr, pte, mk_pte(page, prot)); @@ -2175,11 +2250,11 @@ gotten: if (likely(pte_same(*page_table, orig_pte))) { if (old_page) { if (!PageAnon(old_page)) { - dec_mm_counter(mm, MM_FILEPAGES); - inc_mm_counter(mm, MM_ANONPAGES); + dec_mm_counter_fast(mm, MM_FILEPAGES); + inc_mm_counter_fast(mm, MM_ANONPAGES); } } else - inc_mm_counter(mm, MM_ANONPAGES); + inc_mm_counter_fast(mm, MM_ANONPAGES); flush_cache_page(vma, address, pte_pfn(orig_pte)); entry = mk_pte(new_page, vma->vm_page_prot); entry = maybe_mkwrite(pte_mkdirty(entry), vma); @@ -2616,7 +2691,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, * discarded at swap_free(). */ - inc_mm_counter(mm, MM_ANONPAGES); + inc_mm_counter_fast(mm, MM_ANONPAGES); pte = mk_pte(page, vma->vm_page_prot); if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) { pte = maybe_mkwrite(pte_mkdirty(pte), vma); @@ -2700,7 +2775,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, if (!pte_none(*page_table)) goto release; - inc_mm_counter(mm, MM_ANONPAGES); + inc_mm_counter_fast(mm, MM_ANONPAGES); page_add_new_anon_rmap(page, vma, address); setpte: set_pte_at(mm, address, page_table, entry); @@ -2854,10 +2929,10 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, if (flags & FAULT_FLAG_WRITE) entry = maybe_mkwrite(pte_mkdirty(entry), vma); if (anon) { - inc_mm_counter(mm, MM_ANONPAGES); + inc_mm_counter_fast(mm, MM_ANONPAGES); page_add_new_anon_rmap(page, vma, address); } else { - inc_mm_counter(mm, MM_FILEPAGES); + inc_mm_counter_fast(mm, MM_FILEPAGES); page_add_file_rmap(page); if (flags & FAULT_FLAG_WRITE) { dirty_page = page; @@ -3035,6 +3110,9 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, count_vm_event(PGFAULT); + /* do counter updates before entering really critical section. */ + check_sync_rss_stat(current); + if (unlikely(is_vm_hugetlb_page(vma))) return hugetlb_fault(mm, vma, address, flags); -- cgit v1.2.3 From b084d4353ff99d824d3bc5a5c2c22c70b1fba722 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Fri, 5 Mar 2010 13:41:42 -0800 Subject: mm: count swap usage A frequent questions from users about memory management is what numbers of swap ents are user for processes. And this information will give some hints to oom-killer. Besides we can count the number of swapents per a process by scanning /proc//smaps, this is very slow and not good for usual process information handler which works like 'ps' or 'top'. (ps or top is now enough slow..) This patch adds a counter of swapents to mm_counter and update is at each swap events. Information is exported via /proc//status file as [kamezawa@bluextal memory]$ cat /proc/self/status Name: cat State: R (running) Tgid: 2910 Pid: 2910 PPid: 2823 TracerPid: 0 Uid: 500 500 500 500 Gid: 500 500 500 500 FDSize: 256 Groups: 500 VmPeak: 82696 kB VmSize: 82696 kB VmLck: 0 kB VmHWM: 432 kB VmRSS: 432 kB VmData: 172 kB VmStk: 84 kB VmExe: 48 kB VmLib: 1568 kB VmPTE: 40 kB VmSwap: 0 kB <=============== this. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: KAMEZAWA Hiroyuki Reviewed-by: Minchan Kim Reviewed-by: Christoph Lameter Cc: Lee Schermerhorn Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) (limited to 'mm/memory.c') diff --git a/mm/memory.c b/mm/memory.c index a4597614f18d..77d9f840936b 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -679,7 +679,9 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, &src_mm->mmlist); spin_unlock(&mmlist_lock); } - if (is_write_migration_entry(entry) && + if (likely(!non_swap_entry(entry))) + rss[MM_SWAPENTS]++; + else if (is_write_migration_entry(entry) && is_cow_mapping(vm_flags)) { /* * COW mappings require pages in both parent @@ -974,9 +976,14 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, if (pte_file(ptent)) { if (unlikely(!(vma->vm_flags & VM_NONLINEAR))) print_bad_pte(vma, addr, ptent, NULL); - } else if - (unlikely(!free_swap_and_cache(pte_to_swp_entry(ptent)))) - print_bad_pte(vma, addr, ptent, NULL); + } else { + swp_entry_t entry = pte_to_swp_entry(ptent); + + if (!non_swap_entry(entry)) + rss[MM_SWAPENTS]--; + if (unlikely(!free_swap_and_cache(entry))) + print_bad_pte(vma, addr, ptent, NULL); + } pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); } while (pte++, addr += PAGE_SIZE, (addr != end && *zap_work > 0)); @@ -2692,6 +2699,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, */ inc_mm_counter_fast(mm, MM_ANONPAGES); + dec_mm_counter_fast(mm, MM_SWAPENTS); pte = mk_pte(page, vma->vm_page_prot); if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) { pte = maybe_mkwrite(pte_mkdirty(pte), vma); -- cgit v1.2.3 From 5beb49305251e5669852ed541e8e2f2f7696c53e Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Fri, 5 Mar 2010 13:42:07 -0800 Subject: mm: change anon_vma linking to fix multi-process server scalability issue The old anon_vma code can lead to scalability issues with heavily forking workloads. Specifically, each anon_vma will be shared between the parent process and all its child processes. In a workload with 1000 child processes and a VMA with 1000 anonymous pages per process that get COWed, this leads to a system with a million anonymous pages in the same anon_vma, each of which is mapped in just one of the 1000 processes. However, the current rmap code needs to walk them all, leading to O(N) scanning complexity for each page. This can result in systems where one CPU is walking the page tables of 1000 processes in page_referenced_one, while all other CPUs are stuck on the anon_vma lock. This leads to catastrophic failure for a benchmark like AIM7, where the total number of processes can reach in the tens of thousands. Real workloads are still a factor 10 less process intensive than AIM7, but they are catching up. This patch changes the way anon_vmas and VMAs are linked, which allows us to associate multiple anon_vmas with a VMA. At fork time, each child process gets its own anon_vmas, in which its COWed pages will be instantiated. The parents' anon_vma is also linked to the VMA, because non-COWed pages could be present in any of the children. This reduces rmap scanning complexity to O(1) for the pages of the 1000 child processes, with O(N) complexity for at most 1/N pages in the system. This reduces the average scanning cost in heavily forking workloads from O(N) to 2. The only real complexity in this patch stems from the fact that linking a VMA to anon_vmas now involves memory allocations. This means vma_adjust can fail, if it needs to attach a VMA to anon_vma structures. This in turn means error handling needs to be added to the calling functions. A second source of complexity is that, because there can be multiple anon_vmas, the anon_vma linking in vma_adjust can no longer be done under "the" anon_vma lock. To prevent the rmap code from walking up an incomplete VMA, this patch introduces the VM_LOCK_RMAP VMA flag. This bit flag uses the same slot as the NOMMU VM_MAPPED_COPY, with an ifdef in mm.h to make sure it is impossible to compile a kernel that needs both symbolic values for the same bitflag. Some test results: Without the anon_vma changes, when AIM7 hits around 9.7k users (on a test box with 16GB RAM and not quite enough IO), the system ends up running >99% in system time, with every CPU on the same anon_vma lock in the pageout code. With these changes, AIM7 hits the cross-over point around 29.7k users. This happens with ~99% IO wait time, there never seems to be any spike in system time. The anon_vma lock contention appears to be resolved. [akpm@linux-foundation.org: cleanups] Signed-off-by: Rik van Riel Cc: KOSAKI Motohiro Cc: Larry Woodman Cc: Lee Schermerhorn Cc: Minchan Kim Cc: Andrea Arcangeli Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm/memory.c') diff --git a/mm/memory.c b/mm/memory.c index 77d9f840936b..dc785b438d70 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -374,7 +374,7 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma, * Hide vma from rmap and truncate_pagecache before freeing * pgtables */ - anon_vma_unlink(vma); + unlink_anon_vmas(vma); unlink_file_vma(vma); if (is_vm_hugetlb_page(vma)) { @@ -388,7 +388,7 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma, && !is_vm_hugetlb_page(next)) { vma = next; next = vma->vm_next; - anon_vma_unlink(vma); + unlink_anon_vmas(vma); unlink_file_vma(vma); } free_pgd_range(tlb, addr, vma->vm_end, -- cgit v1.2.3 From c44b674323f4a2480dbeb65d4b487fa5f06f49e0 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Fri, 5 Mar 2010 13:42:09 -0800 Subject: rmap: move exclusively owned pages to own anon_vma in do_wp_page() When the parent process breaks the COW on a page, both the original which is mapped at child and the new page which is mapped parent end up in that same anon_vma. Generally this won't be a problem, but for some workloads it could preserve the O(N) rmap scanning complexity. A simple fix is to ensure that, when a page which is mapped child gets reused in do_wp_page, because we already are the exclusive owner, the page gets moved to our own exclusive child's anon_vma. Signed-off-by: Rik van Riel Cc: KOSAKI Motohiro Cc: Larry Woodman Cc: Lee Schermerhorn Reviewed-by: Minchan Kim Cc: Andrea Arcangeli Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'mm/memory.c') diff --git a/mm/memory.c b/mm/memory.c index dc785b438d70..d1153e37e9ba 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2138,6 +2138,13 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, page_cache_release(old_page); } reuse = reuse_swap_page(old_page); + if (reuse) + /* + * The page is all ours. Move it to our anon_vma so + * the rmap code will not search our parent or siblings. + * Protected against the rmap code by the page lock. + */ + page_move_anon_rmap(old_page, vma, address); unlock_page(old_page); } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) == (VM_WRITE|VM_SHARED))) { -- cgit v1.2.3 From 53bddb4e9f3f53df02a783751984ddeade71b085 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Wed, 10 Mar 2010 15:20:38 -0800 Subject: nommu: fix build breakage Commit 34e55232e59f7b19050267a05ff1226e5cd122a5 ("mm: avoid false sharing of mm_counter") added sync_mm_rss() for syncing loosely accounted rss counters. It's for CONFIG_MMU but sync_mm_rss is called even in NOMMU enviroment (kerne/exit.c, fs/exec.c). Above commit doesn't handle it well. This patch changes SPLIT_RSS_COUNTING depends on SPLIT_PTLOCKS && CONFIG_MMU And for avoid unnecessary function calls, sync_mm_rss changed to be inlined noop function in header file. Reported-by: David Howells Signed-off-by: KAMEZAWA Hiroyuki Signed-off-by: Mike Frysinger Signed-off-by: Michal Simek Signed-off-by: David Howells Cc: Greg Ungerer Cc: Geert Uytterhoeven Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'mm/memory.c') diff --git a/mm/memory.c b/mm/memory.c index d1153e37e9ba..3d9130bd95d0 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -190,9 +190,6 @@ static void check_sync_rss_stat(struct task_struct *task) { } -void sync_mm_rss(struct task_struct *task, struct mm_struct *mm) -{ -} #endif /* -- cgit v1.2.3 From 718a38211bf4375c0a1efad3afbc5dbaef5d33f9 Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Wed, 10 Mar 2010 15:20:43 -0800 Subject: mm: introduce dump_page() and print symbolic flag names - introduce dump_page() to print the page info for debugging some error condition. - convert three mm users: bad_page(), print_bad_pte() and memory offline failure. - print an extra field: the symbolic names of page->flags Example dump_page() output: [ 157.521694] page:ffffea0000a7cba8 count:2 mapcount:1 mapping:ffff88001c901791 index:0x147 [ 157.525570] page flags: 0x100000000100068(uptodate|lru|active|swapbacked) Signed-off-by: Wu Fengguang Cc: Ingo Molnar Cc: Alex Chiang Cc: Rik van Riel Cc: Andi Kleen Cc: Mel Gorman Cc: Christoph Lameter Cc: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'mm/memory.c') diff --git a/mm/memory.c b/mm/memory.c index 3d9130bd95d0..5b7f2002e54b 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -509,12 +509,8 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr, "BUG: Bad page map in process %s pte:%08llx pmd:%08llx\n", current->comm, (long long)pte_val(pte), (long long)pmd_val(*pmd)); - if (page) { - printk(KERN_ALERT - "page:%p flags:%p count:%d mapcount:%d mapping:%p index:%lx\n", - page, (void *)page->flags, page_count(page), - page_mapcount(page), page->mapping, page->index); - } + if (page) + dump_page(page); printk(KERN_ALERT "addr:%p vm_flags:%08lx anon_vma:%p mapping:%p index:%lx\n", (void *)addr, vma->vm_flags, vma->anon_vma, mapping, index); -- cgit v1.2.3 From 298359c5bf06c04258d7cf552426e198c47e83c1 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Tue, 23 Mar 2010 13:35:37 -0700 Subject: exit: fix oops in sync_mm_rss In 2.6.34-rc1, removing vhost_net module causes an oops in sync_mm_rss (called from do_exit) when workqueue is destroyed. This does not happen on net-next, or with vhost on top of to 2.6.33. The issue seems to be introduced by 34e55232e59f7b19050267a05ff1226e5cd122a5 ("mm: avoid false sharing of mm_counter) which added sync_mm_rss() that is passed task->mm, and dereferences it without checking. If task is a kernel thread, mm might be NULL. I think this might also happen e.g. with aio. This patch fixes the oops by calling sync_mm_rss when task->mm is set to NULL. I also added BUG_ON to detect any other cases where counters get incremented while mm is NULL. The oops I observed looks like this: BUG: unable to handle kernel NULL pointer dereference at 00000000000002a8 IP: [] sync_mm_rss+0x33/0x6f PGD 0 Oops: 0002 [#1] SMP last sysfs file: /sys/devices/system/cpu/cpu7/cache/index2/shared_cpu_map CPU 2 Modules linked in: vhost_net(-) tun bridge stp sunrpc ipv6 cpufreq_ondemand acpi_cpufreq freq_table kvm_intel kvm i5000_edac edac_core rtc_cmos bnx2 button i2c_i801 i2c_core rtc_core e1000e sg joydev ide_cd_mod serio_raw pcspkr rtc_lib cdrom virtio_net virtio_blk virtio_pci virtio_ring virtio af_packet e1000 shpchp aacraid uhci_hcd ohci_hcd ehci_hcd [last unloaded: microcode] Pid: 2046, comm: vhost Not tainted 2.6.34-rc1-vhost #25 System Planar/IBM System x3550 -[7978B3G]- RIP: 0010:[] [] sync_mm_rss+0x33/0x6f RSP: 0018:ffff8802379b7e60 EFLAGS: 00010202 RAX: 0000000000000008 RBX: ffff88023f2390c0 RCX: 0000000000000000 RDX: ffff88023f2396b0 RSI: 0000000000000000 RDI: ffff88023f2390c0 RBP: ffff8802379b7e60 R08: 0000000000000000 R09: 0000000000000000 R10: ffff88023aecfbc0 R11: 0000000000013240 R12: 0000000000000000 R13: ffffffff81051a6c R14: ffffe8ffffc0f540 R15: 0000000000000000 FS: 0000000000000000(0000) GS:ffff880001e80000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b CR2: 00000000000002a8 CR3: 000000023af23000 CR4: 00000000000406e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 Process vhost (pid: 2046, threadinfo ffff8802379b6000, task ffff88023f2390c0) Stack: ffff8802379b7ee0 ffffffff81040687 ffffe8ffffc0f558 ffffffffa00a3e2d <0> 0000000000000000 ffff88023f2390c0 ffffffff81055817 ffff8802379b7e98 <0> ffff8802379b7e98 0000000100000286 ffff8802379b7ee0 ffff88023ad47d78 Call Trace: [] do_exit+0x147/0x6c4 [] ? handle_rx_net+0x0/0x17 [vhost_net] [] ? autoremove_wake_function+0x0/0x39 [] ? worker_thread+0x0/0x229 [] kthreadd+0x0/0xf2 [] kernel_thread_helper+0x4/0x10 [] ? kthread+0x0/0x87 [] ? kernel_thread_helper+0x0/0x10 Code: 00 8b 87 6c 02 00 00 85 c0 74 14 48 98 f0 48 01 86 a0 02 00 00 c7 87 6c 02 00 00 00 00 00 00 8b 87 70 02 00 00 85 c0 74 14 48 98 48 01 86 a8 02 00 00 c7 87 70 02 00 00 00 00 00 00 8b 87 74 RIP [] sync_mm_rss+0x33/0x6f RSP CR2: 00000000000002a8 ---[ end trace 41603ba922beddd2 ]--- Fixing recursive fault but reboot is needed! (note: handle_rx_net is a work item using workqueue in question). sync_mm_rss+0x33/0x6f gave me a hint. I also tried reverting 34e55232e59f7b19050267a05ff1226e5cd122a5 and the oops goes away. The module in question calls use_mm and later unuse_mm from a kernel thread. It is when this kernel thread is destroyed that the crash happens. Signed-off-by: Michael S. Tsirkin Andrea Arcangeli Reviewed-by: Rik van Riel Reviewed-by: KAMEZAWA Hiroyuki Reviewed-by: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 1 + 1 file changed, 1 insertion(+) (limited to 'mm/memory.c') diff --git a/mm/memory.c b/mm/memory.c index 5b7f2002e54b..bc9ba5a1f5b9 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -130,6 +130,7 @@ void __sync_task_rss_stat(struct task_struct *task, struct mm_struct *mm) for (i = 0; i < NR_MM_COUNTERS; i++) { if (task->rss_stat.count[i]) { + BUG_ON(!mm); add_mm_counter(mm, i, task->rss_stat.count[i]); task->rss_stat.count[i] = 0; } -- cgit v1.2.3