From 033193275b3ffcfe7f3fde7b569f3d207f6cd6a0 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Tue, 22 Mar 2011 16:32:56 -0700 Subject: pagewalk: only split huge pages when necessary Right now, if a mm_walk has either ->pte_entry or ->pmd_entry set, it will unconditionally split any transparent huge pages it runs in to. In practice, that means that anyone doing a cat /proc/$pid/smaps will unconditionally break down every huge page in the process and depend on khugepaged to re-collapse it later. This is fairly suboptimal. This patch changes that behavior. It teaches each ->pmd_entry handler (there are five) that they must break down the THPs themselves. Also, the _generic_ code will never break down a THP unless a ->pte_entry handler is actually set. This means that the ->pmd_entry handlers can now choose to deal with THPs without breaking them down. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Dave Hansen Acked-by: Mel Gorman Acked-by: David Rientjes Reviewed-by: Eric B Munson Tested-by: Eric B Munson Cc: Michael J Wolf Cc: Andrea Arcangeli Cc: Johannes Weiner Cc: Matt Mackall Cc: Jeremy Fitzhardinge Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/task_mmu.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'fs/proc') diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 60b914860f81..78fd3621f565 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -343,6 +343,8 @@ static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, struct page *page; int mapcount; + split_huge_page_pmd(walk->mm, pmd); + pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); for (; addr != end; pte++, addr += PAGE_SIZE) { ptent = *pte; @@ -467,6 +469,8 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr, spinlock_t *ptl; struct page *page; + split_huge_page_pmd(walk->mm, pmd); + pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); for (; addr != end; pte++, addr += PAGE_SIZE) { ptent = *pte; @@ -623,6 +627,8 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, pte_t *pte; int err = 0; + split_huge_page_pmd(walk->mm, pmd); + /* find the first VMA at or above 'addr' */ vma = find_vma(walk->mm, addr); for (; addr != end; addr += PAGE_SIZE) { -- cgit v1.2.3 From ae11c4d9f646064cf086e2f8cd4b3c475df7739c Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Tue, 22 Mar 2011 16:32:58 -0700 Subject: smaps: break out smaps_pte_entry() from smaps_pte_range() We will use smaps_pte_entry() in a moment to handle both small and transparent large pages. But, we must break it out of smaps_pte_range() first. Signed-off-by: Dave Hansen Acked-by: Mel Gorman Acked-by: Johannes Weiner Acked-by: David Rientjes Reviewed-by: Eric B Munson Tested-by: Eric B Munson Cc: Michael J Wolf Cc: Andrea Arcangeli Cc: Matt Mackall Cc: Jeremy Fitzhardinge Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/task_mmu.c | 87 +++++++++++++++++++++++++++++------------------------- 1 file changed, 47 insertions(+), 40 deletions(-) (limited to 'fs/proc') diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 78fd3621f565..5cd06fa3106b 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -333,56 +333,63 @@ struct mem_size_stats { u64 pss; }; -static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, - struct mm_walk *walk) + +static void smaps_pte_entry(pte_t ptent, unsigned long addr, + struct mm_walk *walk) { struct mem_size_stats *mss = walk->private; struct vm_area_struct *vma = mss->vma; - pte_t *pte, ptent; - spinlock_t *ptl; struct page *page; int mapcount; - split_huge_page_pmd(walk->mm, pmd); - - pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); - for (; addr != end; pte++, addr += PAGE_SIZE) { - ptent = *pte; + if (is_swap_pte(ptent)) { + mss->swap += PAGE_SIZE; + return; + } - if (is_swap_pte(ptent)) { - mss->swap += PAGE_SIZE; - continue; - } + if (!pte_present(ptent)) + return; + + page = vm_normal_page(vma, addr, ptent); + if (!page) + return; + + if (PageAnon(page)) + mss->anonymous += PAGE_SIZE; + + mss->resident += PAGE_SIZE; + /* Accumulate the size in pages that have been accessed. */ + if (pte_young(ptent) || PageReferenced(page)) + mss->referenced += PAGE_SIZE; + mapcount = page_mapcount(page); + if (mapcount >= 2) { + if (pte_dirty(ptent) || PageDirty(page)) + mss->shared_dirty += PAGE_SIZE; + else + mss->shared_clean += PAGE_SIZE; + mss->pss += (PAGE_SIZE << PSS_SHIFT) / mapcount; + } else { + if (pte_dirty(ptent) || PageDirty(page)) + mss->private_dirty += PAGE_SIZE; + else + mss->private_clean += PAGE_SIZE; + mss->pss += (PAGE_SIZE << PSS_SHIFT); + } +} - if (!pte_present(ptent)) - continue; +static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, + struct mm_walk *walk) +{ + struct mem_size_stats *mss = walk->private; + struct vm_area_struct *vma = mss->vma; + pte_t *pte; + spinlock_t *ptl; - page = vm_normal_page(vma, addr, ptent); - if (!page) - continue; + split_huge_page_pmd(walk->mm, pmd); - if (PageAnon(page)) - mss->anonymous += PAGE_SIZE; - - mss->resident += PAGE_SIZE; - /* Accumulate the size in pages that have been accessed. */ - if (pte_young(ptent) || PageReferenced(page)) - mss->referenced += PAGE_SIZE; - mapcount = page_mapcount(page); - if (mapcount >= 2) { - if (pte_dirty(ptent) || PageDirty(page)) - mss->shared_dirty += PAGE_SIZE; - else - mss->shared_clean += PAGE_SIZE; - mss->pss += (PAGE_SIZE << PSS_SHIFT) / mapcount; - } else { - if (pte_dirty(ptent) || PageDirty(page)) - mss->private_dirty += PAGE_SIZE; - else - mss->private_clean += PAGE_SIZE; - mss->pss += (PAGE_SIZE << PSS_SHIFT); - } - } + pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); + for (; addr != end; pte++, addr += PAGE_SIZE) + smaps_pte_entry(*pte, addr, walk); pte_unmap_unlock(pte - 1, ptl); cond_resched(); return 0; -- cgit v1.2.3 From 3c9acc7849b1eab7ffc75e933404c5f32865d9a2 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Tue, 22 Mar 2011 16:32:59 -0700 Subject: smaps: pass pte size argument in to smaps_pte_entry() Add an argument to the new smaps_pte_entry() function to let it account in things other than PAGE_SIZE units. I changed all of the PAGE_SIZE sites, even though not all of them can be reached for transparent huge pages, just so this will continue to work without changes as THPs are improved. Signed-off-by: Dave Hansen Acked-by: Mel Gorman Acked-by: Johannes Weiner Acked-by: David Rientjes Reviewed-by: Eric B Munson Tested-by: Eric B Munson Cc: Michael J Wolf Cc: Andrea Arcangeli Cc: Matt Mackall Cc: Jeremy Fitzhardinge Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/task_mmu.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) (limited to 'fs/proc') diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 5cd06fa3106b..d7e2af334076 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -335,7 +335,7 @@ struct mem_size_stats { static void smaps_pte_entry(pte_t ptent, unsigned long addr, - struct mm_walk *walk) + unsigned long ptent_size, struct mm_walk *walk) { struct mem_size_stats *mss = walk->private; struct vm_area_struct *vma = mss->vma; @@ -343,7 +343,7 @@ static void smaps_pte_entry(pte_t ptent, unsigned long addr, int mapcount; if (is_swap_pte(ptent)) { - mss->swap += PAGE_SIZE; + mss->swap += ptent_size; return; } @@ -355,25 +355,25 @@ static void smaps_pte_entry(pte_t ptent, unsigned long addr, return; if (PageAnon(page)) - mss->anonymous += PAGE_SIZE; + mss->anonymous += ptent_size; - mss->resident += PAGE_SIZE; + mss->resident += ptent_size; /* Accumulate the size in pages that have been accessed. */ if (pte_young(ptent) || PageReferenced(page)) - mss->referenced += PAGE_SIZE; + mss->referenced += ptent_size; mapcount = page_mapcount(page); if (mapcount >= 2) { if (pte_dirty(ptent) || PageDirty(page)) - mss->shared_dirty += PAGE_SIZE; + mss->shared_dirty += ptent_size; else - mss->shared_clean += PAGE_SIZE; - mss->pss += (PAGE_SIZE << PSS_SHIFT) / mapcount; + mss->shared_clean += ptent_size; + mss->pss += (ptent_size << PSS_SHIFT) / mapcount; } else { if (pte_dirty(ptent) || PageDirty(page)) - mss->private_dirty += PAGE_SIZE; + mss->private_dirty += ptent_size; else - mss->private_clean += PAGE_SIZE; - mss->pss += (PAGE_SIZE << PSS_SHIFT); + mss->private_clean += ptent_size; + mss->pss += (ptent_size << PSS_SHIFT); } } @@ -389,7 +389,7 @@ static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); for (; addr != end; pte++, addr += PAGE_SIZE) - smaps_pte_entry(*pte, addr, walk); + smaps_pte_entry(*pte, addr, PAGE_SIZE, walk); pte_unmap_unlock(pte - 1, ptl); cond_resched(); return 0; -- cgit v1.2.3 From 22e057c5923e60debad318cbeaee33033b110bc8 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Tue, 22 Mar 2011 16:33:00 -0700 Subject: smaps: teach smaps_pte_range() about THP pmds This adds code to explicitly detect and handle pmd_trans_huge() pmds. It then passes HPAGE_SIZE units in to the smap_pte_entry() function instead of PAGE_SIZE. This means that using /proc/$pid/smaps now will no longer cause THPs to be broken down in to small pages. Signed-off-by: Dave Hansen Reviewed-by: Eric B Munson Tested-by: Eric B Munson Acked-by: Andrea Arcangeli Acked-by: David Rientjes Cc: Mel Gorman Cc: Michael J Wolf Cc: Andrea Arcangeli Cc: Johannes Weiner Cc: Matt Mackall Cc: Jeremy Fitzhardinge Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/task_mmu.c | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) (limited to 'fs/proc') diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index d7e2af334076..26f9cc00102c 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -1,5 +1,6 @@ #include #include +#include #include #include #include @@ -7,6 +8,7 @@ #include #include #include +#include #include #include @@ -385,8 +387,25 @@ static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, pte_t *pte; spinlock_t *ptl; - split_huge_page_pmd(walk->mm, pmd); - + spin_lock(&walk->mm->page_table_lock); + if (pmd_trans_huge(*pmd)) { + if (pmd_trans_splitting(*pmd)) { + spin_unlock(&walk->mm->page_table_lock); + wait_split_huge_page(vma->anon_vma, pmd); + } else { + smaps_pte_entry(*(pte_t *)pmd, addr, + HPAGE_PMD_SIZE, walk); + spin_unlock(&walk->mm->page_table_lock); + return 0; + } + } else { + spin_unlock(&walk->mm->page_table_lock); + } + /* + * The mmap_sem held all the way back in m_start() is what + * keeps khugepaged out of here and from collapsing things + * in here. + */ pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); for (; addr != end; pte++, addr += PAGE_SIZE) smaps_pte_entry(*pte, addr, PAGE_SIZE, walk); -- cgit v1.2.3 From 4031a219d8913da40ade5a6e5b538cc61e975cc8 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Tue, 22 Mar 2011 16:33:01 -0700 Subject: smaps: have smaps show transparent huge pages Now that the mere act of _looking_ at /proc/$pid/smaps will not destroy transparent huge pages, tell how much of the VMA is actually mapped with them. This way, we can make sure that we're getting THPs where we expect to see them. Signed-off-by: Dave Hansen Acked-by: Mel Gorman Acked-by: David Rientjes Reviewed-by: Eric B Munson Tested-by: Eric B Munson Cc: Michael J Wolf Cc: Andrea Arcangeli Cc: Johannes Weiner Cc: Matt Mackall Cc: Jeremy Fitzhardinge Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/task_mmu.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'fs/proc') diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 26f9cc00102c..93381aae9363 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -331,6 +331,7 @@ struct mem_size_stats { unsigned long private_dirty; unsigned long referenced; unsigned long anonymous; + unsigned long anonymous_thp; unsigned long swap; u64 pss; }; @@ -396,6 +397,7 @@ static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, smaps_pte_entry(*(pte_t *)pmd, addr, HPAGE_PMD_SIZE, walk); spin_unlock(&walk->mm->page_table_lock); + mss->anonymous_thp += HPAGE_PMD_SIZE; return 0; } } else { @@ -444,6 +446,7 @@ static int show_smap(struct seq_file *m, void *v) "Private_Dirty: %8lu kB\n" "Referenced: %8lu kB\n" "Anonymous: %8lu kB\n" + "AnonHugePages: %8lu kB\n" "Swap: %8lu kB\n" "KernelPageSize: %8lu kB\n" "MMUPageSize: %8lu kB\n" @@ -457,6 +460,7 @@ static int show_smap(struct seq_file *m, void *v) mss.private_dirty >> 10, mss.referenced >> 10, mss.anonymous >> 10, + mss.anonymous_thp >> 10, mss.swap >> 10, vma_kernel_pagesize(vma) >> 10, vma_mmu_pagesize(vma) >> 10, -- cgit v1.2.3 From 26ec3c646e75ce7a69fda429d68fcbdcd5eacc62 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 15 Feb 2011 21:24:05 -0500 Subject: make sessionid permissions in /proc/*/task/* match those in /proc/* Signed-off-by: Al Viro --- fs/proc/base.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/proc') diff --git a/fs/proc/base.c b/fs/proc/base.c index d49c4b5d2c3e..b77236de6d8f 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -3161,7 +3161,7 @@ static const struct pid_entry tid_base_stuff[] = { REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations), #ifdef CONFIG_AUDITSYSCALL REG("loginuid", S_IWUSR|S_IRUGO, proc_loginuid_operations), - REG("sessionid", S_IRUSR, proc_sessionid_operations), + REG("sessionid", S_IRUGO, proc_sessionid_operations), #endif #ifdef CONFIG_FAULT_INJECTION REG("make-it-fail", S_IRUGO|S_IWUSR, proc_fault_inject_operations), -- cgit v1.2.3 From ca6b0bf0e086513b9ee5efc0aa5770ecb57778af Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 15 Feb 2011 22:04:37 -0500 Subject: pagemap: close races with suid execve just use mm_for_maps() Signed-off-by: Al Viro --- fs/proc/base.c | 4 ++-- fs/proc/task_mmu.c | 7 ++----- 2 files changed, 4 insertions(+), 7 deletions(-) (limited to 'fs/proc') diff --git a/fs/proc/base.c b/fs/proc/base.c index b77236de6d8f..df73573e12c9 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -2797,7 +2797,7 @@ static const struct pid_entry tgid_base_stuff[] = { #ifdef CONFIG_PROC_PAGE_MONITOR REG("clear_refs", S_IWUSR, proc_clear_refs_operations), REG("smaps", S_IRUGO, proc_smaps_operations), - REG("pagemap", S_IRUSR, proc_pagemap_operations), + REG("pagemap", S_IRUGO, proc_pagemap_operations), #endif #ifdef CONFIG_SECURITY DIR("attr", S_IRUGO|S_IXUGO, proc_attr_dir_inode_operations, proc_attr_dir_operations), @@ -3133,7 +3133,7 @@ static const struct pid_entry tid_base_stuff[] = { #ifdef CONFIG_PROC_PAGE_MONITOR REG("clear_refs", S_IWUSR, proc_clear_refs_operations), REG("smaps", S_IRUGO, proc_smaps_operations), - REG("pagemap", S_IRUSR, proc_pagemap_operations), + REG("pagemap", S_IRUGO, proc_pagemap_operations), #endif #ifdef CONFIG_SECURITY DIR("attr", S_IRUGO|S_IXUGO, proc_attr_dir_inode_operations, proc_attr_dir_operations), diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 60b914860f81..c966413c139b 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -729,7 +729,8 @@ static ssize_t pagemap_read(struct file *file, char __user *buf, goto out; ret = -EACCES; - if (!ptrace_may_access(task, PTRACE_MODE_READ)) + mm = mm_for_maps(task); + if (!mm) goto out_task; ret = -EINVAL; @@ -742,10 +743,6 @@ static ssize_t pagemap_read(struct file *file, char __user *buf, if (!count) goto out_task; - mm = get_task_mm(task); - if (!mm) - goto out_task; - pm.len = PM_ENTRY_BYTES * (PAGEMAP_WALK_SIZE >> PAGE_SHIFT); pm.buffer = kmalloc(pm.len, GFP_TEMPORARY); ret = -ENOMEM; -- cgit v1.2.3 From ec6fd8a4355cda81cd9f06bebc048e83eb514ac7 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 15 Feb 2011 22:22:54 -0500 Subject: report errors in /proc/*/*map* sanely Signed-off-by: Al Viro --- fs/proc/base.c | 8 +++++--- fs/proc/task_mmu.c | 10 +++++----- fs/proc/task_nommu.c | 6 +++--- 3 files changed, 13 insertions(+), 11 deletions(-) (limited to 'fs/proc') diff --git a/fs/proc/base.c b/fs/proc/base.c index df73573e12c9..c28281102082 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -225,15 +225,17 @@ static int check_mem_permission(struct task_struct *task) struct mm_struct *mm_for_maps(struct task_struct *task) { struct mm_struct *mm; + int err; - if (mutex_lock_killable(&task->signal->cred_guard_mutex)) - return NULL; + err = mutex_lock_killable(&task->signal->cred_guard_mutex); + if (err) + return ERR_PTR(err); mm = get_task_mm(task); if (mm && mm != current->mm && !ptrace_may_access(task, PTRACE_MODE_READ)) { mmput(mm); - mm = NULL; + mm = ERR_PTR(-EACCES); } mutex_unlock(&task->signal->cred_guard_mutex); diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index c966413c139b..8fed0f88fbf7 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -119,11 +119,11 @@ static void *m_start(struct seq_file *m, loff_t *pos) priv->task = get_pid_task(priv->pid, PIDTYPE_PID); if (!priv->task) - return NULL; + return ERR_PTR(-ESRCH); mm = mm_for_maps(priv->task); - if (!mm) - return NULL; + if (!mm || IS_ERR(mm)) + return mm; down_read(&mm->mmap_sem); tail_vma = get_gate_vma(priv->task); @@ -728,9 +728,9 @@ static ssize_t pagemap_read(struct file *file, char __user *buf, if (!task) goto out; - ret = -EACCES; mm = mm_for_maps(task); - if (!mm) + ret = PTR_ERR(mm); + if (!mm || IS_ERR(mm)) goto out_task; ret = -EINVAL; diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c index b535d3e5d5f1..980de547c070 100644 --- a/fs/proc/task_nommu.c +++ b/fs/proc/task_nommu.c @@ -199,13 +199,13 @@ static void *m_start(struct seq_file *m, loff_t *pos) /* pin the task and mm whilst we play with them */ priv->task = get_pid_task(priv->pid, PIDTYPE_PID); if (!priv->task) - return NULL; + return ERR_PTR(-ESRCH); mm = mm_for_maps(priv->task); - if (!mm) { + if (!mm || IS_ERR(mm)) { put_task_struct(priv->task); priv->task = NULL; - return NULL; + return mm; } down_read(&mm->mmap_sem); -- cgit v1.2.3 From d6f64b89d7ff22ce05896ab4a93a653e8d0b123d Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 15 Feb 2011 22:26:01 -0500 Subject: close race in /proc/*/environ Switch to mm_for_maps(). Maybe we ought to make it r--r--r--, since we do checks on IO anyway... Signed-off-by: Al Viro --- fs/proc/base.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'fs/proc') diff --git a/fs/proc/base.c b/fs/proc/base.c index c28281102082..fc471b8766d1 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -919,20 +919,18 @@ static ssize_t environ_read(struct file *file, char __user *buf, if (!task) goto out_no_task; - if (!ptrace_may_access(task, PTRACE_MODE_READ)) - goto out; - ret = -ENOMEM; page = (char *)__get_free_page(GFP_TEMPORARY); if (!page) goto out; - ret = 0; - mm = get_task_mm(task); - if (!mm) + mm = mm_for_maps(task); + ret = PTR_ERR(mm); + if (!mm || IS_ERR(mm)) goto out_free; + ret = 0; while (count > 0) { int this_len, retval, max_len; -- cgit v1.2.3 From 2fadaef41283aad7100fa73f01998cddaca25833 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 15 Feb 2011 22:52:11 -0500 Subject: auxv: require the target to be tracable (or yourself) same as for environ, except that we didn't do any checks to prevent access after suid execve Signed-off-by: Al Viro --- fs/proc/base.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs/proc') diff --git a/fs/proc/base.c b/fs/proc/base.c index fc471b8766d1..e94b58b496f1 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -281,9 +281,9 @@ out: static int proc_pid_auxv(struct task_struct *task, char *buffer) { - int res = 0; - struct mm_struct *mm = get_task_mm(task); - if (mm) { + struct mm_struct *mm = mm_for_maps(task); + int res = PTR_ERR(mm); + if (mm && !IS_ERR(mm)) { unsigned int nwords = 0; do { nwords += 2; -- cgit v1.2.3 From 31db58b3ab432f72ea76be58b12e6ffaf627d5db Mon Sep 17 00:00:00 2001 From: Stephen Wilson Date: Sun, 13 Mar 2011 15:49:15 -0400 Subject: mm: arch: make get_gate_vma take an mm_struct instead of a task_struct Morally, the presence of a gate vma is more an attribute of a particular mm than a particular task. Moreover, dropping the dependency on task_struct will help make both existing and future operations on mm's more flexible and convenient. Signed-off-by: Stephen Wilson Reviewed-by: Michel Lespinasse Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Signed-off-by: Al Viro --- fs/proc/task_mmu.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'fs/proc') diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 8fed0f88fbf7..e73314afc535 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -126,7 +126,7 @@ static void *m_start(struct seq_file *m, loff_t *pos) return mm; down_read(&mm->mmap_sem); - tail_vma = get_gate_vma(priv->task); + tail_vma = get_gate_vma(priv->task->mm); priv->tail_vma = tail_vma; /* Start with last addr hint */ @@ -277,7 +277,8 @@ static int show_map(struct seq_file *m, void *v) show_map_vma(m, vma); if (m->count < m->size) /* vma is copied successfully */ - m->version = (vma != get_gate_vma(task))? vma->vm_start: 0; + m->version = (vma != get_gate_vma(task->mm)) + ? vma->vm_start : 0; return 0; } @@ -436,7 +437,8 @@ static int show_smap(struct seq_file *m, void *v) (unsigned long)(mss.pss >> (10 + PSS_SHIFT)) : 0); if (m->count < m->size) /* vma is copied successfully */ - m->version = (vma != get_gate_vma(task)) ? vma->vm_start : 0; + m->version = (vma != get_gate_vma(task->mm)) + ? vma->vm_start : 0; return 0; } -- cgit v1.2.3 From 26947f8c8f9598209001cdcd31bb2162a2e54691 Mon Sep 17 00:00:00 2001 From: Stephen Wilson Date: Sun, 13 Mar 2011 15:49:21 -0400 Subject: proc: disable mem_write after exec This change makes mem_write() observe the same constraints as mem_read(). This is particularly important for mem_write as an accidental leak of the fd across an exec could result in arbitrary modification of the target process' memory. IOW, /proc/pid/mem is implicitly close-on-exec. Signed-off-by: Stephen Wilson Signed-off-by: Al Viro --- fs/proc/base.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'fs/proc') diff --git a/fs/proc/base.c b/fs/proc/base.c index e94b58b496f1..9af49a3984f1 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -850,6 +850,10 @@ static ssize_t mem_write(struct file * file, const char __user *buf, if (check_mem_permission(task)) goto out; + copied = -EIO; + if (file->private_data != (void *)((long)current->self_exec_id)) + goto out; + copied = -ENOMEM; page = (char *)__get_free_page(GFP_TEMPORARY); if (!page) -- cgit v1.2.3 From 18f661bcf898742212182d75f22f05b048cc04bb Mon Sep 17 00:00:00 2001 From: Stephen Wilson Date: Sun, 13 Mar 2011 15:49:22 -0400 Subject: proc: hold cred_guard_mutex in check_mem_permission() Avoid a potential race when task exec's and we get a new ->mm but check against the old credentials in ptrace_may_access(). Holding of the mutex is implemented by factoring out the body of the code into a helper function __check_mem_permission(). Performing this factorization now simplifies upcoming changes and minimizes churn in the diff's. Signed-off-by: Stephen Wilson Signed-off-by: Al Viro --- fs/proc/base.c | 26 ++++++++++++++++++++++---- 1 file changed, 22 insertions(+), 4 deletions(-) (limited to 'fs/proc') diff --git a/fs/proc/base.c b/fs/proc/base.c index 9af49a3984f1..013f116b3223 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -191,10 +191,7 @@ static int proc_root_link(struct inode *inode, struct path *path) return result; } -/* - * Return zero if current may access user memory in @task, -error if not. - */ -static int check_mem_permission(struct task_struct *task) +static int __check_mem_permission(struct task_struct *task) { /* * A task can always look at itself, in case it chooses @@ -222,6 +219,27 @@ static int check_mem_permission(struct task_struct *task) return -EPERM; } +/* + * Return zero if current may access user memory in @task, -error if not. + */ +static int check_mem_permission(struct task_struct *task) +{ + int err; + + /* + * Avoid racing if task exec's as we might get a new mm but validate + * against old credentials. + */ + err = mutex_lock_killable(&task->signal->cred_guard_mutex); + if (err) + return err; + + err = __check_mem_permission(task); + mutex_unlock(&task->signal->cred_guard_mutex); + + return err; +} + struct mm_struct *mm_for_maps(struct task_struct *task) { struct mm_struct *mm; -- cgit v1.2.3 From 8b0db9db19858b08c46a84540acfd35f6e6487b8 Mon Sep 17 00:00:00 2001 From: Stephen Wilson Date: Sun, 13 Mar 2011 15:49:23 -0400 Subject: proc: make check_mem_permission() return an mm_struct on success This change allows us to take advantage of access_remote_vm(), which in turn eliminates a security issue with the mem_write() implementation. The previous implementation of mem_write() was insecure since the target task could exec a setuid-root binary between the permission check and the actual write. Holding a reference to the target mm_struct eliminates this vulnerability. Signed-off-by: Stephen Wilson Signed-off-by: Al Viro --- fs/proc/base.c | 58 ++++++++++++++++++++++++++++++++++------------------------ 1 file changed, 34 insertions(+), 24 deletions(-) (limited to 'fs/proc') diff --git a/fs/proc/base.c b/fs/proc/base.c index 013f116b3223..e34c3c33b2de 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -191,14 +191,20 @@ static int proc_root_link(struct inode *inode, struct path *path) return result; } -static int __check_mem_permission(struct task_struct *task) +static struct mm_struct *__check_mem_permission(struct task_struct *task) { + struct mm_struct *mm; + + mm = get_task_mm(task); + if (!mm) + return ERR_PTR(-EINVAL); + /* * A task can always look at itself, in case it chooses * to use system calls instead of load instructions. */ if (task == current) - return 0; + return mm; /* * If current is actively ptrace'ing, and would also be @@ -210,20 +216,23 @@ static int __check_mem_permission(struct task_struct *task) match = (tracehook_tracer_task(task) == current); rcu_read_unlock(); if (match && ptrace_may_access(task, PTRACE_MODE_ATTACH)) - return 0; + return mm; } /* * Noone else is allowed. */ - return -EPERM; + mmput(mm); + return ERR_PTR(-EPERM); } /* - * Return zero if current may access user memory in @task, -error if not. + * If current may access user memory in @task return a reference to the + * corresponding mm, otherwise ERR_PTR. */ -static int check_mem_permission(struct task_struct *task) +static struct mm_struct *check_mem_permission(struct task_struct *task) { + struct mm_struct *mm; int err; /* @@ -232,12 +241,12 @@ static int check_mem_permission(struct task_struct *task) */ err = mutex_lock_killable(&task->signal->cred_guard_mutex); if (err) - return err; + return ERR_PTR(err); - err = __check_mem_permission(task); + mm = __check_mem_permission(task); mutex_unlock(&task->signal->cred_guard_mutex); - return err; + return mm; } struct mm_struct *mm_for_maps(struct task_struct *task) @@ -795,18 +804,14 @@ static ssize_t mem_read(struct file * file, char __user * buf, if (!task) goto out_no_task; - if (check_mem_permission(task)) - goto out; - ret = -ENOMEM; page = (char *)__get_free_page(GFP_TEMPORARY); if (!page) goto out; - ret = 0; - - mm = get_task_mm(task); - if (!mm) + mm = check_mem_permission(task); + ret = PTR_ERR(mm); + if (IS_ERR(mm)) goto out_free; ret = -EIO; @@ -820,8 +825,8 @@ static ssize_t mem_read(struct file * file, char __user * buf, int this_len, retval; this_len = (count > PAGE_SIZE) ? PAGE_SIZE : count; - retval = access_process_vm(task, src, page, this_len, 0); - if (!retval || check_mem_permission(task)) { + retval = access_remote_vm(mm, src, page, this_len, 0); + if (!retval) { if (!ret) ret = -EIO; break; @@ -860,22 +865,25 @@ static ssize_t mem_write(struct file * file, const char __user *buf, char *page; struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode); unsigned long dst = *ppos; + struct mm_struct *mm; copied = -ESRCH; if (!task) goto out_no_task; - if (check_mem_permission(task)) - goto out; + mm = check_mem_permission(task); + copied = PTR_ERR(mm); + if (IS_ERR(mm)) + goto out_task; copied = -EIO; if (file->private_data != (void *)((long)current->self_exec_id)) - goto out; + goto out_mm; copied = -ENOMEM; page = (char *)__get_free_page(GFP_TEMPORARY); if (!page) - goto out; + goto out_mm; copied = 0; while (count > 0) { @@ -886,7 +894,7 @@ static ssize_t mem_write(struct file * file, const char __user *buf, copied = -EFAULT; break; } - retval = access_process_vm(task, dst, page, this_len, 1); + retval = access_remote_vm(mm, dst, page, this_len, 1); if (!retval) { if (!copied) copied = -EIO; @@ -899,7 +907,9 @@ static ssize_t mem_write(struct file * file, const char __user *buf, } *ppos = dst; free_page((unsigned long) page); -out: +out_mm: + mmput(mm); +out_task: put_task_struct(task); out_no_task: return copied; -- cgit v1.2.3 From 198214a7ee50375fa71a65e518341980cfd4b2f0 Mon Sep 17 00:00:00 2001 From: Stephen Wilson Date: Sun, 13 Mar 2011 15:49:24 -0400 Subject: proc: enable writing to /proc/pid/mem With recent changes there is no longer a security hazard with writing to /proc/pid/mem. Remove the #ifdef. Signed-off-by: Stephen Wilson Signed-off-by: Al Viro --- fs/proc/base.c | 5 ----- 1 file changed, 5 deletions(-) (limited to 'fs/proc') diff --git a/fs/proc/base.c b/fs/proc/base.c index e34c3c33b2de..bc15df390ec4 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -854,10 +854,6 @@ out_no_task: return ret; } -#define mem_write NULL - -#ifndef mem_write -/* This is a security hazard */ static ssize_t mem_write(struct file * file, const char __user *buf, size_t count, loff_t *ppos) { @@ -914,7 +910,6 @@ out_task: out_no_task: return copied; } -#endif loff_t mem_lseek(struct file *file, loff_t offset, int orig) { -- cgit v1.2.3 From a9712bc12c40c172e393f85a9b2ba8db4bf59509 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 23 Mar 2011 15:52:50 -0400 Subject: deal with races in /proc/*/{syscall,stack,personality} All of those are rw-r--r-- and all are broken for suid - if you open a file before the target does suid-root exec, you'll be still able to access it. For personality it's not a big deal, but for syscall and stack it's a real problem. Fix: check that task is tracable for you at the time of read(). Signed-off-by: Al Viro --- fs/proc/base.c | 69 ++++++++++++++++++++++++++++++++++++++++++---------------- 1 file changed, 50 insertions(+), 19 deletions(-) (limited to 'fs/proc') diff --git a/fs/proc/base.c b/fs/proc/base.c index bc15df390ec4..7d5bb8b9a4ff 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -347,6 +347,23 @@ static int proc_pid_wchan(struct task_struct *task, char *buffer) } #endif /* CONFIG_KALLSYMS */ +static int lock_trace(struct task_struct *task) +{ + int err = mutex_lock_killable(&task->signal->cred_guard_mutex); + if (err) + return err; + if (!ptrace_may_access(task, PTRACE_MODE_ATTACH)) { + mutex_unlock(&task->signal->cred_guard_mutex); + return -EPERM; + } + return 0; +} + +static void unlock_trace(struct task_struct *task) +{ + mutex_unlock(&task->signal->cred_guard_mutex); +} + #ifdef CONFIG_STACKTRACE #define MAX_STACK_TRACE_DEPTH 64 @@ -356,6 +373,7 @@ static int proc_pid_stack(struct seq_file *m, struct pid_namespace *ns, { struct stack_trace trace; unsigned long *entries; + int err; int i; entries = kmalloc(MAX_STACK_TRACE_DEPTH * sizeof(*entries), GFP_KERNEL); @@ -366,15 +384,20 @@ static int proc_pid_stack(struct seq_file *m, struct pid_namespace *ns, trace.max_entries = MAX_STACK_TRACE_DEPTH; trace.entries = entries; trace.skip = 0; - save_stack_trace_tsk(task, &trace); - for (i = 0; i < trace.nr_entries; i++) { - seq_printf(m, "[<%p>] %pS\n", - (void *)entries[i], (void *)entries[i]); + err = lock_trace(task); + if (!err) { + save_stack_trace_tsk(task, &trace); + + for (i = 0; i < trace.nr_entries; i++) { + seq_printf(m, "[<%p>] %pS\n", + (void *)entries[i], (void *)entries[i]); + } + unlock_trace(task); } kfree(entries); - return 0; + return err; } #endif @@ -537,18 +560,22 @@ static int proc_pid_syscall(struct task_struct *task, char *buffer) { long nr; unsigned long args[6], sp, pc; + int res = lock_trace(task); + if (res) + return res; if (task_current_syscall(task, &nr, args, 6, &sp, &pc)) - return sprintf(buffer, "running\n"); - - if (nr < 0) - return sprintf(buffer, "%ld 0x%lx 0x%lx\n", nr, sp, pc); - - return sprintf(buffer, + res = sprintf(buffer, "running\n"); + else if (nr < 0) + res = sprintf(buffer, "%ld 0x%lx 0x%lx\n", nr, sp, pc); + else + res = sprintf(buffer, "%ld 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx\n", nr, args[0], args[1], args[2], args[3], args[4], args[5], sp, pc); + unlock_trace(task); + return res; } #endif /* CONFIG_HAVE_ARCH_TRACEHOOK */ @@ -2775,8 +2802,12 @@ static int proc_tgid_io_accounting(struct task_struct *task, char *buffer) static int proc_pid_personality(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task) { - seq_printf(m, "%08x\n", task->personality); - return 0; + int err = lock_trace(task); + if (!err) { + seq_printf(m, "%08x\n", task->personality); + unlock_trace(task); + } + return err; } /* @@ -2795,7 +2826,7 @@ static const struct pid_entry tgid_base_stuff[] = { REG("environ", S_IRUSR, proc_environ_operations), INF("auxv", S_IRUSR, proc_pid_auxv), ONE("status", S_IRUGO, proc_pid_status), - ONE("personality", S_IRUSR, proc_pid_personality), + ONE("personality", S_IRUGO, proc_pid_personality), INF("limits", S_IRUGO, proc_pid_limits), #ifdef CONFIG_SCHED_DEBUG REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations), @@ -2805,7 +2836,7 @@ static const struct pid_entry tgid_base_stuff[] = { #endif REG("comm", S_IRUGO|S_IWUSR, proc_pid_set_comm_operations), #ifdef CONFIG_HAVE_ARCH_TRACEHOOK - INF("syscall", S_IRUSR, proc_pid_syscall), + INF("syscall", S_IRUGO, proc_pid_syscall), #endif INF("cmdline", S_IRUGO, proc_pid_cmdline), ONE("stat", S_IRUGO, proc_tgid_stat), @@ -2833,7 +2864,7 @@ static const struct pid_entry tgid_base_stuff[] = { INF("wchan", S_IRUGO, proc_pid_wchan), #endif #ifdef CONFIG_STACKTRACE - ONE("stack", S_IRUSR, proc_pid_stack), + ONE("stack", S_IRUGO, proc_pid_stack), #endif #ifdef CONFIG_SCHEDSTATS INF("schedstat", S_IRUGO, proc_pid_schedstat), @@ -3135,14 +3166,14 @@ static const struct pid_entry tid_base_stuff[] = { REG("environ", S_IRUSR, proc_environ_operations), INF("auxv", S_IRUSR, proc_pid_auxv), ONE("status", S_IRUGO, proc_pid_status), - ONE("personality", S_IRUSR, proc_pid_personality), + ONE("personality", S_IRUGO, proc_pid_personality), INF("limits", S_IRUGO, proc_pid_limits), #ifdef CONFIG_SCHED_DEBUG REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations), #endif REG("comm", S_IRUGO|S_IWUSR, proc_pid_set_comm_operations), #ifdef CONFIG_HAVE_ARCH_TRACEHOOK - INF("syscall", S_IRUSR, proc_pid_syscall), + INF("syscall", S_IRUGO, proc_pid_syscall), #endif INF("cmdline", S_IRUGO, proc_pid_cmdline), ONE("stat", S_IRUGO, proc_tid_stat), @@ -3169,7 +3200,7 @@ static const struct pid_entry tid_base_stuff[] = { INF("wchan", S_IRUGO, proc_pid_wchan), #endif #ifdef CONFIG_STACKTRACE - ONE("stack", S_IRUSR, proc_pid_stack), + ONE("stack", S_IRUGO, proc_pid_stack), #endif #ifdef CONFIG_SCHEDSTATS INF("schedstat", S_IRUGO, proc_pid_schedstat), -- cgit v1.2.3 From 51e031496d50f87ff519a63cfd4fc2f415f03336 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Wed, 23 Mar 2011 16:42:48 -0700 Subject: proc: hide kernel addresses via %pK in /proc//stack This file is readable for the task owner. Hide kernel addresses from unprivileged users, leave them function names and offsets. Signed-off-by: Konstantin Khlebnikov Acked-by: Kees Cook Cc: Alexey Dobriyan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/base.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/proc') diff --git a/fs/proc/base.c b/fs/proc/base.c index d49c4b5d2c3e..c3af15e9c070 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -340,7 +340,7 @@ static int proc_pid_stack(struct seq_file *m, struct pid_namespace *ns, save_stack_trace_tsk(task, &trace); for (i = 0; i < trace.nr_entries; i++) { - seq_printf(m, "[<%p>] %pS\n", + seq_printf(m, "[<%pK>] %pS\n", (void *)entries[i], (void *)entries[i]); } kfree(entries); -- cgit v1.2.3 From 0db0c01b53a1a421513f91573241aabafb87802a Mon Sep 17 00:00:00 2001 From: Aaro Koskinen Date: Wed, 23 Mar 2011 16:42:50 -0700 Subject: procfs: fix /proc//maps heap check The current code fails to print the "[heap]" marking if the heap is split into multiple mappings. Fix the check so that the marking is displayed in all possible cases: 1. vma matches exactly the heap 2. the heap vma is merged e.g. with bss 3. the heap vma is splitted e.g. due to locked pages Test cases. In all cases, the process should have mapping(s) with [heap] marking: (1) vma matches exactly the heap #include #include #include int main (void) { if (sbrk(4096) != (void *)-1) { printf("check /proc/%d/maps\n", (int)getpid()); while (1) sleep(1); } return 0; } # ./test1 check /proc/553/maps [1] + Stopped ./test1 # cat /proc/553/maps | head -4 00008000-00009000 r-xp 00000000 01:00 3113640 /test1 00010000-00011000 rw-p 00000000 01:00 3113640 /test1 00011000-00012000 rw-p 00000000 00:00 0 [heap] 4006f000-40070000 rw-p 00000000 00:00 0 (2) the heap vma is merged #include #include #include char foo[4096] = "foo"; char bar[4096]; int main (void) { if (sbrk(4096) != (void *)-1) { printf("check /proc/%d/maps\n", (int)getpid()); while (1) sleep(1); } return 0; } # ./test2 check /proc/556/maps [2] + Stopped ./test2 # cat /proc/556/maps | head -4 00008000-00009000 r-xp 00000000 01:00 3116312 /test2 00010000-00012000 rw-p 00000000 01:00 3116312 /test2 00012000-00014000 rw-p 00000000 00:00 0 [heap] 4004a000-4004b000 rw-p 00000000 00:00 0 (3) the heap vma is splitted (this fails without the patch) #include #include #include #include int main (void) { if ((sbrk(4096) != (void *)-1) && !mlockall(MCL_FUTURE) && (sbrk(4096) != (void *)-1)) { printf("check /proc/%d/maps\n", (int)getpid()); while (1) sleep(1); } return 0; } # ./test3 check /proc/559/maps [1] + Stopped ./test3 # cat /proc/559/maps|head -4 00008000-00009000 r-xp 00000000 01:00 3119108 /test3 00010000-00011000 rw-p 00000000 01:00 3119108 /test3 00011000-00012000 rw-p 00000000 00:00 0 [heap] 00012000-00013000 rw-p 00000000 00:00 0 [heap] It looks like the bug has been there forever, and since it only results in some information missing from a procfile, it does not fulfil the -stable "critical issue" criteria. Signed-off-by: Aaro Koskinen Reviewed-by: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/task_mmu.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs/proc') diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 93381aae9363..636f1a1fdf87 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -251,8 +251,8 @@ static void show_map_vma(struct seq_file *m, struct vm_area_struct *vma) const char *name = arch_vma_name(vma); if (!name) { if (mm) { - if (vma->vm_start <= mm->start_brk && - vma->vm_end >= mm->brk) { + if (vma->vm_start <= mm->brk && + vma->vm_end >= mm->start_brk) { name = "[heap]"; } else if (vma->vm_start <= mm->start_stack && vma->vm_end >= mm->start_stack) { -- cgit v1.2.3 From fc3d8767b2b6de955579852d7a150f1734265eaf Mon Sep 17 00:00:00 2001 From: Jovi Zhang Date: Wed, 23 Mar 2011 16:42:51 -0700 Subject: procfs: fix some wrong error code usage [root@wei 1]# cat /proc/1/mem cat: /proc/1/mem: No such process error code -ESRCH is wrong in this situation. Return -EPERM instead. Signed-off-by: Jovi Zhang Reviewed-by: KOSAKI Motohiro Cc: Alexey Dobriyan Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/base.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'fs/proc') diff --git a/fs/proc/base.c b/fs/proc/base.c index c3af15e9c070..daba13653256 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -775,7 +775,8 @@ static ssize_t mem_read(struct file * file, char __user * buf, if (!task) goto out_no_task; - if (check_mem_permission(task)) + ret = check_mem_permission(task); + if (ret) goto out; ret = -ENOMEM; @@ -845,7 +846,8 @@ static ssize_t mem_write(struct file * file, const char __user *buf, if (!task) goto out_no_task; - if (check_mem_permission(task)) + copied = check_mem_permission(task); + if (copied) goto out; copied = -ENOMEM; @@ -917,6 +919,7 @@ static ssize_t environ_read(struct file *file, char __user *buf, if (!task) goto out_no_task; + ret = -EPERM; if (!ptrace_may_access(task, PTRACE_MODE_READ)) goto out; -- cgit v1.2.3 From 312ec7e50c4d3f40b3762af651d1aa79a67f556a Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Wed, 23 Mar 2011 16:42:52 -0700 Subject: proc: make struct proc_dir_entry::namelen unsigned int 1. namelen is declared "unsigned short" which hints for "maybe space savings". Indeed in 2.4 struct proc_dir_entry looked like: struct proc_dir_entry { unsigned short low_ino; unsigned short namelen; Now, low_ino is "unsigned int", all savings were gone for a long time. "struct proc_dir_entry" is not that countless to worry about it's size, anyway. 2. converting from unsigned short to int/unsigned int can only create problems, we better play it safe. Space is not really conserved, because of natural alignment for the next field. sizeof(struct proc_dir_entry) remains the same. Signed-off-by: Alexey Dobriyan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/generic.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'fs/proc') diff --git a/fs/proc/generic.c b/fs/proc/generic.c index 01e07f2a188f..f1281339b6fa 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -28,7 +28,7 @@ DEFINE_SPINLOCK(proc_subdir_lock); -static int proc_match(int len, const char *name, struct proc_dir_entry *de) +static int proc_match(unsigned int len, const char *name, struct proc_dir_entry *de) { if (de->namelen != len) return 0; @@ -303,7 +303,7 @@ static int __xlate_proc_name(const char *name, struct proc_dir_entry **ret, { const char *cp = name, *next; struct proc_dir_entry *de; - int len; + unsigned int len; de = *ret; if (!de) @@ -602,7 +602,7 @@ static struct proc_dir_entry *__proc_create(struct proc_dir_entry **parent, { struct proc_dir_entry *ent = NULL; const char *fn = name; - int len; + unsigned int len; /* make sure name is valid */ if (!name || !strlen(name)) goto out; @@ -786,7 +786,7 @@ void remove_proc_entry(const char *name, struct proc_dir_entry *parent) struct proc_dir_entry **p; struct proc_dir_entry *de = NULL; const char *fn = name; - int len; + unsigned int len; spin_lock(&proc_subdir_lock); if (__xlate_proc_name(name, &parent, &fn) != 0) { -- cgit v1.2.3 From 5883f57ca0008ffc93e09cbb9847a1928e50c6f3 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 23 Mar 2011 16:42:53 -0700 Subject: proc: protect mm start_code/end_code in /proc/pid/stat While mm->start_stack was protected from cross-uid viewing (commit f83ce3e6b02d5 ("proc: avoid information leaks to non-privileged processes")), the start_code and end_code values were not. This would allow the text location of a PIE binary to leak, defeating ASLR. Note that the value "1" is used instead of "0" for a protected value since "ps", "killall", and likely other readers of /proc/pid/stat, take start_code of "0" to mean a kernel thread and will misbehave. Thanks to Brad Spengler for pointing this out. Addresses CVE-2011-0726 Signed-off-by: Kees Cook Cc: Cc: Alexey Dobriyan Cc: David Howells Cc: Eugene Teo Cc: Martin Schwidefsky Cc: Brad Spengler Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/array.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs/proc') diff --git a/fs/proc/array.c b/fs/proc/array.c index 7c99c1cf7e5c..5e4f776b0917 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -489,8 +489,8 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, vsize, mm ? get_mm_rss(mm) : 0, rsslim, - mm ? mm->start_code : 0, - mm ? mm->end_code : 0, + mm ? (permitted ? mm->start_code : 1) : 0, + mm ? (permitted ? mm->end_code : 1) : 0, (permitted && mm) ? mm->start_stack : 0, esp, eip, -- cgit v1.2.3 From 4308eebbeb2026827d4492ce8c23d99f7f144a82 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 23 Mar 2011 16:43:13 -0700 Subject: pidns: call pid_ns_prepare_proc() from create_pid_namespace() Reorganize proc_get_sb() so it can be called before the struct pid of the first process is allocated. Signed-off-by: Eric W. Biederman Signed-off-by: Daniel Lezcano Cc: Oleg Nesterov Cc: Alexey Dobriyan Acked-by: Serge E. Hallyn Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/root.c | 25 +++++++------------------ 1 file changed, 7 insertions(+), 18 deletions(-) (limited to 'fs/proc') diff --git a/fs/proc/root.c b/fs/proc/root.c index ef9fa8e24ad6..e5e2bfa7a03b 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -43,17 +43,6 @@ static struct dentry *proc_mount(struct file_system_type *fs_type, struct pid_namespace *ns; struct proc_inode *ei; - if (proc_mnt) { - /* Seed the root directory with a pid so it doesn't need - * to be special in base.c. I would do this earlier but - * the only task alive when /proc is mounted the first time - * is the init_task and it doesn't have any pids. - */ - ei = PROC_I(proc_mnt->mnt_sb->s_root->d_inode); - if (!ei->pid) - ei->pid = find_get_pid(1); - } - if (flags & MS_KERNMOUNT) ns = (struct pid_namespace *)data; else @@ -71,16 +60,16 @@ static struct dentry *proc_mount(struct file_system_type *fs_type, return ERR_PTR(err); } - ei = PROC_I(sb->s_root->d_inode); - if (!ei->pid) { - rcu_read_lock(); - ei->pid = get_pid(find_pid_ns(1, ns)); - rcu_read_unlock(); - } - sb->s_flags |= MS_ACTIVE; } + ei = PROC_I(sb->s_root->d_inode); + if (!ei->pid) { + rcu_read_lock(); + ei->pid = get_pid(find_pid_ns(1, ns)); + rcu_read_unlock(); + } + return dget(sb->s_root); } -- cgit v1.2.3 From 52e9fc76d0d4b1e8adeee736172c6c23180059b2 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 23 Mar 2011 16:43:14 -0700 Subject: procfs: kill the global proc_mnt variable After the previous cleanup in proc_get_sb() the global proc_mnt has no reasons to exists, kill it. Signed-off-by: Oleg Nesterov Signed-off-by: Eric W. Biederman Signed-off-by: Daniel Lezcano Cc: Alexey Dobriyan Acked-by: Serge E. Hallyn Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/inode.c | 2 -- fs/proc/internal.h | 1 - fs/proc/root.c | 7 ++++--- 3 files changed, 4 insertions(+), 6 deletions(-) (limited to 'fs/proc') diff --git a/fs/proc/inode.c b/fs/proc/inode.c index d6a7ca1fdac5..d15aa1b1cc8f 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -46,8 +46,6 @@ static void proc_evict_inode(struct inode *inode) } } -struct vfsmount *proc_mnt; - static struct kmem_cache * proc_inode_cachep; static struct inode *proc_alloc_inode(struct super_block *sb) diff --git a/fs/proc/internal.h b/fs/proc/internal.h index 9ad561ded409..c03e8d3a3a5b 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -107,7 +107,6 @@ static inline struct proc_dir_entry *pde_get(struct proc_dir_entry *pde) } void pde_put(struct proc_dir_entry *pde); -extern struct vfsmount *proc_mnt; int proc_fill_super(struct super_block *); struct inode *proc_get_inode(struct super_block *, struct proc_dir_entry *); diff --git a/fs/proc/root.c b/fs/proc/root.c index e5e2bfa7a03b..a9000e9cfee5 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -90,19 +90,20 @@ static struct file_system_type proc_fs_type = { void __init proc_root_init(void) { + struct vfsmount *mnt; int err; proc_init_inodecache(); err = register_filesystem(&proc_fs_type); if (err) return; - proc_mnt = kern_mount_data(&proc_fs_type, &init_pid_ns); - if (IS_ERR(proc_mnt)) { + mnt = kern_mount_data(&proc_fs_type, &init_pid_ns); + if (IS_ERR(mnt)) { unregister_filesystem(&proc_fs_type); return; } - init_pid_ns.proc_mnt = proc_mnt; + init_pid_ns.proc_mnt = mnt; proc_symlink("mounts", NULL, "self/mounts"); proc_net_init(); -- cgit v1.2.3 From 76597cd31470fa130784c78fadb4dab2e624a723 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 27 Mar 2011 19:09:29 -0700 Subject: proc: fix oops on invalid /proc//maps access MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When m_start returns an error, the seq_file logic will still call m_stop with that error entry, so we'd better make sure that we check it before using it as a vma. Introduced by commit ec6fd8a4355c ("report errors in /proc/*/*map* sanely"), which replaced NULL with various ERR_PTR() cases. (On ia64, you happen to get a unaligned fault instead of a page fault, since the address used is generally some random error code like -EPERM) Reported-by: Anca Emanuel Reported-by: Tony Luck Cc: Al Viro Cc: Américo Wang Cc: Stephen Wilson Signed-off-by: Linus Torvalds --- fs/proc/task_mmu.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs/proc') diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 7c708a418acc..2e7addfd9803 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -182,7 +182,8 @@ static void m_stop(struct seq_file *m, void *v) struct proc_maps_private *priv = m->private; struct vm_area_struct *vma = v; - vma_stop(priv, vma); + if (!IS_ERR(vma)) + vma_stop(priv, vma); if (priv->task) put_task_struct(priv->task); } -- cgit v1.2.3 From 25985edcedea6396277003854657b5f3cb31a628 Mon Sep 17 00:00:00 2001 From: Lucas De Marchi Date: Wed, 30 Mar 2011 22:57:33 -0300 Subject: Fix common misspellings Fixes generated by 'codespell' and manually reviewed. Signed-off-by: Lucas De Marchi --- fs/proc/base.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/proc') diff --git a/fs/proc/base.c b/fs/proc/base.c index 5a670c11aeac..dd6628d3ba42 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -220,7 +220,7 @@ static struct mm_struct *__check_mem_permission(struct task_struct *task) } /* - * Noone else is allowed. + * No one else is allowed. */ mmput(mm); return ERR_PTR(-EPERM); -- cgit v1.2.3 From d8bdc59f215e62098bc5b4256fd9928bf27053a1 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 18 Apr 2011 10:36:54 -0700 Subject: proc: do proper range check on readdir offset Rather than pass in some random truncated offset to the pid-related functions, check that the offset is in range up-front. This is just cleanup, the previous commit fixed the real problem. Cc: stable@kernel.org Signed-off-by: Linus Torvalds --- fs/proc/base.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'fs/proc') diff --git a/fs/proc/base.c b/fs/proc/base.c index dd6628d3ba42..dfa532730e55 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -3124,11 +3124,16 @@ static int proc_pid_fill_cache(struct file *filp, void *dirent, filldir_t filldi /* for the /proc/ directory itself, after non-process stuff has been done */ int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir) { - unsigned int nr = filp->f_pos - FIRST_PROCESS_ENTRY; - struct task_struct *reaper = get_proc_task(filp->f_path.dentry->d_inode); + unsigned int nr; + struct task_struct *reaper; struct tgid_iter iter; struct pid_namespace *ns; + if (filp->f_pos >= PID_MAX_LIMIT + TGID_OFFSET) + goto out_no_task; + nr = filp->f_pos - FIRST_PROCESS_ENTRY; + + reaper = get_proc_task(filp->f_path.dentry->d_inode); if (!reaper) goto out_no_task; -- cgit v1.2.3 From a09a79f66874c905af35d5bb5e5f2fdc7b6b894d Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Mon, 9 May 2011 13:01:09 +0200 Subject: Don't lock guardpage if the stack is growing up Linux kernel excludes guard page when performing mlock on a VMA with down-growing stack. However, some architectures have up-growing stack and locking the guard page should be excluded in this case too. This patch fixes lvm2 on PA-RISC (and possibly other architectures with up-growing stack). lvm2 calculates number of used pages when locking and when unlocking and reports an internal error if the numbers mismatch. [ Patch changed fairly extensively to also fix /proc//maps for the grows-up case, and to move things around a bit to clean it all up and share the infrstructure with the /proc bits. Tested on ia64 that has both grow-up and grow-down segments - Linus ] Signed-off-by: Mikulas Patocka Tested-by: Tony Luck Cc: stable@kernel.org Signed-off-by: Linus Torvalds --- fs/proc/task_mmu.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'fs/proc') diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 2e7addfd9803..318d8654989b 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -214,7 +214,7 @@ static void show_map_vma(struct seq_file *m, struct vm_area_struct *vma) int flags = vma->vm_flags; unsigned long ino = 0; unsigned long long pgoff = 0; - unsigned long start; + unsigned long start, end; dev_t dev = 0; int len; @@ -227,13 +227,15 @@ static void show_map_vma(struct seq_file *m, struct vm_area_struct *vma) /* We don't show the stack guard page in /proc/maps */ start = vma->vm_start; - if (vma->vm_flags & VM_GROWSDOWN) - if (!vma_stack_continue(vma->vm_prev, vma->vm_start)) - start += PAGE_SIZE; + if (stack_guard_page_start(vma, start)) + start += PAGE_SIZE; + end = vma->vm_end; + if (stack_guard_page_end(vma, end)) + end -= PAGE_SIZE; seq_printf(m, "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu %n", start, - vma->vm_end, + end, flags & VM_READ ? 'r' : '-', flags & VM_WRITE ? 'w' : '-', flags & VM_EXEC ? 'x' : '-', -- cgit v1.2.3