From 2ce9738bac1b386f46e8478fd2c263460e7c2b09 Mon Sep 17 00:00:00 2001 From: "eparis@redhat" Date: Thu, 2 Jun 2011 21:20:51 +1000 Subject: cgroupfs: use init_cred when populating new cgroupfs mount We recently found that in some configurations SELinux was blocking the ability for cgroupfs to be mounted. The reason for this is because cgroupfs creates files and directories during the get_sb() call and also uses lookup_one_len() during that same get_sb() call. This is a problem since the security subsystem cannot initialize the superblock and the inodes in that filesystem until after the get_sb() call returns. Thus we leave the inodes in an unitialized state during get_sb(). For the vast majority of filesystems this is not an issue, but since cgroupfs uses lookup_on_len() it does search permission checks on the directories in the path it walks. Since the inode security state is not set up SELinux does these checks as if the inodes were 'unlabeled.' Many 'normal' userspace process do not have permission to interact with unlabeled inodes. The solution presented here is to do the permission checks of path walk and inode creation as the kernel rather than as the task that called mount. Since the kernel has permission to read/write/create unlabeled inodes the get_sb() call will complete successfully and the SELinux code will be able to initialize the superblock and those inodes created during the get_sb() call. This appears to be the same solution used by other filesystems such as devtmpfs to solve the same issue and should thus have no negative impact on other LSMs which currently work. Signed-off-by: Eric Paris Acked-by: Paul Menage Signed-off-by: James Morris --- kernel/cgroup.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 2731d115d725..81a867851fee 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -27,9 +27,11 @@ */ #include +#include #include #include #include +#include #include #include #include @@ -1514,6 +1516,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, struct cgroup *root_cgrp = &root->top_cgroup; struct inode *inode; struct cgroupfs_root *existing_root; + const struct cred *cred; int i; BUG_ON(sb->s_root != NULL); @@ -1593,7 +1596,9 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, BUG_ON(!list_empty(&root_cgrp->children)); BUG_ON(root->number_of_cgroups != 1); + cred = override_creds(&init_cred); cgroup_populate_dir(root_cgrp); + revert_creds(cred); mutex_unlock(&cgroup_mutex); mutex_unlock(&inode->i_mutex); } else { -- cgit v1.2.3 From f629299b544b6cc12b4e3e85fec96f4ce5809482 Mon Sep 17 00:00:00 2001 From: Jesper Juhl Date: Sun, 24 Jul 2011 23:15:42 +0200 Subject: trace events: Update version number reference to new 3.x scheme for EVENT_POWER_TRACING_DEPRECATED What was scheduled to be 2.6.41 is now going to be 3.1 . Signed-off-by: Jesper Juhl Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Cc: Steven Rostedt Link: http://lkml.kernel.org/r/alpine.LNX.2.00.1107250929370.8080@swampdragon.chaosbits.net Signed-off-by: Ingo Molnar --- kernel/trace/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 2ad39e556cb4..cd3134510f3d 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -82,7 +82,7 @@ config EVENT_POWER_TRACING_DEPRECATED power:power_frequency This is for userspace compatibility and will vanish after 5 kernel iterations, - namely 2.6.41. + namely 3.1. config CONTEXT_SWITCH_TRACER bool -- cgit v1.2.3 From 1dd75f91ae713049eb6baaa640078f3a6549e522 Mon Sep 17 00:00:00 2001 From: "jhbird.choi@samsung.com" Date: Thu, 21 Jul 2011 15:29:14 +0900 Subject: genirq: Fix wrong bit operation (!msk & 0x01) should be !(msk & 0x01) Signed-off-by: Jonghwan Choi Link: http://lkml.kernel.org/r/1311229754-6003-1-git-send-email-jhbird.choi@samsung.com Signed-off-by: Thomas Gleixner Cc: stable@kernel.org --- kernel/irq/generic-chip.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c index 3a2cab407b93..e38544dddb18 100644 --- a/kernel/irq/generic-chip.c +++ b/kernel/irq/generic-chip.c @@ -246,7 +246,7 @@ void irq_setup_generic_chip(struct irq_chip_generic *gc, u32 msk, gc->mask_cache = irq_reg_readl(gc->reg_base + ct->regs.mask); for (i = gc->irq_base; msk; msk >>= 1, i++) { - if (!msk & 0x01) + if (!(msk & 0x01)) continue; if (flags & IRQ_GC_INIT_NESTED_LOCK) @@ -301,7 +301,7 @@ void irq_remove_generic_chip(struct irq_chip_generic *gc, u32 msk, raw_spin_unlock(&gc_lock); for (; msk; msk >>= 1, i++) { - if (!msk & 0x01) + if (!(msk & 0x01)) continue; /* Remove handler first. That will mask the irq line */ -- cgit v1.2.3 From 9ea71503a8ed9184d2d0b8ccc4d269d05f7940ae Mon Sep 17 00:00:00 2001 From: Shawn Bohrer Date: Thu, 30 Jun 2011 11:21:32 -0500 Subject: futex: Fix regression with read only mappings commit 7485d0d3758e8e6491a5c9468114e74dc050785d (futexes: Remove rw parameter from get_futex_key()) in 2.6.33 fixed two problems: First, It prevented a loop when encountering a ZERO_PAGE. Second, it fixed RW MAP_PRIVATE futex operations by forcing the COW to occur by unconditionally performing a write access get_user_pages_fast() to get the page. The commit also introduced a user-mode regression in that it broke futex operations on read-only memory maps. For example, this breaks workloads that have one or more reader processes doing a FUTEX_WAIT on a futex within a read only shared file mapping, and a writer processes that has a writable mapping issuing the FUTEX_WAKE. This fixes the regression for valid futex operations on RO mappings by trying a RO get_user_pages_fast() when the RW get_user_pages_fast() fails. This change makes it necessary to also check for invalid use cases, such as anonymous RO mappings (which can never change) and the ZERO_PAGE which the commit referenced above was written to address. This patch does restore the original behavior with RO MAP_PRIVATE mappings, which have inherent user-mode usage problems and don't really make sense. With this patch performing a FUTEX_WAIT within a RO MAP_PRIVATE mapping will be successfully woken provided another process updates the region of the underlying mapped file. However, the mmap() man page states that for a MAP_PRIVATE mapping: It is unspecified whether changes made to the file after the mmap() call are visible in the mapped region. So user-mode users attempting to use futex operations on RO MAP_PRIVATE mappings are depending on unspecified behavior. Additionally a RO MAP_PRIVATE mapping could fail to wake up in the following case. Thread-A: call futex(FUTEX_WAIT, memory-region-A). get_futex_key() return inode based key. sleep on the key Thread-B: call mprotect(PROT_READ|PROT_WRITE, memory-region-A) Thread-B: write memory-region-A. COW happen. This process's memory-region-A become related to new COWed private (ie PageAnon=1) page. Thread-B: call futex(FUETX_WAKE, memory-region-A). get_futex_key() return mm based key. IOW, we fail to wake up Thread-A. Once again doing something like this is just silly and users who do something like this get what they deserve. While RO MAP_PRIVATE mappings are nonsensical, checking for a private mapping requires walking the vmas and was deemed too costly to avoid a userspace hang. This Patch is based on Peter Zijlstra's initial patch with modifications to only allow RO mappings for futex operations that need VERIFY_READ access. Reported-by: David Oliver Signed-off-by: Shawn Bohrer Acked-by: Peter Zijlstra Signed-off-by: Darren Hart Cc: KOSAKI Motohiro Cc: peterz@infradead.org Cc: eric.dumazet@gmail.com Cc: zvonler@rgmadvisors.com Cc: hughd@google.com Link: http://lkml.kernel.org/r/1309450892-30676-1-git-send-email-sbohrer@rgmadvisors.com Cc: stable@kernel.org Signed-off-by: Thomas Gleixner --- kernel/futex.c | 54 ++++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 42 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index fe28dc282eae..70bb54bd2468 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -218,6 +218,8 @@ static void drop_futex_key_refs(union futex_key *key) * @uaddr: virtual address of the futex * @fshared: 0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED * @key: address where result is stored. + * @rw: mapping needs to be read/write (values: VERIFY_READ, + * VERIFY_WRITE) * * Returns a negative error code or 0 * The key words are stored in *key on success. @@ -229,12 +231,12 @@ static void drop_futex_key_refs(union futex_key *key) * lock_page() might sleep, the caller should not hold a spinlock. */ static int -get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key) +get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw) { unsigned long address = (unsigned long)uaddr; struct mm_struct *mm = current->mm; struct page *page, *page_head; - int err; + int err, ro = 0; /* * The futex address must be "naturally" aligned. @@ -262,8 +264,18 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key) again: err = get_user_pages_fast(address, 1, 1, &page); + /* + * If write access is not required (eg. FUTEX_WAIT), try + * and get read-only access. + */ + if (err == -EFAULT && rw == VERIFY_READ) { + err = get_user_pages_fast(address, 1, 0, &page); + ro = 1; + } if (err < 0) return err; + else + err = 0; #ifdef CONFIG_TRANSPARENT_HUGEPAGE page_head = page; @@ -305,6 +317,13 @@ again: if (!page_head->mapping) { unlock_page(page_head); put_page(page_head); + /* + * ZERO_PAGE pages don't have a mapping. Avoid a busy loop + * trying to find one. RW mapping would have COW'd (and thus + * have a mapping) so this page is RO and won't ever change. + */ + if ((page_head == ZERO_PAGE(address))) + return -EFAULT; goto again; } @@ -316,6 +335,15 @@ again: * the object not the particular process. */ if (PageAnon(page_head)) { + /* + * A RO anonymous page will never change and thus doesn't make + * sense for futex operations. + */ + if (ro) { + err = -EFAULT; + goto out; + } + key->both.offset |= FUT_OFF_MMSHARED; /* ref taken on mm */ key->private.mm = mm; key->private.address = address; @@ -327,9 +355,10 @@ again: get_futex_key_refs(key); +out: unlock_page(page_head); put_page(page_head); - return 0; + return err; } static inline void put_futex_key(union futex_key *key) @@ -940,7 +969,7 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset) if (!bitset) return -EINVAL; - ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key); + ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, VERIFY_READ); if (unlikely(ret != 0)) goto out; @@ -986,10 +1015,10 @@ futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2, int ret, op_ret; retry: - ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1); + ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, VERIFY_READ); if (unlikely(ret != 0)) goto out; - ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2); + ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, VERIFY_WRITE); if (unlikely(ret != 0)) goto out_put_key1; @@ -1243,10 +1272,11 @@ retry: pi_state = NULL; } - ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1); + ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, VERIFY_READ); if (unlikely(ret != 0)) goto out; - ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2); + ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, + requeue_pi ? VERIFY_WRITE : VERIFY_READ); if (unlikely(ret != 0)) goto out_put_key1; @@ -1790,7 +1820,7 @@ static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags, * while the syscall executes. */ retry: - ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q->key); + ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q->key, VERIFY_READ); if (unlikely(ret != 0)) return ret; @@ -1941,7 +1971,7 @@ static int futex_lock_pi(u32 __user *uaddr, unsigned int flags, int detect, } retry: - ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q.key); + ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q.key, VERIFY_WRITE); if (unlikely(ret != 0)) goto out; @@ -2060,7 +2090,7 @@ retry: if ((uval & FUTEX_TID_MASK) != vpid) return -EPERM; - ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key); + ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, VERIFY_WRITE); if (unlikely(ret != 0)) goto out; @@ -2249,7 +2279,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, debug_rt_mutex_init_waiter(&rt_waiter); rt_waiter.task = NULL; - ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2); + ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, VERIFY_WRITE); if (unlikely(ret != 0)) goto out; -- cgit v1.2.3 From 778d3b0ff0654ad7092bf823fd32010066b12365 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Tue, 26 Jul 2011 16:08:30 -0700 Subject: cpusets: randomize node rotor used in cpuset_mem_spread_node() [ This patch has already been accepted as commit 0ac0c0d0f837 but later reverted (commit 35926ff5fba8) because it itroduced arch specific __node_random which was defined only for x86 code so it broke other archs. This is a followup without any arch specific code. Other than that there are no functional changes.] Some workloads that create a large number of small files tend to assign too many pages to node 0 (multi-node systems). Part of the reason is that the rotor (in cpuset_mem_spread_node()) used to assign nodes starts at node 0 for newly created tasks. This patch changes the rotor to be initialized to a random node number of the cpuset. [akpm@linux-foundation.org: fix layout] [Lee.Schermerhorn@hp.com: Define stub numa_random() for !NUMA configuration] [mhocko@suse.cz: Make it arch independent] [akpm@linux-foundation.org: fix CONFIG_NUMA=y, MAX_NUMNODES>1 build] Signed-off-by: Jack Steiner Signed-off-by: Lee Schermerhorn Signed-off-by: Michal Hocko Reviewed-by: KOSAKI Motohiro Cc: Christoph Lameter Cc: Pekka Enberg Cc: Paul Menage Cc: Jack Steiner Cc: Robin Holt Cc: David Rientjes Cc: Christoph Lameter Cc: David Rientjes Cc: Jack Steiner Cc: KOSAKI Motohiro Cc: Lee Schermerhorn Cc: Michal Hocko Cc: Paul Menage Cc: Pekka Enberg Cc: Robin Holt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cpuset.c | 8 ++++++++ kernel/fork.c | 4 ++++ 2 files changed, 12 insertions(+) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 9c9b7545c810..f8bc977ccbbe 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -2460,11 +2460,19 @@ static int cpuset_spread_node(int *rotor) int cpuset_mem_spread_node(void) { + if (current->cpuset_mem_spread_rotor == NUMA_NO_NODE) + current->cpuset_mem_spread_rotor = + node_random(¤t->mems_allowed); + return cpuset_spread_node(¤t->cpuset_mem_spread_rotor); } int cpuset_slab_spread_node(void) { + if (current->cpuset_slab_spread_rotor == NUMA_NO_NODE) + current->cpuset_slab_spread_rotor = + node_random(¤t->mems_allowed); + return cpuset_spread_node(¤t->cpuset_slab_spread_rotor); } diff --git a/kernel/fork.c b/kernel/fork.c index 17bf7c8d6511..e33177edb3bf 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1173,6 +1173,10 @@ static struct task_struct *copy_process(unsigned long clone_flags, } mpol_fix_fork_child_flag(p); #endif +#ifdef CONFIG_CPUSETS + p->cpuset_mem_spread_rotor = NUMA_NO_NODE; + p->cpuset_slab_spread_rotor = NUMA_NO_NODE; +#endif #ifdef CONFIG_TRACE_IRQFLAGS p->irq_events = 0; #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW -- cgit v1.2.3 From fb0a685cb95a0267a96153af2f72486f27be5847 Mon Sep 17 00:00:00 2001 From: Daniel Rebelo de Oliveira Date: Tue, 26 Jul 2011 16:08:39 -0700 Subject: kernel/fork.c: fix a few coding style issues Signed-off-by: Daniel Rebelo de Oliveira Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 83 ++++++++++++++++++++++++++++++++++------------------------- 1 file changed, 48 insertions(+), 35 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index e33177edb3bf..e7ceaca89609 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -80,7 +80,7 @@ * Protected counters by write_lock_irq(&tasklist_lock) */ unsigned long total_forks; /* Handle normal Linux uptimes. */ -int nr_threads; /* The idle threads do not count.. */ +int nr_threads; /* The idle threads do not count.. */ int max_threads; /* tunable limit on nr_threads */ @@ -232,7 +232,7 @@ void __init fork_init(unsigned long mempages) /* * we need to allow at least 20 threads to boot a system */ - if(max_threads < 20) + if (max_threads < 20) max_threads = 20; init_task.signal->rlim[RLIMIT_NPROC].rlim_cur = max_threads/2; @@ -268,7 +268,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig) return NULL; } - err = arch_dup_task_struct(tsk, orig); + err = arch_dup_task_struct(tsk, orig); if (err) goto out; @@ -288,8 +288,11 @@ static struct task_struct *dup_task_struct(struct task_struct *orig) tsk->stack_canary = get_random_int(); #endif - /* One for us, one for whoever does the "release_task()" (usually parent) */ - atomic_set(&tsk->usage,2); + /* + * One for us, one for whoever does the "release_task()" (usually + * parent) + */ + atomic_set(&tsk->usage, 2); #ifdef CONFIG_BLK_DEV_IO_TRACE tsk->btrace_seq = 0; #endif @@ -437,7 +440,7 @@ fail_nomem: goto out; } -static inline int mm_alloc_pgd(struct mm_struct * mm) +static inline int mm_alloc_pgd(struct mm_struct *mm) { mm->pgd = pgd_alloc(mm); if (unlikely(!mm->pgd)) @@ -445,7 +448,7 @@ static inline int mm_alloc_pgd(struct mm_struct * mm) return 0; } -static inline void mm_free_pgd(struct mm_struct * mm) +static inline void mm_free_pgd(struct mm_struct *mm) { pgd_free(mm, mm->pgd); } @@ -482,7 +485,7 @@ static void mm_init_aio(struct mm_struct *mm) #endif } -static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p) +static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p) { atomic_set(&mm->mm_users, 1); atomic_set(&mm->mm_count, 1); @@ -513,9 +516,9 @@ static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p) /* * Allocate and initialize an mm_struct. */ -struct mm_struct * mm_alloc(void) +struct mm_struct *mm_alloc(void) { - struct mm_struct * mm; + struct mm_struct *mm; mm = allocate_mm(); if (!mm) @@ -583,7 +586,7 @@ void added_exe_file_vma(struct mm_struct *mm) void removed_exe_file_vma(struct mm_struct *mm) { mm->num_exe_file_vmas--; - if ((mm->num_exe_file_vmas == 0) && mm->exe_file){ + if ((mm->num_exe_file_vmas == 0) && mm->exe_file) { fput(mm->exe_file); mm->exe_file = NULL; } @@ -775,9 +778,9 @@ fail_nocontext: return NULL; } -static int copy_mm(unsigned long clone_flags, struct task_struct * tsk) +static int copy_mm(unsigned long clone_flags, struct task_struct *tsk) { - struct mm_struct * mm, *oldmm; + struct mm_struct *mm, *oldmm; int retval; tsk->min_flt = tsk->maj_flt = 0; @@ -844,7 +847,7 @@ static int copy_fs(unsigned long clone_flags, struct task_struct *tsk) return 0; } -static int copy_files(unsigned long clone_flags, struct task_struct * tsk) +static int copy_files(unsigned long clone_flags, struct task_struct *tsk) { struct files_struct *oldf, *newf; int error = 0; @@ -1166,11 +1169,11 @@ static struct task_struct *copy_process(unsigned long clone_flags, cgroup_fork(p); #ifdef CONFIG_NUMA p->mempolicy = mpol_dup(p->mempolicy); - if (IS_ERR(p->mempolicy)) { - retval = PTR_ERR(p->mempolicy); - p->mempolicy = NULL; - goto bad_fork_cleanup_cgroup; - } + if (IS_ERR(p->mempolicy)) { + retval = PTR_ERR(p->mempolicy); + p->mempolicy = NULL; + goto bad_fork_cleanup_cgroup; + } mpol_fix_fork_child_flag(p); #endif #ifdef CONFIG_CPUSETS @@ -1216,25 +1219,33 @@ static struct task_struct *copy_process(unsigned long clone_flags, retval = perf_event_init_task(p); if (retval) goto bad_fork_cleanup_policy; - - if ((retval = audit_alloc(p))) + retval = audit_alloc(p); + if (retval) goto bad_fork_cleanup_policy; /* copy all the process information */ - if ((retval = copy_semundo(clone_flags, p))) + retval = copy_semundo(clone_flags, p); + if (retval) goto bad_fork_cleanup_audit; - if ((retval = copy_files(clone_flags, p))) + retval = copy_files(clone_flags, p); + if (retval) goto bad_fork_cleanup_semundo; - if ((retval = copy_fs(clone_flags, p))) + retval = copy_fs(clone_flags, p); + if (retval) goto bad_fork_cleanup_files; - if ((retval = copy_sighand(clone_flags, p))) + retval = copy_sighand(clone_flags, p); + if (retval) goto bad_fork_cleanup_fs; - if ((retval = copy_signal(clone_flags, p))) + retval = copy_signal(clone_flags, p); + if (retval) goto bad_fork_cleanup_sighand; - if ((retval = copy_mm(clone_flags, p))) + retval = copy_mm(clone_flags, p); + if (retval) goto bad_fork_cleanup_signal; - if ((retval = copy_namespaces(clone_flags, p))) + retval = copy_namespaces(clone_flags, p); + if (retval) goto bad_fork_cleanup_mm; - if ((retval = copy_io(clone_flags, p))) + retval = copy_io(clone_flags, p); + if (retval) goto bad_fork_cleanup_namespaces; retval = copy_thread(clone_flags, stack_start, stack_size, p, regs); if (retval) @@ -1256,7 +1267,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, /* * Clear TID on mm_release()? */ - p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr: NULL; + p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr : NULL; #ifdef CONFIG_BLOCK p->plug = NULL; #endif @@ -1324,7 +1335,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, * it's process group. * A fatal signal pending means that current will exit, so the new * thread can't slip out of an OOM kill (or normal SIGKILL). - */ + */ recalc_sigpending(); if (signal_pending(current)) { spin_unlock(¤t->sighand->siglock); @@ -1685,12 +1696,14 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags) */ if (unshare_flags & (CLONE_NEWIPC|CLONE_SYSVSEM)) do_sysvsem = 1; - if ((err = unshare_fs(unshare_flags, &new_fs))) + err = unshare_fs(unshare_flags, &new_fs); + if (err) goto bad_unshare_out; - if ((err = unshare_fd(unshare_flags, &new_fd))) + err = unshare_fd(unshare_flags, &new_fd); + if (err) goto bad_unshare_cleanup_fs; - if ((err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy, - new_fs))) + err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy, new_fs); + if (err) goto bad_unshare_cleanup_fd; if (new_fs || new_fd || do_sysvsem || new_nsproxy) { -- cgit v1.2.3 From b34a6b1da371ed8af1221459a18c67970f7e3d53 Mon Sep 17 00:00:00 2001 From: Vasiliy Kulikov Date: Tue, 26 Jul 2011 16:08:48 -0700 Subject: ipc: introduce shm_rmid_forced sysctl Add support for the shm_rmid_forced sysctl. If set to 1, all shared memory objects in current ipc namespace will be automatically forced to use IPC_RMID. The POSIX way of handling shmem allows one to create shm objects and call shmdt(), leaving shm object associated with no process, thus consuming memory not counted via rlimits. With shm_rmid_forced=1 the shared memory object is counted at least for one process, so OOM killer may effectively kill the fat process holding the shared memory. It obviously breaks POSIX - some programs relying on the feature would stop working. So set shm_rmid_forced=1 only if you're sure nobody uses "orphaned" memory. Use shm_rmid_forced=0 by default for compatability reasons. The feature was previously impemented in -ow as a configure option. [akpm@linux-foundation.org: fix documentation, per Randy] [akpm@linux-foundation.org: fix warning] [akpm@linux-foundation.org: readability/conventionality tweaks] [akpm@linux-foundation.org: fix shm_rmid_forced/shm_forced_rmid confusion, use standard comment layout] Signed-off-by: Vasiliy Kulikov Cc: Randy Dunlap Cc: "Eric W. Biederman" Cc: "Serge E. Hallyn" Cc: Daniel Lezcano Cc: Oleg Nesterov Cc: Tejun Heo Cc: Ingo Molnar Cc: Alan Cox Cc: Solar Designer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 9ee58bb9e60f..2913b3509d42 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -980,6 +980,7 @@ NORET_TYPE void do_exit(long code) trace_sched_process_exit(tsk); exit_sem(tsk); + exit_shm(tsk); exit_files(tsk); exit_fs(tsk); check_stack_usage(); -- cgit v1.2.3 From 947be5dfdaaef01b43a3d5444688ebef2bd263d4 Mon Sep 17 00:00:00 2001 From: Vitaliy Ivanov Date: Tue, 26 Jul 2011 16:08:49 -0700 Subject: gcov: disable CONSTRUCTORS for UML Selecting GCOV for UML causing configuration mismatch: warning: (GCOV_KERNEL) selects CONSTRUCTORS which has unmet direct dependencies (!UML) Constructors are not needed for UML. Signed-off-by: Vitaliy Ivanov Cc: Peter Oberparleiter Acked-by: Richard Weinberger Acked-by: WANG Cong Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/gcov/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig index 5bf924d80b5c..a92028196cc1 100644 --- a/kernel/gcov/Kconfig +++ b/kernel/gcov/Kconfig @@ -3,7 +3,7 @@ menu "GCOV-based kernel profiling" config GCOV_KERNEL bool "Enable gcov-based kernel profiling" depends on DEBUG_FS - select CONSTRUCTORS + select CONSTRUCTORS if !UML default n ---help--- This option enables gcov-based code profiling (e.g. for code coverage -- cgit v1.2.3 From 4302fbc8ec2ccae279c939f241bf8ce64e1a0bb7 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 26 Jul 2011 16:08:52 -0700 Subject: panic: panic=-1 for immediate reboot When a kernel BUG or oops occurs, ChromeOS intends to panic and immediately reboot, with stacktrace and other messages preserved in RAM across reboot. But the longer we delay, the more likely the user is to poweroff and lose the info. panic_timeout (seconds before rebooting) is set by panic= boot option or sysctl or /proc/sys/kernel/panic; but 0 means wait forever, so at present we have to delay at least 1 second. Let a negative number mean reboot immediately (with the small cosmetic benefit of suppressing that newline-less "Rebooting in %d seconds.." message). Signed-off-by: Hugh Dickins Signed-off-by: Mandeep Singh Baines Cc: Huang Ying Cc: Andi Kleen Cc: Hugh Dickins Cc: Olaf Hering Cc: Jesse Barnes Cc: Dave Airlie Cc: Greg Kroah-Hartman Cc: Alan Cox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/panic.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/panic.c b/kernel/panic.c index 69231670eb95..d7bb6974efb5 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -119,6 +119,8 @@ NORET_TYPE void panic(const char * fmt, ...) } mdelay(PANIC_TIMER_STEP); } + } + if (panic_timeout != 0) { /* * This will not be a clean reboot, with everything * shutting down. But if there is a chance of -- cgit v1.2.3 From 60063497a95e716c9a689af3be2687d261f115b4 Mon Sep 17 00:00:00 2001 From: Arun Sharma Date: Tue, 26 Jul 2011 16:09:06 -0700 Subject: atomic: use This allows us to move duplicated code in (atomic_inc_not_zero() for now) to Signed-off-by: Arun Sharma Reviewed-by: Eric Dumazet Cc: Ingo Molnar Cc: David Miller Cc: Eric Dumazet Acked-by: Mike Frysinger Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/audit.c | 2 +- kernel/auditsc.c | 2 +- kernel/cgroup.c | 2 +- kernel/cpuset.c | 2 +- kernel/debug/debug_core.c | 2 +- kernel/rcupdate.c | 2 +- kernel/rcutorture.c | 2 +- kernel/rcutree_trace.c | 2 +- kernel/rwsem.c | 2 +- kernel/stop_machine.c | 2 +- kernel/taskstats.c | 2 +- kernel/trace/trace.h | 2 +- kernel/trace/trace_mmiotrace.c | 2 +- 13 files changed, 13 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index 52501b5d4902..0a1355ca3d79 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -43,7 +43,7 @@ #include #include -#include +#include #include #include #include diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 00d79df03e76..ce4b054acee5 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -44,7 +44,7 @@ #include #include -#include +#include #include #include #include diff --git a/kernel/cgroup.c b/kernel/cgroup.c index a63507b92ca4..984458035d4a 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -59,7 +59,7 @@ #include #include /* used in cgroup_attach_proc */ -#include +#include static DEFINE_MUTEX(cgroup_mutex); diff --git a/kernel/cpuset.c b/kernel/cpuset.c index f8bc977ccbbe..10131fdaff70 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -55,7 +55,7 @@ #include #include -#include +#include #include #include #include diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c index bad6786dee88..0d7c08784efb 100644 --- a/kernel/debug/debug_core.c +++ b/kernel/debug/debug_core.c @@ -51,7 +51,7 @@ #include #include -#include +#include #include #include "debug_core.h" diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index 7784bd216b6a..ddddb320be61 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c @@ -37,7 +37,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index ced72102adc2..98f51b13bb7e 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -33,7 +33,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c index 4e144876dc68..3b0c0986afc0 100644 --- a/kernel/rcutree_trace.c +++ b/kernel/rcutree_trace.c @@ -31,7 +31,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/kernel/rwsem.c b/kernel/rwsem.c index 176e5e56ffab..9f48f3d82e9b 100644 --- a/kernel/rwsem.c +++ b/kernel/rwsem.c @@ -11,7 +11,7 @@ #include #include -#include +#include /* * lock for reading diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index c1124752e1d3..ba5070ce5765 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -19,7 +19,7 @@ #include #include -#include +#include /* * Structure to determine completion condition and record errors. May diff --git a/kernel/taskstats.c b/kernel/taskstats.c index fc0f22005417..d1db2880d1cf 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -28,7 +28,7 @@ #include #include #include -#include +#include /* * Maximum length of a cpumask that can be specified in diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 3f381d0b20a8..616846bcfee5 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -2,7 +2,7 @@ #define _LINUX_KERNEL_TRACE_H #include -#include +#include #include #include #include diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c index 017fa376505d..fd3c8aae55e5 100644 --- a/kernel/trace/trace_mmiotrace.c +++ b/kernel/trace/trace_mmiotrace.c @@ -12,7 +12,7 @@ #include #include -#include +#include #include "trace.h" #include "trace_output.h" -- cgit v1.2.3 From c1095c6da518b0b64e724f629051fa67655cd8d9 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 27 Jul 2011 12:49:44 -0700 Subject: signals: sys_ssetmask/sys_rt_sigsuspend should use set_current_blocked() sys_ssetmask(), sys_rt_sigsuspend() and compat_sys_rt_sigsuspend() change ->blocked directly. This is not correct, see the changelog in e6fa16ab "signal: sigprocmask() should do retarget_shared_pending()" Change them to use set_current_blocked(). Another change is that now we are doing ->saved_sigmask = ->blocked lockless, it doesn't make any sense to do this under ->siglock. Signed-off-by: Oleg Nesterov Reviewed-by: Matt Fleming Acked-by: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/compat.c | 5 +---- kernel/signal.c | 17 +++++------------ 2 files changed, 6 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/compat.c b/kernel/compat.c index 18197ae2d465..616c78197cca 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -992,11 +992,8 @@ asmlinkage long compat_sys_rt_sigsuspend(compat_sigset_t __user *unewset, compat sigset_from_compat(&newset, &newset32); sigdelsetmask(&newset, sigmask(SIGKILL)|sigmask(SIGSTOP)); - spin_lock_irq(¤t->sighand->siglock); current->saved_sigmask = current->blocked; - current->blocked = newset; - recalc_sigpending(); - spin_unlock_irq(¤t->sighand->siglock); + set_current_blocked(&newset); current->state = TASK_INTERRUPTIBLE; schedule(); diff --git a/kernel/signal.c b/kernel/signal.c index d7f70aed1cc0..291c9700be75 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -3102,15 +3102,11 @@ SYSCALL_DEFINE0(sgetmask) SYSCALL_DEFINE1(ssetmask, int, newmask) { - int old; - - spin_lock_irq(¤t->sighand->siglock); - old = current->blocked.sig[0]; + int old = current->blocked.sig[0]; + sigset_t newset; - siginitset(¤t->blocked, newmask & ~(sigmask(SIGKILL)| - sigmask(SIGSTOP))); - recalc_sigpending(); - spin_unlock_irq(¤t->sighand->siglock); + siginitset(&newset, newmask & ~(sigmask(SIGKILL) | sigmask(SIGSTOP))); + set_current_blocked(&newset); return old; } @@ -3167,11 +3163,8 @@ SYSCALL_DEFINE2(rt_sigsuspend, sigset_t __user *, unewset, size_t, sigsetsize) return -EFAULT; sigdelsetmask(&newset, sigmask(SIGKILL)|sigmask(SIGSTOP)); - spin_lock_irq(¤t->sighand->siglock); current->saved_sigmask = current->blocked; - current->blocked = newset; - recalc_sigpending(); - spin_unlock_irq(¤t->sighand->siglock); + set_current_blocked(&newset); current->state = TASK_INTERRUPTIBLE; schedule(); -- cgit v1.2.3 From 2330fb8242c3efc281ab8a2d3e22686023699955 Mon Sep 17 00:00:00 2001 From: Hans Verkuil Date: Tue, 7 Jun 2011 11:43:57 -0300 Subject: [media] v4l2-compat-ioctl32: add VIDIOC_DQEVENT support Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab --- kernel/compat.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/compat.c b/kernel/compat.c index fc9eb093acd5..d4abc5bcc27c 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -158,6 +158,7 @@ int put_compat_timespec(const struct timespec *ts, struct compat_timespec __user __put_user(ts->tv_sec, &cts->tv_sec) || __put_user(ts->tv_nsec, &cts->tv_nsec)) ? -EFAULT : 0; } +EXPORT_SYMBOL_GPL(put_compat_timespec); static long compat_nanosleep_restart(struct restart_block *restart) { -- cgit v1.2.3 From 08a543ad33fc188650801bd36eed4ffe272643e1 Mon Sep 17 00:00:00 2001 From: Grant Likely Date: Tue, 26 Jul 2011 03:19:06 -0600 Subject: irq: add irq_domain translation infrastructure This patch adds irq_domain infrastructure for translating from hardware irq numbers to linux irqs. This is particularly important for architectures adding device tree support because the current implementation (excluding PowerPC and SPARC) cannot handle translation for more than a single interrupt controller. irq_domain supports device tree translation for any number of interrupt controllers. This patch converts x86, Microblaze, ARM and MIPS to use irq_domain for device tree irq translation. x86 is untested beyond compiling it, irq_domain is enabled for MIPS and Microblaze, but the old behaviour is preserved until the core code is modified to actually register an irq_domain yet. On ARM it works and is required for much of the new ARM device tree board support. PowerPC has /not/ been converted to use this new infrastructure. It is still missing some features before it can replace the virq infrastructure already in powerpc (see documentation on irq_domain_map/unmap for details). Followup patches will add the missing pieces and migrate PowerPC to use irq_domain. SPARC has its own method of managing interrupts from the device tree and is unaffected by this change. Acked-by: Ralf Baechle Signed-off-by: Grant Likely --- kernel/irq/Kconfig | 4 ++ kernel/irq/Makefile | 1 + kernel/irq/irqdomain.c | 122 +++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 127 insertions(+) create mode 100644 kernel/irq/irqdomain.c (limited to 'kernel') diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig index d1d051b38e0b..5a38bf4de641 100644 --- a/kernel/irq/Kconfig +++ b/kernel/irq/Kconfig @@ -52,6 +52,10 @@ config IRQ_EDGE_EOI_HANDLER config GENERIC_IRQ_CHIP bool +# Generic irq_domain hw <--> linux irq number translation +config IRQ_DOMAIN + bool + # Support forced irq threading config IRQ_FORCED_THREADING bool diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile index 73290056cfb6..fff17381f0af 100644 --- a/kernel/irq/Makefile +++ b/kernel/irq/Makefile @@ -2,6 +2,7 @@ obj-y := irqdesc.o handle.o manage.o spurious.o resend.o chip.o dummychip.o devres.o obj-$(CONFIG_GENERIC_IRQ_CHIP) += generic-chip.o obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o +obj-$(CONFIG_IRQ_DOMAIN) += irqdomain.o obj-$(CONFIG_PROC_FS) += proc.o obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o obj-$(CONFIG_PM_SLEEP) += pm.o diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c new file mode 100644 index 000000000000..29c7bd42e25d --- /dev/null +++ b/kernel/irq/irqdomain.c @@ -0,0 +1,122 @@ +#include +#include +#include +#include +#include + +static LIST_HEAD(irq_domain_list); +static DEFINE_MUTEX(irq_domain_mutex); + +/** + * irq_domain_add() - Register an irq_domain + * @domain: ptr to initialized irq_domain structure + * + * Registers an irq_domain structure. The irq_domain must at a minimum be + * initialized with an ops structure pointer, and either a ->to_irq hook or + * a valid irq_base value. Everything else is optional. + */ +void irq_domain_add(struct irq_domain *domain) +{ + struct irq_data *d; + int hwirq; + + /* + * This assumes that the irq_domain owner has already allocated + * the irq_descs. This block will be removed when support for dynamic + * allocation of irq_descs is added to irq_domain. + */ + for (hwirq = 0; hwirq < domain->nr_irq; hwirq++) { + d = irq_get_irq_data(irq_domain_to_irq(domain, hwirq)); + if (d || d->domain) { + /* things are broken; just report, don't clean up */ + WARN(1, "error: irq_desc already assigned to a domain"); + return; + } + d->domain = domain; + d->hwirq = hwirq; + } + + mutex_lock(&irq_domain_mutex); + list_add(&domain->list, &irq_domain_list); + mutex_unlock(&irq_domain_mutex); +} + +/** + * irq_domain_del() - Unregister an irq_domain + * @domain: ptr to registered irq_domain. + */ +void irq_domain_del(struct irq_domain *domain) +{ + struct irq_data *d; + int hwirq; + + mutex_lock(&irq_domain_mutex); + list_del(&domain->list); + mutex_unlock(&irq_domain_mutex); + + /* Clear the irq_domain assignments */ + for (hwirq = 0; hwirq < domain->nr_irq; hwirq++) { + d = irq_get_irq_data(irq_domain_to_irq(domain, hwirq)); + d->domain = NULL; + } +} + +#if defined(CONFIG_OF_IRQ) +/** + * irq_create_of_mapping() - Map a linux irq number from a DT interrupt spec + * + * Used by the device tree interrupt mapping code to translate a device tree + * interrupt specifier to a valid linux irq number. Returns either a valid + * linux IRQ number or 0. + * + * When the caller no longer need the irq number returned by this function it + * should arrange to call irq_dispose_mapping(). + */ +unsigned int irq_create_of_mapping(struct device_node *controller, + const u32 *intspec, unsigned int intsize) +{ + struct irq_domain *domain; + unsigned long hwirq; + unsigned int irq, type; + int rc = -EINVAL; + + /* Find a domain which can translate the irq spec */ + mutex_lock(&irq_domain_mutex); + list_for_each_entry(domain, &irq_domain_list, list) { + if (!domain->ops->dt_translate) + continue; + rc = domain->ops->dt_translate(domain, controller, + intspec, intsize, &hwirq, &type); + if (rc == 0) + break; + } + mutex_unlock(&irq_domain_mutex); + + if (rc != 0) + return 0; + + irq = irq_domain_to_irq(domain, hwirq); + if (type != IRQ_TYPE_NONE) + irq_set_irq_type(irq, type); + pr_debug("%s: mapped hwirq=%i to irq=%i, flags=%x\n", + controller->full_name, (int)hwirq, irq, type); + return irq; +} +EXPORT_SYMBOL_GPL(irq_create_of_mapping); + +/** + * irq_dispose_mapping() - Discard a mapping created by irq_create_of_mapping() + * @irq: linux irq number to be discarded + * + * Calling this function indicates the caller no longer needs a reference to + * the linux irq number returned by a prior call to irq_create_of_mapping(). + */ +void irq_dispose_mapping(unsigned int irq) +{ + /* + * nothing yet; will be filled when support for dynamic allocation of + * irq_descs is added to irq_domain + */ +} +EXPORT_SYMBOL_GPL(irq_dispose_mapping); +#endif /* CONFIG_OF_IRQ */ -- cgit v1.2.3 From 7e71330169d8056536b299290544980bccc6b300 Mon Sep 17 00:00:00 2001 From: Grant Likely Date: Tue, 26 Jul 2011 03:19:06 -0600 Subject: dt/irq: add irq_domain_generate_simple() helper irq_domain_generate_simple() is an easy way to generate an irq translation domain for simple irq controllers. It assumes a flat 1:1 mapping from hardware irq number to an offset of the first linux irq number assigned to the controller Signed-off-by: Grant Likely --- kernel/irq/irqdomain.c | 58 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 58 insertions(+) (limited to 'kernel') diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 29c7bd42e25d..d5828da3fd38 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -3,6 +3,8 @@ #include #include #include +#include +#include static LIST_HEAD(irq_domain_list); static DEFINE_MUTEX(irq_domain_mutex); @@ -119,4 +121,60 @@ void irq_dispose_mapping(unsigned int irq) */ } EXPORT_SYMBOL_GPL(irq_dispose_mapping); + +int irq_domain_simple_dt_translate(struct irq_domain *d, + struct device_node *controller, + const u32 *intspec, unsigned int intsize, + unsigned long *out_hwirq, unsigned int *out_type) +{ + if (d->of_node != controller) + return -EINVAL; + if (intsize < 1) + return -EINVAL; + + *out_hwirq = intspec[0]; + *out_type = IRQ_TYPE_NONE; + if (intsize > 1) + *out_type = intspec[1] & IRQ_TYPE_SENSE_MASK; + return 0; +} + +struct irq_domain_ops irq_domain_simple_ops = { + .dt_translate = irq_domain_simple_dt_translate, +}; +EXPORT_SYMBOL_GPL(irq_domain_simple_ops); + +/** + * irq_domain_create_simple() - Set up a 'simple' translation range + */ +void irq_domain_add_simple(struct device_node *controller, int irq_base) +{ + struct irq_domain *domain; + + domain = kzalloc(sizeof(*domain), GFP_KERNEL); + if (!domain) { + WARN_ON(1); + return; + } + + domain->irq_base = irq_base; + domain->of_node = of_node_get(controller); + domain->ops = &irq_domain_simple_ops; + irq_domain_add(domain); +} +EXPORT_SYMBOL_GPL(irq_domain_add_simple); + +void irq_domain_generate_simple(const struct of_device_id *match, + u64 phys_base, unsigned int irq_start) +{ + struct device_node *node; + pr_info("looking for phys_base=%llx, irq_start=%i\n", + (unsigned long long) phys_base, (int) irq_start); + node = of_find_matching_node_by_address(NULL, match, phys_base); + if (node) + irq_domain_add_simple(node, irq_start); + else + pr_info("no node found\n"); +} +EXPORT_SYMBOL_GPL(irq_domain_generate_simple); #endif /* CONFIG_OF_IRQ */ -- cgit v1.2.3 From f3637a5f2e2eb391ff5757bc83fb5de8f9726464 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Thu, 7 Jul 2011 22:32:17 +0200 Subject: irq: Always set IRQF_ONESHOT if no primary handler is specified If no primary handler is specified then a default one is assigned which always returns IRQ_WAKE_THREAD. This handler requires the IRQF_ONESHOT flag on LEVEL / EIO typed irqs because the source of interrupt is not disabled. Since it is required for those users and there is no difference for others it makes sense to add this flag unconditionally. Signed-off-by: Sebastian Andrzej Siewior Link: http://lkml.kernel.org/r/1310070737-18514-1-git-send-email-bigeasy@linutronix.de Signed-off-by: Thomas Gleixner --- kernel/irq/manage.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 0a7840aeb0fb..3f9cd4799da7 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -1322,6 +1322,7 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler, if (!thread_fn) return -EINVAL; handler = irq_default_primary_handler; + irqflags |= IRQF_ONESHOT; } action = kzalloc(sizeof(struct irqaction), GFP_KERNEL); -- cgit v1.2.3 From b6873807a7143b7d6d8b06809295e559d07d7deb Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Mon, 11 Jul 2011 12:17:31 +0200 Subject: irq: Track the owner of irq descriptor Interrupt descriptors can be allocated from modules. The interrupts are used by other modules, but we have no refcount on the module which provides the interrupts and there is no way to establish one on the device level as the interrupt using module is agnostic to the fact that the interrupt is provided by a module rather than by some builtin interrupt controller. To prevent removal of the interrupt providing module, we can track the owner of the interrupt descriptor, which also provides the relevant irq chip functions in the irq descriptor. request/setup_irq() can now acquire a refcount on the owner module to prevent unloading. free_irq() drops the refcount. Signed-off-by: Sebastian Andrzej Siewior Link: http://lkml.kernel.org/r/20110711101731.GA13804@Chamillionaire.breakpoint.cc Signed-off-by: Thomas Gleixner --- kernel/irq/irqdesc.c | 36 ++++++++++++++++++++++++------------ kernel/irq/manage.c | 17 +++++++++++++---- 2 files changed, 37 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 4c60a50e66b2..cb65d0360e31 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -70,7 +70,8 @@ static inline void desc_smp_init(struct irq_desc *desc, int node) { } static inline int desc_node(struct irq_desc *desc) { return 0; } #endif -static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node) +static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node, + struct module *owner) { int cpu; @@ -86,6 +87,7 @@ static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node) desc->irq_count = 0; desc->irqs_unhandled = 0; desc->name = NULL; + desc->owner = owner; for_each_possible_cpu(cpu) *per_cpu_ptr(desc->kstat_irqs, cpu) = 0; desc_smp_init(desc, node); @@ -128,7 +130,7 @@ static void free_masks(struct irq_desc *desc) static inline void free_masks(struct irq_desc *desc) { } #endif -static struct irq_desc *alloc_desc(int irq, int node) +static struct irq_desc *alloc_desc(int irq, int node, struct module *owner) { struct irq_desc *desc; gfp_t gfp = GFP_KERNEL; @@ -147,7 +149,7 @@ static struct irq_desc *alloc_desc(int irq, int node) raw_spin_lock_init(&desc->lock); lockdep_set_class(&desc->lock, &irq_desc_lock_class); - desc_set_defaults(irq, desc, node); + desc_set_defaults(irq, desc, node, owner); return desc; @@ -173,13 +175,14 @@ static void free_desc(unsigned int irq) kfree(desc); } -static int alloc_descs(unsigned int start, unsigned int cnt, int node) +static int alloc_descs(unsigned int start, unsigned int cnt, int node, + struct module *owner) { struct irq_desc *desc; int i; for (i = 0; i < cnt; i++) { - desc = alloc_desc(start + i, node); + desc = alloc_desc(start + i, node, owner); if (!desc) goto err; mutex_lock(&sparse_irq_lock); @@ -227,7 +230,7 @@ int __init early_irq_init(void) nr_irqs = initcnt; for (i = 0; i < initcnt; i++) { - desc = alloc_desc(i, node); + desc = alloc_desc(i, node, NULL); set_bit(i, allocated_irqs); irq_insert_desc(i, desc); } @@ -261,7 +264,7 @@ int __init early_irq_init(void) alloc_masks(&desc[i], GFP_KERNEL, node); raw_spin_lock_init(&desc[i].lock); lockdep_set_class(&desc[i].lock, &irq_desc_lock_class); - desc_set_defaults(i, &desc[i], node); + desc_set_defaults(i, &desc[i], node, NULL); } return arch_early_irq_init(); } @@ -276,8 +279,16 @@ static void free_desc(unsigned int irq) dynamic_irq_cleanup(irq); } -static inline int alloc_descs(unsigned int start, unsigned int cnt, int node) +static inline int alloc_descs(unsigned int start, unsigned int cnt, int node, + struct module *owner) { + u32 i; + + for (i = 0; i < cnt; i++) { + struct irq_desc *desc = irq_to_desc(start + i); + + desc->owner = owner; + } return start; } @@ -337,7 +348,8 @@ EXPORT_SYMBOL_GPL(irq_free_descs); * Returns the first irq number or error code */ int __ref -irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node) +__irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node, + struct module *owner) { int start, ret; @@ -366,13 +378,13 @@ irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node) bitmap_set(allocated_irqs, start, cnt); mutex_unlock(&sparse_irq_lock); - return alloc_descs(start, cnt, node); + return alloc_descs(start, cnt, node, owner); err: mutex_unlock(&sparse_irq_lock); return ret; } -EXPORT_SYMBOL_GPL(irq_alloc_descs); +EXPORT_SYMBOL_GPL(__irq_alloc_descs); /** * irq_reserve_irqs - mark irqs allocated @@ -440,7 +452,7 @@ void dynamic_irq_cleanup(unsigned int irq) unsigned long flags; raw_spin_lock_irqsave(&desc->lock, flags); - desc_set_defaults(irq, desc, desc_node(desc)); + desc_set_defaults(irq, desc, desc_node(desc), NULL); raw_spin_unlock_irqrestore(&desc->lock, flags); } diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 3f9cd4799da7..2e9425889fa8 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -883,6 +883,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) if (desc->irq_data.chip == &no_irq_chip) return -ENOSYS; + if (!try_module_get(desc->owner)) + return -ENODEV; /* * Some drivers like serial.c use request_irq() heavily, * so we have to be careful not to interfere with a @@ -906,8 +908,10 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) */ nested = irq_settings_is_nested_thread(desc); if (nested) { - if (!new->thread_fn) - return -EINVAL; + if (!new->thread_fn) { + ret = -EINVAL; + goto out_mput; + } /* * Replace the primary handler which was provided from * the driver for non nested interrupt handling by the @@ -929,8 +933,10 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) t = kthread_create(irq_thread, new, "irq/%d-%s", irq, new->name); - if (IS_ERR(t)) - return PTR_ERR(t); + if (IS_ERR(t)) { + ret = PTR_ERR(t); + goto out_mput; + } /* * We keep the reference to the task struct even if * the thread dies to avoid that the interrupt code @@ -1095,6 +1101,8 @@ out_thread: kthread_stop(t); put_task_struct(t); } +out_mput: + module_put(desc->owner); return ret; } @@ -1203,6 +1211,7 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id) put_task_struct(action->thread); } + module_put(desc->owner); return action; } -- cgit v1.2.3 From 1c388919d89ca35741e9c4d3255adf87f76f0c06 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Sat, 7 May 2011 20:53:16 +0200 Subject: resources: Add lookup_resource() Add a function to find an existing resource by a resource start address. This allows to implement simple allocators (with a malloc/free-alike API) on top of the resource system. Signed-off-by: Geert Uytterhoeven --- kernel/resource.c | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'kernel') diff --git a/kernel/resource.c b/kernel/resource.c index 3ff40178dce7..3b3cedc52592 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -553,6 +553,27 @@ int allocate_resource(struct resource *root, struct resource *new, EXPORT_SYMBOL(allocate_resource); +/** + * lookup_resource - find an existing resource by a resource start address + * @root: root resource descriptor + * @start: resource start address + * + * Returns a pointer to the resource if found, NULL otherwise + */ +struct resource *lookup_resource(struct resource *root, resource_size_t start) +{ + struct resource *res; + + read_lock(&resource_lock); + for (res = root->child; res; res = res->sibling) { + if (res->start == start) + break; + } + read_unlock(&resource_lock); + + return res; +} + /* * Insert a resource into the resource tree. If successful, return NULL, * otherwise return the conflicting resource (compare to __request_resource()) -- cgit v1.2.3 From 3bdb65ec95e6cccffc40102d7c003047c45da90c Mon Sep 17 00:00:00 2001 From: Jason Wessel Date: Thu, 30 Jun 2011 14:12:00 -0500 Subject: kdb: cleanup unused variables missed in the original kdb merge The BTARGS and BTSYMARG variables do not have any function in the mainline version of kdb. Reported-by: Tim Bird Signed-off-by: Jason Wessel --- kernel/debug/kdb/kdb_bt.c | 5 ++--- kernel/debug/kdb/kdb_cmds | 4 ---- kernel/debug/kdb/kdb_main.c | 2 +- 3 files changed, 3 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/debug/kdb/kdb_bt.c b/kernel/debug/kdb/kdb_bt.c index 2f62fe85f16a..7179eac7b41c 100644 --- a/kernel/debug/kdb/kdb_bt.c +++ b/kernel/debug/kdb/kdb_bt.c @@ -112,9 +112,8 @@ kdb_bt(int argc, const char **argv) unsigned long addr; long offset; - kdbgetintenv("BTARGS", &argcount); /* Arguments to print */ - kdbgetintenv("BTAPROMPT", &btaprompt); /* Prompt after each - * proc in bta */ + /* Prompt after each proc in bta */ + kdbgetintenv("BTAPROMPT", &btaprompt); if (strcmp(argv[0], "bta") == 0) { struct task_struct *g, *p; diff --git a/kernel/debug/kdb/kdb_cmds b/kernel/debug/kdb/kdb_cmds index 56c88e4db309..9834ad303ab6 100644 --- a/kernel/debug/kdb/kdb_cmds +++ b/kernel/debug/kdb/kdb_cmds @@ -18,16 +18,12 @@ defcmd dumpcommon "" "Common kdb debugging" endefcmd defcmd dumpall "" "First line debugging" - set BTSYMARG 1 - set BTARGS 9 pid R -dumpcommon -bta endefcmd defcmd dumpcpu "" "Same as dumpall but only tasks on cpus" - set BTSYMARG 1 - set BTARGS 9 pid R -dumpcommon -btc diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index be14779bcef6..b33116ec9e6d 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -145,7 +145,6 @@ static char *__env[] = { #endif "RADIX=16", "MDCOUNT=8", /* lines of md output */ - "BTARGS=9", /* 9 possible args in bt */ KDB_PLATFORM_ENV, "DTABCOUNT=30", "NOSECT=1", @@ -172,6 +171,7 @@ static char *__env[] = { (char *)0, (char *)0, (char *)0, + (char *)0, }; static const int __nenv = (sizeof(__env) / sizeof(char *)); -- cgit v1.2.3 From f679c4985bb2e7de9d39a5d40b6031361c4ad861 Mon Sep 17 00:00:00 2001 From: Jason Wessel Date: Mon, 23 May 2011 13:17:41 -0500 Subject: kdb,kgdb: Implement switch and pass buffer from kdb -> gdb When switching from kdb mode to kgdb mode packets were getting lost depending on the size of the fifo queue of the serial chip. When gdb initially connects if it is in kdb mode it should entirely send any character buffer over to the gdbstub when switching connections. Previously kdb was zero'ing out the character buffer and this could lead to gdb failing to connect at all, or a lengthy pause could occur on the initial connect. Signed-off-by: Jason Wessel --- kernel/debug/gdbstub.c | 22 +++++++++++++++------- kernel/debug/kdb/kdb_debugger.c | 17 +++++++---------- kernel/debug/kdb/kdb_io.c | 10 ++++++---- kernel/debug/kdb/kdb_private.h | 1 + 4 files changed, 29 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/debug/gdbstub.c b/kernel/debug/gdbstub.c index a11db956dd62..34872482315e 100644 --- a/kernel/debug/gdbstub.c +++ b/kernel/debug/gdbstub.c @@ -42,6 +42,8 @@ /* Our I/O buffers. */ static char remcom_in_buffer[BUFMAX]; static char remcom_out_buffer[BUFMAX]; +static int gdbstub_use_prev_in_buf; +static int gdbstub_prev_in_buf_pos; /* Storage for the registers, in GDB format. */ static unsigned long gdb_regs[(NUMREGBYTES + @@ -58,6 +60,13 @@ static int gdbstub_read_wait(void) int ret = -1; int i; + if (unlikely(gdbstub_use_prev_in_buf)) { + if (gdbstub_prev_in_buf_pos < gdbstub_use_prev_in_buf) + return remcom_in_buffer[gdbstub_prev_in_buf_pos++]; + else + gdbstub_use_prev_in_buf = 0; + } + /* poll any additional I/O interfaces that are defined */ while (ret < 0) for (i = 0; kdb_poll_funcs[i] != NULL; i++) { @@ -109,7 +118,6 @@ static void get_packet(char *buffer) buffer[count] = ch; count = count + 1; } - buffer[count] = 0; if (ch == '#') { xmitcsum = hex_to_bin(gdbstub_read_wait()) << 4; @@ -124,6 +132,7 @@ static void get_packet(char *buffer) if (dbg_io_ops->flush) dbg_io_ops->flush(); } + buffer[count] = 0; } while (checksum != xmitcsum); } @@ -1082,12 +1091,11 @@ int gdbstub_state(struct kgdb_state *ks, char *cmd) case 'c': strcpy(remcom_in_buffer, cmd); return 0; - case '?': - gdb_cmd_status(ks); - break; - case '\0': - strcpy(remcom_out_buffer, ""); - break; + case '$': + strcpy(remcom_in_buffer, cmd); + gdbstub_use_prev_in_buf = strlen(remcom_in_buffer); + gdbstub_prev_in_buf_pos = 0; + return 0; } dbg_io_ops->write_char('+'); put_packet(remcom_out_buffer); diff --git a/kernel/debug/kdb/kdb_debugger.c b/kernel/debug/kdb/kdb_debugger.c index dd0b1b7dd02c..fe422d275782 100644 --- a/kernel/debug/kdb/kdb_debugger.c +++ b/kernel/debug/kdb/kdb_debugger.c @@ -30,6 +30,8 @@ EXPORT_SYMBOL_GPL(kdb_poll_funcs); int kdb_poll_idx = 1; EXPORT_SYMBOL_GPL(kdb_poll_idx); +static struct kgdb_state *kdb_ks; + int kdb_stub(struct kgdb_state *ks) { int error = 0; @@ -39,6 +41,7 @@ int kdb_stub(struct kgdb_state *ks) kdb_dbtrap_t db_result = KDB_DB_NOBPT; int i; + kdb_ks = ks; if (KDB_STATE(REENTRY)) { reason = KDB_REASON_SWITCH; KDB_STATE_CLEAR(REENTRY); @@ -124,16 +127,6 @@ int kdb_stub(struct kgdb_state *ks) kdbnearsym_cleanup(); if (error == KDB_CMD_KGDB) { if (KDB_STATE(DOING_KGDB) || KDB_STATE(DOING_KGDB2)) { - /* - * This inteface glue which allows kdb to transition in into - * the gdb stub. In order to do this the '?' or '' gdb serial - * packet response is processed here. And then control is - * passed to the gdbstub. - */ - if (KDB_STATE(DOING_KGDB)) - gdbstub_state(ks, "?"); - else - gdbstub_state(ks, ""); KDB_STATE_CLEAR(DOING_KGDB); KDB_STATE_CLEAR(DOING_KGDB2); } @@ -166,3 +159,7 @@ int kdb_stub(struct kgdb_state *ks) return kgdb_info[ks->cpu].ret_state; } +void kdb_gdb_state_pass(char *buf) +{ + gdbstub_state(kdb_ks, buf); +} diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c index 96fdaac46a80..bd233264b29f 100644 --- a/kernel/debug/kdb/kdb_io.c +++ b/kernel/debug/kdb/kdb_io.c @@ -35,8 +35,8 @@ static void kgdb_transition_check(char *buffer) { int slen = strlen(buffer); if (strncmp(buffer, "$?#3f", slen) != 0 && - strncmp(buffer, "$qSupported#37", slen) != 0 && - strncmp(buffer, "+$qSupported#37", slen) != 0) { + strncmp(buffer, "$qSupported", slen) != 0 && + strncmp(buffer, "+$qSupported", slen) != 0) { KDB_STATE_SET(KGDB_TRANS); kdb_printf("%s", buffer); } @@ -390,12 +390,14 @@ poll_again: /* Special escape to kgdb */ if (lastchar - buffer >= 5 && strcmp(lastchar - 5, "$?#3f") == 0) { + kdb_gdb_state_pass(lastchar - 5); strcpy(buffer, "kgdb"); KDB_STATE_SET(DOING_KGDB); return buffer; } - if (lastchar - buffer >= 14 && - strcmp(lastchar - 14, "$qSupported#37") == 0) { + if (lastchar - buffer >= 11 && + strcmp(lastchar - 11, "$qSupported") == 0) { + kdb_gdb_state_pass(lastchar - 11); strcpy(buffer, "kgdb"); KDB_STATE_SET(DOING_KGDB2); return buffer; diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h index 35d69ed1dfb5..03d332e63442 100644 --- a/kernel/debug/kdb/kdb_private.h +++ b/kernel/debug/kdb/kdb_private.h @@ -218,6 +218,7 @@ extern void kdb_print_nameval(const char *name, unsigned long val); extern void kdb_send_sig_info(struct task_struct *p, struct siginfo *info); extern void kdb_meminfo_proc_show(void); extern char *kdb_getstr(char *, size_t, char *); +extern void kdb_gdb_state_pass(char *buf); /* Defines for kdb_symbol_print */ #define KDB_SP_SPACEB 0x0001 /* Space before string */ -- cgit v1.2.3 From d613d828e8987a1f794378022f900b454fa95403 Mon Sep 17 00:00:00 2001 From: Jason Wessel Date: Mon, 23 May 2011 13:22:54 -0500 Subject: kdb: Remove all references to DOING_KGDB2 The DOING_KGDB2 was originally a state variable for one of the two ways to automatically transition from kdb to kgdb. Purge all these variables and just use one single state for the transition. Signed-off-by: Jason Wessel --- kernel/debug/kdb/kdb_debugger.c | 4 +--- kernel/debug/kdb/kdb_io.c | 2 +- kernel/debug/kdb/kdb_main.c | 2 +- kernel/debug/kdb/kdb_private.h | 2 -- 4 files changed, 3 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/debug/kdb/kdb_debugger.c b/kernel/debug/kdb/kdb_debugger.c index fe422d275782..d9ca9aa481ec 100644 --- a/kernel/debug/kdb/kdb_debugger.c +++ b/kernel/debug/kdb/kdb_debugger.c @@ -126,10 +126,8 @@ int kdb_stub(struct kgdb_state *ks) KDB_STATE_CLEAR(PAGER); kdbnearsym_cleanup(); if (error == KDB_CMD_KGDB) { - if (KDB_STATE(DOING_KGDB) || KDB_STATE(DOING_KGDB2)) { + if (KDB_STATE(DOING_KGDB)) KDB_STATE_CLEAR(DOING_KGDB); - KDB_STATE_CLEAR(DOING_KGDB2); - } return DBG_PASS_EVENT; } kdb_bp_install(ks->linux_regs); diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c index bd233264b29f..0dbcdfbb6fd0 100644 --- a/kernel/debug/kdb/kdb_io.c +++ b/kernel/debug/kdb/kdb_io.c @@ -399,7 +399,7 @@ poll_again: strcmp(lastchar - 11, "$qSupported") == 0) { kdb_gdb_state_pass(lastchar - 11); strcpy(buffer, "kgdb"); - KDB_STATE_SET(DOING_KGDB2); + KDB_STATE_SET(DOING_KGDB); return buffer; } } diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index b33116ec9e6d..63786e71a3cd 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -1386,7 +1386,7 @@ int kdb_main_loop(kdb_reason_t reason, kdb_reason_t reason2, int error, } if (result == KDB_CMD_KGDB) { - if (!(KDB_STATE(DOING_KGDB) || KDB_STATE(DOING_KGDB2))) + if (!KDB_STATE(DOING_KGDB)) kdb_printf("Entering please attach debugger " "or use $D#44+ or $3#33\n"); break; diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h index 03d332e63442..e381d105b40b 100644 --- a/kernel/debug/kdb/kdb_private.h +++ b/kernel/debug/kdb/kdb_private.h @@ -21,7 +21,6 @@ #define KDB_CMD_SS (-1003) #define KDB_CMD_SSB (-1004) #define KDB_CMD_KGDB (-1005) -#define KDB_CMD_KGDB2 (-1006) /* Internal debug flags */ #define KDB_DEBUG_FLAG_BP 0x0002 /* Breakpoint subsystem debug */ @@ -146,7 +145,6 @@ extern int kdb_state; * keyboard on this cpu */ #define KDB_STATE_KEXEC 0x00040000 /* kexec issued */ #define KDB_STATE_DOING_KGDB 0x00080000 /* kgdb enter now issued */ -#define KDB_STATE_DOING_KGDB2 0x00100000 /* kgdb enter now issued */ #define KDB_STATE_KGDB_TRANS 0x00200000 /* Transition to kgdb */ #define KDB_STATE_ARCH 0xff000000 /* Reserved for arch * specific use */ -- cgit v1.2.3 From 37f86b469d73fc2f2a925536fb99b8f513f641b7 Mon Sep 17 00:00:00 2001 From: Jason Wessel Date: Tue, 24 May 2011 10:43:06 -0500 Subject: kdb,kgdb: Allow arbitrary kgdb magic knock sequences The first packet that gdb sends when the kernel is in kdb mode seems to change with every release of gdb. Instead of continuing to add many different gdb packets, change kdb to automatically look for any thing that looks like a gdb packet. Example 1 cold start test: echo g > /proc/sysrq-trigger $D#44+ Example 2 cold start test: echo g > /proc/sysrq-trigger $3#33 The second one should re-enter kdb's shell right away and is purely a test. Signed-off-by: Jason Wessel --- kernel/debug/kdb/kdb_io.c | 28 ++++++++++++++++++++-------- 1 file changed, 20 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c index 0dbcdfbb6fd0..4802eb5840e1 100644 --- a/kernel/debug/kdb/kdb_io.c +++ b/kernel/debug/kdb/kdb_io.c @@ -31,15 +31,21 @@ char kdb_prompt_str[CMD_BUFLEN]; int kdb_trap_printk; -static void kgdb_transition_check(char *buffer) +static int kgdb_transition_check(char *buffer) { - int slen = strlen(buffer); - if (strncmp(buffer, "$?#3f", slen) != 0 && - strncmp(buffer, "$qSupported", slen) != 0 && - strncmp(buffer, "+$qSupported", slen) != 0) { + if (buffer[0] != '+' && buffer[0] != '$') { KDB_STATE_SET(KGDB_TRANS); kdb_printf("%s", buffer); + } else { + int slen = strlen(buffer); + if (slen > 3 && buffer[slen - 3] == '#') { + kdb_gdb_state_pass(buffer); + strcpy(buffer, "kgdb"); + KDB_STATE_SET(DOING_KGDB); + return 1; + } } + return 0; } static int kdb_read_get_key(char *buffer, size_t bufsize) @@ -251,6 +257,10 @@ poll_again: case 13: /* enter */ *lastchar++ = '\n'; *lastchar++ = '\0'; + if (!KDB_STATE(KGDB_TRANS)) { + KDB_STATE_SET(KGDB_TRANS); + kdb_printf("%s", buffer); + } kdb_printf("\n"); return buffer; case 4: /* Del */ @@ -382,10 +392,12 @@ poll_again: * printed characters if we think that * kgdb is connecting, until the check * fails */ - if (!KDB_STATE(KGDB_TRANS)) - kgdb_transition_check(buffer); - else + if (!KDB_STATE(KGDB_TRANS)) { + if (kgdb_transition_check(buffer)) + return buffer; + } else { kdb_printf("%c", key); + } } /* Special escape to kgdb */ if (lastchar - buffer >= 5 && -- cgit v1.2.3 From dfc428b656c4693a2334a8d9865b430beddb562a Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 3 Aug 2011 16:21:04 -0700 Subject: taskstats: add_del_listener() shouldn't use the wrong node 1. Commit 26c4caea9d69 "don't allow duplicate entries in listener mode" changed add_del_listener(REGISTER) so that "next_cpu:" can reuse the listener allocated for the previous cpu, this doesn't look exactly right even if minor. Change the code to kfree() in the already-registered case, this case is unlikely anyway so the extra kmalloc_node() shouldn't hurt but looke more correct and clean. 2. use the plain list_for_each_entry() instead of _safe() to scan listeners->list. 3. Remove the unneeded INIT_LIST_HEAD(&s->list), we are going to list_add(&s->list). Signed-off-by: Oleg Nesterov Reviewed-by: Vasiliy Kulikov Cc: Balbir Singh Reviewed-by: Jerome Marchand Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/taskstats.c | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/taskstats.c b/kernel/taskstats.c index d1db2880d1cf..a09a54936f19 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -291,30 +291,28 @@ static int add_del_listener(pid_t pid, const struct cpumask *mask, int isadd) if (!cpumask_subset(mask, cpu_possible_mask)) return -EINVAL; - s = NULL; if (isadd == REGISTER) { for_each_cpu(cpu, mask) { - if (!s) - s = kmalloc_node(sizeof(struct listener), - GFP_KERNEL, cpu_to_node(cpu)); + s = kmalloc_node(sizeof(struct listener), + GFP_KERNEL, cpu_to_node(cpu)); if (!s) goto cleanup; + s->pid = pid; - INIT_LIST_HEAD(&s->list); s->valid = 1; listeners = &per_cpu(listener_array, cpu); down_write(&listeners->sem); - list_for_each_entry_safe(s2, tmp, &listeners->list, list) { + list_for_each_entry(s2, &listeners->list, list) { if (s2->pid == pid) - goto next_cpu; + goto exists; } list_add(&s->list, &listeners->list); s = NULL; -next_cpu: +exists: up_write(&listeners->sem); + kfree(s); /* nop if NULL */ } - kfree(s); return 0; } -- cgit v1.2.3 From a7295898a1d2e501427f557111c2b4bdfc90b1ed Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 3 Aug 2011 16:21:05 -0700 Subject: taskstats: add_del_listener() should ignore !valid listeners When send_cpu_listeners() finds the orphaned listener it marks it as !valid and drops listeners->sem. Before it takes this sem for writing, s->pid can be reused and add_del_listener() can wrongly try to re-use this entry. Change add_del_listener() to check ->valid = T. Signed-off-by: Oleg Nesterov Reviewed-by: Vasiliy Kulikov Acked-by: Balbir Singh Cc: Jerome Marchand Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/taskstats.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/taskstats.c b/kernel/taskstats.c index a09a54936f19..e19ce1454ee1 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -304,7 +304,7 @@ static int add_del_listener(pid_t pid, const struct cpumask *mask, int isadd) listeners = &per_cpu(listener_array, cpu); down_write(&listeners->sem); list_for_each_entry(s2, &listeners->list, list) { - if (s2->pid == pid) + if (s2->pid == pid && s2->valid) goto exists; } list_add(&s->list, &listeners->list); -- cgit v1.2.3 From 288d5abec8314ae50fe6692f324b0444acae8486 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Wed, 3 Aug 2011 22:03:29 -1000 Subject: Boot up with usermodehelper disabled The core device layer sends tons of uevent notifications for each device it finds, and if the kernel has been built with a non-empty CONFIG_UEVENT_HELPER_PATH that will make us try to execute the usermode helper binary for all these events very early in the boot. Not only won't the root filesystem even be mounted at that point, we literally won't have necessarily even initialized all the process handling data structures at that point, which causes no end of silly problems even when the usermode helper doesn't actually succeed in executing. So just use our existing infrastructure to disable the usermodehelpers to make the kernel start out with them disabled. We enable them when we've at least initialized stuff a bit. Problems related to an uninitialized init_ipc_ns.ids[IPC_SHM_IDS].rw_mutex reported by various people. Reported-by: Manuel Lauss Reported-by: Richard Weinberger Reported-by: Marc Zyngier Acked-by: Kay Sievers Cc: Andrew Morton Cc: Vasiliy Kulikov Cc: Greg KH Signed-off-by: Linus Torvalds --- kernel/kmod.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/kmod.c b/kernel/kmod.c index 47613dfb7b28..ddc7644c1305 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -274,7 +274,7 @@ static void __call_usermodehelper(struct work_struct *work) * (used for preventing user land processes from being created after the user * land has been frozen during a system-wide hibernation or suspend operation). */ -static int usermodehelper_disabled; +static int usermodehelper_disabled = 1; /* Number of helpers running */ static atomic_t running_helpers = ATOMIC_INIT(0); -- cgit v1.2.3 From 7d36b26be0f3c6b86e3ab7e1539e42f3a3bc79ca Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 26 Jul 2011 13:13:44 +0200 Subject: lockdep: Fix trace_hardirqs_on_caller() Commit dd4e5d3ac4a ("lockdep: Fix trace_[soft,hard]irqs_[on,off]() recursion") made a bit of a mess of the various checks and error conditions. In particular it moved the check for !irqs_disabled() before the spurious enable test, resulting in some warnings. Reported-by: Arnaud Lacombe Reported-by: Dave Jones Reported-and-tested-by: Sergey Senozhatsky Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1311679697.24752.28.camel@twins Signed-off-by: Ingo Molnar --- kernel/lockdep.c | 30 ++++++++++++++++-------------- 1 file changed, 16 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 3956f5149e25..74ca247a4d4f 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -2485,23 +2485,9 @@ static void __trace_hardirqs_on_caller(unsigned long ip) { struct task_struct *curr = current; - if (DEBUG_LOCKS_WARN_ON(unlikely(early_boot_irqs_disabled))) - return; - - if (unlikely(curr->hardirqs_enabled)) { - /* - * Neither irq nor preemption are disabled here - * so this is racy by nature but losing one hit - * in a stat is not a big deal. - */ - __debug_atomic_inc(redundant_hardirqs_on); - return; - } /* we'll do an OFF -> ON transition: */ curr->hardirqs_enabled = 1; - if (DEBUG_LOCKS_WARN_ON(current->hardirq_context)) - return; /* * We are going to turn hardirqs on, so set the * usage bit for all held locks: @@ -2529,9 +2515,25 @@ void trace_hardirqs_on_caller(unsigned long ip) if (unlikely(!debug_locks || current->lockdep_recursion)) return; + if (unlikely(current->hardirqs_enabled)) { + /* + * Neither irq nor preemption are disabled here + * so this is racy by nature but losing one hit + * in a stat is not a big deal. + */ + __debug_atomic_inc(redundant_hardirqs_on); + return; + } + if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) return; + if (DEBUG_LOCKS_WARN_ON(unlikely(early_boot_irqs_disabled))) + return; + + if (DEBUG_LOCKS_WARN_ON(current->hardirq_context)) + return; + current->lockdep_recursion = 1; __trace_hardirqs_on_caller(ip); current->lockdep_recursion = 0; -- cgit v1.2.3 From 70a0686a72c7a7e554b404ca11406ceec709d425 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 25 Jul 2011 12:09:59 +0200 Subject: lockdep: Fix up warning On Sun, 2011-07-24 at 21:06 -0400, Arnaud Lacombe wrote: > /src/linux/linux/kernel/lockdep.c: In function 'mark_held_locks': > /src/linux/linux/kernel/lockdep.c:2471:31: warning: comparison of > distinct pointer types lacks a cast The warning is harmless in this case, but the below makes it go away. Reported-by: Arnaud Lacombe Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1311588599.2617.56.camel@laptop Signed-off-by: Ingo Molnar --- kernel/lockdep.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 74ca247a4d4f..5903586f32a0 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -2468,7 +2468,7 @@ mark_held_locks(struct task_struct *curr, enum mark_type mark) BUG_ON(usage_bit >= LOCK_USAGE_STATES); - if (hlock_class(hlock)->key == &__lockdep_no_validate__) + if (hlock_class(hlock)->key == __lockdep_no_validate__.subkeys) continue; if (!mark_lock(curr, hlock, usage_bit)) -- cgit v1.2.3 From f59de8992aa6dc85e81aadc26b0f69e17809721d Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 14 Jul 2011 15:19:09 +0200 Subject: lockdep: Clear whole lockdep_map on initialization lockdep_init_map() only initializes parts of lockdep_map and triggers kmemcheck warning when it is copied as a whole. There isn't anything to be gained by clearing selectively. memset() the whole structure and remove loop for ->class_cache[] clearing. Addresses https://bugzilla.kernel.org/show_bug.cgi?id=35532 Signed-off-by: Tejun Heo Reported-and-tested-by: Christian Casteyde Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=35532 Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20110714131909.GJ3455@htj.dyndns.org Signed-off-by: Ingo Molnar --- kernel/lockdep.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 5903586f32a0..8c24294e477f 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -2874,10 +2874,7 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this, void lockdep_init_map(struct lockdep_map *lock, const char *name, struct lock_class_key *key, int subclass) { - int i; - - for (i = 0; i < NR_LOCKDEP_CACHING_CLASSES; i++) - lock->class_cache[i] = NULL; + memset(lock, 0, sizeof(*lock)); #ifdef CONFIG_LOCK_STAT lock->cpu = raw_smp_processor_id(); -- cgit v1.2.3 From b77f0f3c1f587791aa5d9bd1b0012c9a89eb9258 Mon Sep 17 00:00:00 2001 From: Jason Baron Date: Fri, 5 Aug 2011 16:40:40 -0400 Subject: jump label: Reduce the cycle count by changing the link order In the course of testing jump labels for use with the CFS bandwidth controller, Paul Turner, discovered that using jump labels reduced the branch count and the instruction count, but did not reduce the cycle count or wall time. I noticed that having the jump_label.o included in the kernel but not used in any way still caused this increase in cycle count and wall time. Thus, I moved jump_label.o in the kernel/Makefile, thus changing the link order, and presumably moving it out of hot icache areas. This brought down the cycle count/time as expected. In addition to Paul's testing, I've tested the patch using a single 'static_branch()' in the getppid() path, and basically running tight loops of calls to getppid(). Here are my results for the branch disabled case: With jump labels turned on (CONFIG_JUMP_LABEL), branch disabled: Performance counter stats for 'bash -c /tmp/getppid;true' (50 runs): 3,969,510,217 instructions # 0.864 IPC ( +-0.000% ) 4,592,334,954 cycles ( +- 0.046% ) 751,634,470 branches ( +- 0.000% ) 1.722635797 seconds time elapsed ( +- 0.046% ) Jump labels turned off (CONFIG_JUMP_LABEL not set), branch disabled: Performance counter stats for 'bash -c /tmp/getppid;true' (50 runs): 4,009,611,846 instructions # 0.867 IPC ( +-0.000% ) 4,622,210,580 cycles ( +- 0.012% ) 771,662,904 branches ( +- 0.000% ) 1.734341454 seconds time elapsed ( +- 0.022% ) Signed-off-by: Jason Baron Cc: rth@redhat.com Cc: a.p.zijlstra@chello.nl Cc: rostedt@goodmis.org Link: http://lkml.kernel.org/r/20110805204040.GG2522@redhat.com Signed-off-by: Ingo Molnar Tested-by: Paul Turner --- kernel/Makefile | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/Makefile b/kernel/Makefile index d06467fc8f7c..eca595e2fd52 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -10,7 +10,7 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o \ kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ notifier.o ksysfs.o pm_qos_params.o sched_clock.o cred.o \ - async.o range.o jump_label.o + async.o range.o obj-y += groups.o ifdef CONFIG_FUNCTION_TRACER @@ -107,6 +107,7 @@ obj-$(CONFIG_PERF_EVENTS) += events/ obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o obj-$(CONFIG_PADATA) += padata.o obj-$(CONFIG_CRASH_DUMP) += crash_dump.o +obj-$(CONFIG_JUMP_LABEL) += jump_label.o ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) # According to Alan Modra , the -fno-omit-frame-pointer is -- cgit v1.2.3 From 80e0401e35410a69bfae05b454db8a7187edd6b8 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 5 Aug 2011 14:26:17 +0200 Subject: lockdep: Fix wrong assumption in match_held_lock match_held_lock() was assuming it was being called on a lock class that had already seen usage. This condition was true for bug-free code using lockdep_assert_held(), since you're in fact holding the lock when calling it. However the assumption fails the moment you assume the assertion can fail, which is the whole point of having the assertion in the first place. Anyway, now that there's more lockdep_is_held() users, notably __rcu_dereference_check(), its much easier to trigger this since we test for a number of locks and we only need to hold any one of them to be good. Reported-by: Sergey Senozhatsky Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1312547787.28695.2.camel@twins Signed-off-by: Ingo Molnar --- kernel/lockdep.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 8c24294e477f..91d67ce3a8d5 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -3111,7 +3111,13 @@ static int match_held_lock(struct held_lock *hlock, struct lockdep_map *lock) if (!class) class = look_up_lock_class(lock, 0); - if (DEBUG_LOCKS_WARN_ON(!class)) + /* + * If look_up_lock_class() failed to find a class, we're trying + * to test if we hold a lock that has never yet been acquired. + * Clearly if the lock hasn't been acquired _ever_, we're not + * holding it either, so report failure. + */ + if (!class) return 0; if (DEBUG_LOCKS_WARN_ON(!hlock->nest_lock)) -- cgit v1.2.3 From f2c0d0266cc5eb36a4aa44944b4096ec121490aa Mon Sep 17 00:00:00 2001 From: Jonathan Nieder Date: Mon, 8 Aug 2011 06:22:43 +0200 Subject: cap_syslog: don't use WARN_ONCE for CAP_SYS_ADMIN deprecation warning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit syslog-ng versions before 3.3.0beta1 (2011-05-12) assume that CAP_SYS_ADMIN is sufficient to access syslog, so ever since CAP_SYSLOG was introduced (2010-11-25) they have triggered a warning. Commit ee24aebffb75 ("cap_syslog: accept CAP_SYS_ADMIN for now") improved matters a little by making syslog-ng work again, just keeping the WARN_ONCE(). But still, this is a warning that writes a stack trace we don't care about to syslog, sets a taint flag, and alarms sysadmins when nothing worse has happened than use of an old userspace with a recent kernel. Convert the WARN_ONCE to a printk_once to avoid that while continuing to give userspace developers a hint that this is an unwanted backward-compatibility feature and won't be around forever. Reported-by: Ralf Hildebrandt Reported-by: Niels Reported-by: PaweÅ‚ Sikora Signed-off-by: Jonathan Nieder Liked-by: Gergely Nagy Acked-by: Serge Hallyn Acked-by: James Morris Signed-off-by: Linus Torvalds --- kernel/printk.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/printk.c b/kernel/printk.c index 37dff3429adb..836a2ae0ac31 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -318,8 +318,10 @@ static int check_syslog_permissions(int type, bool from_file) return 0; /* For historical reasons, accept CAP_SYS_ADMIN too, with a warning */ if (capable(CAP_SYS_ADMIN)) { - WARN_ONCE(1, "Attempt to access syslog with CAP_SYS_ADMIN " - "but no CAP_SYSLOG (deprecated).\n"); + printk_once(KERN_WARNING "%s (%d): " + "Attempt to access syslog with CAP_SYS_ADMIN " + "but no CAP_SYSLOG (deprecated).\n", + current->comm, task_pid_nr(current)); return 0; } return -EPERM; -- cgit v1.2.3 From 971c90bfa2f0b4fe52d6d9002178d547706f1343 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Thu, 4 Aug 2011 07:25:35 -0700 Subject: alarmtimers: Avoid possible null pointer traversal We don't check if old_setting is non null before assigning it, so correct this. CC: Thomas Gleixner CC: stable@kernel.org Signed-off-by: John Stultz --- kernel/time/alarmtimer.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 59f369f98a04..1dee3f62a6a7 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -479,11 +479,8 @@ static int alarm_timer_set(struct k_itimer *timr, int flags, if (!rtcdev) return -ENOTSUPP; - /* Save old values */ - old_setting->it_interval = - ktime_to_timespec(timr->it.alarmtimer.period); - old_setting->it_value = - ktime_to_timespec(timr->it.alarmtimer.node.expires); + if (old_setting) + alarm_timer_get(timr, old_setting); /* If the timer was already set, cancel it */ alarm_cancel(&timr->it.alarmtimer); -- cgit v1.2.3 From ea7802f630d356acaf66b3c0b28c00a945fc35dc Mon Sep 17 00:00:00 2001 From: John Stultz Date: Thu, 4 Aug 2011 07:51:56 -0700 Subject: alarmtimers: Memset itimerspec passed into alarm_timer_get Following common_timer_get, zero out the itimerspec passed in. CC: Thomas Gleixner CC: stable@kernel.org Signed-off-by: John Stultz --- kernel/time/alarmtimer.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 1dee3f62a6a7..0e9263f6fd09 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -441,6 +441,8 @@ static int alarm_timer_create(struct k_itimer *new_timer) static void alarm_timer_get(struct k_itimer *timr, struct itimerspec *cur_setting) { + memset(cur_setting, 0, sizeof(struct itimerspec)); + cur_setting->it_interval = ktime_to_timespec(timr->it.alarmtimer.period); cur_setting->it_value = -- cgit v1.2.3 From 6af7e471e5a7746b8024d70b4363d3dfe41d36b8 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Wed, 10 Aug 2011 10:26:09 -0700 Subject: alarmtimers: Avoid possible denial of service with high freq periodic timers Its possible to jam up the alarm timers by setting very small interval timers, which will cause the alarmtimer subsystem to spend all of its time firing and restarting timers. This can effectivly lock up a box. A deeper fix is needed, closely mimicking the hrtimer code, but for now just cap the interval to 100us to avoid userland hanging the system. CC: Thomas Gleixner CC: stable@kernel.org Signed-off-by: John Stultz --- kernel/time/alarmtimer.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'kernel') diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 0e9263f6fd09..ea5e1a928d5b 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -481,6 +481,15 @@ static int alarm_timer_set(struct k_itimer *timr, int flags, if (!rtcdev) return -ENOTSUPP; + /* + * XXX HACK! Currently we can DOS a system if the interval + * period on alarmtimers is too small. Cap the interval here + * to 100us and solve this properly in a future patch! -jstultz + */ + if ((new_setting->it_interval.tv_sec == 0) && + (new_setting->it_interval.tv_nsec < 100000)) + new_setting->it_interval.tv_nsec = 100000; + if (old_setting) alarm_timer_get(timr, old_setting); -- cgit v1.2.3 From c09c47caedc9854d59378d6e34c989e51cfdd2b4 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Thu, 11 Aug 2011 10:36:05 +0200 Subject: blktrace: add FLUSH/FUA support Add FLUSH/FUA support to blktrace. As FLUSH precedes WRITE and/or FUA follows WRITE, use the same 'F' flag for both cases and distinguish them by their (relative) position. The end results look like (other flags might be shown also): - WRITE: W - WRITE_FLUSH: FW - WRITE_FUA: WF - WRITE_FLUSH_FUA: FWF Note that we reuse TC_BARRIER due to lack of bit space of act_mask so that the older versions of blktrace tools will report flush requests as barriers from now on. Cc: Steven Rostedt Cc: Frederic Weisbecker Cc: Ingo Molnar Signed-off-by: Namhyung Kim Reviewed-by: Jeff Moyer Signed-off-by: Jens Axboe --- kernel/trace/blktrace.c | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 6957aa298dfa..7c910a5593a6 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -206,6 +206,8 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, what |= MASK_TC_BIT(rw, RAHEAD); what |= MASK_TC_BIT(rw, META); what |= MASK_TC_BIT(rw, DISCARD); + what |= MASK_TC_BIT(rw, FLUSH); + what |= MASK_TC_BIT(rw, FUA); pid = tsk->pid; if (act_log_check(bt, what, sector, pid)) @@ -1054,6 +1056,9 @@ static void fill_rwbs(char *rwbs, const struct blk_io_trace *t) goto out; } + if (tc & BLK_TC_FLUSH) + rwbs[i++] = 'F'; + if (tc & BLK_TC_DISCARD) rwbs[i++] = 'D'; else if (tc & BLK_TC_WRITE) @@ -1063,10 +1068,10 @@ static void fill_rwbs(char *rwbs, const struct blk_io_trace *t) else rwbs[i++] = 'N'; + if (tc & BLK_TC_FUA) + rwbs[i++] = 'F'; if (tc & BLK_TC_AHEAD) rwbs[i++] = 'A'; - if (tc & BLK_TC_BARRIER) - rwbs[i++] = 'B'; if (tc & BLK_TC_SYNC) rwbs[i++] = 'S'; if (tc & BLK_TC_META) @@ -1132,7 +1137,7 @@ typedef int (blk_log_action_t) (struct trace_iterator *iter, const char *act); static int blk_log_action_classic(struct trace_iterator *iter, const char *act) { - char rwbs[6]; + char rwbs[RWBS_LEN]; unsigned long long ts = iter->ts; unsigned long nsec_rem = do_div(ts, NSEC_PER_SEC); unsigned secs = (unsigned long)ts; @@ -1148,7 +1153,7 @@ static int blk_log_action_classic(struct trace_iterator *iter, const char *act) static int blk_log_action(struct trace_iterator *iter, const char *act) { - char rwbs[6]; + char rwbs[RWBS_LEN]; const struct blk_io_trace *t = te_blk_io_trace(iter->ent); fill_rwbs(rwbs, t); @@ -1561,7 +1566,7 @@ static const struct { } mask_maps[] = { { BLK_TC_READ, "read" }, { BLK_TC_WRITE, "write" }, - { BLK_TC_BARRIER, "barrier" }, + { BLK_TC_FLUSH, "flush" }, { BLK_TC_SYNC, "sync" }, { BLK_TC_QUEUE, "queue" }, { BLK_TC_REQUEUE, "requeue" }, @@ -1573,6 +1578,7 @@ static const struct { { BLK_TC_META, "meta" }, { BLK_TC_DISCARD, "discard" }, { BLK_TC_DRV_DATA, "drv_data" }, + { BLK_TC_FUA, "fua" }, }; static int blk_trace_str2mask(const char *str) @@ -1788,6 +1794,9 @@ void blk_fill_rwbs(char *rwbs, u32 rw, int bytes) { int i = 0; + if (rw & REQ_FLUSH) + rwbs[i++] = 'F'; + if (rw & WRITE) rwbs[i++] = 'W'; else if (rw & REQ_DISCARD) @@ -1797,6 +1806,8 @@ void blk_fill_rwbs(char *rwbs, u32 rw, int bytes) else rwbs[i++] = 'N'; + if (rw & REQ_FUA) + rwbs[i++] = 'F'; if (rw & REQ_RAHEAD) rwbs[i++] = 'A'; if (rw & REQ_SYNC) -- cgit v1.2.3 From 72fa59970f8698023045ab0713d66f3f4f96945c Mon Sep 17 00:00:00 2001 From: Vasiliy Kulikov Date: Mon, 8 Aug 2011 19:02:04 +0400 Subject: move RLIMIT_NPROC check from set_user() to do_execve_common() The patch http://lkml.org/lkml/2003/7/13/226 introduced an RLIMIT_NPROC check in set_user() to check for NPROC exceeding via setuid() and similar functions. Before the check there was a possibility to greatly exceed the allowed number of processes by an unprivileged user if the program relied on rlimit only. But the check created new security threat: many poorly written programs simply don't check setuid() return code and believe it cannot fail if executed with root privileges. So, the check is removed in this patch because of too often privilege escalations related to buggy programs. The NPROC can still be enforced in the common code flow of daemons spawning user processes. Most of daemons do fork()+setuid()+execve(). The check introduced in execve() (1) enforces the same limit as in setuid() and (2) doesn't create similar security issues. Neil Brown suggested to track what specific process has exceeded the limit by setting PF_NPROC_EXCEEDED process flag. With the change only this process would fail on execve(), and other processes' execve() behaviour is not changed. Solar Designer suggested to re-check whether NPROC limit is still exceeded at the moment of execve(). If the process was sleeping for days between set*uid() and execve(), and the NPROC counter step down under the limit, the defered execve() failure because NPROC limit was exceeded days ago would be unexpected. If the limit is not exceeded anymore, we clear the flag on successful calls to execve() and fork(). The flag is also cleared on successful calls to set_user() as the limit was exceeded for the previous user, not the current one. Similar check was introduced in -ow patches (without the process flag). v3 - clear PF_NPROC_EXCEEDED on successful calls to set_user(). Reviewed-by: James Morris Signed-off-by: Vasiliy Kulikov Acked-by: NeilBrown Signed-off-by: Linus Torvalds --- kernel/cred.c | 6 ++---- kernel/fork.c | 1 + kernel/sys.c | 15 +++++++++++---- 3 files changed, 14 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/cred.c b/kernel/cred.c index 174fa84eca30..8ef31f53c44c 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -508,10 +508,8 @@ int commit_creds(struct cred *new) key_fsgid_changed(task); /* do it - * - What if a process setreuid()'s and this brings the - * new uid over his NPROC rlimit? We can check this now - * cheaply with the new uid cache, so if it matters - * we should be checking for it. -DaveM + * RLIMIT_NPROC limits on user->processes have already been checked + * in set_user(). */ alter_cred_subscribers(new, 2); if (new->user != old->user) diff --git a/kernel/fork.c b/kernel/fork.c index e7ceaca89609..8e6b6f4fb272 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1111,6 +1111,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, p->real_cred->user != INIT_USER) goto bad_fork_free; } + current->flags &= ~PF_NPROC_EXCEEDED; retval = copy_creds(p, clone_flags); if (retval < 0) diff --git a/kernel/sys.c b/kernel/sys.c index a101ba36c444..dd948a1fca4c 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -621,11 +621,18 @@ static int set_user(struct cred *new) if (!new_user) return -EAGAIN; + /* + * We don't fail in case of NPROC limit excess here because too many + * poorly written programs don't check set*uid() return code, assuming + * it never fails if called by root. We may still enforce NPROC limit + * for programs doing set*uid()+execve() by harmlessly deferring the + * failure to the execve() stage. + */ if (atomic_read(&new_user->processes) >= rlimit(RLIMIT_NPROC) && - new_user != INIT_USER) { - free_uid(new_user); - return -EAGAIN; - } + new_user != INIT_USER) + current->flags |= PF_NPROC_EXCEEDED; + else + current->flags &= ~PF_NPROC_EXCEEDED; free_uid(new->user); new->user = new_user; -- cgit v1.2.3 From c59d87c460767bc35dafd490139d3cfe78fb8da4 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 12 Aug 2011 16:21:35 -0500 Subject: xfs: remove subdirectories Use the move from Linux 2.6 to Linux 3.x as an excuse to kill the annoying subdirectories in the XFS source code. Besides the large amount of file rename the only changes are to the Makefile, a few files including headers with the subdirectory prefix, and the binary sysctl compat code that includes a header under fs/xfs/ from kernel/. Signed-off-by: Christoph Hellwig Signed-off-by: Alex Elder --- kernel/sysctl_binary.c | 2 +- kernel/sysctl_check.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c index 3b8e028b9601..e8bffbe2ba4b 100644 --- a/kernel/sysctl_binary.c +++ b/kernel/sysctl_binary.c @@ -1,6 +1,6 @@ #include #include -#include "../fs/xfs/linux-2.6/xfs_sysctl.h" +#include "../fs/xfs/xfs_sysctl.h" #include #include #include diff --git a/kernel/sysctl_check.c b/kernel/sysctl_check.c index 4e4932a7b360..362da653813d 100644 --- a/kernel/sysctl_check.c +++ b/kernel/sysctl_check.c @@ -1,6 +1,6 @@ #include #include -#include "../fs/xfs/linux-2.6/xfs_sysctl.h" +#include "../fs/xfs/xfs_sysctl.h" #include #include #include -- cgit v1.2.3 From 17f2ae7f677f023997e02fd2ebabd90ea2a0390d Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Sun, 14 Aug 2011 13:34:31 +0200 Subject: PM / Domains: Fix build for CONFIG_PM_RUNTIME unset Function genpd_queue_power_off_work() is not defined for CONFIG_PM_RUNTIME, so pm_genpd_poweroff_unused() causes a build error to happen in that case. Fix the problem by making pm_genpd_poweroff_unused() depend on CONFIG_PM_RUNTIME too. Signed-off-by: Rafael J. Wysocki --- kernel/power/Kconfig | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index b1914cb9095c..3744c594b19b 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -231,3 +231,7 @@ config PM_CLK config PM_GENERIC_DOMAINS bool depends on PM + +config PM_GENERIC_DOMAINS_RUNTIME + def_bool y + depends on PM_RUNTIME && PM_GENERIC_DOMAINS -- cgit v1.2.3 From d522a0d17963e9c2e556db2cbd60c96d40505b6c Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Thu, 18 Aug 2011 12:19:27 -0700 Subject: irqdesc: fix new kernel-doc warning Fix kernel-doc warning in irqdesc.c: Warning(kernel/irq/irqdesc.c:353): No description found for parameter 'owner' Signed-off-by: Randy Dunlap Signed-off-by: Linus Torvalds --- kernel/irq/irqdesc.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index cb65d0360e31..039b889ea053 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -344,6 +344,7 @@ EXPORT_SYMBOL_GPL(irq_free_descs); * @from: Start the search from this irq number * @cnt: Number of consecutive irqs to allocate. * @node: Preferred node on which the irq descriptor should be allocated + * @owner: Owning module (can be NULL) * * Returns the first irq number or error code */ -- cgit v1.2.3 From 69dd3d8e29e294caaf63eb5e8a72d250279f9e5f Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Tue, 23 Aug 2011 10:36:51 -0700 Subject: Revert "irq: Always set IRQF_ONESHOT if no primary handler is specified" This reverts commit f3637a5f2e2eb391ff5757bc83fb5de8f9726464. It turns out that this breaks several drivers, one example being OMAP boards which use the on-board OMAP UARTs and the omap-serial driver that will not boot to userspace after the commit. Paul Walmsley reports that enabling CONFIG_DEBUG_SHIRQ reveals 'IRQ handler type mismatch' errors: IRQ handler type mismatch for IRQ 74 current handler: serial idle ... and the reason is that setting IRQF_ONESHOT will now result in those interrupt handlers having different IRQF flags, and thus being unsharable. So the commit log in the reverted commit: "Since it is required for those users and there is no difference for others it makes sense to add this flag unconditionally." is simply not true: there may not be any difference from a "actions at irq time", but there is a *big* difference wrt this flag testing irq management (see __setup_irq() in kernel/irq/manage.c). One solution may be to stop verifying IRQF_ONESHOT in __setup_irq(), but right now the safe course of action is to revert the change. Let's revisit this in a later merge window. Reported-by: Paul Walmsley Cc: Sebastian Andrzej Siewior Requested-by: Alan Cox Acked-by: Thomas Gleixner Signed-off-by: Linus Torvalds --- kernel/irq/manage.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 2e9425889fa8..9b956fa20308 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -1331,7 +1331,6 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler, if (!thread_fn) return -EINVAL; handler = irq_default_primary_handler; - irqflags |= IRQF_ONESHOT; } action = kzalloc(sizeof(struct irqaction), GFP_KERNEL); -- cgit v1.2.3 From be27425dcc516fd08245b047ea57f83b8f6f0903 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Fri, 19 Aug 2011 16:15:10 -0700 Subject: Add a personality to report 2.6.x version numbers I ran into a couple of programs which broke with the new Linux 3.0 version. Some of those were binary only. I tried to use LD_PRELOAD to work around it, but it was quite difficult and in one case impossible because of a mix of 32bit and 64bit executables. For example, all kind of management software from HP doesnt work, unless we pretend to run a 2.6 kernel. $ uname -a Linux svivoipvnx001 3.0.0-08107-g97cd98f #1062 SMP Fri Aug 12 18:11:45 CEST 2011 i686 i686 i386 GNU/Linux $ hpacucli ctrl all show Error: No controllers detected. $ rpm -qf /usr/sbin/hpacucli hpacucli-8.75-12.0 Another notable case is that Python now reports "linux3" from sys.platform(); which in turn can break things that were checking sys.platform() == "linux2": https://bugzilla.mozilla.org/show_bug.cgi?id=664564 It seems pretty clear to me though it's a bug in the apps that are using '==' instead of .startswith(), but this allows us to unbreak broken programs. This patch adds a UNAME26 personality that makes the kernel report a 2.6.40+x version number instead. The x is the x in 3.x. I know this is somewhat ugly, but I didn't find a better workaround, and compatibility to existing programs is important. Some programs also read /proc/sys/kernel/osrelease. This can be worked around in user space with mount --bind (and a mount namespace) To use: wget ftp://ftp.kernel.org/pub/linux/kernel/people/ak/uname26/uname26.c gcc -o uname26 uname26.c ./uname26 program Signed-off-by: Andi Kleen Signed-off-by: Linus Torvalds --- kernel/sys.c | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index dd948a1fca4c..18ee1d2f6474 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -37,6 +37,8 @@ #include #include #include +#include +#include #include #include @@ -44,6 +46,8 @@ #include #include +/* Move somewhere else to avoid recompiling? */ +#include #include #include @@ -1161,6 +1165,34 @@ DECLARE_RWSEM(uts_sem); #define override_architecture(name) 0 #endif +/* + * Work around broken programs that cannot handle "Linux 3.0". + * Instead we map 3.x to 2.6.40+x, so e.g. 3.0 would be 2.6.40 + */ +static int override_release(char __user *release, int len) +{ + int ret = 0; + char buf[len]; + + if (current->personality & UNAME26) { + char *rest = UTS_RELEASE; + int ndots = 0; + unsigned v; + + while (*rest) { + if (*rest == '.' && ++ndots >= 3) + break; + if (!isdigit(*rest) && *rest != '.') + break; + rest++; + } + v = ((LINUX_VERSION_CODE >> 8) & 0xff) + 40; + snprintf(buf, len, "2.6.%u%s", v, rest); + ret = copy_to_user(release, buf, len); + } + return ret; +} + SYSCALL_DEFINE1(newuname, struct new_utsname __user *, name) { int errno = 0; @@ -1170,6 +1202,8 @@ SYSCALL_DEFINE1(newuname, struct new_utsname __user *, name) errno = -EFAULT; up_read(&uts_sem); + if (!errno && override_release(name->release, sizeof(name->release))) + errno = -EFAULT; if (!errno && override_architecture(name)) errno = -EFAULT; return errno; @@ -1191,6 +1225,8 @@ SYSCALL_DEFINE1(uname, struct old_utsname __user *, name) error = -EFAULT; up_read(&uts_sem); + if (!error && override_release(name->release, sizeof(name->release))) + error = -EFAULT; if (!error && override_architecture(name)) error = -EFAULT; return error; @@ -1225,6 +1261,8 @@ SYSCALL_DEFINE1(olduname, struct oldold_utsname __user *, name) if (!error && override_architecture(name)) error = -EFAULT; + if (!error && override_release(name->release, sizeof(name->release))) + error = -EFAULT; return error ? -EFAULT : 0; } #endif -- cgit v1.2.3 From 4c30c6f566c0989ddaee3407da44751e340a63ed Mon Sep 17 00:00:00 2001 From: Nishanth Aravamudan Date: Thu, 25 Aug 2011 15:59:11 -0700 Subject: kernel/printk: do not turn off bootconsole in printk_late_init() if keep_bootcon It seems that 7bf693951a8e ("console: allow to retain boot console via boot option keep_bootcon") doesn't always achieve what it aims, as when printk_late_init() runs it unconditionally turns off all boot consoles. With this patch, I am able to see more messages on the boot console in KVM guests than I can without, when keep_bootcon is specified. I think it is appropriate for the relevant -stable trees. However, it's more of an annoyance than a serious bug (ideally you don't need to keep the boot console around as console handover should be working -- I was encountering a situation where the console handover wasn't working and not having the boot console available meant I couldn't see why). Signed-off-by: Nishanth Aravamudan Cc: David S. Miller Cc: Alan Cox Cc: Greg KH Acked-by: Fabio M. Di Nitto Cc: [2.6.39.x, 3.0.x] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/printk.c b/kernel/printk.c index 836a2ae0ac31..28a40d8171b8 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -1604,7 +1604,7 @@ static int __init printk_late_init(void) struct console *con; for_each_console(con) { - if (con->flags & CON_BOOT) { + if (!keep_bootcon && con->flags & CON_BOOT) { printk(KERN_INFO "turn off boot console %s%d\n", con->name, con->index); unregister_console(con); -- cgit v1.2.3 From f5b940997397229975ea073679b03967932a541b Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Fri, 26 Aug 2011 18:03:11 -0400 Subject: All Arch: remove linkage for sys_nfsservctl system call The nfsservctl system call is now gone, so we should remove all linkage for it. Signed-off-by: NeilBrown Signed-off-by: J. Bruce Fields Signed-off-by: Linus Torvalds --- kernel/sys_ni.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 62cbc8877fef..a9a5de07c4f1 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -16,7 +16,6 @@ asmlinkage long sys_ni_syscall(void) return -ENOSYS; } -cond_syscall(sys_nfsservctl); cond_syscall(sys_quotactl); cond_syscall(sys32_quotactl); cond_syscall(sys_acct); -- cgit v1.2.3 From c259e01a1ec90063042f758e409cd26b2a0963c8 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 22 Jun 2011 19:47:00 +0200 Subject: sched: Separate the scheduler entry for preemption Block-IO and workqueues call into notifier functions from the scheduler core code with interrupts and preemption disabled. These calls should be made before entering the scheduler core. To simplify this, separate the scheduler core code into __schedule(). __schedule() is directly called from the places which set PREEMPT_ACTIVE and from schedule(). This allows us to add the work checks into schedule(), so they are only called when a task voluntary goes to sleep. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra Cc: Tejun Heo Cc: Jens Axboe Cc: Linus Torvalds Cc: stable@kernel.org # 2.6.39+ Link: http://lkml.kernel.org/r/20110622174918.813258321@linutronix.de Signed-off-by: Ingo Molnar --- kernel/sched.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index ccacdbdecf45..ec15e8129cf7 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4279,9 +4279,9 @@ pick_next_task(struct rq *rq) } /* - * schedule() is the main scheduler function. + * __schedule() is the main scheduler function. */ -asmlinkage void __sched schedule(void) +static void __sched __schedule(void) { struct task_struct *prev, *next; unsigned long *switch_count; @@ -4369,6 +4369,11 @@ need_resched: if (need_resched()) goto need_resched; } + +asmlinkage void schedule(void) +{ + __schedule(); +} EXPORT_SYMBOL(schedule); #ifdef CONFIG_MUTEX_SPIN_ON_OWNER @@ -4435,7 +4440,7 @@ asmlinkage void __sched notrace preempt_schedule(void) do { add_preempt_count_notrace(PREEMPT_ACTIVE); - schedule(); + __schedule(); sub_preempt_count_notrace(PREEMPT_ACTIVE); /* @@ -4463,7 +4468,7 @@ asmlinkage void __sched preempt_schedule_irq(void) do { add_preempt_count(PREEMPT_ACTIVE); local_irq_enable(); - schedule(); + __schedule(); local_irq_disable(); sub_preempt_count(PREEMPT_ACTIVE); @@ -5588,7 +5593,7 @@ static inline int should_resched(void) static void __cond_resched(void) { add_preempt_count(PREEMPT_ACTIVE); - schedule(); + __schedule(); sub_preempt_count(PREEMPT_ACTIVE); } -- cgit v1.2.3 From 9c40cef2b799f9b5e7fa5de4d2ad3a0168ba118c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 22 Jun 2011 19:47:01 +0200 Subject: sched: Move blk_schedule_flush_plug() out of __schedule() There is no real reason to run blk_schedule_flush_plug() with interrupts and preemption disabled. Move it into schedule() and call it when the task is going voluntarily to sleep. There might be false positives when the task is woken between that call and actually scheduling, but that's not really different from being woken immediately after switching away. This fixes a deadlock in the scheduler where the blk_schedule_flush_plug() callchain enables interrupts and thereby allows a wakeup to happen of the task that's going to sleep. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra Cc: Tejun Heo Cc: Jens Axboe Cc: Linus Torvalds Cc: stable@kernel.org # 2.6.39+ Link: http://lkml.kernel.org/n/tip-dwfxtra7yg1b5r65m32ywtct@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched.c | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index ec15e8129cf7..511732c39b6e 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4322,16 +4322,6 @@ need_resched: if (to_wakeup) try_to_wake_up_local(to_wakeup); } - - /* - * If we are going to sleep and we have plugged IO - * queued, make sure to submit it to avoid deadlocks. - */ - if (blk_needs_flush_plug(prev)) { - raw_spin_unlock(&rq->lock); - blk_schedule_flush_plug(prev); - raw_spin_lock(&rq->lock); - } } switch_count = &prev->nvcsw; } @@ -4370,8 +4360,23 @@ need_resched: goto need_resched; } +static inline void sched_submit_work(struct task_struct *tsk) +{ + if (!tsk->state) + return; + /* + * If we are going to sleep and we have plugged IO queued, + * make sure to submit it to avoid deadlocks. + */ + if (blk_needs_flush_plug(tsk)) + blk_schedule_flush_plug(tsk); +} + asmlinkage void schedule(void) { + struct task_struct *tsk = current; + + sched_submit_work(tsk); __schedule(); } EXPORT_SYMBOL(schedule); -- cgit v1.2.3 From feff8fa0075bdfd43c841e9d689ed81adda988d6 Mon Sep 17 00:00:00 2001 From: WANG Cong Date: Thu, 18 Aug 2011 20:36:57 +0800 Subject: sched: Fix a memory leak in __sdt_free() This patch fixes the following memory leak: unreferenced object 0xffff880107266800 (size 512): comm "sched-powersave", pid 3718, jiffies 4323097853 (age 27495.450s) hex dump (first 32 bytes): 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ backtrace: [] create_object+0x187/0x28b [] kmemleak_alloc+0x73/0x98 [] __kmalloc_node+0x104/0x159 [] kzalloc_node.clone.97+0x15/0x17 [] build_sched_domains+0xb7/0x7f3 [] partition_sched_domains+0x1db/0x24a [] do_rebuild_sched_domains+0x3b/0x47 [] rebuild_sched_domains+0x10/0x12 [] sched_power_savings_store+0x6c/0x7b [] sched_mc_power_savings_store+0x16/0x18 [] sysdev_class_store+0x20/0x22 [] sysfs_write_file+0x108/0x144 [] vfs_write+0xaf/0x102 [] sys_write+0x4d/0x74 [] system_call_fastpath+0x16/0x1b [] 0xffffffffffffffff Signed-off-by: WANG Cong Signed-off-by: Peter Zijlstra Cc: stable@kernel.org # 3.0 Link: http://lkml.kernel.org/r/1313671017-4112-1-git-send-email-amwang@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 511732c39b6e..c79e7c63a4aa 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -7453,6 +7453,7 @@ static void __sdt_free(const struct cpumask *cpu_map) struct sched_domain *sd = *per_cpu_ptr(sdd->sd, j); if (sd && (sd->flags & SD_OVERLAP)) free_sched_groups(sd->groups, 0); + kfree(*per_cpu_ptr(sdd->sd, j)); kfree(*per_cpu_ptr(sdd->sg, j)); kfree(*per_cpu_ptr(sdd->sgp, j)); } -- cgit v1.2.3 From a8d757ef076f0f95f13a918808824058de25b3eb Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Thu, 25 Aug 2011 15:58:03 +0200 Subject: perf events: Fix slow and broken cgroup context switch code The current cgroup context switch code was incorrect leading to bogus counts. Furthermore, as soon as there was an active cgroup event on a CPU, the context switch cost on that CPU would increase by a significant amount as demonstrated by a simple ping/pong example: $ ./pong Both processes pinned to CPU1, running for 10s 10684.51 ctxsw/s Now start a cgroup perf stat: $ perf stat -e cycles,cycles -A -a -G test -C 1 -- sleep 100 $ ./pong Both processes pinned to CPU1, running for 10s 6674.61 ctxsw/s That's a 37% penalty. Note that pong is not even in the monitored cgroup. The results shown by perf stat are bogus: $ perf stat -e cycles,cycles -A -a -G test -C 1 -- sleep 100 Performance counter stats for 'sleep 100': CPU1 cycles test CPU1 16,984,189,138 cycles # 0.000 GHz The second 'cycles' event should report a count @ CPU clock (here 2.4GHz) as it is counting across all cgroups. The patch below fixes the bogus accounting and bypasses any cgroup switches in case the outgoing and incoming tasks are in the same cgroup. With this patch the same test now yields: $ ./pong Both processes pinned to CPU1, running for 10s 10775.30 ctxsw/s Start perf stat with cgroup: $ perf stat -e cycles,cycles -A -a -G test -C 1 -- sleep 10 Run pong outside the cgroup: $ /pong Both processes pinned to CPU1, running for 10s 10687.80 ctxsw/s The penalty is now less than 2%. And the results for perf stat are correct: $ perf stat -e cycles,cycles -A -a -G test -C 1 -- sleep 10 Performance counter stats for 'sleep 10': CPU1 cycles test # 0.000 GHz CPU1 23,933,981,448 cycles # 0.000 GHz Now perf stat reports the correct counts for for the non cgroup event. If we run pong inside the cgroup, then we also get the correct counts: $ perf stat -e cycles,cycles -A -a -G test -C 1 -- sleep 10 Performance counter stats for 'sleep 10': CPU1 22,297,726,205 cycles test # 0.000 GHz CPU1 23,933,981,448 cycles # 0.000 GHz 10.001457237 seconds time elapsed Signed-off-by: Stephane Eranian Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20110825135803.GA4697@quad Signed-off-by: Ingo Molnar --- kernel/events/core.c | 63 +++++++++++++++++++++++++++++++++++++++++++--------- kernel/sched.c | 2 +- 2 files changed, 54 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index b8785e26ee1c..45847fbb599a 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -399,14 +399,54 @@ void perf_cgroup_switch(struct task_struct *task, int mode) local_irq_restore(flags); } -static inline void perf_cgroup_sched_out(struct task_struct *task) +static inline void perf_cgroup_sched_out(struct task_struct *task, + struct task_struct *next) { - perf_cgroup_switch(task, PERF_CGROUP_SWOUT); + struct perf_cgroup *cgrp1; + struct perf_cgroup *cgrp2 = NULL; + + /* + * we come here when we know perf_cgroup_events > 0 + */ + cgrp1 = perf_cgroup_from_task(task); + + /* + * next is NULL when called from perf_event_enable_on_exec() + * that will systematically cause a cgroup_switch() + */ + if (next) + cgrp2 = perf_cgroup_from_task(next); + + /* + * only schedule out current cgroup events if we know + * that we are switching to a different cgroup. Otherwise, + * do no touch the cgroup events. + */ + if (cgrp1 != cgrp2) + perf_cgroup_switch(task, PERF_CGROUP_SWOUT); } -static inline void perf_cgroup_sched_in(struct task_struct *task) +static inline void perf_cgroup_sched_in(struct task_struct *prev, + struct task_struct *task) { - perf_cgroup_switch(task, PERF_CGROUP_SWIN); + struct perf_cgroup *cgrp1; + struct perf_cgroup *cgrp2 = NULL; + + /* + * we come here when we know perf_cgroup_events > 0 + */ + cgrp1 = perf_cgroup_from_task(task); + + /* prev can never be NULL */ + cgrp2 = perf_cgroup_from_task(prev); + + /* + * only need to schedule in cgroup events if we are changing + * cgroup during ctxsw. Cgroup events were not scheduled + * out of ctxsw out if that was not the case. + */ + if (cgrp1 != cgrp2) + perf_cgroup_switch(task, PERF_CGROUP_SWIN); } static inline int perf_cgroup_connect(int fd, struct perf_event *event, @@ -518,11 +558,13 @@ static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx) { } -static inline void perf_cgroup_sched_out(struct task_struct *task) +static inline void perf_cgroup_sched_out(struct task_struct *task, + struct task_struct *next) { } -static inline void perf_cgroup_sched_in(struct task_struct *task) +static inline void perf_cgroup_sched_in(struct task_struct *prev, + struct task_struct *task) { } @@ -1988,7 +2030,7 @@ void __perf_event_task_sched_out(struct task_struct *task, * cgroup event are system-wide mode only */ if (atomic_read(&__get_cpu_var(perf_cgroup_events))) - perf_cgroup_sched_out(task); + perf_cgroup_sched_out(task, next); } static void task_ctx_sched_out(struct perf_event_context *ctx) @@ -2153,7 +2195,8 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx, * accessing the event control register. If a NMI hits, then it will * keep the event running. */ -void __perf_event_task_sched_in(struct task_struct *task) +void __perf_event_task_sched_in(struct task_struct *prev, + struct task_struct *task) { struct perf_event_context *ctx; int ctxn; @@ -2171,7 +2214,7 @@ void __perf_event_task_sched_in(struct task_struct *task) * cgroup event are system-wide mode only */ if (atomic_read(&__get_cpu_var(perf_cgroup_events))) - perf_cgroup_sched_in(task); + perf_cgroup_sched_in(prev, task); } static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count) @@ -2427,7 +2470,7 @@ static void perf_event_enable_on_exec(struct perf_event_context *ctx) * ctxswin cgroup events which are already scheduled * in. */ - perf_cgroup_sched_out(current); + perf_cgroup_sched_out(current, NULL); raw_spin_lock(&ctx->lock); task_ctx_sched_out(ctx); diff --git a/kernel/sched.c b/kernel/sched.c index ccacdbdecf45..0408cdc6d572 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -3065,7 +3065,7 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW local_irq_disable(); #endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */ - perf_event_task_sched_in(current); + perf_event_task_sched_in(prev, current); #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW local_irq_enable(); #endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */ -- cgit v1.2.3 From 7f310a5d4e8525ac0cc2f58c973d2100ce034410 Mon Sep 17 00:00:00 2001 From: Eric B Munson Date: Thu, 23 Jun 2011 16:34:38 -0400 Subject: perf_event: Fix broken calc_timer_values() We detected a serious issue with PERF_SAMPLE_READ and timing information when events were being multiplexing. Samples would have time_running > time_enabled. That was easy to reproduce with a libpfm4 example (ran 3 times to cause multiplexing on Core 2): $ syst_smpl -e uops_retired:freq=1 & $ syst_smpl -e uops_retired:freq=1 & $ syst_smpl -e uops_retired:freq=1 & IIP:0x0000000040062d ... PERIOD:2355332948 ENA=40144625315 RUN=60014875184 syst_smpl: WARNING: time_running > time_enabled 63277537998 uops_retired:freq=1 , scaled The bug was not present in kernel up to (and including) 3.0. It turns out the bug was introduced by the following commit: commit c4794295917ebeda8013b6cb9c8d71ab4f74a1fa events: Move lockless timer calculation into helper function The parameters of the function got reversed yet the call sites were not updated to reflect the change. That lead to time_running and time_enabled being swapped. That had no effect when there was no multiplexing because in that case time_running = time_enabled but it would show up in any other scenario. Signed-off-by: Stephane Eranian Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20110829124112.GA4828@quad Signed-off-by: Ingo Molnar --- kernel/events/core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 45847fbb599a..0f857782d06f 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3396,8 +3396,8 @@ static int perf_event_index(struct perf_event *event) } static void calc_timer_values(struct perf_event *event, - u64 *running, - u64 *enabled) + u64 *enabled, + u64 *running) { u64 now, ctx_time; -- cgit v1.2.3 From fa2563e41c3d6d6e8af437643981ed28ae0cb56d Mon Sep 17 00:00:00 2001 From: Thomas Tuttle Date: Wed, 14 Sep 2011 16:22:28 -0700 Subject: workqueue: lock cwq access in drain_workqueue Take cwq->gcwq->lock to avoid racing between drain_workqueue checking to make sure the workqueues are empty and cwq_dec_nr_in_flight decrementing and then incrementing nr_active when it activates a delayed work. We discovered this when a corner case in one of our drivers resulted in us trying to destroy a workqueue in which the remaining work would always requeue itself again in the same workqueue. We would hit this race condition and trip the BUG_ON on workqueue.c:3080. Signed-off-by: Thomas Tuttle Acked-by: Tejun Heo Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/workqueue.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 25fb1b0e53fa..1783aabc6128 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -2412,8 +2412,13 @@ reflush: for_each_cwq_cpu(cpu, wq) { struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); + bool drained; - if (!cwq->nr_active && list_empty(&cwq->delayed_works)) + spin_lock_irq(&cwq->gcwq->lock); + drained = !cwq->nr_active && list_empty(&cwq->delayed_works); + spin_unlock_irq(&cwq->gcwq->lock); + + if (drained) continue; if (++flush_cnt == 10 || -- cgit v1.2.3