diff options
Diffstat (limited to 'kernel')
38 files changed, 1780 insertions, 1365 deletions
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index 67b847dfa2bb..1f91413edb87 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -14,6 +14,7 @@ #include <linux/ctype.h> #include <linux/string.h> #include <linux/kernel.h> +#include <linux/kmsg_dump.h> #include <linux/reboot.h> #include <linux/sched.h> #include <linux/sysrq.h> @@ -2040,8 +2041,15 @@ static int kdb_env(int argc, const char **argv) */ static int kdb_dmesg(int argc, const char **argv) { - char *syslog_data[4], *start, *end, c = '\0', *p; - int diag, logging, logsize, lines = 0, adjust = 0, n; + int diag; + int logging; + int lines = 0; + int adjust = 0; + int n = 0; + int skip = 0; + struct kmsg_dumper dumper = { .active = 1 }; + size_t len; + char buf[201]; if (argc > 2) return KDB_ARGCOUNT; @@ -2064,22 +2072,10 @@ static int kdb_dmesg(int argc, const char **argv) kdb_set(2, setargs); } - /* syslog_data[0,1] physical start, end+1. syslog_data[2,3] - * logical start, end+1. */ - kdb_syslog_data(syslog_data); - if (syslog_data[2] == syslog_data[3]) - return 0; - logsize = syslog_data[1] - syslog_data[0]; - start = syslog_data[2]; - end = syslog_data[3]; -#define KDB_WRAP(p) (((p - syslog_data[0]) % logsize) + syslog_data[0]) - for (n = 0, p = start; p < end; ++p) { - c = *KDB_WRAP(p); - if (c == '\n') - ++n; - } - if (c != '\n') - ++n; + kmsg_dump_rewind_nolock(&dumper); + while (kmsg_dump_get_line_nolock(&dumper, 1, NULL, 0, NULL)) + n++; + if (lines < 0) { if (adjust >= n) kdb_printf("buffer only contains %d lines, nothing " @@ -2087,21 +2083,11 @@ static int kdb_dmesg(int argc, const char **argv) else if (adjust - lines >= n) kdb_printf("buffer only contains %d lines, last %d " "lines printed\n", n, n - adjust); - if (adjust) { - for (; start < end && adjust; ++start) { - if (*KDB_WRAP(start) == '\n') - --adjust; - } - if (start < end) - ++start; - } - for (p = start; p < end && lines; ++p) { - if (*KDB_WRAP(p) == '\n') - ++lines; - } - end = p; + skip = adjust; + lines = abs(lines); } else if (lines > 0) { - int skip = n - (adjust + lines); + skip = n - lines - adjust; + lines = abs(lines); if (adjust >= n) { kdb_printf("buffer only contains %d lines, " "nothing printed\n", n); @@ -2112,35 +2098,24 @@ static int kdb_dmesg(int argc, const char **argv) kdb_printf("buffer only contains %d lines, first " "%d lines printed\n", n, lines); } - for (; start < end && skip; ++start) { - if (*KDB_WRAP(start) == '\n') - --skip; - } - for (p = start; p < end && lines; ++p) { - if (*KDB_WRAP(p) == '\n') - --lines; - } - end = p; + } else { + lines = n; } - /* Do a line at a time (max 200 chars) to reduce protocol overhead */ - c = '\n'; - while (start != end) { - char buf[201]; - p = buf; - if (KDB_FLAG(CMD_INTERRUPT)) - return 0; - while (start < end && (c = *KDB_WRAP(start)) && - (p - buf) < sizeof(buf)-1) { - ++start; - *p++ = c; - if (c == '\n') - break; + + if (skip >= n || skip < 0) + return 0; + + kmsg_dump_rewind_nolock(&dumper); + while (kmsg_dump_get_line_nolock(&dumper, 1, buf, sizeof(buf), &len)) { + if (skip) { + skip--; + continue; } - *p = '\0'; - kdb_printf("%s", buf); + if (!lines--) + break; + + kdb_printf("%.*s\n", (int)len - 1, buf); } - if (c != '\n') - kdb_printf("\n"); return 0; } diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h index 47c4e56e513b..392ec6a25844 100644 --- a/kernel/debug/kdb/kdb_private.h +++ b/kernel/debug/kdb/kdb_private.h @@ -205,7 +205,6 @@ extern char kdb_grep_string[]; extern int kdb_grep_leading; extern int kdb_grep_trailing; extern char *kdb_cmds[]; -extern void kdb_syslog_data(char *syslog_data[]); extern unsigned long kdb_task_state_string(const char *); extern char kdb_task_state_char (const struct task_struct *); extern unsigned long kdb_task_state(const struct task_struct *p, diff --git a/kernel/events/core.c b/kernel/events/core.c index d7d71d6ec972..f1cf0edeb39a 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1645,6 +1645,8 @@ perf_install_in_context(struct perf_event_context *ctx, lockdep_assert_held(&ctx->mutex); event->ctx = ctx; + if (event->cpu != -1) + event->cpu = cpu; if (!task) { /* @@ -6252,6 +6254,8 @@ SYSCALL_DEFINE5(perf_event_open, } } + get_online_cpus(); + event = perf_event_alloc(&attr, cpu, task, group_leader, NULL, NULL, NULL); if (IS_ERR(event)) { @@ -6304,7 +6308,7 @@ SYSCALL_DEFINE5(perf_event_open, /* * Get the target context (task or percpu): */ - ctx = find_get_context(pmu, task, cpu); + ctx = find_get_context(pmu, task, event->cpu); if (IS_ERR(ctx)) { err = PTR_ERR(ctx); goto err_alloc; @@ -6377,20 +6381,23 @@ SYSCALL_DEFINE5(perf_event_open, mutex_lock(&ctx->mutex); if (move_group) { - perf_install_in_context(ctx, group_leader, cpu); + synchronize_rcu(); + perf_install_in_context(ctx, group_leader, event->cpu); get_ctx(ctx); list_for_each_entry(sibling, &group_leader->sibling_list, group_entry) { - perf_install_in_context(ctx, sibling, cpu); + perf_install_in_context(ctx, sibling, event->cpu); get_ctx(ctx); } } - perf_install_in_context(ctx, event, cpu); + perf_install_in_context(ctx, event, event->cpu); ++ctx->generation; perf_unpin_context(ctx); mutex_unlock(&ctx->mutex); + put_online_cpus(); + event->owner = current; mutex_lock(¤t->perf_event_mutex); @@ -6419,6 +6426,7 @@ err_context: err_alloc: free_event(event); err_task: + put_online_cpus(); if (task) put_task_struct(task); err_group_fd: @@ -6479,6 +6487,39 @@ err: } EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter); +void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu) +{ + struct perf_event_context *src_ctx; + struct perf_event_context *dst_ctx; + struct perf_event *event, *tmp; + LIST_HEAD(events); + + src_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, src_cpu)->ctx; + dst_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, dst_cpu)->ctx; + + mutex_lock(&src_ctx->mutex); + list_for_each_entry_safe(event, tmp, &src_ctx->event_list, + event_entry) { + perf_remove_from_context(event); + put_ctx(src_ctx); + list_add(&event->event_entry, &events); + } + mutex_unlock(&src_ctx->mutex); + + synchronize_rcu(); + + mutex_lock(&dst_ctx->mutex); + list_for_each_entry_safe(event, tmp, &events, event_entry) { + list_del(&event->event_entry); + if (event->state >= PERF_EVENT_STATE_OFF) + event->state = PERF_EVENT_STATE_INACTIVE; + perf_install_in_context(dst_ctx, event, dst_cpu); + get_ctx(dst_ctx); + } + mutex_unlock(&dst_ctx->mutex); +} +EXPORT_SYMBOL_GPL(perf_pmu_migrate_context); + static void sync_child_event(struct perf_event *child_event, struct task_struct *child) { diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 985be4d80fe8..f93532748bca 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -38,13 +38,29 @@ #define UINSNS_PER_PAGE (PAGE_SIZE/UPROBE_XOL_SLOT_BYTES) #define MAX_UPROBE_XOL_SLOTS UINSNS_PER_PAGE -static struct srcu_struct uprobes_srcu; static struct rb_root uprobes_tree = RB_ROOT; static DEFINE_SPINLOCK(uprobes_treelock); /* serialize rbtree access */ #define UPROBES_HASH_SZ 13 +/* + * We need separate register/unregister and mmap/munmap lock hashes because + * of mmap_sem nesting. + * + * uprobe_register() needs to install probes on (potentially) all processes + * and thus needs to acquire multiple mmap_sems (consequtively, not + * concurrently), whereas uprobe_mmap() is called while holding mmap_sem + * for the particular process doing the mmap. + * + * uprobe_register()->register_for_each_vma() needs to drop/acquire mmap_sem + * because of lock order against i_mmap_mutex. This means there's a hole in + * the register vma iteration where a mmap() can happen. + * + * Thus uprobe_register() can race with uprobe_mmap() and we can try and + * install a probe where one is already installed. + */ + /* serialize (un)register */ static struct mutex uprobes_mutex[UPROBES_HASH_SZ]; @@ -61,17 +77,6 @@ static struct mutex uprobes_mmap_mutex[UPROBES_HASH_SZ]; */ static atomic_t uprobe_events = ATOMIC_INIT(0); -/* - * Maintain a temporary per vma info that can be used to search if a vma - * has already been handled. This structure is introduced since extending - * vm_area_struct wasnt recommended. - */ -struct vma_info { - struct list_head probe_list; - struct mm_struct *mm; - loff_t vaddr; -}; - struct uprobe { struct rb_node rb_node; /* node in the rb tree */ atomic_t ref; @@ -100,7 +105,8 @@ static bool valid_vma(struct vm_area_struct *vma, bool is_register) if (!is_register) return true; - if ((vma->vm_flags & (VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)) == (VM_READ|VM_EXEC)) + if ((vma->vm_flags & (VM_HUGETLB|VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)) + == (VM_READ|VM_EXEC)) return true; return false; @@ -129,33 +135,17 @@ static loff_t vma_address(struct vm_area_struct *vma, loff_t offset) static int __replace_page(struct vm_area_struct *vma, struct page *page, struct page *kpage) { struct mm_struct *mm = vma->vm_mm; - pgd_t *pgd; - pud_t *pud; - pmd_t *pmd; - pte_t *ptep; - spinlock_t *ptl; unsigned long addr; - int err = -EFAULT; + spinlock_t *ptl; + pte_t *ptep; addr = page_address_in_vma(page, vma); if (addr == -EFAULT) - goto out; - - pgd = pgd_offset(mm, addr); - if (!pgd_present(*pgd)) - goto out; - - pud = pud_offset(pgd, addr); - if (!pud_present(*pud)) - goto out; - - pmd = pmd_offset(pud, addr); - if (!pmd_present(*pmd)) - goto out; + return -EFAULT; - ptep = pte_offset_map_lock(mm, pmd, addr, &ptl); + ptep = page_check_address(page, mm, addr, &ptl, 0); if (!ptep) - goto out; + return -EAGAIN; get_page(kpage); page_add_new_anon_rmap(kpage, vma, addr); @@ -174,10 +164,8 @@ static int __replace_page(struct vm_area_struct *vma, struct page *page, struct try_to_free_swap(page); put_page(page); pte_unmap_unlock(ptep, ptl); - err = 0; -out: - return err; + return 0; } /** @@ -222,9 +210,8 @@ static int write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm, void *vaddr_old, *vaddr_new; struct vm_area_struct *vma; struct uprobe *uprobe; - loff_t addr; int ret; - +retry: /* Read the page with vaddr into memory */ ret = get_user_pages(NULL, mm, vaddr, 1, 0, 0, &old_page, &vma); if (ret <= 0) @@ -246,10 +233,6 @@ static int write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm, if (mapping != vma->vm_file->f_mapping) goto put_out; - addr = vma_address(vma, uprobe->offset); - if (vaddr != (unsigned long)addr) - goto put_out; - ret = -ENOMEM; new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vaddr); if (!new_page) @@ -267,11 +250,7 @@ static int write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm, vaddr_new = kmap_atomic(new_page); memcpy(vaddr_new, vaddr_old, PAGE_SIZE); - - /* poke the new insn in, ASSUMES we don't cross page boundary */ - vaddr &= ~PAGE_MASK; - BUG_ON(vaddr + UPROBE_SWBP_INSN_SIZE > PAGE_SIZE); - memcpy(vaddr_new + vaddr, &opcode, UPROBE_SWBP_INSN_SIZE); + memcpy(vaddr_new + (vaddr & ~PAGE_MASK), &opcode, UPROBE_SWBP_INSN_SIZE); kunmap_atomic(vaddr_new); kunmap_atomic(vaddr_old); @@ -291,6 +270,8 @@ unlock_out: put_out: put_page(old_page); + if (unlikely(ret == -EAGAIN)) + goto retry; return ret; } @@ -312,7 +293,7 @@ static int read_opcode(struct mm_struct *mm, unsigned long vaddr, uprobe_opcode_ void *vaddr_new; int ret; - ret = get_user_pages(NULL, mm, vaddr, 1, 0, 0, &page, NULL); + ret = get_user_pages(NULL, mm, vaddr, 1, 0, 1, &page, NULL); if (ret <= 0) return ret; @@ -333,10 +314,20 @@ static int is_swbp_at_addr(struct mm_struct *mm, unsigned long vaddr) uprobe_opcode_t opcode; int result; + if (current->mm == mm) { + pagefault_disable(); + result = __copy_from_user_inatomic(&opcode, (void __user*)vaddr, + sizeof(opcode)); + pagefault_enable(); + + if (likely(result == 0)) + goto out; + } + result = read_opcode(mm, vaddr, &opcode); if (result) return result; - +out: if (is_swbp_insn(&opcode)) return 1; @@ -355,7 +346,9 @@ static int is_swbp_at_addr(struct mm_struct *mm, unsigned long vaddr) int __weak set_swbp(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr) { int result; - + /* + * See the comment near uprobes_hash(). + */ result = is_swbp_at_addr(mm, vaddr); if (result == 1) return -EEXIST; @@ -520,7 +513,6 @@ static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset) uprobe->inode = igrab(inode); uprobe->offset = offset; init_rwsem(&uprobe->consumer_rwsem); - INIT_LIST_HEAD(&uprobe->pending_list); /* add to uprobes_tree, sorted on inode:offset */ cur_uprobe = insert_uprobe(uprobe); @@ -588,20 +580,22 @@ static bool consumer_del(struct uprobe *uprobe, struct uprobe_consumer *uc) } static int -__copy_insn(struct address_space *mapping, struct vm_area_struct *vma, char *insn, - unsigned long nbytes, unsigned long offset) +__copy_insn(struct address_space *mapping, struct file *filp, char *insn, + unsigned long nbytes, loff_t offset) { - struct file *filp = vma->vm_file; struct page *page; void *vaddr; - unsigned long off1; - unsigned long idx; + unsigned long off; + pgoff_t idx; if (!filp) return -EINVAL; - idx = (unsigned long)(offset >> PAGE_CACHE_SHIFT); - off1 = offset &= ~PAGE_MASK; + if (!mapping->a_ops->readpage) + return -EIO; + + idx = offset >> PAGE_CACHE_SHIFT; + off = offset & ~PAGE_MASK; /* * Ensure that the page that has the original instruction is @@ -612,22 +606,20 @@ __copy_insn(struct address_space *mapping, struct vm_area_struct *vma, char *ins return PTR_ERR(page); vaddr = kmap_atomic(page); - memcpy(insn, vaddr + off1, nbytes); + memcpy(insn, vaddr + off, nbytes); kunmap_atomic(vaddr); page_cache_release(page); return 0; } -static int -copy_insn(struct uprobe *uprobe, struct vm_area_struct *vma, unsigned long addr) +static int copy_insn(struct uprobe *uprobe, struct file *filp) { struct address_space *mapping; unsigned long nbytes; int bytes; - addr &= ~PAGE_MASK; - nbytes = PAGE_SIZE - addr; + nbytes = PAGE_SIZE - (uprobe->offset & ~PAGE_MASK); mapping = uprobe->inode->i_mapping; /* Instruction at end of binary; copy only available bytes */ @@ -638,13 +630,13 @@ copy_insn(struct uprobe *uprobe, struct vm_area_struct *vma, unsigned long addr) /* Instruction at the page-boundary; copy bytes in second page */ if (nbytes < bytes) { - if (__copy_insn(mapping, vma, uprobe->arch.insn + nbytes, - bytes - nbytes, uprobe->offset + nbytes)) - return -ENOMEM; - + int err = __copy_insn(mapping, filp, uprobe->arch.insn + nbytes, + bytes - nbytes, uprobe->offset + nbytes); + if (err) + return err; bytes = nbytes; } - return __copy_insn(mapping, vma, uprobe->arch.insn, bytes, uprobe->offset); + return __copy_insn(mapping, filp, uprobe->arch.insn, bytes, uprobe->offset); } /* @@ -672,9 +664,8 @@ copy_insn(struct uprobe *uprobe, struct vm_area_struct *vma, unsigned long addr) */ static int install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, - struct vm_area_struct *vma, loff_t vaddr) + struct vm_area_struct *vma, unsigned long vaddr) { - unsigned long addr; int ret; /* @@ -687,20 +678,22 @@ install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, if (!uprobe->consumers) return -EEXIST; - addr = (unsigned long)vaddr; - if (!(uprobe->flags & UPROBE_COPY_INSN)) { - ret = copy_insn(uprobe, vma, addr); + ret = copy_insn(uprobe, vma->vm_file); if (ret) return ret; if (is_swbp_insn((uprobe_opcode_t *)uprobe->arch.insn)) - return -EEXIST; + return -ENOTSUPP; - ret = arch_uprobe_analyze_insn(&uprobe->arch, mm); + ret = arch_uprobe_analyze_insn(&uprobe->arch, mm, vaddr); if (ret) return ret; + /* write_opcode() assumes we don't cross page boundary */ + BUG_ON((uprobe->offset & ~PAGE_MASK) + + UPROBE_SWBP_INSN_SIZE > PAGE_SIZE); + uprobe->flags |= UPROBE_COPY_INSN; } @@ -713,7 +706,7 @@ install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, * Hence increment before and decrement on failure. */ atomic_inc(&mm->uprobes_state.count); - ret = set_swbp(&uprobe->arch, mm, addr); + ret = set_swbp(&uprobe->arch, mm, vaddr); if (ret) atomic_dec(&mm->uprobes_state.count); @@ -721,27 +714,21 @@ install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, } static void -remove_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, loff_t vaddr) +remove_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, unsigned long vaddr) { - if (!set_orig_insn(&uprobe->arch, mm, (unsigned long)vaddr, true)) + if (!set_orig_insn(&uprobe->arch, mm, vaddr, true)) atomic_dec(&mm->uprobes_state.count); } /* - * There could be threads that have hit the breakpoint and are entering the - * notifier code and trying to acquire the uprobes_treelock. The thread - * calling delete_uprobe() that is removing the uprobe from the rb_tree can - * race with these threads and might acquire the uprobes_treelock compared - * to some of the breakpoint hit threads. In such a case, the breakpoint - * hit threads will not find the uprobe. The current unregistering thread - * waits till all other threads have hit a breakpoint, to acquire the - * uprobes_treelock before the uprobe is removed from the rbtree. + * There could be threads that have already hit the breakpoint. They + * will recheck the current insn and restart if find_uprobe() fails. + * See find_active_uprobe(). */ static void delete_uprobe(struct uprobe *uprobe) { unsigned long flags; - synchronize_srcu(&uprobes_srcu); spin_lock_irqsave(&uprobes_treelock, flags); rb_erase(&uprobe->rb_node, &uprobes_tree); spin_unlock_irqrestore(&uprobes_treelock, flags); @@ -750,139 +737,135 @@ static void delete_uprobe(struct uprobe *uprobe) atomic_dec(&uprobe_events); } -static struct vma_info * -__find_next_vma_info(struct address_space *mapping, struct list_head *head, - struct vma_info *vi, loff_t offset, bool is_register) +struct map_info { + struct map_info *next; + struct mm_struct *mm; + unsigned long vaddr; +}; + +static inline struct map_info *free_map_info(struct map_info *info) +{ + struct map_info *next = info->next; + kfree(info); + return next; +} + +static struct map_info * +build_map_info(struct address_space *mapping, loff_t offset, bool is_register) { + unsigned long pgoff = offset >> PAGE_SHIFT; struct prio_tree_iter iter; struct vm_area_struct *vma; - struct vma_info *tmpvi; - unsigned long pgoff; - int existing_vma; - loff_t vaddr; - - pgoff = offset >> PAGE_SHIFT; + struct map_info *curr = NULL; + struct map_info *prev = NULL; + struct map_info *info; + int more = 0; + again: + mutex_lock(&mapping->i_mmap_mutex); vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { if (!valid_vma(vma, is_register)) continue; - existing_vma = 0; - vaddr = vma_address(vma, offset); - - list_for_each_entry(tmpvi, head, probe_list) { - if (tmpvi->mm == vma->vm_mm && tmpvi->vaddr == vaddr) { - existing_vma = 1; - break; - } + if (!prev && !more) { + /* + * Needs GFP_NOWAIT to avoid i_mmap_mutex recursion through + * reclaim. This is optimistic, no harm done if it fails. + */ + prev = kmalloc(sizeof(struct map_info), + GFP_NOWAIT | __GFP_NOMEMALLOC | __GFP_NOWARN); + if (prev) + prev->next = NULL; } - - /* - * Another vma needs a probe to be installed. However skip - * installing the probe if the vma is about to be unlinked. - */ - if (!existing_vma && atomic_inc_not_zero(&vma->vm_mm->mm_users)) { - vi->mm = vma->vm_mm; - vi->vaddr = vaddr; - list_add(&vi->probe_list, head); - - return vi; + if (!prev) { + more++; + continue; } - } - return NULL; -} - -/* - * Iterate in the rmap prio tree and find a vma where a probe has not - * yet been inserted. - */ -static struct vma_info * -find_next_vma_info(struct address_space *mapping, struct list_head *head, - loff_t offset, bool is_register) -{ - struct vma_info *vi, *retvi; + if (!atomic_inc_not_zero(&vma->vm_mm->mm_users)) + continue; - vi = kzalloc(sizeof(struct vma_info), GFP_KERNEL); - if (!vi) - return ERR_PTR(-ENOMEM); + info = prev; + prev = prev->next; + info->next = curr; + curr = info; - mutex_lock(&mapping->i_mmap_mutex); - retvi = __find_next_vma_info(mapping, head, vi, offset, is_register); + info->mm = vma->vm_mm; + info->vaddr = vma_address(vma, offset); + } mutex_unlock(&mapping->i_mmap_mutex); - if (!retvi) - kfree(vi); + if (!more) + goto out; + + prev = curr; + while (curr) { + mmput(curr->mm); + curr = curr->next; + } - return retvi; + do { + info = kmalloc(sizeof(struct map_info), GFP_KERNEL); + if (!info) { + curr = ERR_PTR(-ENOMEM); + goto out; + } + info->next = prev; + prev = info; + } while (--more); + + goto again; + out: + while (prev) + prev = free_map_info(prev); + return curr; } static int register_for_each_vma(struct uprobe *uprobe, bool is_register) { - struct list_head try_list; - struct vm_area_struct *vma; - struct address_space *mapping; - struct vma_info *vi, *tmpvi; - struct mm_struct *mm; - loff_t vaddr; - int ret; + struct map_info *info; + int err = 0; - mapping = uprobe->inode->i_mapping; - INIT_LIST_HEAD(&try_list); + info = build_map_info(uprobe->inode->i_mapping, + uprobe->offset, is_register); + if (IS_ERR(info)) + return PTR_ERR(info); - ret = 0; + while (info) { + struct mm_struct *mm = info->mm; + struct vm_area_struct *vma; - for (;;) { - vi = find_next_vma_info(mapping, &try_list, uprobe->offset, is_register); - if (!vi) - break; + if (err) + goto free; - if (IS_ERR(vi)) { - ret = PTR_ERR(vi); - break; - } + down_write(&mm->mmap_sem); + vma = find_vma(mm, (unsigned long)info->vaddr); + if (!vma || !valid_vma(vma, is_register)) + goto unlock; - mm = vi->mm; - down_read(&mm->mmap_sem); - vma = find_vma(mm, (unsigned long)vi->vaddr); - if (!vma || !valid_vma(vma, is_register)) { - list_del(&vi->probe_list); - kfree(vi); - up_read(&mm->mmap_sem); - mmput(mm); - continue; - } - vaddr = vma_address(vma, uprobe->offset); if (vma->vm_file->f_mapping->host != uprobe->inode || - vaddr != vi->vaddr) { - list_del(&vi->probe_list); - kfree(vi); - up_read(&mm->mmap_sem); - mmput(mm); - continue; - } - - if (is_register) - ret = install_breakpoint(uprobe, mm, vma, vi->vaddr); - else - remove_breakpoint(uprobe, mm, vi->vaddr); + vma_address(vma, uprobe->offset) != info->vaddr) + goto unlock; - up_read(&mm->mmap_sem); - mmput(mm); if (is_register) { - if (ret && ret == -EEXIST) - ret = 0; - if (ret) - break; + err = install_breakpoint(uprobe, mm, vma, info->vaddr); + /* + * We can race against uprobe_mmap(), see the + * comment near uprobe_hash(). + */ + if (err == -EEXIST) + err = 0; + } else { + remove_breakpoint(uprobe, mm, info->vaddr); } + unlock: + up_write(&mm->mmap_sem); + free: + mmput(mm); + info = free_map_info(info); } - list_for_each_entry_safe(vi, tmpvi, &try_list, probe_list) { - list_del(&vi->probe_list); - kfree(vi); - } - - return ret; + return err; } static int __uprobe_register(struct uprobe *uprobe) @@ -1048,7 +1031,7 @@ static void build_probe_list(struct inode *inode, struct list_head *head) int uprobe_mmap(struct vm_area_struct *vma) { struct list_head tmp_list; - struct uprobe *uprobe, *u; + struct uprobe *uprobe; struct inode *inode; int ret, count; @@ -1066,12 +1049,9 @@ int uprobe_mmap(struct vm_area_struct *vma) ret = 0; count = 0; - list_for_each_entry_safe(uprobe, u, &tmp_list, pending_list) { - loff_t vaddr; - - list_del(&uprobe->pending_list); + list_for_each_entry(uprobe, &tmp_list, pending_list) { if (!ret) { - vaddr = vma_address(vma, uprobe->offset); + loff_t vaddr = vma_address(vma, uprobe->offset); if (vaddr < vma->vm_start || vaddr >= vma->vm_end) { put_uprobe(uprobe); @@ -1079,8 +1059,10 @@ int uprobe_mmap(struct vm_area_struct *vma) } ret = install_breakpoint(uprobe, vma->vm_mm, vma, vaddr); - - /* Ignore double add: */ + /* + * We can race against uprobe_register(), see the + * comment near uprobe_hash(). + */ if (ret == -EEXIST) { ret = 0; @@ -1115,7 +1097,7 @@ int uprobe_mmap(struct vm_area_struct *vma) void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned long end) { struct list_head tmp_list; - struct uprobe *uprobe, *u; + struct uprobe *uprobe; struct inode *inode; if (!atomic_read(&uprobe_events) || !valid_vma(vma, false)) @@ -1132,11 +1114,8 @@ void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned lon mutex_lock(uprobes_mmap_hash(inode)); build_probe_list(inode, &tmp_list); - list_for_each_entry_safe(uprobe, u, &tmp_list, pending_list) { - loff_t vaddr; - - list_del(&uprobe->pending_list); - vaddr = vma_address(vma, uprobe->offset); + list_for_each_entry(uprobe, &tmp_list, pending_list) { + loff_t vaddr = vma_address(vma, uprobe->offset); if (vaddr >= start && vaddr < end) { /* @@ -1378,9 +1357,6 @@ void uprobe_free_utask(struct task_struct *t) { struct uprobe_task *utask = t->utask; - if (t->uprobe_srcu_id != -1) - srcu_read_unlock_raw(&uprobes_srcu, t->uprobe_srcu_id); - if (!utask) return; @@ -1398,7 +1374,6 @@ void uprobe_free_utask(struct task_struct *t) void uprobe_copy_process(struct task_struct *t) { t->utask = NULL; - t->uprobe_srcu_id = -1; } /* @@ -1417,7 +1392,6 @@ static struct uprobe_task *add_utask(void) if (unlikely(!utask)) return NULL; - utask->active_uprobe = NULL; current->utask = utask; return utask; } @@ -1479,41 +1453,64 @@ static bool can_skip_sstep(struct uprobe *uprobe, struct pt_regs *regs) return false; } +static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp) +{ + struct mm_struct *mm = current->mm; + struct uprobe *uprobe = NULL; + struct vm_area_struct *vma; + + down_read(&mm->mmap_sem); + vma = find_vma(mm, bp_vaddr); + if (vma && vma->vm_start <= bp_vaddr) { + if (valid_vma(vma, false)) { + struct inode *inode; + loff_t offset; + + inode = vma->vm_file->f_mapping->host; + offset = bp_vaddr - vma->vm_start; + offset += (vma->vm_pgoff << PAGE_SHIFT); + uprobe = find_uprobe(inode, offset); + } + + if (!uprobe) + *is_swbp = is_swbp_at_addr(mm, bp_vaddr); + } else { + *is_swbp = -EFAULT; + } + up_read(&mm->mmap_sem); + + return uprobe; +} + /* * Run handler and ask thread to singlestep. * Ensure all non-fatal signals cannot interrupt thread while it singlesteps. */ static void handle_swbp(struct pt_regs *regs) { - struct vm_area_struct *vma; struct uprobe_task *utask; struct uprobe *uprobe; - struct mm_struct *mm; unsigned long bp_vaddr; + int uninitialized_var(is_swbp); - uprobe = NULL; bp_vaddr = uprobe_get_swbp_addr(regs); - mm = current->mm; - down_read(&mm->mmap_sem); - vma = find_vma(mm, bp_vaddr); - - if (vma && vma->vm_start <= bp_vaddr && valid_vma(vma, false)) { - struct inode *inode; - loff_t offset; - - inode = vma->vm_file->f_mapping->host; - offset = bp_vaddr - vma->vm_start; - offset += (vma->vm_pgoff << PAGE_SHIFT); - uprobe = find_uprobe(inode, offset); - } - - srcu_read_unlock_raw(&uprobes_srcu, current->uprobe_srcu_id); - current->uprobe_srcu_id = -1; - up_read(&mm->mmap_sem); + uprobe = find_active_uprobe(bp_vaddr, &is_swbp); if (!uprobe) { - /* No matching uprobe; signal SIGTRAP. */ - send_sig(SIGTRAP, current, 0); + if (is_swbp > 0) { + /* No matching uprobe; signal SIGTRAP. */ + send_sig(SIGTRAP, current, 0); + } else { + /* + * Either we raced with uprobe_unregister() or we can't + * access this memory. The latter is only possible if + * another thread plays with our ->mm. In both cases + * we can simply restart. If this vma was unmapped we + * can pretend this insn was not executed yet and get + * the (correct) SIGSEGV after restart. + */ + instruction_pointer_set(regs, bp_vaddr); + } return; } @@ -1620,7 +1617,6 @@ int uprobe_pre_sstep_notifier(struct pt_regs *regs) utask->state = UTASK_BP_HIT; set_thread_flag(TIF_UPROBE); - current->uprobe_srcu_id = srcu_read_lock_raw(&uprobes_srcu); return 1; } @@ -1655,7 +1651,6 @@ static int __init init_uprobes(void) mutex_init(&uprobes_mutex[i]); mutex_init(&uprobes_mmap_mutex[i]); } - init_srcu_struct(&uprobes_srcu); return register_die_notifier(&uprobe_exception_nb); } diff --git a/kernel/fork.c b/kernel/fork.c index bebabad59202..ff1cad3b7bdc 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -304,12 +304,17 @@ static struct task_struct *dup_task_struct(struct task_struct *orig) } err = arch_dup_task_struct(tsk, orig); - if (err) - goto out; + /* + * We defer looking at err, because we will need this setup + * for the clean up path to work correctly. + */ tsk->stack = ti; - setup_thread_stack(tsk, orig); + + if (err) + goto out; + clear_user_return_notifier(tsk); clear_tsk_need_resched(tsk); stackend = end_of_stack(tsk); diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index 8f9b4eb974e0..a70518c9d82f 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -175,7 +175,7 @@ config PM_TEST_SUSPEND You probably want to have your system's RTC driver statically linked, ensuring that it's available when this test runs. -config CAN_PM_TRACE +config PM_SLEEP_DEBUG def_bool y depends on PM_DEBUG && PM_SLEEP @@ -196,7 +196,7 @@ config PM_TRACE config PM_TRACE_RTC bool "Suspend/resume event tracing" - depends on CAN_PM_TRACE + depends on PM_SLEEP_DEBUG depends on X86 select PM_TRACE ---help--- diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 8b53db38a279..b26f5f1e773e 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -5,6 +5,7 @@ * Copyright (c) 2003 Open Source Development Lab * Copyright (c) 2004 Pavel Machek <pavel@ucw.cz> * Copyright (c) 2009 Rafael J. Wysocki, Novell Inc. + * Copyright (C) 2012 Bojan Smojver <bojan@rexursive.com> * * This file is released under the GPLv2. */ @@ -27,7 +28,6 @@ #include <linux/syscore_ops.h> #include <linux/ctype.h> #include <linux/genhd.h> -#include <scsi/scsi_scan.h> #include "power.h" @@ -46,6 +46,9 @@ enum { HIBERNATION_PLATFORM, HIBERNATION_SHUTDOWN, HIBERNATION_REBOOT, +#ifdef CONFIG_SUSPEND + HIBERNATION_SUSPEND, +#endif /* keep last */ __HIBERNATION_AFTER_LAST }; @@ -354,6 +357,7 @@ int hibernation_snapshot(int platform_mode) } suspend_console(); + ftrace_stop(); pm_restrict_gfp_mask(); error = dpm_suspend(PMSG_FREEZE); @@ -379,6 +383,7 @@ int hibernation_snapshot(int platform_mode) if (error || !in_suspend) pm_restore_gfp_mask(); + ftrace_start(); resume_console(); dpm_complete(msg); @@ -481,6 +486,7 @@ int hibernation_restore(int platform_mode) pm_prepare_console(); suspend_console(); + ftrace_stop(); pm_restrict_gfp_mask(); error = dpm_suspend_start(PMSG_QUIESCE); if (!error) { @@ -488,6 +494,7 @@ int hibernation_restore(int platform_mode) dpm_resume_end(PMSG_RECOVER); } pm_restore_gfp_mask(); + ftrace_start(); resume_console(); pm_restore_console(); return error; @@ -514,6 +521,7 @@ int hibernation_platform_enter(void) entering_platform_hibernation = true; suspend_console(); + ftrace_stop(); error = dpm_suspend_start(PMSG_HIBERNATE); if (error) { if (hibernation_ops->recover) @@ -557,6 +565,7 @@ int hibernation_platform_enter(void) Resume_devices: entering_platform_hibernation = false; dpm_resume_end(PMSG_RESTORE); + ftrace_start(); resume_console(); Close: @@ -574,6 +583,10 @@ int hibernation_platform_enter(void) */ static void power_down(void) { +#ifdef CONFIG_SUSPEND + int error; +#endif + switch (hibernation_mode) { case HIBERNATION_REBOOT: kernel_restart(NULL); @@ -583,6 +596,25 @@ static void power_down(void) case HIBERNATION_SHUTDOWN: kernel_power_off(); break; +#ifdef CONFIG_SUSPEND + case HIBERNATION_SUSPEND: + error = suspend_devices_and_enter(PM_SUSPEND_MEM); + if (error) { + if (hibernation_ops) + hibernation_mode = HIBERNATION_PLATFORM; + else + hibernation_mode = HIBERNATION_SHUTDOWN; + power_down(); + } + /* + * Restore swap signature. + */ + error = swsusp_unmark(); + if (error) + printk(KERN_ERR "PM: Swap will be unusable! " + "Try swapon -a.\n"); + return; +#endif } kernel_halt(); /* @@ -748,13 +780,6 @@ static int software_resume(void) async_synchronize_full(); } - /* - * We can't depend on SCSI devices being available after loading - * one of their modules until scsi_complete_async_scans() is - * called and the resume device usually is a SCSI one. - */ - scsi_complete_async_scans(); - swsusp_resume_device = name_to_dev_t(resume_file); if (!swsusp_resume_device) { error = -ENODEV; @@ -827,6 +852,9 @@ static const char * const hibernation_modes[] = { [HIBERNATION_PLATFORM] = "platform", [HIBERNATION_SHUTDOWN] = "shutdown", [HIBERNATION_REBOOT] = "reboot", +#ifdef CONFIG_SUSPEND + [HIBERNATION_SUSPEND] = "suspend", +#endif }; /* @@ -867,6 +895,9 @@ static ssize_t disk_show(struct kobject *kobj, struct kobj_attribute *attr, switch (i) { case HIBERNATION_SHUTDOWN: case HIBERNATION_REBOOT: +#ifdef CONFIG_SUSPEND + case HIBERNATION_SUSPEND: +#endif break; case HIBERNATION_PLATFORM: if (hibernation_ops) @@ -907,6 +938,9 @@ static ssize_t disk_store(struct kobject *kobj, struct kobj_attribute *attr, switch (mode) { case HIBERNATION_SHUTDOWN: case HIBERNATION_REBOOT: +#ifdef CONFIG_SUSPEND + case HIBERNATION_SUSPEND: +#endif hibernation_mode = mode; break; case HIBERNATION_PLATFORM: diff --git a/kernel/power/main.c b/kernel/power/main.c index 428f8a034e96..f458238109cc 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -235,6 +235,47 @@ late_initcall(pm_debugfs_init); #endif /* CONFIG_PM_SLEEP */ +#ifdef CONFIG_PM_SLEEP_DEBUG +/* + * pm_print_times: print time taken by devices to suspend and resume. + * + * show() returns whether printing of suspend and resume times is enabled. + * store() accepts 0 or 1. 0 disables printing and 1 enables it. + */ +bool pm_print_times_enabled; + +static ssize_t pm_print_times_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%d\n", pm_print_times_enabled); +} + +static ssize_t pm_print_times_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t n) +{ + unsigned long val; + + if (kstrtoul(buf, 10, &val)) + return -EINVAL; + + if (val > 1) + return -EINVAL; + + pm_print_times_enabled = !!val; + return n; +} + +power_attr(pm_print_times); + +static inline void pm_print_times_init(void) +{ + pm_print_times_enabled = !!initcall_debug; +} +#else /* !CONFIG_PP_SLEEP_DEBUG */ +static inline void pm_print_times_init(void) {} +#endif /* CONFIG_PM_SLEEP_DEBUG */ + struct kobject *power_kobj; /** @@ -531,6 +572,9 @@ static struct attribute * g[] = { #ifdef CONFIG_PM_DEBUG &pm_test_attr.attr, #endif +#ifdef CONFIG_PM_SLEEP_DEBUG + &pm_print_times_attr.attr, +#endif #endif NULL, }; @@ -566,6 +610,7 @@ static int __init pm_init(void) error = sysfs_create_group(power_kobj, &attr_group); if (error) return error; + pm_print_times_init(); return pm_autosleep_init(); } diff --git a/kernel/power/power.h b/kernel/power/power.h index b0bd4beaebfe..7d4b7ffb3c1d 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -156,6 +156,9 @@ extern void swsusp_free(void); extern int swsusp_read(unsigned int *flags_p); extern int swsusp_write(unsigned int flags); extern void swsusp_close(fmode_t); +#ifdef CONFIG_SUSPEND +extern int swsusp_unmark(void); +#endif /* kernel/power/block_io.c */ extern struct block_device *hib_resume_bdev; diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 396d262b8fd0..c8b7446b27df 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -24,6 +24,7 @@ #include <linux/export.h> #include <linux/suspend.h> #include <linux/syscore_ops.h> +#include <linux/ftrace.h> #include <trace/events/power.h> #include "power.h" @@ -212,6 +213,7 @@ int suspend_devices_and_enter(suspend_state_t state) goto Close; } suspend_console(); + ftrace_stop(); suspend_test_start(); error = dpm_suspend_start(PMSG_SUSPEND); if (error) { @@ -231,6 +233,7 @@ int suspend_devices_and_enter(suspend_state_t state) suspend_test_start(); dpm_resume_end(PMSG_RESUME); suspend_test_finish("resume devices"); + ftrace_start(); resume_console(); Close: if (suspend_ops->end) diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 11e22c068e8b..3c9d764eb0d8 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -448,9 +448,9 @@ static int save_image(struct swap_map_handle *handle, struct timeval start; struct timeval stop; - printk(KERN_INFO "PM: Saving image data pages (%u pages) ... ", + printk(KERN_INFO "PM: Saving image data pages (%u pages)...\n", nr_to_write); - m = nr_to_write / 100; + m = nr_to_write / 10; if (!m) m = 1; nr_pages = 0; @@ -464,7 +464,8 @@ static int save_image(struct swap_map_handle *handle, if (ret) break; if (!(nr_pages % m)) - printk(KERN_CONT "\b\b\b\b%3d%%", nr_pages / m); + printk(KERN_INFO "PM: Image saving progress: %3d%%\n", + nr_pages / m * 10); nr_pages++; } err2 = hib_wait_on_bio_chain(&bio); @@ -472,9 +473,7 @@ static int save_image(struct swap_map_handle *handle, if (!ret) ret = err2; if (!ret) - printk(KERN_CONT "\b\b\b\bdone\n"); - else - printk(KERN_CONT "\n"); + printk(KERN_INFO "PM: Image saving done.\n"); swsusp_show_speed(&start, &stop, nr_to_write, "Wrote"); return ret; } @@ -668,9 +667,9 @@ static int save_image_lzo(struct swap_map_handle *handle, printk(KERN_INFO "PM: Using %u thread(s) for compression.\n" - "PM: Compressing and saving image data (%u pages) ... ", + "PM: Compressing and saving image data (%u pages)...\n", nr_threads, nr_to_write); - m = nr_to_write / 100; + m = nr_to_write / 10; if (!m) m = 1; nr_pages = 0; @@ -690,8 +689,10 @@ static int save_image_lzo(struct swap_map_handle *handle, data_of(*snapshot), PAGE_SIZE); if (!(nr_pages % m)) - printk(KERN_CONT "\b\b\b\b%3d%%", - nr_pages / m); + printk(KERN_INFO + "PM: Image saving progress: " + "%3d%%\n", + nr_pages / m * 10); nr_pages++; } if (!off) @@ -761,11 +762,8 @@ out_finish: do_gettimeofday(&stop); if (!ret) ret = err2; - if (!ret) { - printk(KERN_CONT "\b\b\b\bdone\n"); - } else { - printk(KERN_CONT "\n"); - } + if (!ret) + printk(KERN_INFO "PM: Image saving done.\n"); swsusp_show_speed(&start, &stop, nr_to_write, "Wrote"); out_clean: if (crc) { @@ -973,9 +971,9 @@ static int load_image(struct swap_map_handle *handle, int err2; unsigned nr_pages; - printk(KERN_INFO "PM: Loading image data pages (%u pages) ... ", + printk(KERN_INFO "PM: Loading image data pages (%u pages)...\n", nr_to_read); - m = nr_to_read / 100; + m = nr_to_read / 10; if (!m) m = 1; nr_pages = 0; @@ -993,7 +991,8 @@ static int load_image(struct swap_map_handle *handle, if (ret) break; if (!(nr_pages % m)) - printk("\b\b\b\b%3d%%", nr_pages / m); + printk(KERN_INFO "PM: Image loading progress: %3d%%\n", + nr_pages / m * 10); nr_pages++; } err2 = hib_wait_on_bio_chain(&bio); @@ -1001,12 +1000,11 @@ static int load_image(struct swap_map_handle *handle, if (!ret) ret = err2; if (!ret) { - printk("\b\b\b\bdone\n"); + printk(KERN_INFO "PM: Image loading done.\n"); snapshot_write_finalize(snapshot); if (!snapshot_image_loaded(snapshot)) ret = -ENODATA; - } else - printk("\n"); + } swsusp_show_speed(&start, &stop, nr_to_read, "Read"); return ret; } @@ -1185,9 +1183,9 @@ static int load_image_lzo(struct swap_map_handle *handle, printk(KERN_INFO "PM: Using %u thread(s) for decompression.\n" - "PM: Loading and decompressing image data (%u pages) ... ", + "PM: Loading and decompressing image data (%u pages)...\n", nr_threads, nr_to_read); - m = nr_to_read / 100; + m = nr_to_read / 10; if (!m) m = 1; nr_pages = 0; @@ -1319,7 +1317,10 @@ static int load_image_lzo(struct swap_map_handle *handle, data[thr].unc + off, PAGE_SIZE); if (!(nr_pages % m)) - printk("\b\b\b\b%3d%%", nr_pages / m); + printk(KERN_INFO + "PM: Image loading progress: " + "%3d%%\n", + nr_pages / m * 10); nr_pages++; ret = snapshot_write_next(snapshot); @@ -1344,7 +1345,7 @@ out_finish: } do_gettimeofday(&stop); if (!ret) { - printk("\b\b\b\bdone\n"); + printk(KERN_INFO "PM: Image loading done.\n"); snapshot_write_finalize(snapshot); if (!snapshot_image_loaded(snapshot)) ret = -ENODATA; @@ -1357,8 +1358,7 @@ out_finish: } } } - } else - printk("\n"); + } swsusp_show_speed(&start, &stop, nr_to_read, "Read"); out_clean: for (i = 0; i < ring_size; i++) @@ -1472,6 +1472,34 @@ void swsusp_close(fmode_t mode) blkdev_put(hib_resume_bdev, mode); } +/** + * swsusp_unmark - Unmark swsusp signature in the resume device + */ + +#ifdef CONFIG_SUSPEND +int swsusp_unmark(void) +{ + int error; + + hib_bio_read_page(swsusp_resume_block, swsusp_header, NULL); + if (!memcmp(HIBERNATE_SIG,swsusp_header->sig, 10)) { + memcpy(swsusp_header->sig,swsusp_header->orig_sig, 10); + error = hib_bio_write_page(swsusp_resume_block, + swsusp_header, NULL); + } else { + printk(KERN_ERR "PM: Cannot find swsusp signature!\n"); + error = -ENODEV; + } + + /* + * We just returned from suspend, we don't need the image any more. + */ + free_all_swap_pages(root_swap); + + return error; +} +#endif + static int swsusp_header_init(void) { swsusp_header = (struct swsusp_header*) __get_free_page(GFP_KERNEL); diff --git a/kernel/power/user.c b/kernel/power/user.c index 91b0fd021a95..4ed81e74f86f 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c @@ -24,7 +24,6 @@ #include <linux/console.h> #include <linux/cpu.h> #include <linux/freezer.h> -#include <scsi/scsi_scan.h> #include <asm/uaccess.h> @@ -84,7 +83,6 @@ static int snapshot_open(struct inode *inode, struct file *filp) * appear. */ wait_for_device_probe(); - scsi_complete_async_scans(); data->swap = -1; data->mode = O_WRONLY; diff --git a/kernel/power/wakelock.c b/kernel/power/wakelock.c index c8fba3380076..8f50de394d22 100644 --- a/kernel/power/wakelock.c +++ b/kernel/power/wakelock.c @@ -9,6 +9,7 @@ * manipulate wakelocks on Android. */ +#include <linux/capability.h> #include <linux/ctype.h> #include <linux/device.h> #include <linux/err.h> @@ -188,6 +189,9 @@ int pm_wake_lock(const char *buf) size_t len; int ret = 0; + if (!capable(CAP_BLOCK_SUSPEND)) + return -EPERM; + while (*str && !isspace(*str)) str++; @@ -231,6 +235,9 @@ int pm_wake_unlock(const char *buf) size_t len; int ret = 0; + if (!capable(CAP_BLOCK_SUSPEND)) + return -EPERM; + len = strlen(buf); if (!len) return -EINVAL; diff --git a/kernel/printk.c b/kernel/printk.c index 177fa49357a5..ac4bc9e79465 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -1192,21 +1192,6 @@ SYSCALL_DEFINE3(syslog, int, type, char __user *, buf, int, len) return do_syslog(type, buf, len, SYSLOG_FROM_CALL); } -#ifdef CONFIG_KGDB_KDB -/* kdb dmesg command needs access to the syslog buffer. do_syslog() - * uses locks so it cannot be used during debugging. Just tell kdb - * where the start and end of the physical and logical logs are. This - * is equivalent to do_syslog(3). - */ -void kdb_syslog_data(char *syslog_data[4]) -{ - syslog_data[0] = log_buf; - syslog_data[1] = log_buf + log_buf_len; - syslog_data[2] = log_buf + log_first_idx; - syslog_data[3] = log_buf + log_next_idx; -} -#endif /* CONFIG_KGDB_KDB */ - static bool __read_mostly ignore_loglevel; static int __init ignore_loglevel_setup(char *str) @@ -2525,7 +2510,7 @@ void kmsg_dump(enum kmsg_dump_reason reason) } /** - * kmsg_dump_get_line - retrieve one kmsg log line + * kmsg_dump_get_line_nolock - retrieve one kmsg log line (unlocked version) * @dumper: registered kmsg dumper * @syslog: include the "<4>" prefixes * @line: buffer to copy the line to @@ -2540,11 +2525,12 @@ void kmsg_dump(enum kmsg_dump_reason reason) * * A return value of FALSE indicates that there are no more records to * read. + * + * The function is similar to kmsg_dump_get_line(), but grabs no locks. */ -bool kmsg_dump_get_line(struct kmsg_dumper *dumper, bool syslog, - char *line, size_t size, size_t *len) +bool kmsg_dump_get_line_nolock(struct kmsg_dumper *dumper, bool syslog, + char *line, size_t size, size_t *len) { - unsigned long flags; struct log *msg; size_t l = 0; bool ret = false; @@ -2552,7 +2538,6 @@ bool kmsg_dump_get_line(struct kmsg_dumper *dumper, bool syslog, if (!dumper->active) goto out; - raw_spin_lock_irqsave(&logbuf_lock, flags); if (dumper->cur_seq < log_first_seq) { /* messages are gone, move to first available one */ dumper->cur_seq = log_first_seq; @@ -2560,10 +2545,8 @@ bool kmsg_dump_get_line(struct kmsg_dumper *dumper, bool syslog, } /* last entry */ - if (dumper->cur_seq >= log_next_seq) { - raw_spin_unlock_irqrestore(&logbuf_lock, flags); + if (dumper->cur_seq >= log_next_seq) goto out; - } msg = log_from_idx(dumper->cur_idx); l = msg_print_text(msg, 0, syslog, line, size); @@ -2571,12 +2554,41 @@ bool kmsg_dump_get_line(struct kmsg_dumper *dumper, bool syslog, dumper->cur_idx = log_next(dumper->cur_idx); dumper->cur_seq++; ret = true; - raw_spin_unlock_irqrestore(&logbuf_lock, flags); out: if (len) *len = l; return ret; } + +/** + * kmsg_dump_get_line - retrieve one kmsg log line + * @dumper: registered kmsg dumper + * @syslog: include the "<4>" prefixes + * @line: buffer to copy the line to + * @size: maximum size of the buffer + * @len: length of line placed into buffer + * + * Start at the beginning of the kmsg buffer, with the oldest kmsg + * record, and copy one record into the provided buffer. + * + * Consecutive calls will return the next available record moving + * towards the end of the buffer with the youngest messages. + * + * A return value of FALSE indicates that there are no more records to + * read. + */ +bool kmsg_dump_get_line(struct kmsg_dumper *dumper, bool syslog, + char *line, size_t size, size_t *len) +{ + unsigned long flags; + bool ret; + + raw_spin_lock_irqsave(&logbuf_lock, flags); + ret = kmsg_dump_get_line_nolock(dumper, syslog, line, size, len); + raw_spin_unlock_irqrestore(&logbuf_lock, flags); + + return ret; +} EXPORT_SYMBOL_GPL(kmsg_dump_get_line); /** @@ -2679,6 +2691,24 @@ out: EXPORT_SYMBOL_GPL(kmsg_dump_get_buffer); /** + * kmsg_dump_rewind_nolock - reset the interator (unlocked version) + * @dumper: registered kmsg dumper + * + * Reset the dumper's iterator so that kmsg_dump_get_line() and + * kmsg_dump_get_buffer() can be called again and used multiple + * times within the same dumper.dump() callback. + * + * The function is similar to kmsg_dump_rewind(), but grabs no locks. + */ +void kmsg_dump_rewind_nolock(struct kmsg_dumper *dumper) +{ + dumper->cur_seq = clear_seq; + dumper->cur_idx = clear_idx; + dumper->next_seq = log_next_seq; + dumper->next_idx = log_next_idx; +} + +/** * kmsg_dump_rewind - reset the interator * @dumper: registered kmsg dumper * @@ -2691,10 +2721,7 @@ void kmsg_dump_rewind(struct kmsg_dumper *dumper) unsigned long flags; raw_spin_lock_irqsave(&logbuf_lock, flags); - dumper->cur_seq = clear_seq; - dumper->cur_idx = clear_idx; - dumper->next_seq = log_next_seq; - dumper->next_idx = log_next_idx; + kmsg_dump_rewind_nolock(dumper); raw_spin_unlock_irqrestore(&logbuf_lock, flags); } EXPORT_SYMBOL_GPL(kmsg_dump_rewind); diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index 95cba41ce1e9..4e6a61b15e86 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c @@ -54,6 +54,50 @@ #ifdef CONFIG_PREEMPT_RCU /* + * Preemptible RCU implementation for rcu_read_lock(). + * Just increment ->rcu_read_lock_nesting, shared state will be updated + * if we block. + */ +void __rcu_read_lock(void) +{ + current->rcu_read_lock_nesting++; + barrier(); /* critical section after entry code. */ +} +EXPORT_SYMBOL_GPL(__rcu_read_lock); + +/* + * Preemptible RCU implementation for rcu_read_unlock(). + * Decrement ->rcu_read_lock_nesting. If the result is zero (outermost + * rcu_read_unlock()) and ->rcu_read_unlock_special is non-zero, then + * invoke rcu_read_unlock_special() to clean up after a context switch + * in an RCU read-side critical section and other special cases. + */ +void __rcu_read_unlock(void) +{ + struct task_struct *t = current; + + if (t->rcu_read_lock_nesting != 1) { + --t->rcu_read_lock_nesting; + } else { + barrier(); /* critical section before exit code. */ + t->rcu_read_lock_nesting = INT_MIN; + barrier(); /* assign before ->rcu_read_unlock_special load */ + if (unlikely(ACCESS_ONCE(t->rcu_read_unlock_special))) + rcu_read_unlock_special(t); + barrier(); /* ->rcu_read_unlock_special load before assign */ + t->rcu_read_lock_nesting = 0; + } +#ifdef CONFIG_PROVE_LOCKING + { + int rrln = ACCESS_ONCE(t->rcu_read_lock_nesting); + + WARN_ON_ONCE(rrln < 0 && rrln > INT_MIN / 2); + } +#endif /* #ifdef CONFIG_PROVE_LOCKING */ +} +EXPORT_SYMBOL_GPL(__rcu_read_unlock); + +/* * Check for a task exiting while in a preemptible-RCU read-side * critical section, clean up if so. No need to issue warnings, * as debug_check_no_locks_held() already does this if lockdep diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c index 37a5444204d2..547b1fe5b052 100644 --- a/kernel/rcutiny.c +++ b/kernel/rcutiny.c @@ -172,7 +172,7 @@ void rcu_irq_enter(void) local_irq_restore(flags); } -#ifdef CONFIG_PROVE_RCU +#ifdef CONFIG_DEBUG_LOCK_ALLOC /* * Test whether RCU thinks that the current CPU is idle. @@ -183,7 +183,7 @@ int rcu_is_cpu_idle(void) } EXPORT_SYMBOL(rcu_is_cpu_idle); -#endif /* #ifdef CONFIG_PROVE_RCU */ +#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ /* * Test whether the current CPU was interrupted from idle. Nested diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h index fc31a2d65100..918fd1e8509c 100644 --- a/kernel/rcutiny_plugin.h +++ b/kernel/rcutiny_plugin.h @@ -132,7 +132,6 @@ static struct rcu_preempt_ctrlblk rcu_preempt_ctrlblk = { RCU_TRACE(.rcb.name = "rcu_preempt") }; -static void rcu_read_unlock_special(struct task_struct *t); static int rcu_preempted_readers_exp(void); static void rcu_report_exp_done(void); @@ -351,8 +350,9 @@ static int rcu_initiate_boost(void) rcu_preempt_ctrlblk.boost_tasks = rcu_preempt_ctrlblk.gp_tasks; invoke_rcu_callbacks(); - } else + } else { RCU_TRACE(rcu_initiate_boost_trace()); + } return 1; } @@ -527,23 +527,11 @@ void rcu_preempt_note_context_switch(void) } /* - * Tiny-preemptible RCU implementation for rcu_read_lock(). - * Just increment ->rcu_read_lock_nesting, shared state will be updated - * if we block. - */ -void __rcu_read_lock(void) -{ - current->rcu_read_lock_nesting++; - barrier(); /* needed if we ever invoke rcu_read_lock in rcutiny.c */ -} -EXPORT_SYMBOL_GPL(__rcu_read_lock); - -/* * Handle special cases during rcu_read_unlock(), such as needing to * notify RCU core processing or task having blocked during the RCU * read-side critical section. */ -static noinline void rcu_read_unlock_special(struct task_struct *t) +void rcu_read_unlock_special(struct task_struct *t) { int empty; int empty_exp; @@ -627,38 +615,6 @@ static noinline void rcu_read_unlock_special(struct task_struct *t) } /* - * Tiny-preemptible RCU implementation for rcu_read_unlock(). - * Decrement ->rcu_read_lock_nesting. If the result is zero (outermost - * rcu_read_unlock()) and ->rcu_read_unlock_special is non-zero, then - * invoke rcu_read_unlock_special() to clean up after a context switch - * in an RCU read-side critical section and other special cases. - */ -void __rcu_read_unlock(void) -{ - struct task_struct *t = current; - - barrier(); /* needed if we ever invoke rcu_read_unlock in rcutiny.c */ - if (t->rcu_read_lock_nesting != 1) - --t->rcu_read_lock_nesting; - else { - t->rcu_read_lock_nesting = INT_MIN; - barrier(); /* assign before ->rcu_read_unlock_special load */ - if (unlikely(ACCESS_ONCE(t->rcu_read_unlock_special))) - rcu_read_unlock_special(t); - barrier(); /* ->rcu_read_unlock_special load before assign */ - t->rcu_read_lock_nesting = 0; - } -#ifdef CONFIG_PROVE_LOCKING - { - int rrln = ACCESS_ONCE(t->rcu_read_lock_nesting); - - WARN_ON_ONCE(rrln < 0 && rrln > INT_MIN / 2); - } -#endif /* #ifdef CONFIG_PROVE_LOCKING */ -} -EXPORT_SYMBOL_GPL(__rcu_read_unlock); - -/* * Check for a quiescent state from the current CPU. When a task blocks, * the task is recorded in the rcu_preempt_ctrlblk structure, which is * checked elsewhere. This is called from the scheduling-clock interrupt. @@ -823,9 +779,9 @@ void synchronize_rcu_expedited(void) rpcp->exp_tasks = NULL; /* Wait for tail of ->blkd_tasks list to drain. */ - if (!rcu_preempted_readers_exp()) + if (!rcu_preempted_readers_exp()) { local_irq_restore(flags); - else { + } else { rcu_initiate_boost(); local_irq_restore(flags); wait_event(sync_rcu_preempt_exp_wq, @@ -846,8 +802,6 @@ EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); */ int rcu_preempt_needs_cpu(void) { - if (!rcu_preempt_running_reader()) - rcu_preempt_cpu_qs(); return rcu_preempt_ctrlblk.rcb.rcucblist != NULL; } diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index e66b34ab7555..25b15033c61f 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -49,8 +49,7 @@ #include <asm/byteorder.h> MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and " - "Josh Triplett <josh@freedesktop.org>"); +MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and Josh Triplett <josh@freedesktop.org>"); static int nreaders = -1; /* # reader threads, defaults to 2*ncpus */ static int nfakewriters = 4; /* # fake writer threads */ @@ -206,6 +205,7 @@ static unsigned long boost_starttime; /* jiffies of next boost test start. */ DEFINE_MUTEX(boost_mutex); /* protect setting boost_starttime */ /* and boost task create/destroy. */ static atomic_t barrier_cbs_count; /* Barrier callbacks registered. */ +static bool barrier_phase; /* Test phase. */ static atomic_t barrier_cbs_invoked; /* Barrier callbacks invoked. */ static wait_queue_head_t *barrier_cbs_wq; /* Coordinate barrier testing. */ static DECLARE_WAIT_QUEUE_HEAD(barrier_wq); @@ -407,8 +407,9 @@ rcu_torture_cb(struct rcu_head *p) if (++rp->rtort_pipe_count >= RCU_TORTURE_PIPE_LEN) { rp->rtort_mbtest = 0; rcu_torture_free(rp); - } else + } else { cur_ops->deferred_free(rp); + } } static int rcu_no_completed(void) @@ -635,6 +636,17 @@ static void srcu_torture_synchronize(void) synchronize_srcu(&srcu_ctl); } +static void srcu_torture_call(struct rcu_head *head, + void (*func)(struct rcu_head *head)) +{ + call_srcu(&srcu_ctl, head, func); +} + +static void srcu_torture_barrier(void) +{ + srcu_barrier(&srcu_ctl); +} + static int srcu_torture_stats(char *page) { int cnt = 0; @@ -661,8 +673,8 @@ static struct rcu_torture_ops srcu_ops = { .completed = srcu_torture_completed, .deferred_free = srcu_torture_deferred_free, .sync = srcu_torture_synchronize, - .call = NULL, - .cb_barrier = NULL, + .call = srcu_torture_call, + .cb_barrier = srcu_torture_barrier, .stats = srcu_torture_stats, .name = "srcu" }; @@ -1013,7 +1025,11 @@ rcu_torture_fakewriter(void *arg) do { schedule_timeout_uninterruptible(1 + rcu_random(&rand)%10); udelay(rcu_random(&rand) & 0x3ff); - cur_ops->sync(); + if (cur_ops->cb_barrier != NULL && + rcu_random(&rand) % (nfakewriters * 8) == 0) + cur_ops->cb_barrier(); + else + cur_ops->sync(); rcu_stutter_wait("rcu_torture_fakewriter"); } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); @@ -1183,27 +1199,27 @@ rcu_torture_printk(char *page) } cnt += sprintf(&page[cnt], "%s%s ", torture_type, TORTURE_FLAG); cnt += sprintf(&page[cnt], - "rtc: %p ver: %lu tfle: %d rta: %d rtaf: %d rtf: %d " - "rtmbe: %d rtbke: %ld rtbre: %ld " - "rtbf: %ld rtb: %ld nt: %ld " - "onoff: %ld/%ld:%ld/%ld " - "barrier: %ld/%ld:%ld", + "rtc: %p ver: %lu tfle: %d rta: %d rtaf: %d rtf: %d ", rcu_torture_current, rcu_torture_current_version, list_empty(&rcu_torture_freelist), atomic_read(&n_rcu_torture_alloc), atomic_read(&n_rcu_torture_alloc_fail), - atomic_read(&n_rcu_torture_free), + atomic_read(&n_rcu_torture_free)); + cnt += sprintf(&page[cnt], "rtmbe: %d rtbke: %ld rtbre: %ld ", atomic_read(&n_rcu_torture_mberror), n_rcu_torture_boost_ktrerror, - n_rcu_torture_boost_rterror, + n_rcu_torture_boost_rterror); + cnt += sprintf(&page[cnt], "rtbf: %ld rtb: %ld nt: %ld ", n_rcu_torture_boost_failure, n_rcu_torture_boosts, - n_rcu_torture_timers, + n_rcu_torture_timers); + cnt += sprintf(&page[cnt], "onoff: %ld/%ld:%ld/%ld ", n_online_successes, n_online_attempts, n_offline_successes, - n_offline_attempts, + n_offline_attempts); + cnt += sprintf(&page[cnt], "barrier: %ld/%ld:%ld", n_barrier_successes, n_barrier_attempts, n_rcu_torture_barrier_error); @@ -1445,8 +1461,7 @@ rcu_torture_shutdown(void *arg) delta = shutdown_time - jiffies_snap; if (verbose) printk(KERN_ALERT "%s" TORTURE_FLAG - "rcu_torture_shutdown task: %lu " - "jiffies remaining\n", + "rcu_torture_shutdown task: %lu jiffies remaining\n", torture_type, delta); schedule_timeout_interruptible(delta); jiffies_snap = ACCESS_ONCE(jiffies); @@ -1498,8 +1513,7 @@ rcu_torture_onoff(void *arg) if (cpu_down(cpu) == 0) { if (verbose) printk(KERN_ALERT "%s" TORTURE_FLAG - "rcu_torture_onoff task: " - "offlined %d\n", + "rcu_torture_onoff task: offlined %d\n", torture_type, cpu); n_offline_successes++; } @@ -1512,8 +1526,7 @@ rcu_torture_onoff(void *arg) if (cpu_up(cpu) == 0) { if (verbose) printk(KERN_ALERT "%s" TORTURE_FLAG - "rcu_torture_onoff task: " - "onlined %d\n", + "rcu_torture_onoff task: onlined %d\n", torture_type, cpu); n_online_successes++; } @@ -1631,6 +1644,7 @@ void rcu_torture_barrier_cbf(struct rcu_head *rcu) static int rcu_torture_barrier_cbs(void *arg) { long myid = (long)arg; + bool lastphase = 0; struct rcu_head rcu; init_rcu_head_on_stack(&rcu); @@ -1638,9 +1652,11 @@ static int rcu_torture_barrier_cbs(void *arg) set_user_nice(current, 19); do { wait_event(barrier_cbs_wq[myid], - atomic_read(&barrier_cbs_count) == n_barrier_cbs || + barrier_phase != lastphase || kthread_should_stop() || fullstop != FULLSTOP_DONTSTOP); + lastphase = barrier_phase; + smp_mb(); /* ensure barrier_phase load before ->call(). */ if (kthread_should_stop() || fullstop != FULLSTOP_DONTSTOP) break; cur_ops->call(&rcu, rcu_torture_barrier_cbf); @@ -1665,7 +1681,8 @@ static int rcu_torture_barrier(void *arg) do { atomic_set(&barrier_cbs_invoked, 0); atomic_set(&barrier_cbs_count, n_barrier_cbs); - /* wake_up() path contains the required barriers. */ + smp_mb(); /* Ensure barrier_phase after prior assignments. */ + barrier_phase = !barrier_phase; for (i = 0; i < n_barrier_cbs; i++) wake_up(&barrier_cbs_wq[i]); wait_event(barrier_wq, @@ -1684,7 +1701,7 @@ static int rcu_torture_barrier(void *arg) schedule_timeout_interruptible(HZ / 10); } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); VERBOSE_PRINTK_STRING("rcu_torture_barrier task stopping"); - rcutorture_shutdown_absorb("rcu_torture_barrier_cbs"); + rcutorture_shutdown_absorb("rcu_torture_barrier"); while (!kthread_should_stop()) schedule_timeout_interruptible(1); return 0; @@ -1908,8 +1925,8 @@ rcu_torture_init(void) static struct rcu_torture_ops *torture_ops[] = { &rcu_ops, &rcu_sync_ops, &rcu_expedited_ops, &rcu_bh_ops, &rcu_bh_sync_ops, &rcu_bh_expedited_ops, - &srcu_ops, &srcu_sync_ops, &srcu_raw_ops, - &srcu_raw_sync_ops, &srcu_expedited_ops, + &srcu_ops, &srcu_sync_ops, &srcu_expedited_ops, + &srcu_raw_ops, &srcu_raw_sync_ops, &sched_ops, &sched_sync_ops, &sched_expedited_ops, }; mutex_lock(&fullstop_mutex); @@ -1931,8 +1948,7 @@ rcu_torture_init(void) return -EINVAL; } if (cur_ops->fqs == NULL && fqs_duration != 0) { - printk(KERN_ALERT "rcu-torture: ->fqs NULL and non-zero " - "fqs_duration, fqs disabled.\n"); + printk(KERN_ALERT "rcu-torture: ->fqs NULL and non-zero fqs_duration, fqs disabled.\n"); fqs_duration = 0; } if (cur_ops->init) diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 38ecdda3f55f..f280e542e3e9 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -60,36 +60,44 @@ /* Data structures. */ -static struct lock_class_key rcu_node_class[NUM_RCU_LVLS]; - -#define RCU_STATE_INITIALIZER(structname) { \ - .level = { &structname##_state.node[0] }, \ - .levelcnt = { \ - NUM_RCU_LVL_0, /* root of hierarchy. */ \ - NUM_RCU_LVL_1, \ - NUM_RCU_LVL_2, \ - NUM_RCU_LVL_3, \ - NUM_RCU_LVL_4, /* == MAX_RCU_LVLS */ \ - }, \ +static struct lock_class_key rcu_node_class[RCU_NUM_LVLS]; + +#define RCU_STATE_INITIALIZER(sname, cr) { \ + .level = { &sname##_state.node[0] }, \ + .call = cr, \ .fqs_state = RCU_GP_IDLE, \ .gpnum = -300, \ .completed = -300, \ - .onofflock = __RAW_SPIN_LOCK_UNLOCKED(&structname##_state.onofflock), \ - .orphan_nxttail = &structname##_state.orphan_nxtlist, \ - .orphan_donetail = &structname##_state.orphan_donelist, \ - .fqslock = __RAW_SPIN_LOCK_UNLOCKED(&structname##_state.fqslock), \ - .n_force_qs = 0, \ - .n_force_qs_ngp = 0, \ - .name = #structname, \ + .onofflock = __RAW_SPIN_LOCK_UNLOCKED(&sname##_state.onofflock), \ + .orphan_nxttail = &sname##_state.orphan_nxtlist, \ + .orphan_donetail = &sname##_state.orphan_donelist, \ + .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \ + .fqslock = __RAW_SPIN_LOCK_UNLOCKED(&sname##_state.fqslock), \ + .name = #sname, \ } -struct rcu_state rcu_sched_state = RCU_STATE_INITIALIZER(rcu_sched); +struct rcu_state rcu_sched_state = + RCU_STATE_INITIALIZER(rcu_sched, call_rcu_sched); DEFINE_PER_CPU(struct rcu_data, rcu_sched_data); -struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh); +struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh, call_rcu_bh); DEFINE_PER_CPU(struct rcu_data, rcu_bh_data); static struct rcu_state *rcu_state; +LIST_HEAD(rcu_struct_flavors); + +/* Increase (but not decrease) the CONFIG_RCU_FANOUT_LEAF at boot time. */ +static int rcu_fanout_leaf = CONFIG_RCU_FANOUT_LEAF; +module_param(rcu_fanout_leaf, int, 0); +int rcu_num_lvls __read_mostly = RCU_NUM_LVLS; +static int num_rcu_lvl[] = { /* Number of rcu_nodes at specified level. */ + NUM_RCU_LVL_0, + NUM_RCU_LVL_1, + NUM_RCU_LVL_2, + NUM_RCU_LVL_3, + NUM_RCU_LVL_4, +}; +int rcu_num_nodes __read_mostly = NUM_RCU_NODES; /* Total # rcu_nodes in use. */ /* * The rcu_scheduler_active variable transitions from zero to one just @@ -147,13 +155,6 @@ static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp); unsigned long rcutorture_testseq; unsigned long rcutorture_vernum; -/* State information for rcu_barrier() and friends. */ - -static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL}; -static atomic_t rcu_barrier_cpu_count; -static DEFINE_MUTEX(rcu_barrier_mutex); -static struct completion rcu_barrier_completion; - /* * Return true if an RCU grace period is in progress. The ACCESS_ONCE()s * permit this function to be invoked without holding the root rcu_node @@ -201,6 +202,7 @@ void rcu_note_context_switch(int cpu) { trace_rcu_utilization("Start context switch"); rcu_sched_qs(cpu); + rcu_preempt_note_context_switch(cpu); trace_rcu_utilization("End context switch"); } EXPORT_SYMBOL_GPL(rcu_note_context_switch); @@ -357,7 +359,7 @@ static void rcu_idle_enter_common(struct rcu_dynticks *rdtp, long long oldval) struct task_struct *idle = idle_task(smp_processor_id()); trace_rcu_dyntick("Error on entry: not idle task", oldval, 0); - ftrace_dump(DUMP_ALL); + ftrace_dump(DUMP_ORIG); WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", current->pid, current->comm, idle->pid, idle->comm); /* must be idle task! */ @@ -467,7 +469,7 @@ static void rcu_idle_exit_common(struct rcu_dynticks *rdtp, long long oldval) trace_rcu_dyntick("Error on exit: not idle task", oldval, rdtp->dynticks_nesting); - ftrace_dump(DUMP_ALL); + ftrace_dump(DUMP_ORIG); WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", current->pid, current->comm, idle->pid, idle->comm); /* must be idle task! */ @@ -584,8 +586,6 @@ void rcu_nmi_exit(void) WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1); } -#ifdef CONFIG_PROVE_RCU - /** * rcu_is_cpu_idle - see if RCU thinks that the current CPU is idle * @@ -603,7 +603,7 @@ int rcu_is_cpu_idle(void) } EXPORT_SYMBOL(rcu_is_cpu_idle); -#ifdef CONFIG_HOTPLUG_CPU +#if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU) /* * Is the current CPU online? Disable preemption to avoid false positives @@ -644,9 +644,7 @@ bool rcu_lockdep_current_cpu_online(void) } EXPORT_SYMBOL_GPL(rcu_lockdep_current_cpu_online); -#endif /* #ifdef CONFIG_HOTPLUG_CPU */ - -#endif /* #ifdef CONFIG_PROVE_RCU */ +#endif /* #if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU) */ /** * rcu_is_cpu_rrupt_from_idle - see if idle or immediately interrupted from idle @@ -732,7 +730,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp) int cpu; long delta; unsigned long flags; - int ndetected; + int ndetected = 0; struct rcu_node *rnp = rcu_get_root(rsp); /* Only let one CPU complain about others per time interval. */ @@ -773,7 +771,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp) */ rnp = rcu_get_root(rsp); raw_spin_lock_irqsave(&rnp->lock, flags); - ndetected = rcu_print_task_stall(rnp); + ndetected += rcu_print_task_stall(rnp); raw_spin_unlock_irqrestore(&rnp->lock, flags); print_cpu_stall_info_end(); @@ -859,9 +857,10 @@ static int rcu_panic(struct notifier_block *this, unsigned long ev, void *ptr) */ void rcu_cpu_stall_reset(void) { - rcu_sched_state.jiffies_stall = jiffies + ULONG_MAX / 2; - rcu_bh_state.jiffies_stall = jiffies + ULONG_MAX / 2; - rcu_preempt_stall_reset(); + struct rcu_state *rsp; + + for_each_rcu_flavor(rsp) + rsp->jiffies_stall = jiffies + ULONG_MAX / 2; } static struct notifier_block rcu_panic_block = { @@ -893,8 +892,9 @@ static void __note_new_gpnum(struct rcu_state *rsp, struct rcu_node *rnp, struct if (rnp->qsmask & rdp->grpmask) { rdp->qs_pending = 1; rdp->passed_quiesce = 0; - } else + } else { rdp->qs_pending = 0; + } zero_cpu_stall_ticks(rdp); } } @@ -936,6 +936,18 @@ check_for_new_grace_period(struct rcu_state *rsp, struct rcu_data *rdp) } /* + * Initialize the specified rcu_data structure's callback list to empty. + */ +static void init_callback_list(struct rcu_data *rdp) +{ + int i; + + rdp->nxtlist = NULL; + for (i = 0; i < RCU_NEXT_SIZE; i++) + rdp->nxttail[i] = &rdp->nxtlist; +} + +/* * Advance this CPU's callbacks, but only if the current grace period * has ended. This may be called only from the CPU to whom the rdp * belongs. In addition, the corresponding leaf rcu_node structure's @@ -1327,8 +1339,6 @@ static void rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp) { - int i; - /* * Orphan the callbacks. First adjust the counts. This is safe * because ->onofflock excludes _rcu_barrier()'s adoption of @@ -1339,7 +1349,7 @@ rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp, rsp->qlen += rdp->qlen; rdp->n_cbs_orphaned += rdp->qlen; rdp->qlen_lazy = 0; - rdp->qlen = 0; + ACCESS_ONCE(rdp->qlen) = 0; } /* @@ -1368,9 +1378,7 @@ rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp, } /* Finally, initialize the rcu_data structure's list to empty. */ - rdp->nxtlist = NULL; - for (i = 0; i < RCU_NEXT_SIZE; i++) - rdp->nxttail[i] = &rdp->nxtlist; + init_callback_list(rdp); } /* @@ -1504,6 +1512,9 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) raw_spin_unlock_irqrestore(&rnp->lock, flags); if (need_report & RCU_OFL_TASKS_EXP_GP) rcu_report_exp_rnp(rsp, rnp, true); + WARN_ONCE(rdp->qlen != 0 || rdp->nxtlist != NULL, + "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, nxtlist=%p\n", + cpu, rdp->qlen, rdp->nxtlist); } #else /* #ifdef CONFIG_HOTPLUG_CPU */ @@ -1591,7 +1602,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) } smp_mb(); /* List handling before counting for rcu_barrier(). */ rdp->qlen_lazy -= count_lazy; - rdp->qlen -= count; + ACCESS_ONCE(rdp->qlen) -= count; rdp->n_cbs_invoked += count; /* Reinstate batch limit if we have worked down the excess. */ @@ -1604,6 +1615,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) rdp->n_force_qs_snap = rsp->n_force_qs; } else if (rdp->qlen < rdp->qlen_last_fqs_check - qhimark) rdp->qlen_last_fqs_check = rdp->qlen; + WARN_ON_ONCE((rdp->nxtlist == NULL) != (rdp->qlen == 0)); local_irq_restore(flags); @@ -1744,8 +1756,6 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed) break; /* grace period idle or initializing, ignore. */ case RCU_SAVE_DYNTICK: - if (RCU_SIGNAL_INIT != RCU_SAVE_DYNTICK) - break; /* So gcc recognizes the dead code. */ raw_spin_unlock(&rnp->lock); /* irqs remain disabled */ @@ -1787,9 +1797,10 @@ unlock_fqs_ret: * whom the rdp belongs. */ static void -__rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp) +__rcu_process_callbacks(struct rcu_state *rsp) { unsigned long flags; + struct rcu_data *rdp = __this_cpu_ptr(rsp->rda); WARN_ON_ONCE(rdp->beenonline == 0); @@ -1825,11 +1836,11 @@ __rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp) */ static void rcu_process_callbacks(struct softirq_action *unused) { + struct rcu_state *rsp; + trace_rcu_utilization("Start RCU core"); - __rcu_process_callbacks(&rcu_sched_state, - &__get_cpu_var(rcu_sched_data)); - __rcu_process_callbacks(&rcu_bh_state, &__get_cpu_var(rcu_bh_data)); - rcu_preempt_process_callbacks(); + for_each_rcu_flavor(rsp) + __rcu_process_callbacks(rsp); trace_rcu_utilization("End RCU core"); } @@ -1856,6 +1867,56 @@ static void invoke_rcu_core(void) raise_softirq(RCU_SOFTIRQ); } +/* + * Handle any core-RCU processing required by a call_rcu() invocation. + */ +static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp, + struct rcu_head *head, unsigned long flags) +{ + /* + * If called from an extended quiescent state, invoke the RCU + * core in order to force a re-evaluation of RCU's idleness. + */ + if (rcu_is_cpu_idle() && cpu_online(smp_processor_id())) + invoke_rcu_core(); + + /* If interrupts were disabled or CPU offline, don't invoke RCU core. */ + if (irqs_disabled_flags(flags) || cpu_is_offline(smp_processor_id())) + return; + + /* + * Force the grace period if too many callbacks or too long waiting. + * Enforce hysteresis, and don't invoke force_quiescent_state() + * if some other CPU has recently done so. Also, don't bother + * invoking force_quiescent_state() if the newly enqueued callback + * is the only one waiting for a grace period to complete. + */ + if (unlikely(rdp->qlen > rdp->qlen_last_fqs_check + qhimark)) { + + /* Are we ignoring a completed grace period? */ + rcu_process_gp_end(rsp, rdp); + check_for_new_grace_period(rsp, rdp); + + /* Start a new grace period if one not already started. */ + if (!rcu_gp_in_progress(rsp)) { + unsigned long nestflag; + struct rcu_node *rnp_root = rcu_get_root(rsp); + + raw_spin_lock_irqsave(&rnp_root->lock, nestflag); + rcu_start_gp(rsp, nestflag); /* rlses rnp_root->lock */ + } else { + /* Give the grace period a kick. */ + rdp->blimit = LONG_MAX; + if (rsp->n_force_qs == rdp->n_force_qs_snap && + *rdp->nxttail[RCU_DONE_TAIL] != head) + force_quiescent_state(rsp, 0); + rdp->n_force_qs_snap = rsp->n_force_qs; + rdp->qlen_last_fqs_check = rdp->qlen; + } + } else if (ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs), jiffies)) + force_quiescent_state(rsp, 1); +} + static void __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), struct rcu_state *rsp, bool lazy) @@ -1880,7 +1941,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), rdp = this_cpu_ptr(rsp->rda); /* Add the callback to our list. */ - rdp->qlen++; + ACCESS_ONCE(rdp->qlen)++; if (lazy) rdp->qlen_lazy++; else @@ -1895,43 +1956,8 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), else trace_rcu_callback(rsp->name, head, rdp->qlen_lazy, rdp->qlen); - /* If interrupts were disabled, don't dive into RCU core. */ - if (irqs_disabled_flags(flags)) { - local_irq_restore(flags); - return; - } - - /* - * Force the grace period if too many callbacks or too long waiting. - * Enforce hysteresis, and don't invoke force_quiescent_state() - * if some other CPU has recently done so. Also, don't bother - * invoking force_quiescent_state() if the newly enqueued callback - * is the only one waiting for a grace period to complete. - */ - if (unlikely(rdp->qlen > rdp->qlen_last_fqs_check + qhimark)) { - - /* Are we ignoring a completed grace period? */ - rcu_process_gp_end(rsp, rdp); - check_for_new_grace_period(rsp, rdp); - - /* Start a new grace period if one not already started. */ - if (!rcu_gp_in_progress(rsp)) { - unsigned long nestflag; - struct rcu_node *rnp_root = rcu_get_root(rsp); - - raw_spin_lock_irqsave(&rnp_root->lock, nestflag); - rcu_start_gp(rsp, nestflag); /* rlses rnp_root->lock */ - } else { - /* Give the grace period a kick. */ - rdp->blimit = LONG_MAX; - if (rsp->n_force_qs == rdp->n_force_qs_snap && - *rdp->nxttail[RCU_DONE_TAIL] != head) - force_quiescent_state(rsp, 0); - rdp->n_force_qs_snap = rsp->n_force_qs; - rdp->qlen_last_fqs_check = rdp->qlen; - } - } else if (ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs), jiffies)) - force_quiescent_state(rsp, 1); + /* Go handle any RCU core processing required. */ + __call_rcu_core(rsp, rdp, head, flags); local_irq_restore(flags); } @@ -1961,28 +1987,16 @@ EXPORT_SYMBOL_GPL(call_rcu_bh); * occasionally incorrectly indicate that there are multiple CPUs online * when there was in fact only one the whole time, as this just adds * some overhead: RCU still operates correctly. - * - * Of course, sampling num_online_cpus() with preemption enabled can - * give erroneous results if there are concurrent CPU-hotplug operations. - * For example, given a demonic sequence of preemptions in num_online_cpus() - * and CPU-hotplug operations, there could be two or more CPUs online at - * all times, but num_online_cpus() might well return one (or even zero). - * - * However, all such demonic sequences require at least one CPU-offline - * operation. Furthermore, rcu_blocking_is_gp() giving the wrong answer - * is only a problem if there is an RCU read-side critical section executing - * throughout. But RCU-sched and RCU-bh read-side critical sections - * disable either preemption or bh, which prevents a CPU from going offline. - * Therefore, the only way that rcu_blocking_is_gp() can incorrectly return - * that there is only one CPU when in fact there was more than one throughout - * is when there were no RCU readers in the system. If there are no - * RCU readers, the grace period by definition can be of zero length, - * regardless of the number of online CPUs. */ static inline int rcu_blocking_is_gp(void) { + int ret; + might_sleep(); /* Check for RCU read-side critical section. */ - return num_online_cpus() <= 1; + preempt_disable(); + ret = num_online_cpus() <= 1; + preempt_enable(); + return ret; } /** @@ -2117,9 +2131,9 @@ void synchronize_sched_expedited(void) put_online_cpus(); /* No joy, try again later. Or just synchronize_sched(). */ - if (trycount++ < 10) + if (trycount++ < 10) { udelay(trycount * num_online_cpus()); - else { + } else { synchronize_sched(); return; } @@ -2240,9 +2254,12 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp) */ static int rcu_pending(int cpu) { - return __rcu_pending(&rcu_sched_state, &per_cpu(rcu_sched_data, cpu)) || - __rcu_pending(&rcu_bh_state, &per_cpu(rcu_bh_data, cpu)) || - rcu_preempt_pending(cpu); + struct rcu_state *rsp; + + for_each_rcu_flavor(rsp) + if (__rcu_pending(rsp, per_cpu_ptr(rsp->rda, cpu))) + return 1; + return 0; } /* @@ -2252,20 +2269,41 @@ static int rcu_pending(int cpu) */ static int rcu_cpu_has_callbacks(int cpu) { + struct rcu_state *rsp; + /* RCU callbacks either ready or pending? */ - return per_cpu(rcu_sched_data, cpu).nxtlist || - per_cpu(rcu_bh_data, cpu).nxtlist || - rcu_preempt_cpu_has_callbacks(cpu); + for_each_rcu_flavor(rsp) + if (per_cpu_ptr(rsp->rda, cpu)->nxtlist) + return 1; + return 0; +} + +/* + * Helper function for _rcu_barrier() tracing. If tracing is disabled, + * the compiler is expected to optimize this away. + */ +static void _rcu_barrier_trace(struct rcu_state *rsp, char *s, + int cpu, unsigned long done) +{ + trace_rcu_barrier(rsp->name, s, cpu, + atomic_read(&rsp->barrier_cpu_count), done); } /* * RCU callback function for _rcu_barrier(). If we are last, wake * up the task executing _rcu_barrier(). */ -static void rcu_barrier_callback(struct rcu_head *notused) +static void rcu_barrier_callback(struct rcu_head *rhp) { - if (atomic_dec_and_test(&rcu_barrier_cpu_count)) - complete(&rcu_barrier_completion); + struct rcu_data *rdp = container_of(rhp, struct rcu_data, barrier_head); + struct rcu_state *rsp = rdp->rsp; + + if (atomic_dec_and_test(&rsp->barrier_cpu_count)) { + _rcu_barrier_trace(rsp, "LastCB", -1, rsp->n_barrier_done); + complete(&rsp->barrier_completion); + } else { + _rcu_barrier_trace(rsp, "CB", -1, rsp->n_barrier_done); + } } /* @@ -2273,35 +2311,63 @@ static void rcu_barrier_callback(struct rcu_head *notused) */ static void rcu_barrier_func(void *type) { - int cpu = smp_processor_id(); - struct rcu_head *head = &per_cpu(rcu_barrier_head, cpu); - void (*call_rcu_func)(struct rcu_head *head, - void (*func)(struct rcu_head *head)); + struct rcu_state *rsp = type; + struct rcu_data *rdp = __this_cpu_ptr(rsp->rda); - atomic_inc(&rcu_barrier_cpu_count); - call_rcu_func = type; - call_rcu_func(head, rcu_barrier_callback); + _rcu_barrier_trace(rsp, "IRQ", -1, rsp->n_barrier_done); + atomic_inc(&rsp->barrier_cpu_count); + rsp->call(&rdp->barrier_head, rcu_barrier_callback); } /* * Orchestrate the specified type of RCU barrier, waiting for all * RCU callbacks of the specified type to complete. */ -static void _rcu_barrier(struct rcu_state *rsp, - void (*call_rcu_func)(struct rcu_head *head, - void (*func)(struct rcu_head *head))) +static void _rcu_barrier(struct rcu_state *rsp) { int cpu; unsigned long flags; struct rcu_data *rdp; - struct rcu_head rh; + struct rcu_data rd; + unsigned long snap = ACCESS_ONCE(rsp->n_barrier_done); + unsigned long snap_done; - init_rcu_head_on_stack(&rh); + init_rcu_head_on_stack(&rd.barrier_head); + _rcu_barrier_trace(rsp, "Begin", -1, snap); /* Take mutex to serialize concurrent rcu_barrier() requests. */ - mutex_lock(&rcu_barrier_mutex); + mutex_lock(&rsp->barrier_mutex); + + /* + * Ensure that all prior references, including to ->n_barrier_done, + * are ordered before the _rcu_barrier() machinery. + */ + smp_mb(); /* See above block comment. */ + + /* + * Recheck ->n_barrier_done to see if others did our work for us. + * This means checking ->n_barrier_done for an even-to-odd-to-even + * transition. The "if" expression below therefore rounds the old + * value up to the next even number and adds two before comparing. + */ + snap_done = ACCESS_ONCE(rsp->n_barrier_done); + _rcu_barrier_trace(rsp, "Check", -1, snap_done); + if (ULONG_CMP_GE(snap_done, ((snap + 1) & ~0x1) + 2)) { + _rcu_barrier_trace(rsp, "EarlyExit", -1, snap_done); + smp_mb(); /* caller's subsequent code after above check. */ + mutex_unlock(&rsp->barrier_mutex); + return; + } - smp_mb(); /* Prevent any prior operations from leaking in. */ + /* + * Increment ->n_barrier_done to avoid duplicate work. Use + * ACCESS_ONCE() to prevent the compiler from speculating + * the increment to precede the early-exit check. + */ + ACCESS_ONCE(rsp->n_barrier_done)++; + WARN_ON_ONCE((rsp->n_barrier_done & 0x1) != 1); + _rcu_barrier_trace(rsp, "Inc1", -1, rsp->n_barrier_done); + smp_mb(); /* Order ->n_barrier_done increment with below mechanism. */ /* * Initialize the count to one rather than to zero in order to @@ -2320,8 +2386,8 @@ static void _rcu_barrier(struct rcu_state *rsp, * 6. Both rcu_barrier_callback() callbacks are invoked, awakening * us -- but before CPU 1's orphaned callbacks are invoked!!! */ - init_completion(&rcu_barrier_completion); - atomic_set(&rcu_barrier_cpu_count, 1); + init_completion(&rsp->barrier_completion); + atomic_set(&rsp->barrier_cpu_count, 1); raw_spin_lock_irqsave(&rsp->onofflock, flags); rsp->rcu_barrier_in_progress = current; raw_spin_unlock_irqrestore(&rsp->onofflock, flags); @@ -2337,14 +2403,19 @@ static void _rcu_barrier(struct rcu_state *rsp, preempt_disable(); rdp = per_cpu_ptr(rsp->rda, cpu); if (cpu_is_offline(cpu)) { + _rcu_barrier_trace(rsp, "Offline", cpu, + rsp->n_barrier_done); preempt_enable(); while (cpu_is_offline(cpu) && ACCESS_ONCE(rdp->qlen)) schedule_timeout_interruptible(1); } else if (ACCESS_ONCE(rdp->qlen)) { - smp_call_function_single(cpu, rcu_barrier_func, - (void *)call_rcu_func, 1); + _rcu_barrier_trace(rsp, "OnlineQ", cpu, + rsp->n_barrier_done); + smp_call_function_single(cpu, rcu_barrier_func, rsp, 1); preempt_enable(); } else { + _rcu_barrier_trace(rsp, "OnlineNQ", cpu, + rsp->n_barrier_done); preempt_enable(); } } @@ -2361,24 +2432,32 @@ static void _rcu_barrier(struct rcu_state *rsp, rcu_adopt_orphan_cbs(rsp); rsp->rcu_barrier_in_progress = NULL; raw_spin_unlock_irqrestore(&rsp->onofflock, flags); - atomic_inc(&rcu_barrier_cpu_count); + atomic_inc(&rsp->barrier_cpu_count); smp_mb__after_atomic_inc(); /* Ensure atomic_inc() before callback. */ - call_rcu_func(&rh, rcu_barrier_callback); + rd.rsp = rsp; + rsp->call(&rd.barrier_head, rcu_barrier_callback); /* * Now that we have an rcu_barrier_callback() callback on each * CPU, and thus each counted, remove the initial count. */ - if (atomic_dec_and_test(&rcu_barrier_cpu_count)) - complete(&rcu_barrier_completion); + if (atomic_dec_and_test(&rsp->barrier_cpu_count)) + complete(&rsp->barrier_completion); + + /* Increment ->n_barrier_done to prevent duplicate work. */ + smp_mb(); /* Keep increment after above mechanism. */ + ACCESS_ONCE(rsp->n_barrier_done)++; + WARN_ON_ONCE((rsp->n_barrier_done & 0x1) != 0); + _rcu_barrier_trace(rsp, "Inc2", -1, rsp->n_barrier_done); + smp_mb(); /* Keep increment before caller's subsequent code. */ /* Wait for all rcu_barrier_callback() callbacks to be invoked. */ - wait_for_completion(&rcu_barrier_completion); + wait_for_completion(&rsp->barrier_completion); /* Other rcu_barrier() invocations can now safely proceed. */ - mutex_unlock(&rcu_barrier_mutex); + mutex_unlock(&rsp->barrier_mutex); - destroy_rcu_head_on_stack(&rh); + destroy_rcu_head_on_stack(&rd.barrier_head); } /** @@ -2386,7 +2465,7 @@ static void _rcu_barrier(struct rcu_state *rsp, */ void rcu_barrier_bh(void) { - _rcu_barrier(&rcu_bh_state, call_rcu_bh); + _rcu_barrier(&rcu_bh_state); } EXPORT_SYMBOL_GPL(rcu_barrier_bh); @@ -2395,7 +2474,7 @@ EXPORT_SYMBOL_GPL(rcu_barrier_bh); */ void rcu_barrier_sched(void) { - _rcu_barrier(&rcu_sched_state, call_rcu_sched); + _rcu_barrier(&rcu_sched_state); } EXPORT_SYMBOL_GPL(rcu_barrier_sched); @@ -2406,18 +2485,15 @@ static void __init rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp) { unsigned long flags; - int i; struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); struct rcu_node *rnp = rcu_get_root(rsp); /* Set up local state, ensuring consistent view of global state. */ raw_spin_lock_irqsave(&rnp->lock, flags); rdp->grpmask = 1UL << (cpu - rdp->mynode->grplo); - rdp->nxtlist = NULL; - for (i = 0; i < RCU_NEXT_SIZE; i++) - rdp->nxttail[i] = &rdp->nxtlist; + init_callback_list(rdp); rdp->qlen_lazy = 0; - rdp->qlen = 0; + ACCESS_ONCE(rdp->qlen) = 0; rdp->dynticks = &per_cpu(rcu_dynticks, cpu); WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != DYNTICK_TASK_EXIT_IDLE); WARN_ON_ONCE(atomic_read(&rdp->dynticks->dynticks) != 1); @@ -2491,9 +2567,11 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible) static void __cpuinit rcu_prepare_cpu(int cpu) { - rcu_init_percpu_data(cpu, &rcu_sched_state, 0); - rcu_init_percpu_data(cpu, &rcu_bh_state, 0); - rcu_preempt_init_percpu_data(cpu); + struct rcu_state *rsp; + + for_each_rcu_flavor(rsp) + rcu_init_percpu_data(cpu, rsp, + strcmp(rsp->name, "rcu_preempt") == 0); } /* @@ -2505,6 +2583,7 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self, long cpu = (long)hcpu; struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu); struct rcu_node *rnp = rdp->mynode; + struct rcu_state *rsp; trace_rcu_utilization("Start CPU hotplug"); switch (action) { @@ -2529,18 +2608,16 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self, * touch any data without introducing corruption. We send the * dying CPU's callbacks to an arbitrarily chosen online CPU. */ - rcu_cleanup_dying_cpu(&rcu_bh_state); - rcu_cleanup_dying_cpu(&rcu_sched_state); - rcu_preempt_cleanup_dying_cpu(); + for_each_rcu_flavor(rsp) + rcu_cleanup_dying_cpu(rsp); rcu_cleanup_after_idle(cpu); break; case CPU_DEAD: case CPU_DEAD_FROZEN: case CPU_UP_CANCELED: case CPU_UP_CANCELED_FROZEN: - rcu_cleanup_dead_cpu(cpu, &rcu_bh_state); - rcu_cleanup_dead_cpu(cpu, &rcu_sched_state); - rcu_preempt_cleanup_dead_cpu(cpu); + for_each_rcu_flavor(rsp) + rcu_cleanup_dead_cpu(cpu, rsp); break; default: break; @@ -2573,9 +2650,9 @@ static void __init rcu_init_levelspread(struct rcu_state *rsp) { int i; - for (i = NUM_RCU_LVLS - 1; i > 0; i--) + for (i = rcu_num_lvls - 1; i > 0; i--) rsp->levelspread[i] = CONFIG_RCU_FANOUT; - rsp->levelspread[0] = CONFIG_RCU_FANOUT_LEAF; + rsp->levelspread[0] = rcu_fanout_leaf; } #else /* #ifdef CONFIG_RCU_FANOUT_EXACT */ static void __init rcu_init_levelspread(struct rcu_state *rsp) @@ -2585,7 +2662,7 @@ static void __init rcu_init_levelspread(struct rcu_state *rsp) int i; cprv = NR_CPUS; - for (i = NUM_RCU_LVLS - 1; i >= 0; i--) { + for (i = rcu_num_lvls - 1; i >= 0; i--) { ccur = rsp->levelcnt[i]; rsp->levelspread[i] = (cprv + ccur - 1) / ccur; cprv = ccur; @@ -2612,13 +2689,15 @@ static void __init rcu_init_one(struct rcu_state *rsp, /* Initialize the level-tracking arrays. */ - for (i = 1; i < NUM_RCU_LVLS; i++) + for (i = 0; i < rcu_num_lvls; i++) + rsp->levelcnt[i] = num_rcu_lvl[i]; + for (i = 1; i < rcu_num_lvls; i++) rsp->level[i] = rsp->level[i - 1] + rsp->levelcnt[i - 1]; rcu_init_levelspread(rsp); /* Initialize the elements themselves, starting from the leaves. */ - for (i = NUM_RCU_LVLS - 1; i >= 0; i--) { + for (i = rcu_num_lvls - 1; i >= 0; i--) { cpustride *= rsp->levelspread[i]; rnp = rsp->level[i]; for (j = 0; j < rsp->levelcnt[i]; j++, rnp++) { @@ -2648,13 +2727,74 @@ static void __init rcu_init_one(struct rcu_state *rsp, } rsp->rda = rda; - rnp = rsp->level[NUM_RCU_LVLS - 1]; + rnp = rsp->level[rcu_num_lvls - 1]; for_each_possible_cpu(i) { while (i > rnp->grphi) rnp++; per_cpu_ptr(rsp->rda, i)->mynode = rnp; rcu_boot_init_percpu_data(i, rsp); } + list_add(&rsp->flavors, &rcu_struct_flavors); +} + +/* + * Compute the rcu_node tree geometry from kernel parameters. This cannot + * replace the definitions in rcutree.h because those are needed to size + * the ->node array in the rcu_state structure. + */ +static void __init rcu_init_geometry(void) +{ + int i; + int j; + int n = nr_cpu_ids; + int rcu_capacity[MAX_RCU_LVLS + 1]; + + /* If the compile-time values are accurate, just leave. */ + if (rcu_fanout_leaf == CONFIG_RCU_FANOUT_LEAF) + return; + + /* + * Compute number of nodes that can be handled an rcu_node tree + * with the given number of levels. Setting rcu_capacity[0] makes + * some of the arithmetic easier. + */ + rcu_capacity[0] = 1; + rcu_capacity[1] = rcu_fanout_leaf; + for (i = 2; i <= MAX_RCU_LVLS; i++) + rcu_capacity[i] = rcu_capacity[i - 1] * CONFIG_RCU_FANOUT; + + /* + * The boot-time rcu_fanout_leaf parameter is only permitted + * to increase the leaf-level fanout, not decrease it. Of course, + * the leaf-level fanout cannot exceed the number of bits in + * the rcu_node masks. Finally, the tree must be able to accommodate + * the configured number of CPUs. Complain and fall back to the + * compile-time values if these limits are exceeded. + */ + if (rcu_fanout_leaf < CONFIG_RCU_FANOUT_LEAF || + rcu_fanout_leaf > sizeof(unsigned long) * 8 || + n > rcu_capacity[MAX_RCU_LVLS]) { + WARN_ON(1); + return; + } + + /* Calculate the number of rcu_nodes at each level of the tree. */ + for (i = 1; i <= MAX_RCU_LVLS; i++) + if (n <= rcu_capacity[i]) { + for (j = 0; j <= i; j++) + num_rcu_lvl[j] = + DIV_ROUND_UP(n, rcu_capacity[i - j]); + rcu_num_lvls = i; + for (j = i + 1; j <= MAX_RCU_LVLS; j++) + num_rcu_lvl[j] = 0; + break; + } + + /* Calculate the total number of rcu_node structures. */ + rcu_num_nodes = 0; + for (i = 0; i <= MAX_RCU_LVLS; i++) + rcu_num_nodes += num_rcu_lvl[i]; + rcu_num_nodes -= n; } void __init rcu_init(void) @@ -2662,6 +2802,7 @@ void __init rcu_init(void) int cpu; rcu_bootup_announce(); + rcu_init_geometry(); rcu_init_one(&rcu_sched_state, &rcu_sched_data); rcu_init_one(&rcu_bh_state, &rcu_bh_data); __rcu_init_preempt(); diff --git a/kernel/rcutree.h b/kernel/rcutree.h index ea056495783e..4d29169f2124 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -42,28 +42,28 @@ #define RCU_FANOUT_4 (RCU_FANOUT_3 * CONFIG_RCU_FANOUT) #if NR_CPUS <= RCU_FANOUT_1 -# define NUM_RCU_LVLS 1 +# define RCU_NUM_LVLS 1 # define NUM_RCU_LVL_0 1 # define NUM_RCU_LVL_1 (NR_CPUS) # define NUM_RCU_LVL_2 0 # define NUM_RCU_LVL_3 0 # define NUM_RCU_LVL_4 0 #elif NR_CPUS <= RCU_FANOUT_2 -# define NUM_RCU_LVLS 2 +# define RCU_NUM_LVLS 2 # define NUM_RCU_LVL_0 1 # define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1) # define NUM_RCU_LVL_2 (NR_CPUS) # define NUM_RCU_LVL_3 0 # define NUM_RCU_LVL_4 0 #elif NR_CPUS <= RCU_FANOUT_3 -# define NUM_RCU_LVLS 3 +# define RCU_NUM_LVLS 3 # define NUM_RCU_LVL_0 1 # define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2) # define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1) # define NUM_RCU_LVL_3 (NR_CPUS) # define NUM_RCU_LVL_4 0 #elif NR_CPUS <= RCU_FANOUT_4 -# define NUM_RCU_LVLS 4 +# define RCU_NUM_LVLS 4 # define NUM_RCU_LVL_0 1 # define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_3) # define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2) @@ -76,6 +76,9 @@ #define RCU_SUM (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3 + NUM_RCU_LVL_4) #define NUM_RCU_NODES (RCU_SUM - NR_CPUS) +extern int rcu_num_lvls; +extern int rcu_num_nodes; + /* * Dynticks per-CPU state. */ @@ -97,6 +100,7 @@ struct rcu_dynticks { /* # times non-lazy CBs posted to CPU. */ unsigned long nonlazy_posted_snap; /* idle-period nonlazy_posted snapshot. */ + int tick_nohz_enabled_snap; /* Previously seen value from sysfs. */ #endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */ }; @@ -206,7 +210,7 @@ struct rcu_node { */ #define rcu_for_each_node_breadth_first(rsp, rnp) \ for ((rnp) = &(rsp)->node[0]; \ - (rnp) < &(rsp)->node[NUM_RCU_NODES]; (rnp)++) + (rnp) < &(rsp)->node[rcu_num_nodes]; (rnp)++) /* * Do a breadth-first scan of the non-leaf rcu_node structures for the @@ -215,7 +219,7 @@ struct rcu_node { */ #define rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) \ for ((rnp) = &(rsp)->node[0]; \ - (rnp) < (rsp)->level[NUM_RCU_LVLS - 1]; (rnp)++) + (rnp) < (rsp)->level[rcu_num_lvls - 1]; (rnp)++) /* * Scan the leaves of the rcu_node hierarchy for the specified rcu_state @@ -224,8 +228,8 @@ struct rcu_node { * It is still a leaf node, even if it is also the root node. */ #define rcu_for_each_leaf_node(rsp, rnp) \ - for ((rnp) = (rsp)->level[NUM_RCU_LVLS - 1]; \ - (rnp) < &(rsp)->node[NUM_RCU_NODES]; (rnp)++) + for ((rnp) = (rsp)->level[rcu_num_lvls - 1]; \ + (rnp) < &(rsp)->node[rcu_num_nodes]; (rnp)++) /* Index values for nxttail array in struct rcu_data. */ #define RCU_DONE_TAIL 0 /* Also RCU_WAIT head. */ @@ -311,6 +315,9 @@ struct rcu_data { unsigned long n_rp_need_fqs; unsigned long n_rp_need_nothing; + /* 6) _rcu_barrier() callback. */ + struct rcu_head barrier_head; + int cpu; struct rcu_state *rsp; }; @@ -357,10 +364,12 @@ do { \ */ struct rcu_state { struct rcu_node node[NUM_RCU_NODES]; /* Hierarchy. */ - struct rcu_node *level[NUM_RCU_LVLS]; /* Hierarchy levels. */ + struct rcu_node *level[RCU_NUM_LVLS]; /* Hierarchy levels. */ u32 levelcnt[MAX_RCU_LVLS + 1]; /* # nodes in each level. */ - u8 levelspread[NUM_RCU_LVLS]; /* kids/node in each level. */ + u8 levelspread[RCU_NUM_LVLS]; /* kids/node in each level. */ struct rcu_data __percpu *rda; /* pointer of percu rcu_data. */ + void (*call)(struct rcu_head *head, /* call_rcu() flavor. */ + void (*func)(struct rcu_head *head)); /* The following fields are guarded by the root rcu_node's lock. */ @@ -392,6 +401,11 @@ struct rcu_state { struct task_struct *rcu_barrier_in_progress; /* Task doing rcu_barrier(), */ /* or NULL if no barrier. */ + struct mutex barrier_mutex; /* Guards barrier fields. */ + atomic_t barrier_cpu_count; /* # CPUs waiting on. */ + struct completion barrier_completion; /* Wake at barrier end. */ + unsigned long n_barrier_done; /* ++ at start and end of */ + /* _rcu_barrier(). */ raw_spinlock_t fqslock; /* Only one task forcing */ /* quiescent states. */ unsigned long jiffies_force_qs; /* Time at which to invoke */ @@ -409,8 +423,13 @@ struct rcu_state { unsigned long gp_max; /* Maximum GP duration in */ /* jiffies. */ char *name; /* Name of structure. */ + struct list_head flavors; /* List of RCU flavors. */ }; +extern struct list_head rcu_struct_flavors; +#define for_each_rcu_flavor(rsp) \ + list_for_each_entry((rsp), &rcu_struct_flavors, flavors) + /* Return values for rcu_preempt_offline_tasks(). */ #define RCU_OFL_TASKS_NORM_GP 0x1 /* Tasks blocking normal */ @@ -444,6 +463,7 @@ DECLARE_PER_CPU(char, rcu_cpu_has_work); /* Forward declarations for rcutree_plugin.h */ static void rcu_bootup_announce(void); long rcu_batches_completed(void); +static void rcu_preempt_note_context_switch(int cpu); static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp); #ifdef CONFIG_HOTPLUG_CPU static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, @@ -452,25 +472,18 @@ static void rcu_stop_cpu_kthread(int cpu); #endif /* #ifdef CONFIG_HOTPLUG_CPU */ static void rcu_print_detail_task_stall(struct rcu_state *rsp); static int rcu_print_task_stall(struct rcu_node *rnp); -static void rcu_preempt_stall_reset(void); static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp); #ifdef CONFIG_HOTPLUG_CPU static int rcu_preempt_offline_tasks(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp); #endif /* #ifdef CONFIG_HOTPLUG_CPU */ -static void rcu_preempt_cleanup_dead_cpu(int cpu); static void rcu_preempt_check_callbacks(int cpu); -static void rcu_preempt_process_callbacks(void); void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)); #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_TREE_PREEMPT_RCU) static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, bool wake); #endif /* #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_TREE_PREEMPT_RCU) */ -static int rcu_preempt_pending(int cpu); -static int rcu_preempt_cpu_has_callbacks(int cpu); -static void __cpuinit rcu_preempt_init_percpu_data(int cpu); -static void rcu_preempt_cleanup_dying_cpu(void); static void __init __rcu_init_preempt(void); static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags); static void rcu_preempt_boost_start_gp(struct rcu_node *rnp); diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 5271a020887e..7f3244c0df01 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -68,17 +68,21 @@ static void __init rcu_bootup_announce_oddness(void) printk(KERN_INFO "\tAdditional per-CPU info printed with stalls.\n"); #endif #if NUM_RCU_LVL_4 != 0 - printk(KERN_INFO "\tExperimental four-level hierarchy is enabled.\n"); + printk(KERN_INFO "\tFour-level hierarchy is enabled.\n"); #endif + if (rcu_fanout_leaf != CONFIG_RCU_FANOUT_LEAF) + printk(KERN_INFO "\tExperimental boot-time adjustment of leaf fanout to %d.\n", rcu_fanout_leaf); + if (nr_cpu_ids != NR_CPUS) + printk(KERN_INFO "\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%d.\n", NR_CPUS, nr_cpu_ids); } #ifdef CONFIG_TREE_PREEMPT_RCU -struct rcu_state rcu_preempt_state = RCU_STATE_INITIALIZER(rcu_preempt); +struct rcu_state rcu_preempt_state = + RCU_STATE_INITIALIZER(rcu_preempt, call_rcu); DEFINE_PER_CPU(struct rcu_data, rcu_preempt_data); static struct rcu_state *rcu_state = &rcu_preempt_state; -static void rcu_read_unlock_special(struct task_struct *t); static int rcu_preempted_readers_exp(struct rcu_node *rnp); /* @@ -153,7 +157,7 @@ static void rcu_preempt_qs(int cpu) * * Caller must disable preemption. */ -void rcu_preempt_note_context_switch(void) +static void rcu_preempt_note_context_switch(int cpu) { struct task_struct *t = current; unsigned long flags; @@ -164,7 +168,7 @@ void rcu_preempt_note_context_switch(void) (t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) { /* Possibly blocking in an RCU read-side critical section. */ - rdp = __this_cpu_ptr(rcu_preempt_state.rda); + rdp = per_cpu_ptr(rcu_preempt_state.rda, cpu); rnp = rdp->mynode; raw_spin_lock_irqsave(&rnp->lock, flags); t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED; @@ -228,23 +232,11 @@ void rcu_preempt_note_context_switch(void) * means that we continue to block the current grace period. */ local_irq_save(flags); - rcu_preempt_qs(smp_processor_id()); + rcu_preempt_qs(cpu); local_irq_restore(flags); } /* - * Tree-preemptible RCU implementation for rcu_read_lock(). - * Just increment ->rcu_read_lock_nesting, shared state will be updated - * if we block. - */ -void __rcu_read_lock(void) -{ - current->rcu_read_lock_nesting++; - barrier(); /* needed if we ever invoke rcu_read_lock in rcutree.c */ -} -EXPORT_SYMBOL_GPL(__rcu_read_lock); - -/* * Check for preempted RCU readers blocking the current grace period * for the specified rcu_node structure. If the caller needs a reliable * answer, it must hold the rcu_node's ->lock. @@ -310,7 +302,7 @@ static struct list_head *rcu_next_node_entry(struct task_struct *t, * notify RCU core processing or task having blocked during the RCU * read-side critical section. */ -static noinline void rcu_read_unlock_special(struct task_struct *t) +void rcu_read_unlock_special(struct task_struct *t) { int empty; int empty_exp; @@ -398,8 +390,9 @@ static noinline void rcu_read_unlock_special(struct task_struct *t) rnp->grphi, !!rnp->gp_tasks); rcu_report_unblock_qs_rnp(rnp, flags); - } else + } else { raw_spin_unlock_irqrestore(&rnp->lock, flags); + } #ifdef CONFIG_RCU_BOOST /* Unboost if we were boosted. */ @@ -418,38 +411,6 @@ static noinline void rcu_read_unlock_special(struct task_struct *t) } } -/* - * Tree-preemptible RCU implementation for rcu_read_unlock(). - * Decrement ->rcu_read_lock_nesting. If the result is zero (outermost - * rcu_read_unlock()) and ->rcu_read_unlock_special is non-zero, then - * invoke rcu_read_unlock_special() to clean up after a context switch - * in an RCU read-side critical section and other special cases. - */ -void __rcu_read_unlock(void) -{ - struct task_struct *t = current; - - if (t->rcu_read_lock_nesting != 1) - --t->rcu_read_lock_nesting; - else { - barrier(); /* critical section before exit code. */ - t->rcu_read_lock_nesting = INT_MIN; - barrier(); /* assign before ->rcu_read_unlock_special load */ - if (unlikely(ACCESS_ONCE(t->rcu_read_unlock_special))) - rcu_read_unlock_special(t); - barrier(); /* ->rcu_read_unlock_special load before assign */ - t->rcu_read_lock_nesting = 0; - } -#ifdef CONFIG_PROVE_LOCKING - { - int rrln = ACCESS_ONCE(t->rcu_read_lock_nesting); - - WARN_ON_ONCE(rrln < 0 && rrln > INT_MIN / 2); - } -#endif /* #ifdef CONFIG_PROVE_LOCKING */ -} -EXPORT_SYMBOL_GPL(__rcu_read_unlock); - #ifdef CONFIG_RCU_CPU_STALL_VERBOSE /* @@ -540,16 +501,6 @@ static int rcu_print_task_stall(struct rcu_node *rnp) } /* - * Suppress preemptible RCU's CPU stall warnings by pushing the - * time of the next stall-warning message comfortably far into the - * future. - */ -static void rcu_preempt_stall_reset(void) -{ - rcu_preempt_state.jiffies_stall = jiffies + ULONG_MAX / 2; -} - -/* * Check that the list of blocked tasks for the newly completed grace * period is in fact empty. It is a serious bug to complete a grace * period that still has RCU readers blocked! This function must be @@ -650,14 +601,6 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp, #endif /* #ifdef CONFIG_HOTPLUG_CPU */ /* - * Do CPU-offline processing for preemptible RCU. - */ -static void rcu_preempt_cleanup_dead_cpu(int cpu) -{ - rcu_cleanup_dead_cpu(cpu, &rcu_preempt_state); -} - -/* * Check for a quiescent state from the current CPU. When a task blocks, * the task is recorded in the corresponding CPU's rcu_node structure, * which is checked elsewhere. @@ -677,15 +620,6 @@ static void rcu_preempt_check_callbacks(int cpu) t->rcu_read_unlock_special |= RCU_READ_UNLOCK_NEED_QS; } -/* - * Process callbacks for preemptible RCU. - */ -static void rcu_preempt_process_callbacks(void) -{ - __rcu_process_callbacks(&rcu_preempt_state, - &__get_cpu_var(rcu_preempt_data)); -} - #ifdef CONFIG_RCU_BOOST static void rcu_preempt_do_callbacks(void) @@ -824,9 +758,9 @@ sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp) int must_wait = 0; raw_spin_lock_irqsave(&rnp->lock, flags); - if (list_empty(&rnp->blkd_tasks)) + if (list_empty(&rnp->blkd_tasks)) { raw_spin_unlock_irqrestore(&rnp->lock, flags); - else { + } else { rnp->exp_tasks = rnp->blkd_tasks.next; rcu_initiate_boost(rnp, flags); /* releases rnp->lock */ must_wait = 1; @@ -870,9 +804,9 @@ void synchronize_rcu_expedited(void) * expedited grace period for us, just leave. */ while (!mutex_trylock(&sync_rcu_preempt_exp_mutex)) { - if (trycount++ < 10) + if (trycount++ < 10) { udelay(trycount * num_online_cpus()); - else { + } else { synchronize_rcu(); return; } @@ -917,51 +851,16 @@ mb_ret: } EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); -/* - * Check to see if there is any immediate preemptible-RCU-related work - * to be done. - */ -static int rcu_preempt_pending(int cpu) -{ - return __rcu_pending(&rcu_preempt_state, - &per_cpu(rcu_preempt_data, cpu)); -} - -/* - * Does preemptible RCU have callbacks on this CPU? - */ -static int rcu_preempt_cpu_has_callbacks(int cpu) -{ - return !!per_cpu(rcu_preempt_data, cpu).nxtlist; -} - /** * rcu_barrier - Wait until all in-flight call_rcu() callbacks complete. */ void rcu_barrier(void) { - _rcu_barrier(&rcu_preempt_state, call_rcu); + _rcu_barrier(&rcu_preempt_state); } EXPORT_SYMBOL_GPL(rcu_barrier); /* - * Initialize preemptible RCU's per-CPU data. - */ -static void __cpuinit rcu_preempt_init_percpu_data(int cpu) -{ - rcu_init_percpu_data(cpu, &rcu_preempt_state, 1); -} - -/* - * Move preemptible RCU's callbacks from dying CPU to other online CPU - * and record a quiescent state. - */ -static void rcu_preempt_cleanup_dying_cpu(void) -{ - rcu_cleanup_dying_cpu(&rcu_preempt_state); -} - -/* * Initialize preemptible RCU's state structures. */ static void __init __rcu_init_preempt(void) @@ -1002,6 +901,14 @@ void rcu_force_quiescent_state(void) EXPORT_SYMBOL_GPL(rcu_force_quiescent_state); /* + * Because preemptible RCU does not exist, we never have to check for + * CPUs being in quiescent states. + */ +static void rcu_preempt_note_context_switch(int cpu) +{ +} + +/* * Because preemptible RCU does not exist, there are never any preempted * RCU readers. */ @@ -1038,14 +945,6 @@ static int rcu_print_task_stall(struct rcu_node *rnp) } /* - * Because preemptible RCU does not exist, there is no need to suppress - * its CPU stall warnings. - */ -static void rcu_preempt_stall_reset(void) -{ -} - -/* * Because there is no preemptible RCU, there can be no readers blocked, * so there is no need to check for blocked tasks. So check only for * bogus qsmask values. @@ -1073,14 +972,6 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp, #endif /* #ifdef CONFIG_HOTPLUG_CPU */ /* - * Because preemptible RCU does not exist, it never needs CPU-offline - * processing. - */ -static void rcu_preempt_cleanup_dead_cpu(int cpu) -{ -} - -/* * Because preemptible RCU does not exist, it never has any callbacks * to check. */ @@ -1089,14 +980,6 @@ static void rcu_preempt_check_callbacks(int cpu) } /* - * Because preemptible RCU does not exist, it never has any callbacks - * to process. - */ -static void rcu_preempt_process_callbacks(void) -{ -} - -/* * Queue an RCU callback for lazy invocation after a grace period. * This will likely be later named something like "call_rcu_lazy()", * but this change will require some way of tagging the lazy RCU @@ -1137,22 +1020,6 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, #endif /* #ifdef CONFIG_HOTPLUG_CPU */ /* - * Because preemptible RCU does not exist, it never has any work to do. - */ -static int rcu_preempt_pending(int cpu) -{ - return 0; -} - -/* - * Because preemptible RCU does not exist, it never has callbacks - */ -static int rcu_preempt_cpu_has_callbacks(int cpu) -{ - return 0; -} - -/* * Because preemptible RCU does not exist, rcu_barrier() is just * another name for rcu_barrier_sched(). */ @@ -1163,21 +1030,6 @@ void rcu_barrier(void) EXPORT_SYMBOL_GPL(rcu_barrier); /* - * Because preemptible RCU does not exist, there is no per-CPU - * data to initialize. - */ -static void __cpuinit rcu_preempt_init_percpu_data(int cpu) -{ -} - -/* - * Because there is no preemptible RCU, there is no cleanup to do. - */ -static void rcu_preempt_cleanup_dying_cpu(void) -{ -} - -/* * Because preemptible RCU does not exist, it need not be initialized. */ static void __init __rcu_init_preempt(void) @@ -1960,9 +1812,11 @@ static void rcu_idle_count_callbacks_posted(void) */ #define RCU_IDLE_FLUSHES 5 /* Number of dyntick-idle tries. */ #define RCU_IDLE_OPT_FLUSHES 3 /* Optional dyntick-idle tries. */ -#define RCU_IDLE_GP_DELAY 6 /* Roughly one grace period. */ +#define RCU_IDLE_GP_DELAY 4 /* Roughly one grace period. */ #define RCU_IDLE_LAZY_GP_DELAY (6 * HZ) /* Roughly six seconds. */ +extern int tick_nohz_enabled; + /* * Does the specified flavor of RCU have non-lazy callbacks pending on * the specified CPU? Both RCU flavor and CPU are specified by the @@ -2039,10 +1893,13 @@ int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies) return 1; } /* Set up for the possibility that RCU will post a timer. */ - if (rcu_cpu_has_nonlazy_callbacks(cpu)) - *delta_jiffies = RCU_IDLE_GP_DELAY; - else - *delta_jiffies = RCU_IDLE_LAZY_GP_DELAY; + if (rcu_cpu_has_nonlazy_callbacks(cpu)) { + *delta_jiffies = round_up(RCU_IDLE_GP_DELAY + jiffies, + RCU_IDLE_GP_DELAY) - jiffies; + } else { + *delta_jiffies = jiffies + RCU_IDLE_LAZY_GP_DELAY; + *delta_jiffies = round_jiffies(*delta_jiffies) - jiffies; + } return 0; } @@ -2101,6 +1958,7 @@ static void rcu_cleanup_after_idle(int cpu) del_timer(&rdtp->idle_gp_timer); trace_rcu_prep_idle("Cleanup after idle"); + rdtp->tick_nohz_enabled_snap = ACCESS_ONCE(tick_nohz_enabled); } /* @@ -2126,6 +1984,18 @@ static void rcu_prepare_for_idle(int cpu) { struct timer_list *tp; struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); + int tne; + + /* Handle nohz enablement switches conservatively. */ + tne = ACCESS_ONCE(tick_nohz_enabled); + if (tne != rdtp->tick_nohz_enabled_snap) { + if (rcu_cpu_has_callbacks(cpu)) + invoke_rcu_core(); /* force nohz to see update. */ + rdtp->tick_nohz_enabled_snap = tne; + return; + } + if (!tne) + return; /* * If this is an idle re-entry, for example, due to use of @@ -2179,10 +2049,11 @@ static void rcu_prepare_for_idle(int cpu) if (rcu_cpu_has_nonlazy_callbacks(cpu)) { trace_rcu_prep_idle("Dyntick with callbacks"); rdtp->idle_gp_timer_expires = - jiffies + RCU_IDLE_GP_DELAY; + round_up(jiffies + RCU_IDLE_GP_DELAY, + RCU_IDLE_GP_DELAY); } else { rdtp->idle_gp_timer_expires = - jiffies + RCU_IDLE_LAZY_GP_DELAY; + round_jiffies(jiffies + RCU_IDLE_LAZY_GP_DELAY); trace_rcu_prep_idle("Dyntick with lazy callbacks"); } tp = &rdtp->idle_gp_timer; @@ -2223,8 +2094,9 @@ static void rcu_prepare_for_idle(int cpu) if (rcu_cpu_has_callbacks(cpu)) { trace_rcu_prep_idle("More callbacks"); invoke_rcu_core(); - } else + } else { trace_rcu_prep_idle("Callbacks drained"); + } } /* @@ -2261,6 +2133,7 @@ static void print_cpu_stall_fast_no_hz(char *cp, int cpu) static void print_cpu_stall_fast_no_hz(char *cp, int cpu) { + *cp = '\0'; } #endif /* #else #ifdef CONFIG_RCU_FAST_NO_HZ */ diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c index d4bc16ddd1d4..abffb486e94e 100644 --- a/kernel/rcutree_trace.c +++ b/kernel/rcutree_trace.c @@ -46,6 +46,31 @@ #define RCU_TREE_NONCORE #include "rcutree.h" +static int show_rcubarrier(struct seq_file *m, void *unused) +{ + struct rcu_state *rsp; + + for_each_rcu_flavor(rsp) + seq_printf(m, "%s: %c bcc: %d nbd: %lu\n", + rsp->name, rsp->rcu_barrier_in_progress ? 'B' : '.', + atomic_read(&rsp->barrier_cpu_count), + rsp->n_barrier_done); + return 0; +} + +static int rcubarrier_open(struct inode *inode, struct file *file) +{ + return single_open(file, show_rcubarrier, NULL); +} + +static const struct file_operations rcubarrier_fops = { + .owner = THIS_MODULE, + .open = rcubarrier_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + #ifdef CONFIG_RCU_BOOST static char convert_kthread_status(unsigned int kthread_status) @@ -95,24 +120,16 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp) rdp->n_cbs_invoked, rdp->n_cbs_orphaned, rdp->n_cbs_adopted); } -#define PRINT_RCU_DATA(name, func, m) \ - do { \ - int _p_r_d_i; \ - \ - for_each_possible_cpu(_p_r_d_i) \ - func(m, &per_cpu(name, _p_r_d_i)); \ - } while (0) - static int show_rcudata(struct seq_file *m, void *unused) { -#ifdef CONFIG_TREE_PREEMPT_RCU - seq_puts(m, "rcu_preempt:\n"); - PRINT_RCU_DATA(rcu_preempt_data, print_one_rcu_data, m); -#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ - seq_puts(m, "rcu_sched:\n"); - PRINT_RCU_DATA(rcu_sched_data, print_one_rcu_data, m); - seq_puts(m, "rcu_bh:\n"); - PRINT_RCU_DATA(rcu_bh_data, print_one_rcu_data, m); + int cpu; + struct rcu_state *rsp; + + for_each_rcu_flavor(rsp) { + seq_printf(m, "%s:\n", rsp->name); + for_each_possible_cpu(cpu) + print_one_rcu_data(m, per_cpu_ptr(rsp->rda, cpu)); + } return 0; } @@ -166,6 +183,9 @@ static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp) static int show_rcudata_csv(struct seq_file *m, void *unused) { + int cpu; + struct rcu_state *rsp; + seq_puts(m, "\"CPU\",\"Online?\",\"c\",\"g\",\"pq\",\"pgp\",\"pq\","); seq_puts(m, "\"dt\",\"dt nesting\",\"dt NMI nesting\",\"df\","); seq_puts(m, "\"of\",\"qll\",\"ql\",\"qs\""); @@ -173,14 +193,11 @@ static int show_rcudata_csv(struct seq_file *m, void *unused) seq_puts(m, "\"kt\",\"ktl\""); #endif /* #ifdef CONFIG_RCU_BOOST */ seq_puts(m, ",\"b\",\"ci\",\"co\",\"ca\"\n"); -#ifdef CONFIG_TREE_PREEMPT_RCU - seq_puts(m, "\"rcu_preempt:\"\n"); - PRINT_RCU_DATA(rcu_preempt_data, print_one_rcu_data_csv, m); -#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ - seq_puts(m, "\"rcu_sched:\"\n"); - PRINT_RCU_DATA(rcu_sched_data, print_one_rcu_data_csv, m); - seq_puts(m, "\"rcu_bh:\"\n"); - PRINT_RCU_DATA(rcu_bh_data, print_one_rcu_data_csv, m); + for_each_rcu_flavor(rsp) { + seq_printf(m, "\"%s:\"\n", rsp->name); + for_each_possible_cpu(cpu) + print_one_rcu_data_csv(m, per_cpu_ptr(rsp->rda, cpu)); + } return 0; } @@ -201,8 +218,7 @@ static const struct file_operations rcudata_csv_fops = { static void print_one_rcu_node_boost(struct seq_file *m, struct rcu_node *rnp) { - seq_printf(m, "%d:%d tasks=%c%c%c%c kt=%c ntb=%lu neb=%lu nnb=%lu " - "j=%04x bt=%04x\n", + seq_printf(m, "%d:%d tasks=%c%c%c%c kt=%c ntb=%lu neb=%lu nnb=%lu ", rnp->grplo, rnp->grphi, "T."[list_empty(&rnp->blkd_tasks)], "N."[!rnp->gp_tasks], @@ -210,11 +226,11 @@ static void print_one_rcu_node_boost(struct seq_file *m, struct rcu_node *rnp) "B."[!rnp->boost_tasks], convert_kthread_status(rnp->boost_kthread_status), rnp->n_tasks_boosted, rnp->n_exp_boosts, - rnp->n_normal_boosts, + rnp->n_normal_boosts); + seq_printf(m, "j=%04x bt=%04x\n", (int)(jiffies & 0xffff), (int)(rnp->boost_time & 0xffff)); - seq_printf(m, "%s: nt=%lu egt=%lu bt=%lu nb=%lu ny=%lu nos=%lu\n", - " balk", + seq_printf(m, " balk: nt=%lu egt=%lu bt=%lu nb=%lu ny=%lu nos=%lu\n", rnp->n_balk_blkd_tasks, rnp->n_balk_exp_gp_tasks, rnp->n_balk_boost_tasks, @@ -270,15 +286,15 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp) struct rcu_node *rnp; gpnum = rsp->gpnum; - seq_printf(m, "c=%lu g=%lu s=%d jfq=%ld j=%x " - "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld/%ld\n", - rsp->completed, gpnum, rsp->fqs_state, + seq_printf(m, "%s: c=%lu g=%lu s=%d jfq=%ld j=%x ", + rsp->name, rsp->completed, gpnum, rsp->fqs_state, (long)(rsp->jiffies_force_qs - jiffies), - (int)(jiffies & 0xffff), + (int)(jiffies & 0xffff)); + seq_printf(m, "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld/%ld\n", rsp->n_force_qs, rsp->n_force_qs_ngp, rsp->n_force_qs - rsp->n_force_qs_ngp, rsp->n_force_qs_lh, rsp->qlen_lazy, rsp->qlen); - for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < NUM_RCU_NODES; rnp++) { + for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < rcu_num_nodes; rnp++) { if (rnp->level != level) { seq_puts(m, "\n"); level = rnp->level; @@ -295,14 +311,10 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp) static int show_rcuhier(struct seq_file *m, void *unused) { -#ifdef CONFIG_TREE_PREEMPT_RCU - seq_puts(m, "rcu_preempt:\n"); - print_one_rcu_state(m, &rcu_preempt_state); -#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ - seq_puts(m, "rcu_sched:\n"); - print_one_rcu_state(m, &rcu_sched_state); - seq_puts(m, "rcu_bh:\n"); - print_one_rcu_state(m, &rcu_bh_state); + struct rcu_state *rsp; + + for_each_rcu_flavor(rsp) + print_one_rcu_state(m, rsp); return 0; } @@ -343,11 +355,10 @@ static void show_one_rcugp(struct seq_file *m, struct rcu_state *rsp) static int show_rcugp(struct seq_file *m, void *unused) { -#ifdef CONFIG_TREE_PREEMPT_RCU - show_one_rcugp(m, &rcu_preempt_state); -#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ - show_one_rcugp(m, &rcu_sched_state); - show_one_rcugp(m, &rcu_bh_state); + struct rcu_state *rsp; + + for_each_rcu_flavor(rsp) + show_one_rcugp(m, rsp); return 0; } @@ -366,44 +377,36 @@ static const struct file_operations rcugp_fops = { static void print_one_rcu_pending(struct seq_file *m, struct rcu_data *rdp) { - seq_printf(m, "%3d%cnp=%ld " - "qsp=%ld rpq=%ld cbr=%ld cng=%ld " - "gpc=%ld gps=%ld nf=%ld nn=%ld\n", + seq_printf(m, "%3d%cnp=%ld ", rdp->cpu, cpu_is_offline(rdp->cpu) ? '!' : ' ', - rdp->n_rcu_pending, + rdp->n_rcu_pending); + seq_printf(m, "qsp=%ld rpq=%ld cbr=%ld cng=%ld ", rdp->n_rp_qs_pending, rdp->n_rp_report_qs, rdp->n_rp_cb_ready, - rdp->n_rp_cpu_needs_gp, + rdp->n_rp_cpu_needs_gp); + seq_printf(m, "gpc=%ld gps=%ld nf=%ld nn=%ld\n", rdp->n_rp_gp_completed, rdp->n_rp_gp_started, rdp->n_rp_need_fqs, rdp->n_rp_need_nothing); } -static void print_rcu_pendings(struct seq_file *m, struct rcu_state *rsp) +static int show_rcu_pending(struct seq_file *m, void *unused) { int cpu; struct rcu_data *rdp; - - for_each_possible_cpu(cpu) { - rdp = per_cpu_ptr(rsp->rda, cpu); - if (rdp->beenonline) - print_one_rcu_pending(m, rdp); + struct rcu_state *rsp; + + for_each_rcu_flavor(rsp) { + seq_printf(m, "%s:\n", rsp->name); + for_each_possible_cpu(cpu) { + rdp = per_cpu_ptr(rsp->rda, cpu); + if (rdp->beenonline) + print_one_rcu_pending(m, rdp); + } } -} - -static int show_rcu_pending(struct seq_file *m, void *unused) -{ -#ifdef CONFIG_TREE_PREEMPT_RCU - seq_puts(m, "rcu_preempt:\n"); - print_rcu_pendings(m, &rcu_preempt_state); -#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ - seq_puts(m, "rcu_sched:\n"); - print_rcu_pendings(m, &rcu_sched_state); - seq_puts(m, "rcu_bh:\n"); - print_rcu_pendings(m, &rcu_bh_state); return 0; } @@ -453,6 +456,11 @@ static int __init rcutree_trace_init(void) if (!rcudir) goto free_out; + retval = debugfs_create_file("rcubarrier", 0444, rcudir, + NULL, &rcubarrier_fops); + if (!retval) + goto free_out; + retval = debugfs_create_file("rcudata", 0444, rcudir, NULL, &rcudata_fops); if (!retval) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index d5594a4268d4..468bdd44c1ba 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2081,7 +2081,6 @@ context_switch(struct rq *rq, struct task_struct *prev, #endif /* Here we just switch the register state and the stack. */ - rcu_switch_from(prev); switch_to(prev, next, prev); barrier(); @@ -2161,11 +2160,73 @@ unsigned long this_cpu_load(void) } +/* + * Global load-average calculations + * + * We take a distributed and async approach to calculating the global load-avg + * in order to minimize overhead. + * + * The global load average is an exponentially decaying average of nr_running + + * nr_uninterruptible. + * + * Once every LOAD_FREQ: + * + * nr_active = 0; + * for_each_possible_cpu(cpu) + * nr_active += cpu_of(cpu)->nr_running + cpu_of(cpu)->nr_uninterruptible; + * + * avenrun[n] = avenrun[0] * exp_n + nr_active * (1 - exp_n) + * + * Due to a number of reasons the above turns in the mess below: + * + * - for_each_possible_cpu() is prohibitively expensive on machines with + * serious number of cpus, therefore we need to take a distributed approach + * to calculating nr_active. + * + * \Sum_i x_i(t) = \Sum_i x_i(t) - x_i(t_0) | x_i(t_0) := 0 + * = \Sum_i { \Sum_j=1 x_i(t_j) - x_i(t_j-1) } + * + * So assuming nr_active := 0 when we start out -- true per definition, we + * can simply take per-cpu deltas and fold those into a global accumulate + * to obtain the same result. See calc_load_fold_active(). + * + * Furthermore, in order to avoid synchronizing all per-cpu delta folding + * across the machine, we assume 10 ticks is sufficient time for every + * cpu to have completed this task. + * + * This places an upper-bound on the IRQ-off latency of the machine. Then + * again, being late doesn't loose the delta, just wrecks the sample. + * + * - cpu_rq()->nr_uninterruptible isn't accurately tracked per-cpu because + * this would add another cross-cpu cacheline miss and atomic operation + * to the wakeup path. Instead we increment on whatever cpu the task ran + * when it went into uninterruptible state and decrement on whatever cpu + * did the wakeup. This means that only the sum of nr_uninterruptible over + * all cpus yields the correct result. + * + * This covers the NO_HZ=n code, for extra head-aches, see the comment below. + */ + /* Variables and functions for calc_load */ static atomic_long_t calc_load_tasks; static unsigned long calc_load_update; unsigned long avenrun[3]; -EXPORT_SYMBOL(avenrun); +EXPORT_SYMBOL(avenrun); /* should be removed */ + +/** + * get_avenrun - get the load average array + * @loads: pointer to dest load array + * @offset: offset to add + * @shift: shift count to shift the result left + * + * These values are estimates at best, so no need for locking. + */ +void get_avenrun(unsigned long *loads, unsigned long offset, int shift) +{ + loads[0] = (avenrun[0] + offset) << shift; + loads[1] = (avenrun[1] + offset) << shift; + loads[2] = (avenrun[2] + offset) << shift; +} static long calc_load_fold_active(struct rq *this_rq) { @@ -2182,6 +2243,9 @@ static long calc_load_fold_active(struct rq *this_rq) return delta; } +/* + * a1 = a0 * e + a * (1 - e) + */ static unsigned long calc_load(unsigned long load, unsigned long exp, unsigned long active) { @@ -2193,30 +2257,118 @@ calc_load(unsigned long load, unsigned long exp, unsigned long active) #ifdef CONFIG_NO_HZ /* - * For NO_HZ we delay the active fold to the next LOAD_FREQ update. + * Handle NO_HZ for the global load-average. + * + * Since the above described distributed algorithm to compute the global + * load-average relies on per-cpu sampling from the tick, it is affected by + * NO_HZ. + * + * The basic idea is to fold the nr_active delta into a global idle-delta upon + * entering NO_HZ state such that we can include this as an 'extra' cpu delta + * when we read the global state. + * + * Obviously reality has to ruin such a delightfully simple scheme: + * + * - When we go NO_HZ idle during the window, we can negate our sample + * contribution, causing under-accounting. + * + * We avoid this by keeping two idle-delta counters and flipping them + * when the window starts, thus separating old and new NO_HZ load. + * + * The only trick is the slight shift in index flip for read vs write. + * + * 0s 5s 10s 15s + * +10 +10 +10 +10 + * |-|-----------|-|-----------|-|-----------|-| + * r:0 0 1 1 0 0 1 1 0 + * w:0 1 1 0 0 1 1 0 0 + * + * This ensures we'll fold the old idle contribution in this window while + * accumlating the new one. + * + * - When we wake up from NO_HZ idle during the window, we push up our + * contribution, since we effectively move our sample point to a known + * busy state. + * + * This is solved by pushing the window forward, and thus skipping the + * sample, for this cpu (effectively using the idle-delta for this cpu which + * was in effect at the time the window opened). This also solves the issue + * of having to deal with a cpu having been in NOHZ idle for multiple + * LOAD_FREQ intervals. * * When making the ILB scale, we should try to pull this in as well. */ -static atomic_long_t calc_load_tasks_idle; +static atomic_long_t calc_load_idle[2]; +static int calc_load_idx; -void calc_load_account_idle(struct rq *this_rq) +static inline int calc_load_write_idx(void) { + int idx = calc_load_idx; + + /* + * See calc_global_nohz(), if we observe the new index, we also + * need to observe the new update time. + */ + smp_rmb(); + + /* + * If the folding window started, make sure we start writing in the + * next idle-delta. + */ + if (!time_before(jiffies, calc_load_update)) + idx++; + + return idx & 1; +} + +static inline int calc_load_read_idx(void) +{ + return calc_load_idx & 1; +} + +void calc_load_enter_idle(void) +{ + struct rq *this_rq = this_rq(); long delta; + /* + * We're going into NOHZ mode, if there's any pending delta, fold it + * into the pending idle delta. + */ delta = calc_load_fold_active(this_rq); - if (delta) - atomic_long_add(delta, &calc_load_tasks_idle); + if (delta) { + int idx = calc_load_write_idx(); + atomic_long_add(delta, &calc_load_idle[idx]); + } } -static long calc_load_fold_idle(void) +void calc_load_exit_idle(void) { - long delta = 0; + struct rq *this_rq = this_rq(); + + /* + * If we're still before the sample window, we're done. + */ + if (time_before(jiffies, this_rq->calc_load_update)) + return; /* - * Its got a race, we don't care... + * We woke inside or after the sample window, this means we're already + * accounted through the nohz accounting, so skip the entire deal and + * sync up for the next window. */ - if (atomic_long_read(&calc_load_tasks_idle)) - delta = atomic_long_xchg(&calc_load_tasks_idle, 0); + this_rq->calc_load_update = calc_load_update; + if (time_before(jiffies, this_rq->calc_load_update + 10)) + this_rq->calc_load_update += LOAD_FREQ; +} + +static long calc_load_fold_idle(void) +{ + int idx = calc_load_read_idx(); + long delta = 0; + + if (atomic_long_read(&calc_load_idle[idx])) + delta = atomic_long_xchg(&calc_load_idle[idx], 0); return delta; } @@ -2302,66 +2454,39 @@ static void calc_global_nohz(void) { long delta, active, n; - /* - * If we crossed a calc_load_update boundary, make sure to fold - * any pending idle changes, the respective CPUs might have - * missed the tick driven calc_load_account_active() update - * due to NO_HZ. - */ - delta = calc_load_fold_idle(); - if (delta) - atomic_long_add(delta, &calc_load_tasks); - - /* - * It could be the one fold was all it took, we done! - */ - if (time_before(jiffies, calc_load_update + 10)) - return; - - /* - * Catch-up, fold however many we are behind still - */ - delta = jiffies - calc_load_update - 10; - n = 1 + (delta / LOAD_FREQ); + if (!time_before(jiffies, calc_load_update + 10)) { + /* + * Catch-up, fold however many we are behind still + */ + delta = jiffies - calc_load_update - 10; + n = 1 + (delta / LOAD_FREQ); - active = atomic_long_read(&calc_load_tasks); - active = active > 0 ? active * FIXED_1 : 0; + active = atomic_long_read(&calc_load_tasks); + active = active > 0 ? active * FIXED_1 : 0; - avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n); - avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n); - avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n); + avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n); + avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n); + avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n); - calc_load_update += n * LOAD_FREQ; -} -#else -void calc_load_account_idle(struct rq *this_rq) -{ -} + calc_load_update += n * LOAD_FREQ; + } -static inline long calc_load_fold_idle(void) -{ - return 0; + /* + * Flip the idle index... + * + * Make sure we first write the new time then flip the index, so that + * calc_load_write_idx() will see the new time when it reads the new + * index, this avoids a double flip messing things up. + */ + smp_wmb(); + calc_load_idx++; } +#else /* !CONFIG_NO_HZ */ -static void calc_global_nohz(void) -{ -} -#endif +static inline long calc_load_fold_idle(void) { return 0; } +static inline void calc_global_nohz(void) { } -/** - * get_avenrun - get the load average array - * @loads: pointer to dest load array - * @offset: offset to add - * @shift: shift count to shift the result left - * - * These values are estimates at best, so no need for locking. - */ -void get_avenrun(unsigned long *loads, unsigned long offset, int shift) -{ - loads[0] = (avenrun[0] + offset) << shift; - loads[1] = (avenrun[1] + offset) << shift; - loads[2] = (avenrun[2] + offset) << shift; -} +#endif /* CONFIG_NO_HZ */ /* * calc_load - update the avenrun load estimates 10 ticks after the @@ -2369,11 +2494,18 @@ void get_avenrun(unsigned long *loads, unsigned long offset, int shift) */ void calc_global_load(unsigned long ticks) { - long active; + long active, delta; if (time_before(jiffies, calc_load_update + 10)) return; + /* + * Fold the 'old' idle-delta to include all NO_HZ cpus. + */ + delta = calc_load_fold_idle(); + if (delta) + atomic_long_add(delta, &calc_load_tasks); + active = atomic_long_read(&calc_load_tasks); active = active > 0 ? active * FIXED_1 : 0; @@ -2384,12 +2516,7 @@ void calc_global_load(unsigned long ticks) calc_load_update += LOAD_FREQ; /* - * Account one period with whatever state we found before - * folding in the nohz state and ageing the entire idle period. - * - * This avoids loosing a sample when we go idle between - * calc_load_account_active() (10 ticks ago) and now and thus - * under-accounting. + * In case we idled for multiple LOAD_FREQ intervals, catch up in bulk. */ calc_global_nohz(); } @@ -2406,7 +2533,6 @@ static void calc_load_account_active(struct rq *this_rq) return; delta = calc_load_fold_active(this_rq); - delta += calc_load_fold_idle(); if (delta) atomic_long_add(delta, &calc_load_tasks); @@ -2414,6 +2540,10 @@ static void calc_load_account_active(struct rq *this_rq) } /* + * End of global load-average stuff + */ + +/* * The exact cpuload at various idx values, calculated at every tick would be * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load * diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c index b44d604b35d1..b6baf370cae9 100644 --- a/kernel/sched/idle_task.c +++ b/kernel/sched/idle_task.c @@ -25,7 +25,6 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int fl static struct task_struct *pick_next_task_idle(struct rq *rq) { schedstat_inc(rq, sched_goidle); - calc_load_account_idle(rq); return rq->idle; } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 6d52cea7f33d..55844f24435a 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -942,8 +942,6 @@ static inline u64 sched_avg_period(void) return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2; } -void calc_load_account_idle(struct rq *this_rq); - #ifdef CONFIG_SCHED_HRTICK /* diff --git a/kernel/smp.c b/kernel/smp.c index d0ae5b24875e..29dd40a9f2f4 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -581,26 +581,6 @@ int smp_call_function(smp_call_func_t func, void *info, int wait) return 0; } EXPORT_SYMBOL(smp_call_function); - -void ipi_call_lock(void) -{ - raw_spin_lock(&call_function.lock); -} - -void ipi_call_unlock(void) -{ - raw_spin_unlock(&call_function.lock); -} - -void ipi_call_lock_irq(void) -{ - raw_spin_lock_irq(&call_function.lock); -} - -void ipi_call_unlock_irq(void) -{ - raw_spin_unlock_irq(&call_function.lock); -} #endif /* USE_GENERIC_SMP_HELPERS */ /* Setup configured maximum number of CPUs to activate */ diff --git a/kernel/smpboot.h b/kernel/smpboot.h index 80c0acfb8472..6ef9433e1c70 100644 --- a/kernel/smpboot.h +++ b/kernel/smpboot.h @@ -3,8 +3,6 @@ struct task_struct; -int smpboot_prepare(unsigned int cpu); - #ifdef CONFIG_GENERIC_SMP_IDLE_THREAD struct task_struct *idle_thread_get(unsigned int cpu); void idle_thread_set_boot_cpu(void); diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 70b33abcc7bb..b7fbadc5c973 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -409,7 +409,9 @@ int second_overflow(unsigned long secs) time_state = TIME_DEL; break; case TIME_INS: - if (secs % 86400 == 0) { + if (!(time_status & STA_INS)) + time_state = TIME_OK; + else if (secs % 86400 == 0) { leap = -1; time_state = TIME_OOP; time_tai++; @@ -418,7 +420,9 @@ int second_overflow(unsigned long secs) } break; case TIME_DEL: - if ((secs + 1) % 86400 == 0) { + if (!(time_status & STA_DEL)) + time_state = TIME_OK; + else if ((secs + 1) % 86400 == 0) { leap = 1; time_tai--; time_state = TIME_WAIT; diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 869997833928..024540f97f74 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -105,7 +105,7 @@ static ktime_t tick_init_jiffy_update(void) /* * NO HZ enabled ? */ -static int tick_nohz_enabled __read_mostly = 1; +int tick_nohz_enabled __read_mostly = 1; /* * Enable / Disable tickless mode @@ -271,50 +271,15 @@ u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time) } EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us); -static void tick_nohz_stop_sched_tick(struct tick_sched *ts) +static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, + ktime_t now, int cpu) { unsigned long seq, last_jiffies, next_jiffies, delta_jiffies; + ktime_t last_update, expires, ret = { .tv64 = 0 }; unsigned long rcu_delta_jiffies; - ktime_t last_update, expires, now; struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev; u64 time_delta; - int cpu; - - cpu = smp_processor_id(); - ts = &per_cpu(tick_cpu_sched, cpu); - - now = tick_nohz_start_idle(cpu, ts); - - /* - * If this cpu is offline and it is the one which updates - * jiffies, then give up the assignment and let it be taken by - * the cpu which runs the tick timer next. If we don't drop - * this here the jiffies might be stale and do_timer() never - * invoked. - */ - if (unlikely(!cpu_online(cpu))) { - if (cpu == tick_do_timer_cpu) - tick_do_timer_cpu = TICK_DO_TIMER_NONE; - } - - if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE)) - return; - if (need_resched()) - return; - - if (unlikely(local_softirq_pending() && cpu_online(cpu))) { - static int ratelimit; - - if (ratelimit < 10) { - printk(KERN_ERR "NOHZ: local_softirq_pending %02x\n", - (unsigned int) local_softirq_pending()); - ratelimit++; - } - return; - } - - ts->idle_calls++; /* Read jiffies and the time when jiffies were updated last */ do { seq = read_seqbegin(&xtime_lock); @@ -397,6 +362,8 @@ static void tick_nohz_stop_sched_tick(struct tick_sched *ts) if (ts->tick_stopped && ktime_equal(expires, dev->next_event)) goto out; + ret = expires; + /* * nohz_stop_sched_tick can be called several times before * the nohz_restart_sched_tick is called. This happens when @@ -406,17 +373,12 @@ static void tick_nohz_stop_sched_tick(struct tick_sched *ts) */ if (!ts->tick_stopped) { select_nohz_load_balancer(1); + calc_load_enter_idle(); - ts->idle_tick = hrtimer_get_expires(&ts->sched_timer); + ts->last_tick = hrtimer_get_expires(&ts->sched_timer); ts->tick_stopped = 1; - ts->idle_jiffies = last_jiffies; } - ts->idle_sleeps++; - - /* Mark expires */ - ts->idle_expires = expires; - /* * If the expiration time == KTIME_MAX, then * in this case we simply stop the tick timer. @@ -447,6 +409,65 @@ out: ts->next_jiffies = next_jiffies; ts->last_jiffies = last_jiffies; ts->sleep_length = ktime_sub(dev->next_event, now); + + return ret; +} + +static bool can_stop_idle_tick(int cpu, struct tick_sched *ts) +{ + /* + * If this cpu is offline and it is the one which updates + * jiffies, then give up the assignment and let it be taken by + * the cpu which runs the tick timer next. If we don't drop + * this here the jiffies might be stale and do_timer() never + * invoked. + */ + if (unlikely(!cpu_online(cpu))) { + if (cpu == tick_do_timer_cpu) + tick_do_timer_cpu = TICK_DO_TIMER_NONE; + } + + if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE)) + return false; + + if (need_resched()) + return false; + + if (unlikely(local_softirq_pending() && cpu_online(cpu))) { + static int ratelimit; + + if (ratelimit < 10) { + printk(KERN_ERR "NOHZ: local_softirq_pending %02x\n", + (unsigned int) local_softirq_pending()); + ratelimit++; + } + return false; + } + + return true; +} + +static void __tick_nohz_idle_enter(struct tick_sched *ts) +{ + ktime_t now, expires; + int cpu = smp_processor_id(); + + now = tick_nohz_start_idle(cpu, ts); + + if (can_stop_idle_tick(cpu, ts)) { + int was_stopped = ts->tick_stopped; + + ts->idle_calls++; + + expires = tick_nohz_stop_sched_tick(ts, now, cpu); + if (expires.tv64 > 0LL) { + ts->idle_sleeps++; + ts->idle_expires = expires; + } + + if (!was_stopped && ts->tick_stopped) + ts->idle_jiffies = ts->last_jiffies; + } } /** @@ -484,7 +505,7 @@ void tick_nohz_idle_enter(void) * update of the idle time accounting in tick_nohz_start_idle(). */ ts->inidle = 1; - tick_nohz_stop_sched_tick(ts); + __tick_nohz_idle_enter(ts); local_irq_enable(); } @@ -504,7 +525,7 @@ void tick_nohz_irq_exit(void) if (!ts->inidle) return; - tick_nohz_stop_sched_tick(ts); + __tick_nohz_idle_enter(ts); } /** @@ -522,7 +543,7 @@ ktime_t tick_nohz_get_sleep_length(void) static void tick_nohz_restart(struct tick_sched *ts, ktime_t now) { hrtimer_cancel(&ts->sched_timer); - hrtimer_set_expires(&ts->sched_timer, ts->idle_tick); + hrtimer_set_expires(&ts->sched_timer, ts->last_tick); while (1) { /* Forward the time to expire in the future */ @@ -545,6 +566,41 @@ static void tick_nohz_restart(struct tick_sched *ts, ktime_t now) } } +static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now) +{ + /* Update jiffies first */ + select_nohz_load_balancer(0); + tick_do_update_jiffies64(now); + update_cpu_load_nohz(); + + touch_softlockup_watchdog(); + /* + * Cancel the scheduled timer and restore the tick + */ + ts->tick_stopped = 0; + ts->idle_exittime = now; + + tick_nohz_restart(ts, now); +} + +static void tick_nohz_account_idle_ticks(struct tick_sched *ts) +{ +#ifndef CONFIG_VIRT_CPU_ACCOUNTING + unsigned long ticks; + /* + * We stopped the tick in idle. Update process times would miss the + * time we slept as update_process_times does only a 1 tick + * accounting. Enforce that this is accounted to idle ! + */ + ticks = jiffies - ts->idle_jiffies; + /* + * We might be one off. Do not randomly account a huge number of ticks! + */ + if (ticks && ticks < LONG_MAX) + account_idle_ticks(ticks); +#endif +} + /** * tick_nohz_idle_exit - restart the idle tick from the idle task * @@ -556,9 +612,6 @@ void tick_nohz_idle_exit(void) { int cpu = smp_processor_id(); struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); -#ifndef CONFIG_VIRT_CPU_ACCOUNTING - unsigned long ticks; -#endif ktime_t now; local_irq_disable(); @@ -573,39 +626,11 @@ void tick_nohz_idle_exit(void) if (ts->idle_active) tick_nohz_stop_idle(cpu, now); - if (!ts->tick_stopped) { - local_irq_enable(); - return; + if (ts->tick_stopped) { + tick_nohz_restart_sched_tick(ts, now); + tick_nohz_account_idle_ticks(ts); } - /* Update jiffies first */ - select_nohz_load_balancer(0); - tick_do_update_jiffies64(now); - update_cpu_load_nohz(); - -#ifndef CONFIG_VIRT_CPU_ACCOUNTING - /* - * We stopped the tick in idle. Update process times would miss the - * time we slept as update_process_times does only a 1 tick - * accounting. Enforce that this is accounted to idle ! - */ - ticks = jiffies - ts->idle_jiffies; - /* - * We might be one off. Do not randomly account a huge number of ticks! - */ - if (ticks && ticks < LONG_MAX) - account_idle_ticks(ticks); -#endif - - touch_softlockup_watchdog(); - /* - * Cancel the scheduled timer and restore the tick - */ - ts->tick_stopped = 0; - ts->idle_exittime = now; - - tick_nohz_restart(ts, now); - local_irq_enable(); } @@ -809,7 +834,8 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer) */ if (ts->tick_stopped) { touch_softlockup_watchdog(); - ts->idle_jiffies++; + if (idle_cpu(cpu)) + ts->idle_jiffies++; } update_process_times(user_mode(regs)); profile_tick(CPU_PROFILING); diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 269b1fe5f2ae..f045cc50832d 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -24,32 +24,32 @@ /* Structure holding internal timekeeping values. */ struct timekeeper { /* Current clocksource used for timekeeping. */ - struct clocksource *clock; + struct clocksource *clock; /* NTP adjusted clock multiplier */ - u32 mult; + u32 mult; /* The shift value of the current clocksource. */ - int shift; - + u32 shift; /* Number of clock cycles in one NTP interval. */ - cycle_t cycle_interval; + cycle_t cycle_interval; /* Number of clock shifted nano seconds in one NTP interval. */ - u64 xtime_interval; + u64 xtime_interval; /* shifted nano seconds left over when rounding cycle_interval */ - s64 xtime_remainder; + s64 xtime_remainder; /* Raw nano seconds accumulated per NTP interval. */ - u32 raw_interval; + u32 raw_interval; + + /* Current CLOCK_REALTIME time in seconds */ + u64 xtime_sec; + /* Clock shifted nano seconds */ + u64 xtime_nsec; - /* Clock shifted nano seconds remainder not stored in xtime.tv_nsec. */ - u64 xtime_nsec; /* Difference between accumulated time and NTP time in ntp * shifted nano seconds. */ - s64 ntp_error; + s64 ntp_error; /* Shift conversion between clock shifted nano seconds and * ntp shifted nano seconds. */ - int ntp_error_shift; + u32 ntp_error_shift; - /* The current time */ - struct timespec xtime; /* * wall_to_monotonic is what we need to add to xtime (or xtime corrected * for sub jiffie times) to get to monotonic time. Monotonic is pegged @@ -64,20 +64,17 @@ struct timekeeper { * - wall_to_monotonic is no longer the boot time, getboottime must be * used instead. */ - struct timespec wall_to_monotonic; + struct timespec wall_to_monotonic; /* time spent in suspend */ - struct timespec total_sleep_time; + struct timespec total_sleep_time; /* The raw monotonic time for the CLOCK_MONOTONIC_RAW posix clock. */ - struct timespec raw_time; - + struct timespec raw_time; /* Offset clock monotonic -> clock realtime */ - ktime_t offs_real; - + ktime_t offs_real; /* Offset clock monotonic -> clock boottime */ - ktime_t offs_boot; - + ktime_t offs_boot; /* Seqlock for all timekeeper values */ - seqlock_t lock; + seqlock_t lock; }; static struct timekeeper timekeeper; @@ -88,11 +85,37 @@ static struct timekeeper timekeeper; */ __cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock); - /* flag for if timekeeping is suspended */ int __read_mostly timekeeping_suspended; +static inline void tk_normalize_xtime(struct timekeeper *tk) +{ + while (tk->xtime_nsec >= ((u64)NSEC_PER_SEC << tk->shift)) { + tk->xtime_nsec -= (u64)NSEC_PER_SEC << tk->shift; + tk->xtime_sec++; + } +} +static struct timespec tk_xtime(struct timekeeper *tk) +{ + struct timespec ts; + + ts.tv_sec = tk->xtime_sec; + ts.tv_nsec = (long)(tk->xtime_nsec >> tk->shift); + return ts; +} + +static void tk_set_xtime(struct timekeeper *tk, const struct timespec *ts) +{ + tk->xtime_sec = ts->tv_sec; + tk->xtime_nsec = ts->tv_nsec << tk->shift; +} + +static void tk_xtime_add(struct timekeeper *tk, const struct timespec *ts) +{ + tk->xtime_sec += ts->tv_sec; + tk->xtime_nsec += ts->tv_nsec << tk->shift; +} /** * timekeeper_setup_internals - Set up internals to use clocksource clock. @@ -104,12 +127,14 @@ int __read_mostly timekeeping_suspended; * * Unless you're the timekeeping code, you should not be using this! */ -static void timekeeper_setup_internals(struct clocksource *clock) +static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock) { cycle_t interval; u64 tmp, ntpinterval; + struct clocksource *old_clock; - timekeeper.clock = clock; + old_clock = tk->clock; + tk->clock = clock; clock->cycle_last = clock->read(clock); /* Do the ns -> cycle conversion first, using original mult */ @@ -122,80 +147,96 @@ static void timekeeper_setup_internals(struct clocksource *clock) tmp = 1; interval = (cycle_t) tmp; - timekeeper.cycle_interval = interval; + tk->cycle_interval = interval; /* Go back from cycles -> shifted ns */ - timekeeper.xtime_interval = (u64) interval * clock->mult; - timekeeper.xtime_remainder = ntpinterval - timekeeper.xtime_interval; - timekeeper.raw_interval = + tk->xtime_interval = (u64) interval * clock->mult; + tk->xtime_remainder = ntpinterval - tk->xtime_interval; + tk->raw_interval = ((u64) interval * clock->mult) >> clock->shift; - timekeeper.xtime_nsec = 0; - timekeeper.shift = clock->shift; + /* if changing clocks, convert xtime_nsec shift units */ + if (old_clock) { + int shift_change = clock->shift - old_clock->shift; + if (shift_change < 0) + tk->xtime_nsec >>= -shift_change; + else + tk->xtime_nsec <<= shift_change; + } + tk->shift = clock->shift; - timekeeper.ntp_error = 0; - timekeeper.ntp_error_shift = NTP_SCALE_SHIFT - clock->shift; + tk->ntp_error = 0; + tk->ntp_error_shift = NTP_SCALE_SHIFT - clock->shift; /* * The timekeeper keeps its own mult values for the currently * active clocksource. These value will be adjusted via NTP * to counteract clock drifting. */ - timekeeper.mult = clock->mult; + tk->mult = clock->mult; } /* Timekeeper helper functions. */ -static inline s64 timekeeping_get_ns(void) +static inline s64 timekeeping_get_ns(struct timekeeper *tk) { cycle_t cycle_now, cycle_delta; struct clocksource *clock; + s64 nsec; /* read clocksource: */ - clock = timekeeper.clock; + clock = tk->clock; cycle_now = clock->read(clock); /* calculate the delta since the last update_wall_time: */ cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; - /* return delta convert to nanoseconds using ntp adjusted mult. */ - return clocksource_cyc2ns(cycle_delta, timekeeper.mult, - timekeeper.shift); + nsec = cycle_delta * tk->mult + tk->xtime_nsec; + nsec >>= tk->shift; + + /* If arch requires, add in gettimeoffset() */ + return nsec + arch_gettimeoffset(); } -static inline s64 timekeeping_get_ns_raw(void) +static inline s64 timekeeping_get_ns_raw(struct timekeeper *tk) { cycle_t cycle_now, cycle_delta; struct clocksource *clock; + s64 nsec; /* read clocksource: */ - clock = timekeeper.clock; + clock = tk->clock; cycle_now = clock->read(clock); /* calculate the delta since the last update_wall_time: */ cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; - /* return delta convert to nanoseconds. */ - return clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift); + /* convert delta to nanoseconds. */ + nsec = clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift); + + /* If arch requires, add in gettimeoffset() */ + return nsec + arch_gettimeoffset(); } -static void update_rt_offset(void) +static void update_rt_offset(struct timekeeper *tk) { - struct timespec tmp, *wtm = &timekeeper.wall_to_monotonic; + struct timespec tmp, *wtm = &tk->wall_to_monotonic; set_normalized_timespec(&tmp, -wtm->tv_sec, -wtm->tv_nsec); - timekeeper.offs_real = timespec_to_ktime(tmp); + tk->offs_real = timespec_to_ktime(tmp); } /* must hold write on timekeeper.lock */ -static void timekeeping_update(bool clearntp) +static void timekeeping_update(struct timekeeper *tk, bool clearntp) { + struct timespec xt; + if (clearntp) { - timekeeper.ntp_error = 0; + tk->ntp_error = 0; ntp_clear(); } - update_rt_offset(); - update_vsyscall(&timekeeper.xtime, &timekeeper.wall_to_monotonic, - timekeeper.clock, timekeeper.mult); + update_rt_offset(tk); + xt = tk_xtime(tk); + update_vsyscall(&xt, &tk->wall_to_monotonic, tk->clock, tk->mult); } @@ -206,27 +247,26 @@ static void timekeeping_update(bool clearntp) * update_wall_time(). This is useful before significant clock changes, * as it avoids having to deal with this time offset explicitly. */ -static void timekeeping_forward_now(void) +static void timekeeping_forward_now(struct timekeeper *tk) { cycle_t cycle_now, cycle_delta; struct clocksource *clock; s64 nsec; - clock = timekeeper.clock; + clock = tk->clock; cycle_now = clock->read(clock); cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; clock->cycle_last = cycle_now; - nsec = clocksource_cyc2ns(cycle_delta, timekeeper.mult, - timekeeper.shift); + tk->xtime_nsec += cycle_delta * tk->mult; /* If arch requires, add in gettimeoffset() */ - nsec += arch_gettimeoffset(); + tk->xtime_nsec += arch_gettimeoffset() << tk->shift; - timespec_add_ns(&timekeeper.xtime, nsec); + tk_normalize_xtime(tk); nsec = clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift); - timespec_add_ns(&timekeeper.raw_time, nsec); + timespec_add_ns(&tk->raw_time, nsec); } /** @@ -238,18 +278,15 @@ static void timekeeping_forward_now(void) void getnstimeofday(struct timespec *ts) { unsigned long seq; - s64 nsecs; + s64 nsecs = 0; WARN_ON(timekeeping_suspended); do { seq = read_seqbegin(&timekeeper.lock); - *ts = timekeeper.xtime; - nsecs = timekeeping_get_ns(); - - /* If arch requires, add in gettimeoffset() */ - nsecs += arch_gettimeoffset(); + ts->tv_sec = timekeeper.xtime_sec; + ts->tv_nsec = timekeeping_get_ns(&timekeeper); } while (read_seqretry(&timekeeper.lock, seq)); @@ -266,13 +303,10 @@ ktime_t ktime_get(void) do { seq = read_seqbegin(&timekeeper.lock); - secs = timekeeper.xtime.tv_sec + + secs = timekeeper.xtime_sec + timekeeper.wall_to_monotonic.tv_sec; - nsecs = timekeeper.xtime.tv_nsec + + nsecs = timekeeping_get_ns(&timekeeper) + timekeeper.wall_to_monotonic.tv_nsec; - nsecs += timekeeping_get_ns(); - /* If arch requires, add in gettimeoffset() */ - nsecs += arch_gettimeoffset(); } while (read_seqretry(&timekeeper.lock, seq)); /* @@ -295,22 +329,19 @@ void ktime_get_ts(struct timespec *ts) { struct timespec tomono; unsigned int seq; - s64 nsecs; WARN_ON(timekeeping_suspended); do { seq = read_seqbegin(&timekeeper.lock); - *ts = timekeeper.xtime; + ts->tv_sec = timekeeper.xtime_sec; + ts->tv_nsec = timekeeping_get_ns(&timekeeper); tomono = timekeeper.wall_to_monotonic; - nsecs = timekeeping_get_ns(); - /* If arch requires, add in gettimeoffset() */ - nsecs += arch_gettimeoffset(); } while (read_seqretry(&timekeeper.lock, seq)); set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec, - ts->tv_nsec + tomono.tv_nsec + nsecs); + ts->tv_nsec + tomono.tv_nsec); } EXPORT_SYMBOL_GPL(ktime_get_ts); @@ -333,20 +364,14 @@ void getnstime_raw_and_real(struct timespec *ts_raw, struct timespec *ts_real) WARN_ON_ONCE(timekeeping_suspended); do { - u32 arch_offset; - seq = read_seqbegin(&timekeeper.lock); *ts_raw = timekeeper.raw_time; - *ts_real = timekeeper.xtime; - - nsecs_raw = timekeeping_get_ns_raw(); - nsecs_real = timekeeping_get_ns(); + ts_real->tv_sec = timekeeper.xtime_sec; + ts_real->tv_nsec = 0; - /* If arch requires, add in gettimeoffset() */ - arch_offset = arch_gettimeoffset(); - nsecs_raw += arch_offset; - nsecs_real += arch_offset; + nsecs_raw = timekeeping_get_ns_raw(&timekeeper); + nsecs_real = timekeeping_get_ns(&timekeeper); } while (read_seqretry(&timekeeper.lock, seq)); @@ -381,7 +406,7 @@ EXPORT_SYMBOL(do_gettimeofday); */ int do_settimeofday(const struct timespec *tv) { - struct timespec ts_delta; + struct timespec ts_delta, xt; unsigned long flags; if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC) @@ -389,15 +414,18 @@ int do_settimeofday(const struct timespec *tv) write_seqlock_irqsave(&timekeeper.lock, flags); - timekeeping_forward_now(); + timekeeping_forward_now(&timekeeper); + + xt = tk_xtime(&timekeeper); + ts_delta.tv_sec = tv->tv_sec - xt.tv_sec; + ts_delta.tv_nsec = tv->tv_nsec - xt.tv_nsec; - ts_delta.tv_sec = tv->tv_sec - timekeeper.xtime.tv_sec; - ts_delta.tv_nsec = tv->tv_nsec - timekeeper.xtime.tv_nsec; timekeeper.wall_to_monotonic = timespec_sub(timekeeper.wall_to_monotonic, ts_delta); - timekeeper.xtime = *tv; - timekeeping_update(true); + tk_set_xtime(&timekeeper, tv); + + timekeeping_update(&timekeeper, true); write_sequnlock_irqrestore(&timekeeper.lock, flags); @@ -424,13 +452,14 @@ int timekeeping_inject_offset(struct timespec *ts) write_seqlock_irqsave(&timekeeper.lock, flags); - timekeeping_forward_now(); + timekeeping_forward_now(&timekeeper); + - timekeeper.xtime = timespec_add(timekeeper.xtime, *ts); + tk_xtime_add(&timekeeper, ts); timekeeper.wall_to_monotonic = timespec_sub(timekeeper.wall_to_monotonic, *ts); - timekeeping_update(true); + timekeeping_update(&timekeeper, true); write_sequnlock_irqrestore(&timekeeper.lock, flags); @@ -455,14 +484,14 @@ static int change_clocksource(void *data) write_seqlock_irqsave(&timekeeper.lock, flags); - timekeeping_forward_now(); + timekeeping_forward_now(&timekeeper); if (!new->enable || new->enable(new) == 0) { old = timekeeper.clock; - timekeeper_setup_internals(new); + tk_setup_internals(&timekeeper, new); if (old->disable) old->disable(old); } - timekeeping_update(true); + timekeeping_update(&timekeeper, true); write_sequnlock_irqrestore(&timekeeper.lock, flags); @@ -512,7 +541,7 @@ void getrawmonotonic(struct timespec *ts) do { seq = read_seqbegin(&timekeeper.lock); - nsecs = timekeeping_get_ns_raw(); + nsecs = timekeeping_get_ns_raw(&timekeeper); *ts = timekeeper.raw_time; } while (read_seqretry(&timekeeper.lock, seq)); @@ -547,6 +576,7 @@ u64 timekeeping_max_deferment(void) { unsigned long seq; u64 ret; + do { seq = read_seqbegin(&timekeeper.lock); @@ -607,19 +637,17 @@ void __init timekeeping_init(void) clock = clocksource_default_clock(); if (clock->enable) clock->enable(clock); - timekeeper_setup_internals(clock); + tk_setup_internals(&timekeeper, clock); - timekeeper.xtime.tv_sec = now.tv_sec; - timekeeper.xtime.tv_nsec = now.tv_nsec; + tk_set_xtime(&timekeeper, &now); timekeeper.raw_time.tv_sec = 0; timekeeper.raw_time.tv_nsec = 0; - if (boot.tv_sec == 0 && boot.tv_nsec == 0) { - boot.tv_sec = timekeeper.xtime.tv_sec; - boot.tv_nsec = timekeeper.xtime.tv_nsec; - } + if (boot.tv_sec == 0 && boot.tv_nsec == 0) + boot = tk_xtime(&timekeeper); + set_normalized_timespec(&timekeeper.wall_to_monotonic, -boot.tv_sec, -boot.tv_nsec); - update_rt_offset(); + update_rt_offset(&timekeeper); timekeeper.total_sleep_time.tv_sec = 0; timekeeper.total_sleep_time.tv_nsec = 0; write_sequnlock_irqrestore(&timekeeper.lock, flags); @@ -641,7 +669,8 @@ static void update_sleep_time(struct timespec t) * Takes a timespec offset measuring a suspend interval and properly * adds the sleep offset to the timekeeping variables. */ -static void __timekeeping_inject_sleeptime(struct timespec *delta) +static void __timekeeping_inject_sleeptime(struct timekeeper *tk, + struct timespec *delta) { if (!timespec_valid(delta)) { printk(KERN_WARNING "__timekeeping_inject_sleeptime: Invalid " @@ -649,10 +678,9 @@ static void __timekeeping_inject_sleeptime(struct timespec *delta) return; } - timekeeper.xtime = timespec_add(timekeeper.xtime, *delta); - timekeeper.wall_to_monotonic = - timespec_sub(timekeeper.wall_to_monotonic, *delta); - update_sleep_time(timespec_add(timekeeper.total_sleep_time, *delta)); + tk_xtime_add(tk, delta); + tk->wall_to_monotonic = timespec_sub(tk->wall_to_monotonic, *delta); + update_sleep_time(timespec_add(tk->total_sleep_time, *delta)); } @@ -678,11 +706,11 @@ void timekeeping_inject_sleeptime(struct timespec *delta) write_seqlock_irqsave(&timekeeper.lock, flags); - timekeeping_forward_now(); + timekeeping_forward_now(&timekeeper); - __timekeeping_inject_sleeptime(delta); + __timekeeping_inject_sleeptime(&timekeeper, delta); - timekeeping_update(true); + timekeeping_update(&timekeeper, true); write_sequnlock_irqrestore(&timekeeper.lock, flags); @@ -711,12 +739,13 @@ static void timekeeping_resume(void) if (timespec_compare(&ts, &timekeeping_suspend_time) > 0) { ts = timespec_sub(ts, timekeeping_suspend_time); - __timekeeping_inject_sleeptime(&ts); + __timekeeping_inject_sleeptime(&timekeeper, &ts); } /* re-base the last cycle value */ timekeeper.clock->cycle_last = timekeeper.clock->read(timekeeper.clock); timekeeper.ntp_error = 0; timekeeping_suspended = 0; + timekeeping_update(&timekeeper, false); write_sequnlock_irqrestore(&timekeeper.lock, flags); touch_softlockup_watchdog(); @@ -736,7 +765,7 @@ static int timekeeping_suspend(void) read_persistent_clock(&timekeeping_suspend_time); write_seqlock_irqsave(&timekeeper.lock, flags); - timekeeping_forward_now(); + timekeeping_forward_now(&timekeeper); timekeeping_suspended = 1; /* @@ -745,7 +774,7 @@ static int timekeeping_suspend(void) * try to compensate so the difference in system time * and persistent_clock time stays close to constant. */ - delta = timespec_sub(timekeeper.xtime, timekeeping_suspend_time); + delta = timespec_sub(tk_xtime(&timekeeper), timekeeping_suspend_time); delta_delta = timespec_sub(delta, old_delta); if (abs(delta_delta.tv_sec) >= 2) { /* @@ -784,7 +813,8 @@ device_initcall(timekeeping_init_ops); * If the error is already larger, we look ahead even further * to compensate for late or lost adjustments. */ -static __always_inline int timekeeping_bigadjust(s64 error, s64 *interval, +static __always_inline int timekeeping_bigadjust(struct timekeeper *tk, + s64 error, s64 *interval, s64 *offset) { s64 tick_error, i; @@ -800,7 +830,7 @@ static __always_inline int timekeeping_bigadjust(s64 error, s64 *interval, * here. This is tuned so that an error of about 1 msec is adjusted * within about 1 sec (or 2^20 nsec in 2^SHIFT_HZ ticks). */ - error2 = timekeeper.ntp_error >> (NTP_SCALE_SHIFT + 22 - 2 * SHIFT_HZ); + error2 = tk->ntp_error >> (NTP_SCALE_SHIFT + 22 - 2 * SHIFT_HZ); error2 = abs(error2); for (look_ahead = 0; error2 > 0; look_ahead++) error2 >>= 2; @@ -809,8 +839,8 @@ static __always_inline int timekeeping_bigadjust(s64 error, s64 *interval, * Now calculate the error in (1 << look_ahead) ticks, but first * remove the single look ahead already included in the error. */ - tick_error = ntp_tick_length() >> (timekeeper.ntp_error_shift + 1); - tick_error -= timekeeper.xtime_interval >> 1; + tick_error = ntp_tick_length() >> (tk->ntp_error_shift + 1); + tick_error -= tk->xtime_interval >> 1; error = ((error - tick_error) >> look_ahead) + tick_error; /* Finally calculate the adjustment shift value. */ @@ -835,9 +865,9 @@ static __always_inline int timekeeping_bigadjust(s64 error, s64 *interval, * this is optimized for the most common adjustments of -1,0,1, * for other values we can do a bit more work. */ -static void timekeeping_adjust(s64 offset) +static void timekeeping_adjust(struct timekeeper *tk, s64 offset) { - s64 error, interval = timekeeper.cycle_interval; + s64 error, interval = tk->cycle_interval; int adj; /* @@ -853,7 +883,7 @@ static void timekeeping_adjust(s64 offset) * * Note: It does not "save" on aggravation when reading the code. */ - error = timekeeper.ntp_error >> (timekeeper.ntp_error_shift - 1); + error = tk->ntp_error >> (tk->ntp_error_shift - 1); if (error > interval) { /* * We now divide error by 4(via shift), which checks if @@ -875,7 +905,8 @@ static void timekeeping_adjust(s64 offset) if (likely(error <= interval)) adj = 1; else - adj = timekeeping_bigadjust(error, &interval, &offset); + adj = timekeeping_bigadjust(tk, error, &interval, + &offset); } else if (error < -interval) { /* See comment above, this is just switched for the negative */ error >>= 2; @@ -884,18 +915,17 @@ static void timekeeping_adjust(s64 offset) interval = -interval; offset = -offset; } else - adj = timekeeping_bigadjust(error, &interval, &offset); - } else /* No adjustment needed */ + adj = timekeeping_bigadjust(tk, error, &interval, + &offset); + } else return; - if (unlikely(timekeeper.clock->maxadj && - (timekeeper.mult + adj > - timekeeper.clock->mult + timekeeper.clock->maxadj))) { + if (unlikely(tk->clock->maxadj && + (tk->mult + adj > tk->clock->mult + tk->clock->maxadj))) { printk_once(KERN_WARNING "Adjusting %s more than 11%% (%ld vs %ld)\n", - timekeeper.clock->name, (long)timekeeper.mult + adj, - (long)timekeeper.clock->mult + - timekeeper.clock->maxadj); + tk->clock->name, (long)tk->mult + adj, + (long)tk->clock->mult + tk->clock->maxadj); } /* * So the following can be confusing. @@ -946,11 +976,60 @@ static void timekeeping_adjust(s64 offset) * * XXX - TODO: Doc ntp_error calculation. */ - timekeeper.mult += adj; - timekeeper.xtime_interval += interval; - timekeeper.xtime_nsec -= offset; - timekeeper.ntp_error -= (interval - offset) << - timekeeper.ntp_error_shift; + tk->mult += adj; + tk->xtime_interval += interval; + tk->xtime_nsec -= offset; + tk->ntp_error -= (interval - offset) << tk->ntp_error_shift; + + /* + * It may be possible that when we entered this function, xtime_nsec + * was very small. Further, if we're slightly speeding the clocksource + * in the code above, its possible the required corrective factor to + * xtime_nsec could cause it to underflow. + * + * Now, since we already accumulated the second, cannot simply roll + * the accumulated second back, since the NTP subsystem has been + * notified via second_overflow. So instead we push xtime_nsec forward + * by the amount we underflowed, and add that amount into the error. + * + * We'll correct this error next time through this function, when + * xtime_nsec is not as small. + */ + if (unlikely((s64)tk->xtime_nsec < 0)) { + s64 neg = -(s64)tk->xtime_nsec; + tk->xtime_nsec = 0; + tk->ntp_error += neg << tk->ntp_error_shift; + } + +} + + +/** + * accumulate_nsecs_to_secs - Accumulates nsecs into secs + * + * Helper function that accumulates a the nsecs greater then a second + * from the xtime_nsec field to the xtime_secs field. + * It also calls into the NTP code to handle leapsecond processing. + * + */ +static inline void accumulate_nsecs_to_secs(struct timekeeper *tk) +{ + u64 nsecps = (u64)NSEC_PER_SEC << tk->shift; + + while (tk->xtime_nsec >= nsecps) { + int leap; + + tk->xtime_nsec -= nsecps; + tk->xtime_sec++; + + /* Figure out if its a leap sec and apply if needed */ + leap = second_overflow(tk->xtime_sec); + tk->xtime_sec += leap; + tk->wall_to_monotonic.tv_sec -= leap; + if (leap) + clock_was_set_delayed(); + + } } @@ -963,46 +1042,36 @@ static void timekeeping_adjust(s64 offset) * * Returns the unconsumed cycles. */ -static cycle_t logarithmic_accumulation(cycle_t offset, int shift) +static cycle_t logarithmic_accumulation(struct timekeeper *tk, cycle_t offset, + u32 shift) { - u64 nsecps = (u64)NSEC_PER_SEC << timekeeper.shift; u64 raw_nsecs; - /* If the offset is smaller than a shifted interval, do nothing */ - if (offset < timekeeper.cycle_interval<<shift) + /* If the offset is smaller then a shifted interval, do nothing */ + if (offset < tk->cycle_interval<<shift) return offset; /* Accumulate one shifted interval */ - offset -= timekeeper.cycle_interval << shift; - timekeeper.clock->cycle_last += timekeeper.cycle_interval << shift; + offset -= tk->cycle_interval << shift; + tk->clock->cycle_last += tk->cycle_interval << shift; - timekeeper.xtime_nsec += timekeeper.xtime_interval << shift; - while (timekeeper.xtime_nsec >= nsecps) { - int leap; - timekeeper.xtime_nsec -= nsecps; - timekeeper.xtime.tv_sec++; - leap = second_overflow(timekeeper.xtime.tv_sec); - timekeeper.xtime.tv_sec += leap; - timekeeper.wall_to_monotonic.tv_sec -= leap; - if (leap) - clock_was_set_delayed(); - } + tk->xtime_nsec += tk->xtime_interval << shift; + accumulate_nsecs_to_secs(tk); /* Accumulate raw time */ - raw_nsecs = timekeeper.raw_interval << shift; - raw_nsecs += timekeeper.raw_time.tv_nsec; + raw_nsecs = tk->raw_interval << shift; + raw_nsecs += tk->raw_time.tv_nsec; if (raw_nsecs >= NSEC_PER_SEC) { u64 raw_secs = raw_nsecs; raw_nsecs = do_div(raw_secs, NSEC_PER_SEC); - timekeeper.raw_time.tv_sec += raw_secs; + tk->raw_time.tv_sec += raw_secs; } - timekeeper.raw_time.tv_nsec = raw_nsecs; + tk->raw_time.tv_nsec = raw_nsecs; /* Accumulate error between NTP and clock interval */ - timekeeper.ntp_error += ntp_tick_length() << shift; - timekeeper.ntp_error -= - (timekeeper.xtime_interval + timekeeper.xtime_remainder) << - (timekeeper.ntp_error_shift + shift); + tk->ntp_error += ntp_tick_length() << shift; + tk->ntp_error -= (tk->xtime_interval + tk->xtime_remainder) << + (tk->ntp_error_shift + shift); return offset; } @@ -1018,6 +1087,7 @@ static void update_wall_time(void) cycle_t offset; int shift = 0, maxshift; unsigned long flags; + s64 remainder; write_seqlock_irqsave(&timekeeper.lock, flags); @@ -1032,8 +1102,6 @@ static void update_wall_time(void) #else offset = (clock->read(clock) - clock->cycle_last) & clock->mask; #endif - timekeeper.xtime_nsec = (s64)timekeeper.xtime.tv_nsec << - timekeeper.shift; /* * With NO_HZ we may have to accumulate many cycle_intervals @@ -1049,64 +1117,36 @@ static void update_wall_time(void) maxshift = (64 - (ilog2(ntp_tick_length())+1)) - 1; shift = min(shift, maxshift); while (offset >= timekeeper.cycle_interval) { - offset = logarithmic_accumulation(offset, shift); + offset = logarithmic_accumulation(&timekeeper, offset, shift); if(offset < timekeeper.cycle_interval<<shift) shift--; } /* correct the clock when NTP error is too big */ - timekeeping_adjust(offset); - - /* - * Since in the loop above, we accumulate any amount of time - * in xtime_nsec over a second into xtime.tv_sec, its possible for - * xtime_nsec to be fairly small after the loop. Further, if we're - * slightly speeding the clocksource up in timekeeping_adjust(), - * its possible the required corrective factor to xtime_nsec could - * cause it to underflow. - * - * Now, we cannot simply roll the accumulated second back, since - * the NTP subsystem has been notified via second_overflow. So - * instead we push xtime_nsec forward by the amount we underflowed, - * and add that amount into the error. - * - * We'll correct this error next time through this function, when - * xtime_nsec is not as small. - */ - if (unlikely((s64)timekeeper.xtime_nsec < 0)) { - s64 neg = -(s64)timekeeper.xtime_nsec; - timekeeper.xtime_nsec = 0; - timekeeper.ntp_error += neg << timekeeper.ntp_error_shift; - } + timekeeping_adjust(&timekeeper, offset); /* - * Store full nanoseconds into xtime after rounding it up and - * add the remainder to the error difference. - */ - timekeeper.xtime.tv_nsec = ((s64)timekeeper.xtime_nsec >> - timekeeper.shift) + 1; - timekeeper.xtime_nsec -= (s64)timekeeper.xtime.tv_nsec << - timekeeper.shift; - timekeeper.ntp_error += timekeeper.xtime_nsec << - timekeeper.ntp_error_shift; + * Store only full nanoseconds into xtime_nsec after rounding + * it up and add the remainder to the error difference. + * XXX - This is necessary to avoid small 1ns inconsistnecies caused + * by truncating the remainder in vsyscalls. However, it causes + * additional work to be done in timekeeping_adjust(). Once + * the vsyscall implementations are converted to use xtime_nsec + * (shifted nanoseconds), this can be killed. + */ + remainder = timekeeper.xtime_nsec & ((1 << timekeeper.shift) - 1); + timekeeper.xtime_nsec -= remainder; + timekeeper.xtime_nsec += 1 << timekeeper.shift; + timekeeper.ntp_error += remainder << timekeeper.ntp_error_shift; /* * Finally, make sure that after the rounding - * xtime.tv_nsec isn't larger than NSEC_PER_SEC + * xtime_nsec isn't larger than NSEC_PER_SEC */ - if (unlikely(timekeeper.xtime.tv_nsec >= NSEC_PER_SEC)) { - int leap; - timekeeper.xtime.tv_nsec -= NSEC_PER_SEC; - timekeeper.xtime.tv_sec++; - leap = second_overflow(timekeeper.xtime.tv_sec); - timekeeper.xtime.tv_sec += leap; - timekeeper.wall_to_monotonic.tv_sec -= leap; - if (leap) - clock_was_set_delayed(); - } + accumulate_nsecs_to_secs(&timekeeper); - timekeeping_update(false); + timekeeping_update(&timekeeper, false); out: write_sequnlock_irqrestore(&timekeeper.lock, flags); @@ -1151,21 +1191,20 @@ void get_monotonic_boottime(struct timespec *ts) { struct timespec tomono, sleep; unsigned int seq; - s64 nsecs; WARN_ON(timekeeping_suspended); do { seq = read_seqbegin(&timekeeper.lock); - *ts = timekeeper.xtime; + ts->tv_sec = timekeeper.xtime_sec; + ts->tv_nsec = timekeeping_get_ns(&timekeeper); tomono = timekeeper.wall_to_monotonic; sleep = timekeeper.total_sleep_time; - nsecs = timekeeping_get_ns(); } while (read_seqretry(&timekeeper.lock, seq)); set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec + sleep.tv_sec, - ts->tv_nsec + tomono.tv_nsec + sleep.tv_nsec + nsecs); + ts->tv_nsec + tomono.tv_nsec + sleep.tv_nsec); } EXPORT_SYMBOL_GPL(get_monotonic_boottime); @@ -1198,13 +1237,13 @@ EXPORT_SYMBOL_GPL(monotonic_to_bootbased); unsigned long get_seconds(void) { - return timekeeper.xtime.tv_sec; + return timekeeper.xtime_sec; } EXPORT_SYMBOL(get_seconds); struct timespec __current_kernel_time(void) { - return timekeeper.xtime; + return tk_xtime(&timekeeper); } struct timespec current_kernel_time(void) @@ -1215,7 +1254,7 @@ struct timespec current_kernel_time(void) do { seq = read_seqbegin(&timekeeper.lock); - now = timekeeper.xtime; + now = tk_xtime(&timekeeper); } while (read_seqretry(&timekeeper.lock, seq)); return now; @@ -1230,7 +1269,7 @@ struct timespec get_monotonic_coarse(void) do { seq = read_seqbegin(&timekeeper.lock); - now = timekeeper.xtime; + now = tk_xtime(&timekeeper); mono = timekeeper.wall_to_monotonic; } while (read_seqretry(&timekeeper.lock, seq)); @@ -1265,7 +1304,7 @@ void get_xtime_and_monotonic_and_sleep_offset(struct timespec *xtim, do { seq = read_seqbegin(&timekeeper.lock); - *xtim = timekeeper.xtime; + *xtim = tk_xtime(&timekeeper); *wtom = timekeeper.wall_to_monotonic; *sleep = timekeeper.total_sleep_time; } while (read_seqretry(&timekeeper.lock, seq)); @@ -1289,11 +1328,8 @@ ktime_t ktime_get_update_offsets(ktime_t *offs_real, ktime_t *offs_boot) do { seq = read_seqbegin(&timekeeper.lock); - secs = timekeeper.xtime.tv_sec; - nsecs = timekeeper.xtime.tv_nsec; - nsecs += timekeeping_get_ns(); - /* If arch requires, add in gettimeoffset() */ - nsecs += arch_gettimeoffset(); + secs = timekeeper.xtime_sec; + nsecs = timekeeping_get_ns(&timekeeper); *offs_real = timekeeper.offs_real; *offs_boot = timekeeper.offs_boot; diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c index 3258455549f4..af5a7e9f164b 100644 --- a/kernel/time/timer_list.c +++ b/kernel/time/timer_list.c @@ -167,7 +167,7 @@ static void print_cpu(struct seq_file *m, int cpu, u64 now) { struct tick_sched *ts = tick_get_tick_sched(cpu); P(nohz_mode); - P_ns(idle_tick); + P_ns(last_tick); P(tick_stopped); P(idle_jiffies); P(idle_calls); @@ -259,7 +259,7 @@ static int timer_list_show(struct seq_file *m, void *v) u64 now = ktime_to_ns(ktime_get()); int cpu; - SEQ_printf(m, "Timer List Version: v0.6\n"); + SEQ_printf(m, "Timer List Version: v0.7\n"); SEQ_printf(m, "HRTIMER_MAX_CLOCK_BASES: %d\n", HRTIMER_MAX_CLOCK_BASES); SEQ_printf(m, "now at %Ld nsecs\n", (unsigned long long)now); diff --git a/kernel/timer.c b/kernel/timer.c index 6ec7e7e0db43..a61c09374eba 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -77,6 +77,7 @@ struct tvec_base { struct timer_list *running_timer; unsigned long timer_jiffies; unsigned long next_timer; + unsigned long active_timers; struct tvec_root tv1; struct tvec tv2; struct tvec tv3; @@ -330,7 +331,8 @@ void set_timer_slack(struct timer_list *timer, int slack_hz) } EXPORT_SYMBOL_GPL(set_timer_slack); -static void internal_add_timer(struct tvec_base *base, struct timer_list *timer) +static void +__internal_add_timer(struct tvec_base *base, struct timer_list *timer) { unsigned long expires = timer->expires; unsigned long idx = expires - base->timer_jiffies; @@ -372,6 +374,19 @@ static void internal_add_timer(struct tvec_base *base, struct timer_list *timer) list_add_tail(&timer->entry, vec); } +static void internal_add_timer(struct tvec_base *base, struct timer_list *timer) +{ + __internal_add_timer(base, timer); + /* + * Update base->active_timers and base->next_timer + */ + if (!tbase_get_deferrable(timer->base)) { + if (time_before(timer->expires, base->next_timer)) + base->next_timer = timer->expires; + base->active_timers++; + } +} + #ifdef CONFIG_TIMER_STATS void __timer_stats_timer_set_start_info(struct timer_list *timer, void *addr) { @@ -654,8 +669,7 @@ void init_timer_deferrable_key(struct timer_list *timer, } EXPORT_SYMBOL(init_timer_deferrable_key); -static inline void detach_timer(struct timer_list *timer, - int clear_pending) +static inline void detach_timer(struct timer_list *timer, bool clear_pending) { struct list_head *entry = &timer->entry; @@ -667,6 +681,29 @@ static inline void detach_timer(struct timer_list *timer, entry->prev = LIST_POISON2; } +static inline void +detach_expired_timer(struct timer_list *timer, struct tvec_base *base) +{ + detach_timer(timer, true); + if (!tbase_get_deferrable(timer->base)) + timer->base->active_timers--; +} + +static int detach_if_pending(struct timer_list *timer, struct tvec_base *base, + bool clear_pending) +{ + if (!timer_pending(timer)) + return 0; + + detach_timer(timer, clear_pending); + if (!tbase_get_deferrable(timer->base)) { + timer->base->active_timers--; + if (timer->expires == base->next_timer) + base->next_timer = base->timer_jiffies; + } + return 1; +} + /* * We are using hashed locking: holding per_cpu(tvec_bases).lock * means that all timers which are tied to this base via timer->base are @@ -712,16 +749,9 @@ __mod_timer(struct timer_list *timer, unsigned long expires, base = lock_timer_base(timer, &flags); - if (timer_pending(timer)) { - detach_timer(timer, 0); - if (timer->expires == base->next_timer && - !tbase_get_deferrable(timer->base)) - base->next_timer = base->timer_jiffies; - ret = 1; - } else { - if (pending_only) - goto out_unlock; - } + ret = detach_if_pending(timer, base, false); + if (!ret && pending_only) + goto out_unlock; debug_activate(timer, expires); @@ -752,9 +782,6 @@ __mod_timer(struct timer_list *timer, unsigned long expires, } timer->expires = expires; - if (time_before(timer->expires, base->next_timer) && - !tbase_get_deferrable(timer->base)) - base->next_timer = timer->expires; internal_add_timer(base, timer); out_unlock: @@ -920,9 +947,6 @@ void add_timer_on(struct timer_list *timer, int cpu) spin_lock_irqsave(&base->lock, flags); timer_set_base(timer, base); debug_activate(timer, timer->expires); - if (time_before(timer->expires, base->next_timer) && - !tbase_get_deferrable(timer->base)) - base->next_timer = timer->expires; internal_add_timer(base, timer); /* * Check whether the other CPU is idle and needs to be @@ -959,13 +983,7 @@ int del_timer(struct timer_list *timer) timer_stats_timer_clear_start_info(timer); if (timer_pending(timer)) { base = lock_timer_base(timer, &flags); - if (timer_pending(timer)) { - detach_timer(timer, 1); - if (timer->expires == base->next_timer && - !tbase_get_deferrable(timer->base)) - base->next_timer = base->timer_jiffies; - ret = 1; - } + ret = detach_if_pending(timer, base, true); spin_unlock_irqrestore(&base->lock, flags); } @@ -990,19 +1008,10 @@ int try_to_del_timer_sync(struct timer_list *timer) base = lock_timer_base(timer, &flags); - if (base->running_timer == timer) - goto out; - - timer_stats_timer_clear_start_info(timer); - ret = 0; - if (timer_pending(timer)) { - detach_timer(timer, 1); - if (timer->expires == base->next_timer && - !tbase_get_deferrable(timer->base)) - base->next_timer = base->timer_jiffies; - ret = 1; + if (base->running_timer != timer) { + timer_stats_timer_clear_start_info(timer); + ret = detach_if_pending(timer, base, true); } -out: spin_unlock_irqrestore(&base->lock, flags); return ret; @@ -1089,7 +1098,8 @@ static int cascade(struct tvec_base *base, struct tvec *tv, int index) */ list_for_each_entry_safe(timer, tmp, &tv_list, entry) { BUG_ON(tbase_get_base(timer->base) != base); - internal_add_timer(base, timer); + /* No accounting, while moving them */ + __internal_add_timer(base, timer); } return index; @@ -1178,7 +1188,7 @@ static inline void __run_timers(struct tvec_base *base) timer_stats_account_timer(timer); base->running_timer = timer; - detach_timer(timer, 1); + detach_expired_timer(timer, base); spin_unlock_irq(&base->lock); call_timer_fn(timer, fn, data); @@ -1316,18 +1326,21 @@ static unsigned long cmp_next_hrtimer_event(unsigned long now, unsigned long get_next_timer_interrupt(unsigned long now) { struct tvec_base *base = __this_cpu_read(tvec_bases); - unsigned long expires; + unsigned long expires = now + NEXT_TIMER_MAX_DELTA; /* * Pretend that there is no timer pending if the cpu is offline. * Possible pending timers will be migrated later to an active cpu. */ if (cpu_is_offline(smp_processor_id())) - return now + NEXT_TIMER_MAX_DELTA; + return expires; + spin_lock(&base->lock); - if (time_before_eq(base->next_timer, base->timer_jiffies)) - base->next_timer = __next_timer_interrupt(base); - expires = base->next_timer; + if (base->active_timers) { + if (time_before_eq(base->next_timer, base->timer_jiffies)) + base->next_timer = __next_timer_interrupt(base); + expires = base->next_timer; + } spin_unlock(&base->lock); if (time_before_eq(expires, now)) @@ -1704,6 +1717,7 @@ static int __cpuinit init_timers_cpu(int cpu) base->timer_jiffies = jiffies; base->next_timer = base->timer_jiffies; + base->active_timers = 0; return 0; } @@ -1714,11 +1728,9 @@ static void migrate_timer_list(struct tvec_base *new_base, struct list_head *hea while (!list_empty(head)) { timer = list_first_entry(head, struct timer_list, entry); - detach_timer(timer, 0); + /* We ignore the accounting on the dying cpu */ + detach_timer(timer, false); timer_set_base(timer, new_base); - if (time_before(timer->expires, new_base->next_timer) && - !tbase_get_deferrable(timer->base)) - new_base->next_timer = timer->expires; internal_add_timer(new_base, timer); } } diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index a008663d86c8..b4f20fba09fc 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -312,7 +312,7 @@ static int remove_ftrace_list_ops(struct ftrace_ops **list, static int __register_ftrace_function(struct ftrace_ops *ops) { - if (ftrace_disabled) + if (unlikely(ftrace_disabled)) return -ENODEV; if (FTRACE_WARN_ON(ops == &global_ops)) @@ -4299,16 +4299,12 @@ int register_ftrace_function(struct ftrace_ops *ops) mutex_lock(&ftrace_lock); - if (unlikely(ftrace_disabled)) - goto out_unlock; - ret = __register_ftrace_function(ops); if (!ret) ret = ftrace_startup(ops, 0); - - out_unlock: mutex_unlock(&ftrace_lock); + return ret; } EXPORT_SYMBOL_GPL(register_ftrace_function); diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 1d0f6a8a0e5e..49491fa7daa2 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1075,6 +1075,7 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int nr_pages, int cpu) rb_init_page(bpage->page); INIT_LIST_HEAD(&cpu_buffer->reader_page->list); + INIT_LIST_HEAD(&cpu_buffer->new_pages); ret = rb_allocate_pages(cpu_buffer, nr_pages); if (ret < 0) @@ -1346,10 +1347,9 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned int nr_pages) * If something was added to this page, it was full * since it is not the tail page. So we deduct the * bytes consumed in ring buffer from here. - * No need to update overruns, since this page is - * deleted from ring buffer and its entries are - * already accounted for. + * Increment overrun to account for the lost events. */ + local_add(page_entries, &cpu_buffer->overrun); local_sub(BUF_PAGE_SIZE, &cpu_buffer->entries_bytes); } @@ -3239,6 +3239,10 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) if (cpu_buffer->commit_page == cpu_buffer->reader_page) goto out; + /* Don't bother swapping if the ring buffer is empty */ + if (rb_num_of_entries(cpu_buffer) == 0) + goto out; + /* * Reset the reader page to size zero. */ diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index a7fa0702be1c..a120f98c4112 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -830,6 +830,8 @@ int register_tracer(struct tracer *type) current_trace = saved_tracer; if (ret) { printk(KERN_CONT "FAILED!\n"); + /* Add the warning after printing 'FAILED' */ + WARN_ON(1); goto out; } /* Only reset on passing, to avoid touching corrupted buffers */ @@ -1708,9 +1710,11 @@ EXPORT_SYMBOL_GPL(trace_vprintk); static void trace_iterator_increment(struct trace_iterator *iter) { + struct ring_buffer_iter *buf_iter = trace_buffer_iter(iter, iter->cpu); + iter->idx++; - if (iter->buffer_iter[iter->cpu]) - ring_buffer_read(iter->buffer_iter[iter->cpu], NULL); + if (buf_iter) + ring_buffer_read(buf_iter, NULL); } static struct trace_entry * @@ -1718,7 +1722,7 @@ peek_next_entry(struct trace_iterator *iter, int cpu, u64 *ts, unsigned long *lost_events) { struct ring_buffer_event *event; - struct ring_buffer_iter *buf_iter = iter->buffer_iter[cpu]; + struct ring_buffer_iter *buf_iter = trace_buffer_iter(iter, cpu); if (buf_iter) event = ring_buffer_iter_peek(buf_iter, ts); @@ -1856,10 +1860,10 @@ void tracing_iter_reset(struct trace_iterator *iter, int cpu) tr->data[cpu]->skipped_entries = 0; - if (!iter->buffer_iter[cpu]) + buf_iter = trace_buffer_iter(iter, cpu); + if (!buf_iter) return; - buf_iter = iter->buffer_iter[cpu]; ring_buffer_iter_reset(buf_iter); /* @@ -2205,13 +2209,15 @@ static enum print_line_t print_bin_fmt(struct trace_iterator *iter) int trace_empty(struct trace_iterator *iter) { + struct ring_buffer_iter *buf_iter; int cpu; /* If we are looking at one CPU buffer, only check that one */ if (iter->cpu_file != TRACE_PIPE_ALL_CPU) { cpu = iter->cpu_file; - if (iter->buffer_iter[cpu]) { - if (!ring_buffer_iter_empty(iter->buffer_iter[cpu])) + buf_iter = trace_buffer_iter(iter, cpu); + if (buf_iter) { + if (!ring_buffer_iter_empty(buf_iter)) return 0; } else { if (!ring_buffer_empty_cpu(iter->tr->buffer, cpu)) @@ -2221,8 +2227,9 @@ int trace_empty(struct trace_iterator *iter) } for_each_tracing_cpu(cpu) { - if (iter->buffer_iter[cpu]) { - if (!ring_buffer_iter_empty(iter->buffer_iter[cpu])) + buf_iter = trace_buffer_iter(iter, cpu); + if (buf_iter) { + if (!ring_buffer_iter_empty(buf_iter)) return 0; } else { if (!ring_buffer_empty_cpu(iter->tr->buffer, cpu)) @@ -2381,6 +2388,11 @@ __tracing_open(struct inode *inode, struct file *file) if (!iter) return ERR_PTR(-ENOMEM); + iter->buffer_iter = kzalloc(sizeof(*iter->buffer_iter) * num_possible_cpus(), + GFP_KERNEL); + if (!iter->buffer_iter) + goto release; + /* * We make a copy of the current tracer to avoid concurrent * changes on it while we are reading. @@ -2441,6 +2453,8 @@ __tracing_open(struct inode *inode, struct file *file) fail: mutex_unlock(&trace_types_lock); kfree(iter->trace); + kfree(iter->buffer_iter); +release: seq_release_private(inode, file); return ERR_PTR(-ENOMEM); } @@ -2481,6 +2495,7 @@ static int tracing_release(struct inode *inode, struct file *file) mutex_destroy(&iter->mutex); free_cpumask_var(iter->started); kfree(iter->trace); + kfree(iter->buffer_iter); seq_release_private(inode, file); return 0; } diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 5aec220d2de0..55e1f7f0db12 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -317,6 +317,14 @@ struct tracer { #define TRACE_PIPE_ALL_CPU -1 +static inline struct ring_buffer_iter * +trace_buffer_iter(struct trace_iterator *iter, int cpu) +{ + if (iter->buffer_iter && iter->buffer_iter[cpu]) + return iter->buffer_iter[cpu]; + return NULL; +} + int tracer_init(struct tracer *t, struct trace_array *tr); int tracing_is_enabled(void); void trace_wake_up(void); diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index a7d2a4c653d8..ce27c8ba8d31 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -538,7 +538,7 @@ get_return_for_leaf(struct trace_iterator *iter, next = &data->ret; } else { - ring_iter = iter->buffer_iter[iter->cpu]; + ring_iter = trace_buffer_iter(iter, iter->cpu); /* First peek to compare current entry and the next one */ if (ring_iter) diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index df611a0e76c5..123b189c732c 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -1325,4 +1325,4 @@ __init static int init_events(void) return 0; } -device_initcall(init_events); +early_initcall(init_events); |