diff options
Diffstat (limited to 'kernel')
44 files changed, 2217 insertions, 438 deletions
diff --git a/kernel/audit.c b/kernel/audit.c index cebb11db4d34..1f37f15117e5 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -499,7 +499,6 @@ static int kauditd_thread(void *dummy) set_freezable(); while (!kthread_should_stop()) { struct sk_buff *skb; - DECLARE_WAITQUEUE(wait, current); flush_hold_queue(); @@ -514,16 +513,8 @@ static int kauditd_thread(void *dummy) audit_printk_skb(skb); continue; } - set_current_state(TASK_INTERRUPTIBLE); - add_wait_queue(&kauditd_wait, &wait); - if (!skb_queue_len(&audit_skb_queue)) { - try_to_freeze(); - schedule(); - } - - __set_current_state(TASK_RUNNING); - remove_wait_queue(&kauditd_wait, &wait); + wait_event_freezable(kauditd_wait, skb_queue_len(&audit_skb_queue)); } return 0; } diff --git a/kernel/cpu.c b/kernel/cpu.c index 90a3d017b90c..5d220234b3ca 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -86,6 +86,16 @@ static struct { #define cpuhp_lock_acquire() lock_map_acquire(&cpu_hotplug.dep_map) #define cpuhp_lock_release() lock_map_release(&cpu_hotplug.dep_map) +static void apply_puts_pending(int max) +{ + int delta; + + if (atomic_read(&cpu_hotplug.puts_pending) >= max) { + delta = atomic_xchg(&cpu_hotplug.puts_pending, 0); + cpu_hotplug.refcount -= delta; + } +} + void get_online_cpus(void) { might_sleep(); @@ -93,6 +103,7 @@ void get_online_cpus(void) return; cpuhp_lock_acquire_read(); mutex_lock(&cpu_hotplug.lock); + apply_puts_pending(65536); cpu_hotplug.refcount++; mutex_unlock(&cpu_hotplug.lock); } @@ -105,6 +116,7 @@ bool try_get_online_cpus(void) if (!mutex_trylock(&cpu_hotplug.lock)) return false; cpuhp_lock_acquire_tryread(); + apply_puts_pending(65536); cpu_hotplug.refcount++; mutex_unlock(&cpu_hotplug.lock); return true; @@ -161,12 +173,7 @@ void cpu_hotplug_begin(void) cpuhp_lock_acquire(); for (;;) { mutex_lock(&cpu_hotplug.lock); - if (atomic_read(&cpu_hotplug.puts_pending)) { - int delta; - - delta = atomic_xchg(&cpu_hotplug.puts_pending, 0); - cpu_hotplug.refcount -= delta; - } + apply_puts_pending(1); if (likely(!cpu_hotplug.refcount)) break; __set_current_state(TASK_UNINTERRUPTIBLE); diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 1f107c74087b..723cfc9d0ad7 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -506,6 +506,16 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial) goto out; } + /* + * We can't shrink if we won't have enough room for SCHED_DEADLINE + * tasks. + */ + ret = -EBUSY; + if (is_cpu_exclusive(cur) && + !cpuset_cpumask_can_shrink(cur->cpus_allowed, + trial->cpus_allowed)) + goto out; + ret = 0; out: rcu_read_unlock(); @@ -1429,17 +1439,8 @@ static int cpuset_can_attach(struct cgroup_subsys_state *css, goto out_unlock; cgroup_taskset_for_each(task, tset) { - /* - * Kthreads which disallow setaffinity shouldn't be moved - * to a new cpuset; we don't want to change their cpu - * affinity and isolating such threads by their set of - * allowed nodes is unnecessary. Thus, cpusets are not - * applicable for such threads. This prevents checking for - * success of set_cpus_allowed_ptr() on all attached tasks - * before cpus_allowed may be changed. - */ - ret = -EINVAL; - if (task->flags & PF_NO_SETAFFINITY) + ret = task_can_attach(task, cs->cpus_allowed); + if (ret) goto out_unlock; ret = security_task_setscheduler(task); if (ret) diff --git a/kernel/events/core.c b/kernel/events/core.c index e56923026dd8..113b837470cd 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -4460,7 +4460,7 @@ perf_output_sample_regs(struct perf_output_handle *handle, } } -static void perf_sample_regs_user(struct perf_regs_user *regs_user, +static void perf_sample_regs_user(struct perf_regs *regs_user, struct pt_regs *regs) { if (!user_mode(regs)) { @@ -4471,11 +4471,22 @@ static void perf_sample_regs_user(struct perf_regs_user *regs_user, } if (regs) { - regs_user->regs = regs; regs_user->abi = perf_reg_abi(current); + regs_user->regs = regs; + } else { + regs_user->abi = PERF_SAMPLE_REGS_ABI_NONE; + regs_user->regs = NULL; } } +static void perf_sample_regs_intr(struct perf_regs *regs_intr, + struct pt_regs *regs) +{ + regs_intr->regs = regs; + regs_intr->abi = perf_reg_abi(current); +} + + /* * Get remaining task size from user stack pointer. * @@ -4857,6 +4868,23 @@ void perf_output_sample(struct perf_output_handle *handle, if (sample_type & PERF_SAMPLE_TRANSACTION) perf_output_put(handle, data->txn); + if (sample_type & PERF_SAMPLE_REGS_INTR) { + u64 abi = data->regs_intr.abi; + /* + * If there are no regs to dump, notice it through + * first u64 being zero (PERF_SAMPLE_REGS_ABI_NONE). + */ + perf_output_put(handle, abi); + + if (abi) { + u64 mask = event->attr.sample_regs_intr; + + perf_output_sample_regs(handle, + data->regs_intr.regs, + mask); + } + } + if (!event->attr.watermark) { int wakeup_events = event->attr.wakeup_events; @@ -4922,12 +4950,13 @@ void perf_prepare_sample(struct perf_event_header *header, header->size += size; } + if (sample_type & (PERF_SAMPLE_REGS_USER | PERF_SAMPLE_STACK_USER)) + perf_sample_regs_user(&data->regs_user, regs); + if (sample_type & PERF_SAMPLE_REGS_USER) { /* regs dump ABI info */ int size = sizeof(u64); - perf_sample_regs_user(&data->regs_user, regs); - if (data->regs_user.regs) { u64 mask = event->attr.sample_regs_user; size += hweight64(mask) * sizeof(u64); @@ -4943,15 +4972,11 @@ void perf_prepare_sample(struct perf_event_header *header, * in case new sample type is added, because we could eat * up the rest of the sample size. */ - struct perf_regs_user *uregs = &data->regs_user; u16 stack_size = event->attr.sample_stack_user; u16 size = sizeof(u64); - if (!uregs->abi) - perf_sample_regs_user(uregs, regs); - stack_size = perf_sample_ustack_size(stack_size, header->size, - uregs->regs); + data->regs_user.regs); /* * If there is something to dump, add space for the dump @@ -4964,6 +4989,21 @@ void perf_prepare_sample(struct perf_event_header *header, data->stack_user_size = stack_size; header->size += size; } + + if (sample_type & PERF_SAMPLE_REGS_INTR) { + /* regs dump ABI info */ + int size = sizeof(u64); + + perf_sample_regs_intr(&data->regs_intr, regs); + + if (data->regs_intr.regs) { + u64 mask = event->attr.sample_regs_intr; + + size += hweight64(mask) * sizeof(u64); + } + + header->size += size; + } } static void perf_event_output(struct perf_event *event, @@ -7151,6 +7191,8 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr, ret = -EINVAL; } + if (attr->sample_type & PERF_SAMPLE_REGS_INTR) + ret = perf_reg_validate(attr->sample_regs_intr); out: return ret; diff --git a/kernel/exit.c b/kernel/exit.c index 5d30019ff953..232c4bc8bcc9 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -997,6 +997,8 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) get_task_struct(p); read_unlock(&tasklist_lock); + sched_annotate_sleep(); + if ((exit_code & 0x7f) == 0) { why = CLD_EXITED; status = exit_code >> 8; @@ -1079,6 +1081,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) * thread can reap it because we its state == DEAD/TRACE. */ read_unlock(&tasklist_lock); + sched_annotate_sleep(); retval = wo->wo_rusage ? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0; @@ -1210,6 +1213,7 @@ unlock_sig: pid = task_pid_vnr(p); why = ptrace ? CLD_TRAPPED : CLD_STOPPED; read_unlock(&tasklist_lock); + sched_annotate_sleep(); if (unlikely(wo->wo_flags & WNOWAIT)) return wait_noreap_copyout(wo, p, pid, uid, why, exit_code); @@ -1272,6 +1276,7 @@ static int wait_task_continued(struct wait_opts *wo, struct task_struct *p) pid = task_pid_vnr(p); get_task_struct(p); read_unlock(&tasklist_lock); + sched_annotate_sleep(); if (!wo->wo_info) { retval = wo->wo_rusage diff --git a/kernel/fork.c b/kernel/fork.c index 9b7d746d6d62..9ca84189cfc2 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1022,11 +1022,14 @@ void __cleanup_sighand(struct sighand_struct *sighand) { if (atomic_dec_and_test(&sighand->count)) { signalfd_cleanup(sighand); + /* + * sighand_cachep is SLAB_DESTROY_BY_RCU so we can free it + * without an RCU grace period, see __lock_task_sighand(). + */ kmem_cache_free(sighand_cachep, sighand); } } - /* * Initialize POSIX timer handling for a thread group. */ diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig index 225086b2652e..9a76e3beda54 100644 --- a/kernel/irq/Kconfig +++ b/kernel/irq/Kconfig @@ -55,6 +55,21 @@ config GENERIC_IRQ_CHIP config IRQ_DOMAIN bool +# Support for hierarchical irq domains +config IRQ_DOMAIN_HIERARCHY + bool + select IRQ_DOMAIN + +# Generic MSI interrupt support +config GENERIC_MSI_IRQ + bool + +# Generic MSI hierarchical interrupt domain support +config GENERIC_MSI_IRQ_DOMAIN + bool + select IRQ_DOMAIN_HIERARCHY + select GENERIC_MSI_IRQ + config HANDLE_DOMAIN_IRQ bool diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile index fff17381f0af..d12123526e2b 100644 --- a/kernel/irq/Makefile +++ b/kernel/irq/Makefile @@ -6,3 +6,4 @@ obj-$(CONFIG_IRQ_DOMAIN) += irqdomain.o obj-$(CONFIG_PROC_FS) += proc.o obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o obj-$(CONFIG_PM_SLEEP) += pm.o +obj-$(CONFIG_GENERIC_MSI_IRQ) += msi.o diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index e5202f00cabc..6f1c7a566b95 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -15,6 +15,7 @@ #include <linux/module.h> #include <linux/interrupt.h> #include <linux/kernel_stat.h> +#include <linux/irqdomain.h> #include <trace/events/irq.h> @@ -178,6 +179,7 @@ int irq_startup(struct irq_desc *desc, bool resend) irq_state_clr_disabled(desc); desc->depth = 0; + irq_domain_activate_irq(&desc->irq_data); if (desc->irq_data.chip->irq_startup) { ret = desc->irq_data.chip->irq_startup(&desc->irq_data); irq_state_clr_masked(desc); @@ -199,6 +201,7 @@ void irq_shutdown(struct irq_desc *desc) desc->irq_data.chip->irq_disable(&desc->irq_data); else desc->irq_data.chip->irq_mask(&desc->irq_data); + irq_domain_deactivate_irq(&desc->irq_data); irq_state_set_masked(desc); } @@ -728,7 +731,30 @@ __irq_set_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, if (!handle) { handle = handle_bad_irq; } else { - if (WARN_ON(desc->irq_data.chip == &no_irq_chip)) + struct irq_data *irq_data = &desc->irq_data; +#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY + /* + * With hierarchical domains we might run into a + * situation where the outermost chip is not yet set + * up, but the inner chips are there. Instead of + * bailing we install the handler, but obviously we + * cannot enable/startup the interrupt at this point. + */ + while (irq_data) { + if (irq_data->chip != &no_irq_chip) + break; + /* + * Bail out if the outer chip is not set up + * and the interrrupt supposed to be started + * right away. + */ + if (WARN_ON(is_chained)) + goto out; + /* Try the parent */ + irq_data = irq_data->parent_data; + } +#endif + if (WARN_ON(!irq_data || irq_data->chip == &no_irq_chip)) goto out; } @@ -847,3 +873,105 @@ void irq_cpu_offline(void) raw_spin_unlock_irqrestore(&desc->lock, flags); } } + +#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY +/** + * irq_chip_ack_parent - Acknowledge the parent interrupt + * @data: Pointer to interrupt specific data + */ +void irq_chip_ack_parent(struct irq_data *data) +{ + data = data->parent_data; + data->chip->irq_ack(data); +} + +/** + * irq_chip_mask_parent - Mask the parent interrupt + * @data: Pointer to interrupt specific data + */ +void irq_chip_mask_parent(struct irq_data *data) +{ + data = data->parent_data; + data->chip->irq_mask(data); +} + +/** + * irq_chip_unmask_parent - Unmask the parent interrupt + * @data: Pointer to interrupt specific data + */ +void irq_chip_unmask_parent(struct irq_data *data) +{ + data = data->parent_data; + data->chip->irq_unmask(data); +} + +/** + * irq_chip_eoi_parent - Invoke EOI on the parent interrupt + * @data: Pointer to interrupt specific data + */ +void irq_chip_eoi_parent(struct irq_data *data) +{ + data = data->parent_data; + data->chip->irq_eoi(data); +} + +/** + * irq_chip_set_affinity_parent - Set affinity on the parent interrupt + * @data: Pointer to interrupt specific data + * @dest: The affinity mask to set + * @force: Flag to enforce setting (disable online checks) + * + * Conditinal, as the underlying parent chip might not implement it. + */ +int irq_chip_set_affinity_parent(struct irq_data *data, + const struct cpumask *dest, bool force) +{ + data = data->parent_data; + if (data->chip->irq_set_affinity) + return data->chip->irq_set_affinity(data, dest, force); + + return -ENOSYS; +} + +/** + * irq_chip_retrigger_hierarchy - Retrigger an interrupt in hardware + * @data: Pointer to interrupt specific data + * + * Iterate through the domain hierarchy of the interrupt and check + * whether a hw retrigger function exists. If yes, invoke it. + */ +int irq_chip_retrigger_hierarchy(struct irq_data *data) +{ + for (data = data->parent_data; data; data = data->parent_data) + if (data->chip && data->chip->irq_retrigger) + return data->chip->irq_retrigger(data); + + return -ENOSYS; +} +#endif + +/** + * irq_chip_compose_msi_msg - Componse msi message for a irq chip + * @data: Pointer to interrupt specific data + * @msg: Pointer to the MSI message + * + * For hierarchical domains we find the first chip in the hierarchy + * which implements the irq_compose_msi_msg callback. For non + * hierarchical we use the top level chip. + */ +int irq_chip_compose_msi_msg(struct irq_data *data, struct msi_msg *msg) +{ + struct irq_data *pos = NULL; + +#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY + for (; data; data = data->parent_data) +#endif + if (data->chip && data->chip->irq_compose_msi_msg) + pos = data; + if (!pos) + return -ENOSYS; + + pos->chip->irq_compose_msi_msg(pos, msg); + + return 0; +} diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c index cf80e7b0ddab..61024e8abdef 100644 --- a/kernel/irq/generic-chip.c +++ b/kernel/irq/generic-chip.c @@ -39,7 +39,7 @@ void irq_gc_mask_disable_reg(struct irq_data *d) u32 mask = d->mask; irq_gc_lock(gc); - irq_reg_writel(mask, gc->reg_base + ct->regs.disable); + irq_reg_writel(gc, mask, ct->regs.disable); *ct->mask_cache &= ~mask; irq_gc_unlock(gc); } @@ -59,7 +59,7 @@ void irq_gc_mask_set_bit(struct irq_data *d) irq_gc_lock(gc); *ct->mask_cache |= mask; - irq_reg_writel(*ct->mask_cache, gc->reg_base + ct->regs.mask); + irq_reg_writel(gc, *ct->mask_cache, ct->regs.mask); irq_gc_unlock(gc); } EXPORT_SYMBOL_GPL(irq_gc_mask_set_bit); @@ -79,7 +79,7 @@ void irq_gc_mask_clr_bit(struct irq_data *d) irq_gc_lock(gc); *ct->mask_cache &= ~mask; - irq_reg_writel(*ct->mask_cache, gc->reg_base + ct->regs.mask); + irq_reg_writel(gc, *ct->mask_cache, ct->regs.mask); irq_gc_unlock(gc); } EXPORT_SYMBOL_GPL(irq_gc_mask_clr_bit); @@ -98,7 +98,7 @@ void irq_gc_unmask_enable_reg(struct irq_data *d) u32 mask = d->mask; irq_gc_lock(gc); - irq_reg_writel(mask, gc->reg_base + ct->regs.enable); + irq_reg_writel(gc, mask, ct->regs.enable); *ct->mask_cache |= mask; irq_gc_unlock(gc); } @@ -114,7 +114,7 @@ void irq_gc_ack_set_bit(struct irq_data *d) u32 mask = d->mask; irq_gc_lock(gc); - irq_reg_writel(mask, gc->reg_base + ct->regs.ack); + irq_reg_writel(gc, mask, ct->regs.ack); irq_gc_unlock(gc); } EXPORT_SYMBOL_GPL(irq_gc_ack_set_bit); @@ -130,7 +130,7 @@ void irq_gc_ack_clr_bit(struct irq_data *d) u32 mask = ~d->mask; irq_gc_lock(gc); - irq_reg_writel(mask, gc->reg_base + ct->regs.ack); + irq_reg_writel(gc, mask, ct->regs.ack); irq_gc_unlock(gc); } @@ -145,8 +145,8 @@ void irq_gc_mask_disable_reg_and_ack(struct irq_data *d) u32 mask = d->mask; irq_gc_lock(gc); - irq_reg_writel(mask, gc->reg_base + ct->regs.mask); - irq_reg_writel(mask, gc->reg_base + ct->regs.ack); + irq_reg_writel(gc, mask, ct->regs.mask); + irq_reg_writel(gc, mask, ct->regs.ack); irq_gc_unlock(gc); } @@ -161,7 +161,7 @@ void irq_gc_eoi(struct irq_data *d) u32 mask = d->mask; irq_gc_lock(gc); - irq_reg_writel(mask, gc->reg_base + ct->regs.eoi); + irq_reg_writel(gc, mask, ct->regs.eoi); irq_gc_unlock(gc); } @@ -191,6 +191,16 @@ int irq_gc_set_wake(struct irq_data *d, unsigned int on) return 0; } +static u32 irq_readl_be(void __iomem *addr) +{ + return ioread32be(addr); +} + +static void irq_writel_be(u32 val, void __iomem *addr) +{ + iowrite32be(val, addr); +} + static void irq_init_generic_chip(struct irq_chip_generic *gc, const char *name, int num_ct, unsigned int irq_base, @@ -245,7 +255,7 @@ irq_gc_init_mask_cache(struct irq_chip_generic *gc, enum irq_gc_flags flags) } ct[i].mask_cache = mskptr; if (flags & IRQ_GC_INIT_MASK_CACHE) - *mskptr = irq_reg_readl(gc->reg_base + mskreg); + *mskptr = irq_reg_readl(gc, mskreg); } } @@ -300,7 +310,13 @@ int irq_alloc_domain_generic_chips(struct irq_domain *d, int irqs_per_chip, dgc->gc[i] = gc = tmp; irq_init_generic_chip(gc, name, num_ct, i * irqs_per_chip, NULL, handler); + gc->domain = d; + if (gcflags & IRQ_GC_BE_IO) { + gc->reg_readl = &irq_readl_be; + gc->reg_writel = &irq_writel_be; + } + raw_spin_lock_irqsave(&gc_lock, flags); list_add_tail(&gc->list, &gc_list); raw_spin_unlock_irqrestore(&gc_lock, flags); diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 6534ff6ce02e..7fac311057b8 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -23,6 +23,10 @@ static DEFINE_MUTEX(irq_domain_mutex); static DEFINE_MUTEX(revmap_trees_mutex); static struct irq_domain *irq_default_domain; +static int irq_domain_alloc_descs(int virq, unsigned int nr_irqs, + irq_hw_number_t hwirq, int node); +static void irq_domain_check_hierarchy(struct irq_domain *domain); + /** * __irq_domain_add() - Allocate a new irq_domain data structure * @of_node: optional device-tree node of the interrupt controller @@ -30,7 +34,7 @@ static struct irq_domain *irq_default_domain; * @hwirq_max: Maximum number of interrupts supported by controller * @direct_max: Maximum value of direct maps; Use ~0 for no limit; 0 for no * direct mapping - * @ops: map/unmap domain callbacks + * @ops: domain callbacks * @host_data: Controller private data pointer * * Allocates and initialize and irq_domain structure. @@ -56,6 +60,7 @@ struct irq_domain *__irq_domain_add(struct device_node *of_node, int size, domain->hwirq_max = hwirq_max; domain->revmap_size = size; domain->revmap_direct_max_irq = direct_max; + irq_domain_check_hierarchy(domain); mutex_lock(&irq_domain_mutex); list_add(&domain->link, &irq_domain_list); @@ -109,7 +114,7 @@ EXPORT_SYMBOL_GPL(irq_domain_remove); * @first_irq: first number of irq block assigned to the domain, * pass zero to assign irqs on-the-fly. If first_irq is non-zero, then * pre-map all of the irqs in the domain to virqs starting at first_irq. - * @ops: map/unmap domain callbacks + * @ops: domain callbacks * @host_data: Controller private data pointer * * Allocates an irq_domain, and optionally if first_irq is positive then also @@ -174,10 +179,8 @@ struct irq_domain *irq_domain_add_legacy(struct device_node *of_node, domain = __irq_domain_add(of_node, first_hwirq + size, first_hwirq + size, 0, ops, host_data); - if (!domain) - return NULL; - - irq_domain_associate_many(domain, first_irq, first_hwirq, size); + if (domain) + irq_domain_associate_many(domain, first_irq, first_hwirq, size); return domain; } @@ -388,7 +391,6 @@ EXPORT_SYMBOL_GPL(irq_create_direct_mapping); unsigned int irq_create_mapping(struct irq_domain *domain, irq_hw_number_t hwirq) { - unsigned int hint; int virq; pr_debug("irq_create_mapping(0x%p, 0x%lx)\n", domain, hwirq); @@ -410,12 +412,8 @@ unsigned int irq_create_mapping(struct irq_domain *domain, } /* Allocate a virtual interrupt number */ - hint = hwirq % nr_irqs; - if (hint == 0) - hint++; - virq = irq_alloc_desc_from(hint, of_node_to_nid(domain->of_node)); - if (virq <= 0) - virq = irq_alloc_desc_from(1, of_node_to_nid(domain->of_node)); + virq = irq_domain_alloc_descs(-1, 1, hwirq, + of_node_to_nid(domain->of_node)); if (virq <= 0) { pr_debug("-> virq allocation failed\n"); return 0; @@ -471,7 +469,7 @@ unsigned int irq_create_of_mapping(struct of_phandle_args *irq_data) struct irq_domain *domain; irq_hw_number_t hwirq; unsigned int type = IRQ_TYPE_NONE; - unsigned int virq; + int virq; domain = irq_data->np ? irq_find_host(irq_data->np) : irq_default_domain; if (!domain) { @@ -489,10 +487,24 @@ unsigned int irq_create_of_mapping(struct of_phandle_args *irq_data) return 0; } - /* Create mapping */ - virq = irq_create_mapping(domain, hwirq); - if (!virq) - return virq; + if (irq_domain_is_hierarchy(domain)) { + /* + * If we've already configured this interrupt, + * don't do it again, or hell will break loose. + */ + virq = irq_find_mapping(domain, hwirq); + if (virq) + return virq; + + virq = irq_domain_alloc_irqs(domain, 1, NUMA_NO_NODE, irq_data); + if (virq <= 0) + return 0; + } else { + /* Create mapping */ + virq = irq_create_mapping(domain, hwirq); + if (!virq) + return virq; + } /* Set type if specified and different than the current one */ if (type != IRQ_TYPE_NONE && @@ -540,8 +552,8 @@ unsigned int irq_find_mapping(struct irq_domain *domain, return 0; if (hwirq < domain->revmap_direct_max_irq) { - data = irq_get_irq_data(hwirq); - if (data && (data->domain == domain) && (data->hwirq == hwirq)) + data = irq_domain_get_irq_data(domain, hwirq); + if (data && data->hwirq == hwirq) return hwirq; } @@ -709,3 +721,518 @@ const struct irq_domain_ops irq_domain_simple_ops = { .xlate = irq_domain_xlate_onetwocell, }; EXPORT_SYMBOL_GPL(irq_domain_simple_ops); + +static int irq_domain_alloc_descs(int virq, unsigned int cnt, + irq_hw_number_t hwirq, int node) +{ + unsigned int hint; + + if (virq >= 0) { + virq = irq_alloc_descs(virq, virq, cnt, node); + } else { + hint = hwirq % nr_irqs; + if (hint == 0) + hint++; + virq = irq_alloc_descs_from(hint, cnt, node); + if (virq <= 0 && hint > 1) + virq = irq_alloc_descs_from(1, cnt, node); + } + + return virq; +} + +#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY +/** + * irq_domain_add_hierarchy - Add a irqdomain into the hierarchy + * @parent: Parent irq domain to associate with the new domain + * @flags: Irq domain flags associated to the domain + * @size: Size of the domain. See below + * @node: Optional device-tree node of the interrupt controller + * @ops: Pointer to the interrupt domain callbacks + * @host_data: Controller private data pointer + * + * If @size is 0 a tree domain is created, otherwise a linear domain. + * + * If successful the parent is associated to the new domain and the + * domain flags are set. + * Returns pointer to IRQ domain, or NULL on failure. + */ +struct irq_domain *irq_domain_add_hierarchy(struct irq_domain *parent, + unsigned int flags, + unsigned int size, + struct device_node *node, + const struct irq_domain_ops *ops, + void *host_data) +{ + struct irq_domain *domain; + + if (size) + domain = irq_domain_add_linear(node, size, ops, host_data); + else + domain = irq_domain_add_tree(node, ops, host_data); + if (domain) { + domain->parent = parent; + domain->flags |= flags; + } + + return domain; +} + +static void irq_domain_insert_irq(int virq) +{ + struct irq_data *data; + + for (data = irq_get_irq_data(virq); data; data = data->parent_data) { + struct irq_domain *domain = data->domain; + irq_hw_number_t hwirq = data->hwirq; + + if (hwirq < domain->revmap_size) { + domain->linear_revmap[hwirq] = virq; + } else { + mutex_lock(&revmap_trees_mutex); + radix_tree_insert(&domain->revmap_tree, hwirq, data); + mutex_unlock(&revmap_trees_mutex); + } + + /* If not already assigned, give the domain the chip's name */ + if (!domain->name && data->chip) + domain->name = data->chip->name; + } + + irq_clear_status_flags(virq, IRQ_NOREQUEST); +} + +static void irq_domain_remove_irq(int virq) +{ + struct irq_data *data; + + irq_set_status_flags(virq, IRQ_NOREQUEST); + irq_set_chip_and_handler(virq, NULL, NULL); + synchronize_irq(virq); + smp_mb(); + + for (data = irq_get_irq_data(virq); data; data = data->parent_data) { + struct irq_domain *domain = data->domain; + irq_hw_number_t hwirq = data->hwirq; + + if (hwirq < domain->revmap_size) { + domain->linear_revmap[hwirq] = 0; + } else { + mutex_lock(&revmap_trees_mutex); + radix_tree_delete(&domain->revmap_tree, hwirq); + mutex_unlock(&revmap_trees_mutex); + } + } +} + +static struct irq_data *irq_domain_insert_irq_data(struct irq_domain *domain, + struct irq_data *child) +{ + struct irq_data *irq_data; + + irq_data = kzalloc_node(sizeof(*irq_data), GFP_KERNEL, child->node); + if (irq_data) { + child->parent_data = irq_data; + irq_data->irq = child->irq; + irq_data->node = child->node; + irq_data->domain = domain; + } + + return irq_data; +} + +static void irq_domain_free_irq_data(unsigned int virq, unsigned int nr_irqs) +{ + struct irq_data *irq_data, *tmp; + int i; + + for (i = 0; i < nr_irqs; i++) { + irq_data = irq_get_irq_data(virq + i); + tmp = irq_data->parent_data; + irq_data->parent_data = NULL; + irq_data->domain = NULL; + + while (tmp) { + irq_data = tmp; + tmp = tmp->parent_data; + kfree(irq_data); + } + } +} + +static int irq_domain_alloc_irq_data(struct irq_domain *domain, + unsigned int virq, unsigned int nr_irqs) +{ + struct irq_data *irq_data; + struct irq_domain *parent; + int i; + + /* The outermost irq_data is embedded in struct irq_desc */ + for (i = 0; i < nr_irqs; i++) { + irq_data = irq_get_irq_data(virq + i); + irq_data->domain = domain; + + for (parent = domain->parent; parent; parent = parent->parent) { + irq_data = irq_domain_insert_irq_data(parent, irq_data); + if (!irq_data) { + irq_domain_free_irq_data(virq, i + 1); + return -ENOMEM; + } + } + } + + return 0; +} + +/** + * irq_domain_get_irq_data - Get irq_data associated with @virq and @domain + * @domain: domain to match + * @virq: IRQ number to get irq_data + */ +struct irq_data *irq_domain_get_irq_data(struct irq_domain *domain, + unsigned int virq) +{ + struct irq_data *irq_data; + + for (irq_data = irq_get_irq_data(virq); irq_data; + irq_data = irq_data->parent_data) + if (irq_data->domain == domain) + return irq_data; + + return NULL; +} + +/** + * irq_domain_set_hwirq_and_chip - Set hwirq and irqchip of @virq at @domain + * @domain: Interrupt domain to match + * @virq: IRQ number + * @hwirq: The hwirq number + * @chip: The associated interrupt chip + * @chip_data: The associated chip data + */ +int irq_domain_set_hwirq_and_chip(struct irq_domain *domain, unsigned int virq, + irq_hw_number_t hwirq, struct irq_chip *chip, + void *chip_data) +{ + struct irq_data *irq_data = irq_domain_get_irq_data(domain, virq); + + if (!irq_data) + return -ENOENT; + + irq_data->hwirq = hwirq; + irq_data->chip = chip ? chip : &no_irq_chip; + irq_data->chip_data = chip_data; + + return 0; +} + +/** + * irq_domain_set_info - Set the complete data for a @virq in @domain + * @domain: Interrupt domain to match + * @virq: IRQ number + * @hwirq: The hardware interrupt number + * @chip: The associated interrupt chip + * @chip_data: The associated interrupt chip data + * @handler: The interrupt flow handler + * @handler_data: The interrupt flow handler data + * @handler_name: The interrupt handler name + */ +void irq_domain_set_info(struct irq_domain *domain, unsigned int virq, + irq_hw_number_t hwirq, struct irq_chip *chip, + void *chip_data, irq_flow_handler_t handler, + void *handler_data, const char *handler_name) +{ + irq_domain_set_hwirq_and_chip(domain, virq, hwirq, chip, chip_data); + __irq_set_handler(virq, handler, 0, handler_name); + irq_set_handler_data(virq, handler_data); +} + +/** + * irq_domain_reset_irq_data - Clear hwirq, chip and chip_data in @irq_data + * @irq_data: The pointer to irq_data + */ +void irq_domain_reset_irq_data(struct irq_data *irq_data) +{ + irq_data->hwirq = 0; + irq_data->chip = &no_irq_chip; + irq_data->chip_data = NULL; +} + +/** + * irq_domain_free_irqs_common - Clear irq_data and free the parent + * @domain: Interrupt domain to match + * @virq: IRQ number to start with + * @nr_irqs: The number of irqs to free + */ +void irq_domain_free_irqs_common(struct irq_domain *domain, unsigned int virq, + unsigned int nr_irqs) +{ + struct irq_data *irq_data; + int i; + + for (i = 0; i < nr_irqs; i++) { + irq_data = irq_domain_get_irq_data(domain, virq + i); + if (irq_data) + irq_domain_reset_irq_data(irq_data); + } + irq_domain_free_irqs_parent(domain, virq, nr_irqs); +} + +/** + * irq_domain_free_irqs_top - Clear handler and handler data, clear irqdata and free parent + * @domain: Interrupt domain to match + * @virq: IRQ number to start with + * @nr_irqs: The number of irqs to free + */ +void irq_domain_free_irqs_top(struct irq_domain *domain, unsigned int virq, + unsigned int nr_irqs) +{ + int i; + + for (i = 0; i < nr_irqs; i++) { + irq_set_handler_data(virq + i, NULL); + irq_set_handler(virq + i, NULL); + } + irq_domain_free_irqs_common(domain, virq, nr_irqs); +} + +static bool irq_domain_is_auto_recursive(struct irq_domain *domain) +{ + return domain->flags & IRQ_DOMAIN_FLAG_AUTO_RECURSIVE; +} + +static void irq_domain_free_irqs_recursive(struct irq_domain *domain, + unsigned int irq_base, + unsigned int nr_irqs) +{ + domain->ops->free(domain, irq_base, nr_irqs); + if (irq_domain_is_auto_recursive(domain)) { + BUG_ON(!domain->parent); + irq_domain_free_irqs_recursive(domain->parent, irq_base, + nr_irqs); + } +} + +static int irq_domain_alloc_irqs_recursive(struct irq_domain *domain, + unsigned int irq_base, + unsigned int nr_irqs, void *arg) +{ + int ret = 0; + struct irq_domain *parent = domain->parent; + bool recursive = irq_domain_is_auto_recursive(domain); + + BUG_ON(recursive && !parent); + if (recursive) + ret = irq_domain_alloc_irqs_recursive(parent, irq_base, + nr_irqs, arg); + if (ret >= 0) + ret = domain->ops->alloc(domain, irq_base, nr_irqs, arg); + if (ret < 0 && recursive) + irq_domain_free_irqs_recursive(parent, irq_base, nr_irqs); + + return ret; +} + +/** + * __irq_domain_alloc_irqs - Allocate IRQs from domain + * @domain: domain to allocate from + * @irq_base: allocate specified IRQ nubmer if irq_base >= 0 + * @nr_irqs: number of IRQs to allocate + * @node: NUMA node id for memory allocation + * @arg: domain specific argument + * @realloc: IRQ descriptors have already been allocated if true + * + * Allocate IRQ numbers and initialized all data structures to support + * hierarchy IRQ domains. + * Parameter @realloc is mainly to support legacy IRQs. + * Returns error code or allocated IRQ number + * + * The whole process to setup an IRQ has been split into two steps. + * The first step, __irq_domain_alloc_irqs(), is to allocate IRQ + * descriptor and required hardware resources. The second step, + * irq_domain_activate_irq(), is to program hardwares with preallocated + * resources. In this way, it's easier to rollback when failing to + * allocate resources. + */ +int __irq_domain_alloc_irqs(struct irq_domain *domain, int irq_base, + unsigned int nr_irqs, int node, void *arg, + bool realloc) +{ + int i, ret, virq; + + if (domain == NULL) { + domain = irq_default_domain; + if (WARN(!domain, "domain is NULL; cannot allocate IRQ\n")) + return -EINVAL; + } + + if (!domain->ops->alloc) { + pr_debug("domain->ops->alloc() is NULL\n"); + return -ENOSYS; + } + + if (realloc && irq_base >= 0) { + virq = irq_base; + } else { + virq = irq_domain_alloc_descs(irq_base, nr_irqs, 0, node); + if (virq < 0) { + pr_debug("cannot allocate IRQ(base %d, count %d)\n", + irq_base, nr_irqs); + return virq; + } + } + + if (irq_domain_alloc_irq_data(domain, virq, nr_irqs)) { + pr_debug("cannot allocate memory for IRQ%d\n", virq); + ret = -ENOMEM; + goto out_free_desc; + } + + mutex_lock(&irq_domain_mutex); + ret = irq_domain_alloc_irqs_recursive(domain, virq, nr_irqs, arg); + if (ret < 0) { + mutex_unlock(&irq_domain_mutex); + goto out_free_irq_data; + } + for (i = 0; i < nr_irqs; i++) + irq_domain_insert_irq(virq + i); + mutex_unlock(&irq_domain_mutex); + + return virq; + +out_free_irq_data: + irq_domain_free_irq_data(virq, nr_irqs); +out_free_desc: + irq_free_descs(virq, nr_irqs); + return ret; +} + +/** + * irq_domain_free_irqs - Free IRQ number and associated data structures + * @virq: base IRQ number + * @nr_irqs: number of IRQs to free + */ +void irq_domain_free_irqs(unsigned int virq, unsigned int nr_irqs) +{ + struct irq_data *data = irq_get_irq_data(virq); + int i; + + if (WARN(!data || !data->domain || !data->domain->ops->free, + "NULL pointer, cannot free irq\n")) + return; + + mutex_lock(&irq_domain_mutex); + for (i = 0; i < nr_irqs; i++) + irq_domain_remove_irq(virq + i); + irq_domain_free_irqs_recursive(data->domain, virq, nr_irqs); + mutex_unlock(&irq_domain_mutex); + + irq_domain_free_irq_data(virq, nr_irqs); + irq_free_descs(virq, nr_irqs); +} + +/** + * irq_domain_alloc_irqs_parent - Allocate interrupts from parent domain + * @irq_base: Base IRQ number + * @nr_irqs: Number of IRQs to allocate + * @arg: Allocation data (arch/domain specific) + * + * Check whether the domain has been setup recursive. If not allocate + * through the parent domain. + */ +int irq_domain_alloc_irqs_parent(struct irq_domain *domain, + unsigned int irq_base, unsigned int nr_irqs, + void *arg) +{ + /* irq_domain_alloc_irqs_recursive() has called parent's alloc() */ + if (irq_domain_is_auto_recursive(domain)) + return 0; + + domain = domain->parent; + if (domain) + return irq_domain_alloc_irqs_recursive(domain, irq_base, + nr_irqs, arg); + return -ENOSYS; +} + +/** + * irq_domain_free_irqs_parent - Free interrupts from parent domain + * @irq_base: Base IRQ number + * @nr_irqs: Number of IRQs to free + * + * Check whether the domain has been setup recursive. If not free + * through the parent domain. + */ +void irq_domain_free_irqs_parent(struct irq_domain *domain, + unsigned int irq_base, unsigned int nr_irqs) +{ + /* irq_domain_free_irqs_recursive() will call parent's free */ + if (!irq_domain_is_auto_recursive(domain) && domain->parent) + irq_domain_free_irqs_recursive(domain->parent, irq_base, + nr_irqs); +} + +/** + * irq_domain_activate_irq - Call domain_ops->activate recursively to activate + * interrupt + * @irq_data: outermost irq_data associated with interrupt + * + * This is the second step to call domain_ops->activate to program interrupt + * controllers, so the interrupt could actually get delivered. + */ +void irq_domain_activate_irq(struct irq_data *irq_data) +{ + if (irq_data && irq_data->domain) { + struct irq_domain *domain = irq_data->domain; + + if (irq_data->parent_data) + irq_domain_activate_irq(irq_data->parent_data); + if (domain->ops->activate) + domain->ops->activate(domain, irq_data); + } +} + +/** + * irq_domain_deactivate_irq - Call domain_ops->deactivate recursively to + * deactivate interrupt + * @irq_data: outermost irq_data associated with interrupt + * + * It calls domain_ops->deactivate to program interrupt controllers to disable + * interrupt delivery. + */ +void irq_domain_deactivate_irq(struct irq_data *irq_data) +{ + if (irq_data && irq_data->domain) { + struct irq_domain *domain = irq_data->domain; + + if (domain->ops->deactivate) + domain->ops->deactivate(domain, irq_data); + if (irq_data->parent_data) + irq_domain_deactivate_irq(irq_data->parent_data); + } +} + +static void irq_domain_check_hierarchy(struct irq_domain *domain) +{ + /* Hierarchy irq_domains must implement callback alloc() */ + if (domain->ops->alloc) + domain->flags |= IRQ_DOMAIN_FLAG_HIERARCHY; +} +#else /* CONFIG_IRQ_DOMAIN_HIERARCHY */ +/** + * irq_domain_get_irq_data - Get irq_data associated with @virq and @domain + * @domain: domain to match + * @virq: IRQ number to get irq_data + */ +struct irq_data *irq_domain_get_irq_data(struct irq_domain *domain, + unsigned int virq) +{ + struct irq_data *irq_data = irq_get_irq_data(virq); + + return (irq_data && irq_data->domain == domain) ? irq_data : NULL; +} + +static void irq_domain_check_hierarchy(struct irq_domain *domain) +{ +} +#endif /* CONFIG_IRQ_DOMAIN_HIERARCHY */ diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 0a9104b4608b..80692373abd6 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -183,6 +183,7 @@ int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask, ret = chip->irq_set_affinity(data, mask, force); switch (ret) { case IRQ_SET_MASK_OK: + case IRQ_SET_MASK_OK_DONE: cpumask_copy(data->affinity, mask); case IRQ_SET_MASK_OK_NOCOPY: irq_set_thread_affinity(desc); @@ -600,6 +601,7 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, switch (ret) { case IRQ_SET_MASK_OK: + case IRQ_SET_MASK_OK_DONE: irqd_clear(&desc->irq_data, IRQD_TRIGGER_MASK); irqd_set(&desc->irq_data, flags); diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c new file mode 100644 index 000000000000..3e18163f336f --- /dev/null +++ b/kernel/irq/msi.c @@ -0,0 +1,330 @@ +/* + * linux/kernel/irq/msi.c + * + * Copyright (C) 2014 Intel Corp. + * Author: Jiang Liu <jiang.liu@linux.intel.com> + * + * This file is licensed under GPLv2. + * + * This file contains common code to support Message Signalled Interrupt for + * PCI compatible and non PCI compatible devices. + */ +#include <linux/types.h> +#include <linux/device.h> +#include <linux/irq.h> +#include <linux/irqdomain.h> +#include <linux/msi.h> + +/* Temparory solution for building, will be removed later */ +#include <linux/pci.h> + +void __get_cached_msi_msg(struct msi_desc *entry, struct msi_msg *msg) +{ + *msg = entry->msg; +} + +void get_cached_msi_msg(unsigned int irq, struct msi_msg *msg) +{ + struct msi_desc *entry = irq_get_msi_desc(irq); + + __get_cached_msi_msg(entry, msg); +} +EXPORT_SYMBOL_GPL(get_cached_msi_msg); + +#ifdef CONFIG_GENERIC_MSI_IRQ_DOMAIN +static inline void irq_chip_write_msi_msg(struct irq_data *data, + struct msi_msg *msg) +{ + data->chip->irq_write_msi_msg(data, msg); +} + +/** + * msi_domain_set_affinity - Generic affinity setter function for MSI domains + * @irq_data: The irq data associated to the interrupt + * @mask: The affinity mask to set + * @force: Flag to enforce setting (disable online checks) + * + * Intended to be used by MSI interrupt controllers which are + * implemented with hierarchical domains. + */ +int msi_domain_set_affinity(struct irq_data *irq_data, + const struct cpumask *mask, bool force) +{ + struct irq_data *parent = irq_data->parent_data; + struct msi_msg msg; + int ret; + + ret = parent->chip->irq_set_affinity(parent, mask, force); + if (ret >= 0 && ret != IRQ_SET_MASK_OK_DONE) { + BUG_ON(irq_chip_compose_msi_msg(irq_data, &msg)); + irq_chip_write_msi_msg(irq_data, &msg); + } + + return ret; +} + +static void msi_domain_activate(struct irq_domain *domain, + struct irq_data *irq_data) +{ + struct msi_msg msg; + + BUG_ON(irq_chip_compose_msi_msg(irq_data, &msg)); + irq_chip_write_msi_msg(irq_data, &msg); +} + +static void msi_domain_deactivate(struct irq_domain *domain, + struct irq_data *irq_data) +{ + struct msi_msg msg; + + memset(&msg, 0, sizeof(msg)); + irq_chip_write_msi_msg(irq_data, &msg); +} + +static int msi_domain_alloc(struct irq_domain *domain, unsigned int virq, + unsigned int nr_irqs, void *arg) +{ + struct msi_domain_info *info = domain->host_data; + struct msi_domain_ops *ops = info->ops; + irq_hw_number_t hwirq = ops->get_hwirq(info, arg); + int i, ret; + + if (irq_find_mapping(domain, hwirq) > 0) + return -EEXIST; + + ret = irq_domain_alloc_irqs_parent(domain, virq, nr_irqs, arg); + if (ret < 0) + return ret; + + for (i = 0; i < nr_irqs; i++) { + ret = ops->msi_init(domain, info, virq + i, hwirq + i, arg); + if (ret < 0) { + if (ops->msi_free) { + for (i--; i > 0; i--) + ops->msi_free(domain, info, virq + i); + } + irq_domain_free_irqs_top(domain, virq, nr_irqs); + return ret; + } + } + + return 0; +} + +static void msi_domain_free(struct irq_domain *domain, unsigned int virq, + unsigned int nr_irqs) +{ + struct msi_domain_info *info = domain->host_data; + int i; + + if (info->ops->msi_free) { + for (i = 0; i < nr_irqs; i++) + info->ops->msi_free(domain, info, virq + i); + } + irq_domain_free_irqs_top(domain, virq, nr_irqs); +} + +static struct irq_domain_ops msi_domain_ops = { + .alloc = msi_domain_alloc, + .free = msi_domain_free, + .activate = msi_domain_activate, + .deactivate = msi_domain_deactivate, +}; + +#ifdef GENERIC_MSI_DOMAIN_OPS +static irq_hw_number_t msi_domain_ops_get_hwirq(struct msi_domain_info *info, + msi_alloc_info_t *arg) +{ + return arg->hwirq; +} + +static int msi_domain_ops_prepare(struct irq_domain *domain, struct device *dev, + int nvec, msi_alloc_info_t *arg) +{ + memset(arg, 0, sizeof(*arg)); + return 0; +} + +static void msi_domain_ops_set_desc(msi_alloc_info_t *arg, + struct msi_desc *desc) +{ + arg->desc = desc; +} +#else +#define msi_domain_ops_get_hwirq NULL +#define msi_domain_ops_prepare NULL +#define msi_domain_ops_set_desc NULL +#endif /* !GENERIC_MSI_DOMAIN_OPS */ + +static int msi_domain_ops_init(struct irq_domain *domain, + struct msi_domain_info *info, + unsigned int virq, irq_hw_number_t hwirq, + msi_alloc_info_t *arg) +{ + irq_domain_set_hwirq_and_chip(domain, virq, hwirq, info->chip, + info->chip_data); + if (info->handler && info->handler_name) { + __irq_set_handler(virq, info->handler, 0, info->handler_name); + if (info->handler_data) + irq_set_handler_data(virq, info->handler_data); + } + return 0; +} + +static int msi_domain_ops_check(struct irq_domain *domain, + struct msi_domain_info *info, + struct device *dev) +{ + return 0; +} + +static struct msi_domain_ops msi_domain_ops_default = { + .get_hwirq = msi_domain_ops_get_hwirq, + .msi_init = msi_domain_ops_init, + .msi_check = msi_domain_ops_check, + .msi_prepare = msi_domain_ops_prepare, + .set_desc = msi_domain_ops_set_desc, +}; + +static void msi_domain_update_dom_ops(struct msi_domain_info *info) +{ + struct msi_domain_ops *ops = info->ops; + + if (ops == NULL) { + info->ops = &msi_domain_ops_default; + return; + } + + if (ops->get_hwirq == NULL) + ops->get_hwirq = msi_domain_ops_default.get_hwirq; + if (ops->msi_init == NULL) + ops->msi_init = msi_domain_ops_default.msi_init; + if (ops->msi_check == NULL) + ops->msi_check = msi_domain_ops_default.msi_check; + if (ops->msi_prepare == NULL) + ops->msi_prepare = msi_domain_ops_default.msi_prepare; + if (ops->set_desc == NULL) + ops->set_desc = msi_domain_ops_default.set_desc; +} + +static void msi_domain_update_chip_ops(struct msi_domain_info *info) +{ + struct irq_chip *chip = info->chip; + + BUG_ON(!chip); + if (!chip->irq_mask) + chip->irq_mask = pci_msi_mask_irq; + if (!chip->irq_unmask) + chip->irq_unmask = pci_msi_unmask_irq; + if (!chip->irq_set_affinity) + chip->irq_set_affinity = msi_domain_set_affinity; +} + +/** + * msi_create_irq_domain - Create a MSI interrupt domain + * @of_node: Optional device-tree node of the interrupt controller + * @info: MSI domain info + * @parent: Parent irq domain + */ +struct irq_domain *msi_create_irq_domain(struct device_node *node, + struct msi_domain_info *info, + struct irq_domain *parent) +{ + if (info->flags & MSI_FLAG_USE_DEF_DOM_OPS) + msi_domain_update_dom_ops(info); + if (info->flags & MSI_FLAG_USE_DEF_CHIP_OPS) + msi_domain_update_chip_ops(info); + + return irq_domain_add_hierarchy(parent, 0, 0, node, &msi_domain_ops, + info); +} + +/** + * msi_domain_alloc_irqs - Allocate interrupts from a MSI interrupt domain + * @domain: The domain to allocate from + * @dev: Pointer to device struct of the device for which the interrupts + * are allocated + * @nvec: The number of interrupts to allocate + * + * Returns 0 on success or an error code. + */ +int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev, + int nvec) +{ + struct msi_domain_info *info = domain->host_data; + struct msi_domain_ops *ops = info->ops; + msi_alloc_info_t arg; + struct msi_desc *desc; + int i, ret, virq = -1; + + ret = ops->msi_check(domain, info, dev); + if (ret == 0) + ret = ops->msi_prepare(domain, dev, nvec, &arg); + if (ret) + return ret; + + for_each_msi_entry(desc, dev) { + ops->set_desc(&arg, desc); + if (info->flags & MSI_FLAG_IDENTITY_MAP) + virq = (int)ops->get_hwirq(info, &arg); + else + virq = -1; + + virq = __irq_domain_alloc_irqs(domain, virq, desc->nvec_used, + dev_to_node(dev), &arg, false); + if (virq < 0) { + ret = -ENOSPC; + if (ops->handle_error) + ret = ops->handle_error(domain, desc, ret); + if (ops->msi_finish) + ops->msi_finish(&arg, ret); + return ret; + } + + for (i = 0; i < desc->nvec_used; i++) + irq_set_msi_desc_off(virq, i, desc); + } + + if (ops->msi_finish) + ops->msi_finish(&arg, 0); + + for_each_msi_entry(desc, dev) { + if (desc->nvec_used == 1) + dev_dbg(dev, "irq %d for MSI\n", virq); + else + dev_dbg(dev, "irq [%d-%d] for MSI\n", + virq, virq + desc->nvec_used - 1); + } + + return 0; +} + +/** + * msi_domain_free_irqs - Free interrupts from a MSI interrupt @domain associated tp @dev + * @domain: The domain to managing the interrupts + * @dev: Pointer to device struct of the device for which the interrupts + * are free + */ +void msi_domain_free_irqs(struct irq_domain *domain, struct device *dev) +{ + struct msi_desc *desc; + + for_each_msi_entry(desc, dev) { + irq_domain_free_irqs(desc->irq, desc->nvec_used); + desc->irq = 0; + } +} + +/** + * msi_get_domain_info - Get the MSI interrupt domain info for @domain + * @domain: The interrupt domain to retrieve data from + * + * Returns the pointer to the msi_domain_info stored in + * @domain->host_data. + */ +struct msi_domain_info *msi_get_domain_info(struct irq_domain *domain) +{ + return (struct msi_domain_info *)domain->host_data; +} + +#endif /* CONFIG_GENERIC_MSI_IRQ_DOMAIN */ diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index dadbf88c22c4..454195194d4a 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -378,8 +378,14 @@ done: * reschedule now, before we try-lock the mutex. This avoids getting * scheduled out right after we obtained the mutex. */ - if (need_resched()) + if (need_resched()) { + /* + * We _should_ have TASK_RUNNING here, but just in case + * we do not, make it so, otherwise we might get stuck. + */ + __set_current_state(TASK_RUNNING); schedule_preempt_disabled(); + } return false; } diff --git a/kernel/module.c b/kernel/module.c index 88cec1ddb1e3..e52a8739361a 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -3097,6 +3097,32 @@ static int may_init_module(void) } /* + * Can't use wait_event_interruptible() because our condition + * 'finished_loading()' contains a blocking primitive itself (mutex_lock). + */ +static int wait_finished_loading(struct module *mod) +{ + DEFINE_WAIT_FUNC(wait, woken_wake_function); + int ret = 0; + + add_wait_queue(&module_wq, &wait); + for (;;) { + if (finished_loading(mod->name)) + break; + + if (signal_pending(current)) { + ret = -ERESTARTSYS; + break; + } + + wait_woken(&wait, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); + } + remove_wait_queue(&module_wq, &wait); + + return ret; +} + +/* * We try to place it in the list now to make sure it's unique before * we dedicate too many resources. In particular, temporary percpu * memory exhaustion. @@ -3116,8 +3142,8 @@ again: || old->state == MODULE_STATE_UNFORMED) { /* Wait in case it fails to load. */ mutex_unlock(&module_mutex); - err = wait_event_interruptible(module_wq, - finished_loading(mod->name)); + + err = wait_finished_loading(mod); if (err) goto out_unlocked; goto again; diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index ced2b84b1cb7..c8755e7e1dba 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -480,7 +480,7 @@ static int syslog_action_restricted(int type) type != SYSLOG_ACTION_SIZE_BUFFER; } -static int check_syslog_permissions(int type, bool from_file) +int check_syslog_permissions(int type, bool from_file) { /* * If this is from /proc/kmsg and we've already opened it, then we've diff --git a/kernel/rcu/Makefile b/kernel/rcu/Makefile index 807ccfbf69b3..e6fae503d1bc 100644 --- a/kernel/rcu/Makefile +++ b/kernel/rcu/Makefile @@ -1,6 +1,6 @@ obj-y += update.o srcu.o obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o obj-$(CONFIG_TREE_RCU) += tree.o -obj-$(CONFIG_TREE_PREEMPT_RCU) += tree.o +obj-$(CONFIG_PREEMPT_RCU) += tree.o obj-$(CONFIG_TREE_RCU_TRACE) += tree_trace.o obj-$(CONFIG_TINY_RCU) += tiny.o diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index ff1a6de62f17..07bb02eda844 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -135,4 +135,6 @@ int rcu_jiffies_till_stall_check(void); */ #define TPS(x) tracepoint_string(x) +void rcu_early_boot_tests(void); + #endif /* __LINUX_RCU_H */ diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 240fa9094f83..4d559baf06e0 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -812,6 +812,7 @@ rcu_torture_cbflood(void *arg) cur_ops->cb_barrier(); stutter_wait("rcu_torture_cbflood"); } while (!torture_must_stop()); + vfree(rhp); torture_kthread_stopping("rcu_torture_cbflood"); return 0; } diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c index c0623fc47125..0db5649f8817 100644 --- a/kernel/rcu/tiny.c +++ b/kernel/rcu/tiny.c @@ -247,7 +247,7 @@ void rcu_bh_qs(void) * be called from hardirq context. It is normally called from the * scheduling-clock interrupt. */ -void rcu_check_callbacks(int cpu, int user) +void rcu_check_callbacks(int user) { RCU_TRACE(check_cpu_stalls()); if (user || rcu_is_cpu_rrupt_from_idle()) @@ -380,7 +380,9 @@ void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) } EXPORT_SYMBOL_GPL(call_rcu_bh); -void rcu_init(void) +void __init rcu_init(void) { open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); + + rcu_early_boot_tests(); } diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 9815447d22e0..7680fc275036 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -105,7 +105,7 @@ struct rcu_state sname##_state = { \ .name = RCU_STATE_NAME(sname), \ .abbr = sabbr, \ }; \ -DEFINE_PER_CPU(struct rcu_data, sname##_data) +DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_data, sname##_data) RCU_STATE_INITIALIZER(rcu_sched, 's', call_rcu_sched); RCU_STATE_INITIALIZER(rcu_bh, 'b', call_rcu_bh); @@ -152,19 +152,6 @@ EXPORT_SYMBOL_GPL(rcu_scheduler_active); */ static int rcu_scheduler_fully_active __read_mostly; -#ifdef CONFIG_RCU_BOOST - -/* - * Control variables for per-CPU and per-rcu_node kthreads. These - * handle all flavors of RCU. - */ -static DEFINE_PER_CPU(struct task_struct *, rcu_cpu_kthread_task); -DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_status); -DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_loops); -DEFINE_PER_CPU(char, rcu_cpu_has_work); - -#endif /* #ifdef CONFIG_RCU_BOOST */ - static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu); static void invoke_rcu_core(void); static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp); @@ -286,11 +273,11 @@ static void rcu_momentary_dyntick_idle(void) * and requires special handling for preemptible RCU. * The caller must have disabled preemption. */ -void rcu_note_context_switch(int cpu) +void rcu_note_context_switch(void) { trace_rcu_utilization(TPS("Start context switch")); rcu_sched_qs(); - rcu_preempt_note_context_switch(cpu); + rcu_preempt_note_context_switch(); if (unlikely(raw_cpu_read(rcu_sched_qs_mask))) rcu_momentary_dyntick_idle(); trace_rcu_utilization(TPS("End context switch")); @@ -325,7 +312,7 @@ static void force_qs_rnp(struct rcu_state *rsp, unsigned long *maxj), bool *isidle, unsigned long *maxj); static void force_quiescent_state(struct rcu_state *rsp); -static int rcu_pending(int cpu); +static int rcu_pending(void); /* * Return the number of RCU-sched batches processed thus far for debug & stats. @@ -510,11 +497,11 @@ cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp) * we really have entered idle, and must do the appropriate accounting. * The caller must have disabled interrupts. */ -static void rcu_eqs_enter_common(struct rcu_dynticks *rdtp, long long oldval, - bool user) +static void rcu_eqs_enter_common(long long oldval, bool user) { struct rcu_state *rsp; struct rcu_data *rdp; + struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); trace_rcu_dyntick(TPS("Start"), oldval, rdtp->dynticks_nesting); if (!user && !is_idle_task(current)) { @@ -531,7 +518,7 @@ static void rcu_eqs_enter_common(struct rcu_dynticks *rdtp, long long oldval, rdp = this_cpu_ptr(rsp->rda); do_nocb_deferred_wakeup(rdp); } - rcu_prepare_for_idle(smp_processor_id()); + rcu_prepare_for_idle(); /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */ smp_mb__before_atomic(); /* See above. */ atomic_inc(&rdtp->dynticks); @@ -565,7 +552,7 @@ static void rcu_eqs_enter(bool user) WARN_ON_ONCE((oldval & DYNTICK_TASK_NEST_MASK) == 0); if ((oldval & DYNTICK_TASK_NEST_MASK) == DYNTICK_TASK_NEST_VALUE) { rdtp->dynticks_nesting = 0; - rcu_eqs_enter_common(rdtp, oldval, user); + rcu_eqs_enter_common(oldval, user); } else { rdtp->dynticks_nesting -= DYNTICK_TASK_NEST_VALUE; } @@ -589,7 +576,7 @@ void rcu_idle_enter(void) local_irq_save(flags); rcu_eqs_enter(false); - rcu_sysidle_enter(this_cpu_ptr(&rcu_dynticks), 0); + rcu_sysidle_enter(0); local_irq_restore(flags); } EXPORT_SYMBOL_GPL(rcu_idle_enter); @@ -639,8 +626,8 @@ void rcu_irq_exit(void) if (rdtp->dynticks_nesting) trace_rcu_dyntick(TPS("--="), oldval, rdtp->dynticks_nesting); else - rcu_eqs_enter_common(rdtp, oldval, true); - rcu_sysidle_enter(rdtp, 1); + rcu_eqs_enter_common(oldval, true); + rcu_sysidle_enter(1); local_irq_restore(flags); } @@ -651,16 +638,17 @@ void rcu_irq_exit(void) * we really have exited idle, and must do the appropriate accounting. * The caller must have disabled interrupts. */ -static void rcu_eqs_exit_common(struct rcu_dynticks *rdtp, long long oldval, - int user) +static void rcu_eqs_exit_common(long long oldval, int user) { + struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); + rcu_dynticks_task_exit(); smp_mb__before_atomic(); /* Force ordering w/previous sojourn. */ atomic_inc(&rdtp->dynticks); /* CPUs seeing atomic_inc() must see later RCU read-side crit sects */ smp_mb__after_atomic(); /* See above. */ WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1)); - rcu_cleanup_after_idle(smp_processor_id()); + rcu_cleanup_after_idle(); trace_rcu_dyntick(TPS("End"), oldval, rdtp->dynticks_nesting); if (!user && !is_idle_task(current)) { struct task_struct *idle __maybe_unused = @@ -691,7 +679,7 @@ static void rcu_eqs_exit(bool user) rdtp->dynticks_nesting += DYNTICK_TASK_NEST_VALUE; } else { rdtp->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; - rcu_eqs_exit_common(rdtp, oldval, user); + rcu_eqs_exit_common(oldval, user); } } @@ -712,7 +700,7 @@ void rcu_idle_exit(void) local_irq_save(flags); rcu_eqs_exit(false); - rcu_sysidle_exit(this_cpu_ptr(&rcu_dynticks), 0); + rcu_sysidle_exit(0); local_irq_restore(flags); } EXPORT_SYMBOL_GPL(rcu_idle_exit); @@ -763,8 +751,8 @@ void rcu_irq_enter(void) if (oldval) trace_rcu_dyntick(TPS("++="), oldval, rdtp->dynticks_nesting); else - rcu_eqs_exit_common(rdtp, oldval, true); - rcu_sysidle_exit(rdtp, 1); + rcu_eqs_exit_common(oldval, true); + rcu_sysidle_exit(1); local_irq_restore(flags); } @@ -2387,7 +2375,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) * invoked from the scheduling-clock interrupt. If rcu_pending returns * false, there is no point in invoking rcu_check_callbacks(). */ -void rcu_check_callbacks(int cpu, int user) +void rcu_check_callbacks(int user) { trace_rcu_utilization(TPS("Start scheduler-tick")); increment_cpu_stall_ticks(); @@ -2419,8 +2407,8 @@ void rcu_check_callbacks(int cpu, int user) rcu_bh_qs(); } - rcu_preempt_check_callbacks(cpu); - if (rcu_pending(cpu)) + rcu_preempt_check_callbacks(); + if (rcu_pending()) invoke_rcu_core(); if (user) rcu_note_voluntary_context_switch(current); @@ -2963,6 +2951,9 @@ static int synchronize_sched_expedited_cpu_stop(void *data) */ void synchronize_sched_expedited(void) { + cpumask_var_t cm; + bool cma = false; + int cpu; long firstsnap, s, snap; int trycount = 0; struct rcu_state *rsp = &rcu_sched_state; @@ -2997,11 +2988,26 @@ void synchronize_sched_expedited(void) } WARN_ON_ONCE(cpu_is_offline(raw_smp_processor_id())); + /* Offline CPUs, idle CPUs, and any CPU we run on are quiescent. */ + cma = zalloc_cpumask_var(&cm, GFP_KERNEL); + if (cma) { + cpumask_copy(cm, cpu_online_mask); + cpumask_clear_cpu(raw_smp_processor_id(), cm); + for_each_cpu(cpu, cm) { + struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); + + if (!(atomic_add_return(0, &rdtp->dynticks) & 0x1)) + cpumask_clear_cpu(cpu, cm); + } + if (cpumask_weight(cm) == 0) + goto all_cpus_idle; + } + /* * Each pass through the following loop attempts to force a * context switch on each CPU. */ - while (try_stop_cpus(cpu_online_mask, + while (try_stop_cpus(cma ? cm : cpu_online_mask, synchronize_sched_expedited_cpu_stop, NULL) == -EAGAIN) { put_online_cpus(); @@ -3013,6 +3019,7 @@ void synchronize_sched_expedited(void) /* ensure test happens before caller kfree */ smp_mb__before_atomic(); /* ^^^ */ atomic_long_inc(&rsp->expedited_workdone1); + free_cpumask_var(cm); return; } @@ -3022,6 +3029,7 @@ void synchronize_sched_expedited(void) } else { wait_rcu_gp(call_rcu_sched); atomic_long_inc(&rsp->expedited_normal); + free_cpumask_var(cm); return; } @@ -3031,6 +3039,7 @@ void synchronize_sched_expedited(void) /* ensure test happens before caller kfree */ smp_mb__before_atomic(); /* ^^^ */ atomic_long_inc(&rsp->expedited_workdone2); + free_cpumask_var(cm); return; } @@ -3045,6 +3054,7 @@ void synchronize_sched_expedited(void) /* CPU hotplug operation in flight, use normal GP. */ wait_rcu_gp(call_rcu_sched); atomic_long_inc(&rsp->expedited_normal); + free_cpumask_var(cm); return; } snap = atomic_long_read(&rsp->expedited_start); @@ -3052,6 +3062,9 @@ void synchronize_sched_expedited(void) } atomic_long_inc(&rsp->expedited_stoppedcpus); +all_cpus_idle: + free_cpumask_var(cm); + /* * Everyone up to our most recent fetch is covered by our grace * period. Update the counter, but only if our work is still @@ -3143,12 +3156,12 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp) * by the current CPU, returning 1 if so. This function is part of the * RCU implementation; it is -not- an exported member of the RCU API. */ -static int rcu_pending(int cpu) +static int rcu_pending(void) { struct rcu_state *rsp; for_each_rcu_flavor(rsp) - if (__rcu_pending(rsp, per_cpu_ptr(rsp->rda, cpu))) + if (__rcu_pending(rsp, this_cpu_ptr(rsp->rda))) return 1; return 0; } @@ -3158,7 +3171,7 @@ static int rcu_pending(int cpu) * non-NULL, store an indication of whether all callbacks are lazy. * (If there are no callbacks, all of them are deemed to be lazy.) */ -static int __maybe_unused rcu_cpu_has_callbacks(int cpu, bool *all_lazy) +static int __maybe_unused rcu_cpu_has_callbacks(bool *all_lazy) { bool al = true; bool hc = false; @@ -3166,7 +3179,7 @@ static int __maybe_unused rcu_cpu_has_callbacks(int cpu, bool *all_lazy) struct rcu_state *rsp; for_each_rcu_flavor(rsp) { - rdp = per_cpu_ptr(rsp->rda, cpu); + rdp = this_cpu_ptr(rsp->rda); if (!rdp->nxtlist) continue; hc = true; @@ -3485,8 +3498,10 @@ static int rcu_cpu_notify(struct notifier_block *self, case CPU_DEAD_FROZEN: case CPU_UP_CANCELED: case CPU_UP_CANCELED_FROZEN: - for_each_rcu_flavor(rsp) + for_each_rcu_flavor(rsp) { rcu_cleanup_dead_cpu(cpu, rsp); + do_nocb_deferred_wakeup(per_cpu_ptr(rsp->rda, cpu)); + } break; default: break; @@ -3766,6 +3781,8 @@ void __init rcu_init(void) pm_notifier(rcu_pm_notify, 0); for_each_online_cpu(cpu) rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)cpu); + + rcu_early_boot_tests(); } #include "tree_plugin.h" diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index bbdc45d8d74f..8e7b1843896e 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -139,7 +139,7 @@ struct rcu_node { unsigned long expmask; /* Groups that have ->blkd_tasks */ /* elements that need to drain to allow the */ /* current expedited grace period to */ - /* complete (only for TREE_PREEMPT_RCU). */ + /* complete (only for PREEMPT_RCU). */ unsigned long qsmaskinit; /* Per-GP initial value for qsmask & expmask. */ unsigned long grpmask; /* Mask to apply to parent qsmask. */ @@ -530,10 +530,10 @@ DECLARE_PER_CPU(struct rcu_data, rcu_sched_data); extern struct rcu_state rcu_bh_state; DECLARE_PER_CPU(struct rcu_data, rcu_bh_data); -#ifdef CONFIG_TREE_PREEMPT_RCU +#ifdef CONFIG_PREEMPT_RCU extern struct rcu_state rcu_preempt_state; DECLARE_PER_CPU(struct rcu_data, rcu_preempt_data); -#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ +#endif /* #ifdef CONFIG_PREEMPT_RCU */ #ifdef CONFIG_RCU_BOOST DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_status); @@ -547,7 +547,7 @@ DECLARE_PER_CPU(char, rcu_cpu_has_work); /* Forward declarations for rcutree_plugin.h */ static void rcu_bootup_announce(void); long rcu_batches_completed(void); -static void rcu_preempt_note_context_switch(int cpu); +static void rcu_preempt_note_context_switch(void); static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp); #ifdef CONFIG_HOTPLUG_CPU static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, @@ -561,12 +561,12 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp); #endif /* #ifdef CONFIG_HOTPLUG_CPU */ -static void rcu_preempt_check_callbacks(int cpu); +static void rcu_preempt_check_callbacks(void); void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)); -#if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_TREE_PREEMPT_RCU) +#if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_PREEMPT_RCU) static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, bool wake); -#endif /* #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_TREE_PREEMPT_RCU) */ +#endif /* #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_PREEMPT_RCU) */ static void __init __rcu_init_preempt(void); static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags); static void rcu_preempt_boost_start_gp(struct rcu_node *rnp); @@ -579,8 +579,8 @@ static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp, #endif /* #ifdef CONFIG_RCU_BOOST */ static void __init rcu_spawn_boost_kthreads(void); static void rcu_prepare_kthreads(int cpu); -static void rcu_cleanup_after_idle(int cpu); -static void rcu_prepare_for_idle(int cpu); +static void rcu_cleanup_after_idle(void); +static void rcu_prepare_for_idle(void); static void rcu_idle_count_callbacks_posted(void); static void print_cpu_stall_info_begin(void); static void print_cpu_stall_info(struct rcu_state *rsp, int cpu); @@ -606,8 +606,8 @@ static void __init rcu_organize_nocb_kthreads(struct rcu_state *rsp); #endif /* #ifdef CONFIG_RCU_NOCB_CPU */ static void __maybe_unused rcu_kick_nohz_cpu(int cpu); static bool init_nocb_callback_list(struct rcu_data *rdp); -static void rcu_sysidle_enter(struct rcu_dynticks *rdtp, int irq); -static void rcu_sysidle_exit(struct rcu_dynticks *rdtp, int irq); +static void rcu_sysidle_enter(int irq); +static void rcu_sysidle_exit(int irq); static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle, unsigned long *maxj); static bool is_sysidle_rcu_state(struct rcu_state *rsp); diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index c1d7f27bd38f..3ec85cb5d544 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -30,14 +30,24 @@ #include <linux/smpboot.h> #include "../time/tick-internal.h" -#define RCU_KTHREAD_PRIO 1 - #ifdef CONFIG_RCU_BOOST + #include "../locking/rtmutex_common.h" -#define RCU_BOOST_PRIO CONFIG_RCU_BOOST_PRIO -#else -#define RCU_BOOST_PRIO RCU_KTHREAD_PRIO -#endif + +/* rcuc/rcub kthread realtime priority */ +static int kthread_prio = CONFIG_RCU_KTHREAD_PRIO; +module_param(kthread_prio, int, 0644); + +/* + * Control variables for per-CPU and per-rcu_node kthreads. These + * handle all flavors of RCU. + */ +static DEFINE_PER_CPU(struct task_struct *, rcu_cpu_kthread_task); +DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_status); +DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_loops); +DEFINE_PER_CPU(char, rcu_cpu_has_work); + +#endif /* #ifdef CONFIG_RCU_BOOST */ #ifdef CONFIG_RCU_NOCB_CPU static cpumask_var_t rcu_nocb_mask; /* CPUs to have callbacks offloaded. */ @@ -72,9 +82,6 @@ static void __init rcu_bootup_announce_oddness(void) #ifdef CONFIG_RCU_TORTURE_TEST_RUNNABLE pr_info("\tRCU torture testing starts during boot.\n"); #endif -#if defined(CONFIG_TREE_PREEMPT_RCU) && !defined(CONFIG_RCU_CPU_STALL_VERBOSE) - pr_info("\tDump stacks of tasks blocking RCU-preempt GP.\n"); -#endif #if defined(CONFIG_RCU_CPU_STALL_INFO) pr_info("\tAdditional per-CPU info printed with stalls.\n"); #endif @@ -85,9 +92,12 @@ static void __init rcu_bootup_announce_oddness(void) pr_info("\tBoot-time adjustment of leaf fanout to %d.\n", rcu_fanout_leaf); if (nr_cpu_ids != NR_CPUS) pr_info("\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%d.\n", NR_CPUS, nr_cpu_ids); +#ifdef CONFIG_RCU_BOOST + pr_info("\tRCU kthread priority: %d.\n", kthread_prio); +#endif } -#ifdef CONFIG_TREE_PREEMPT_RCU +#ifdef CONFIG_PREEMPT_RCU RCU_STATE_INITIALIZER(rcu_preempt, 'p', call_rcu); static struct rcu_state *rcu_state_p = &rcu_preempt_state; @@ -156,7 +166,7 @@ static void rcu_preempt_qs(void) * * Caller must disable preemption. */ -static void rcu_preempt_note_context_switch(int cpu) +static void rcu_preempt_note_context_switch(void) { struct task_struct *t = current; unsigned long flags; @@ -167,7 +177,7 @@ static void rcu_preempt_note_context_switch(int cpu) !t->rcu_read_unlock_special.b.blocked) { /* Possibly blocking in an RCU read-side critical section. */ - rdp = per_cpu_ptr(rcu_preempt_state.rda, cpu); + rdp = this_cpu_ptr(rcu_preempt_state.rda); rnp = rdp->mynode; raw_spin_lock_irqsave(&rnp->lock, flags); smp_mb__after_unlock_lock(); @@ -415,8 +425,6 @@ void rcu_read_unlock_special(struct task_struct *t) } } -#ifdef CONFIG_RCU_CPU_STALL_VERBOSE - /* * Dump detailed information for all tasks blocking the current RCU * grace period on the specified rcu_node structure. @@ -451,14 +459,6 @@ static void rcu_print_detail_task_stall(struct rcu_state *rsp) rcu_print_detail_task_stall_rnp(rnp); } -#else /* #ifdef CONFIG_RCU_CPU_STALL_VERBOSE */ - -static void rcu_print_detail_task_stall(struct rcu_state *rsp) -{ -} - -#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_VERBOSE */ - #ifdef CONFIG_RCU_CPU_STALL_INFO static void rcu_print_task_stall_begin(struct rcu_node *rnp) @@ -621,7 +621,7 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp, * * Caller must disable hard irqs. */ -static void rcu_preempt_check_callbacks(int cpu) +static void rcu_preempt_check_callbacks(void) { struct task_struct *t = current; @@ -630,8 +630,8 @@ static void rcu_preempt_check_callbacks(int cpu) return; } if (t->rcu_read_lock_nesting > 0 && - per_cpu(rcu_preempt_data, cpu).qs_pending && - !per_cpu(rcu_preempt_data, cpu).passed_quiesce) + __this_cpu_read(rcu_preempt_data.qs_pending) && + !__this_cpu_read(rcu_preempt_data.passed_quiesce)) t->rcu_read_unlock_special.b.need_qs = true; } @@ -919,7 +919,7 @@ void exit_rcu(void) __rcu_read_unlock(); } -#else /* #ifdef CONFIG_TREE_PREEMPT_RCU */ +#else /* #ifdef CONFIG_PREEMPT_RCU */ static struct rcu_state *rcu_state_p = &rcu_sched_state; @@ -945,7 +945,7 @@ EXPORT_SYMBOL_GPL(rcu_batches_completed); * Because preemptible RCU does not exist, we never have to check for * CPUs being in quiescent states. */ -static void rcu_preempt_note_context_switch(int cpu) +static void rcu_preempt_note_context_switch(void) { } @@ -1017,7 +1017,7 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp, * Because preemptible RCU does not exist, it never has any callbacks * to check. */ -static void rcu_preempt_check_callbacks(int cpu) +static void rcu_preempt_check_callbacks(void) { } @@ -1070,7 +1070,7 @@ void exit_rcu(void) { } -#endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */ +#endif /* #else #ifdef CONFIG_PREEMPT_RCU */ #ifdef CONFIG_RCU_BOOST @@ -1326,7 +1326,7 @@ static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp, smp_mb__after_unlock_lock(); rnp->boost_kthread_task = t; raw_spin_unlock_irqrestore(&rnp->lock, flags); - sp.sched_priority = RCU_BOOST_PRIO; + sp.sched_priority = kthread_prio; sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); wake_up_process(t); /* get to TASK_INTERRUPTIBLE quickly. */ return 0; @@ -1343,7 +1343,7 @@ static void rcu_cpu_kthread_setup(unsigned int cpu) { struct sched_param sp; - sp.sched_priority = RCU_KTHREAD_PRIO; + sp.sched_priority = kthread_prio; sched_setscheduler_nocheck(current, SCHED_FIFO, &sp); } @@ -1512,10 +1512,10 @@ static void rcu_prepare_kthreads(int cpu) * any flavor of RCU. */ #ifndef CONFIG_RCU_NOCB_CPU_ALL -int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies) +int rcu_needs_cpu(unsigned long *delta_jiffies) { *delta_jiffies = ULONG_MAX; - return rcu_cpu_has_callbacks(cpu, NULL); + return rcu_cpu_has_callbacks(NULL); } #endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */ @@ -1523,7 +1523,7 @@ int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies) * Because we do not have RCU_FAST_NO_HZ, don't bother cleaning up * after it. */ -static void rcu_cleanup_after_idle(int cpu) +static void rcu_cleanup_after_idle(void) { } @@ -1531,7 +1531,7 @@ static void rcu_cleanup_after_idle(int cpu) * Do the idle-entry grace-period work, which, because CONFIG_RCU_FAST_NO_HZ=n, * is nothing. */ -static void rcu_prepare_for_idle(int cpu) +static void rcu_prepare_for_idle(void) { } @@ -1624,15 +1624,15 @@ static bool __maybe_unused rcu_try_advance_all_cbs(void) * The caller must have disabled interrupts. */ #ifndef CONFIG_RCU_NOCB_CPU_ALL -int rcu_needs_cpu(int cpu, unsigned long *dj) +int rcu_needs_cpu(unsigned long *dj) { - struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); + struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); /* Snapshot to detect later posting of non-lazy callback. */ rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted; /* If no callbacks, RCU doesn't need the CPU. */ - if (!rcu_cpu_has_callbacks(cpu, &rdtp->all_lazy)) { + if (!rcu_cpu_has_callbacks(&rdtp->all_lazy)) { *dj = ULONG_MAX; return 0; } @@ -1666,12 +1666,12 @@ int rcu_needs_cpu(int cpu, unsigned long *dj) * * The caller must have disabled interrupts. */ -static void rcu_prepare_for_idle(int cpu) +static void rcu_prepare_for_idle(void) { #ifndef CONFIG_RCU_NOCB_CPU_ALL bool needwake; struct rcu_data *rdp; - struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); + struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); struct rcu_node *rnp; struct rcu_state *rsp; int tne; @@ -1679,7 +1679,7 @@ static void rcu_prepare_for_idle(int cpu) /* Handle nohz enablement switches conservatively. */ tne = ACCESS_ONCE(tick_nohz_active); if (tne != rdtp->tick_nohz_enabled_snap) { - if (rcu_cpu_has_callbacks(cpu, NULL)) + if (rcu_cpu_has_callbacks(NULL)) invoke_rcu_core(); /* force nohz to see update. */ rdtp->tick_nohz_enabled_snap = tne; return; @@ -1688,7 +1688,7 @@ static void rcu_prepare_for_idle(int cpu) return; /* If this is a no-CBs CPU, no callbacks, just return. */ - if (rcu_is_nocb_cpu(cpu)) + if (rcu_is_nocb_cpu(smp_processor_id())) return; /* @@ -1712,7 +1712,7 @@ static void rcu_prepare_for_idle(int cpu) return; rdtp->last_accelerate = jiffies; for_each_rcu_flavor(rsp) { - rdp = per_cpu_ptr(rsp->rda, cpu); + rdp = this_cpu_ptr(rsp->rda); if (!*rdp->nxttail[RCU_DONE_TAIL]) continue; rnp = rdp->mynode; @@ -1731,10 +1731,10 @@ static void rcu_prepare_for_idle(int cpu) * any grace periods that elapsed while the CPU was idle, and if any * callbacks are now ready to invoke, initiate invocation. */ -static void rcu_cleanup_after_idle(int cpu) +static void rcu_cleanup_after_idle(void) { #ifndef CONFIG_RCU_NOCB_CPU_ALL - if (rcu_is_nocb_cpu(cpu)) + if (rcu_is_nocb_cpu(smp_processor_id())) return; if (rcu_try_advance_all_cbs()) invoke_rcu_core(); @@ -2573,9 +2573,13 @@ static void rcu_spawn_one_nocb_kthread(struct rcu_state *rsp, int cpu) rdp->nocb_leader = rdp_spawn; if (rdp_last && rdp != rdp_spawn) rdp_last->nocb_next_follower = rdp; - rdp_last = rdp; - rdp = rdp->nocb_next_follower; - rdp_last->nocb_next_follower = NULL; + if (rdp == rdp_spawn) { + rdp = rdp->nocb_next_follower; + } else { + rdp_last = rdp; + rdp = rdp->nocb_next_follower; + rdp_last->nocb_next_follower = NULL; + } } while (rdp); rdp_spawn->nocb_next_follower = rdp_old_leader; } @@ -2761,9 +2765,10 @@ static int full_sysidle_state; /* Current system-idle state. */ * to detect full-system idle states, not RCU quiescent states and grace * periods. The caller must have disabled interrupts. */ -static void rcu_sysidle_enter(struct rcu_dynticks *rdtp, int irq) +static void rcu_sysidle_enter(int irq) { unsigned long j; + struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); /* If there are no nohz_full= CPUs, no need to track this. */ if (!tick_nohz_full_enabled()) @@ -2832,8 +2837,10 @@ void rcu_sysidle_force_exit(void) * usermode execution does -not- count as idle here! The caller must * have disabled interrupts. */ -static void rcu_sysidle_exit(struct rcu_dynticks *rdtp, int irq) +static void rcu_sysidle_exit(int irq) { + struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); + /* If there are no nohz_full= CPUs, no need to track this. */ if (!tick_nohz_full_enabled()) return; @@ -3127,11 +3134,11 @@ static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp) #else /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ -static void rcu_sysidle_enter(struct rcu_dynticks *rdtp, int irq) +static void rcu_sysidle_enter(int irq) { } -static void rcu_sysidle_exit(struct rcu_dynticks *rdtp, int irq) +static void rcu_sysidle_exit(int irq) { } diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 3ef8ba58694e..e0d31a345ee6 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -306,7 +306,7 @@ struct debug_obj_descr rcuhead_debug_descr = { EXPORT_SYMBOL_GPL(rcuhead_debug_descr); #endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */ -#if defined(CONFIG_TREE_RCU) || defined(CONFIG_TREE_PREEMPT_RCU) || defined(CONFIG_RCU_TRACE) +#if defined(CONFIG_TREE_RCU) || defined(CONFIG_PREEMPT_RCU) || defined(CONFIG_RCU_TRACE) void do_trace_rcu_torture_read(const char *rcutorturename, struct rcu_head *rhp, unsigned long secs, unsigned long c_old, unsigned long c) @@ -531,7 +531,8 @@ static int __noreturn rcu_tasks_kthread(void *arg) struct rcu_head *next; LIST_HEAD(rcu_tasks_holdouts); - /* FIXME: Add housekeeping affinity. */ + /* Run on housekeeping CPUs by default. Sysadm can move if desired. */ + housekeeping_affine(current); /* * Each pass through the following loop makes one check for @@ -690,3 +691,87 @@ static void rcu_spawn_tasks_kthread(void) } #endif /* #ifdef CONFIG_TASKS_RCU */ + +#ifdef CONFIG_PROVE_RCU + +/* + * Early boot self test parameters, one for each flavor + */ +static bool rcu_self_test; +static bool rcu_self_test_bh; +static bool rcu_self_test_sched; + +module_param(rcu_self_test, bool, 0444); +module_param(rcu_self_test_bh, bool, 0444); +module_param(rcu_self_test_sched, bool, 0444); + +static int rcu_self_test_counter; + +static void test_callback(struct rcu_head *r) +{ + rcu_self_test_counter++; + pr_info("RCU test callback executed %d\n", rcu_self_test_counter); +} + +static void early_boot_test_call_rcu(void) +{ + static struct rcu_head head; + + call_rcu(&head, test_callback); +} + +static void early_boot_test_call_rcu_bh(void) +{ + static struct rcu_head head; + + call_rcu_bh(&head, test_callback); +} + +static void early_boot_test_call_rcu_sched(void) +{ + static struct rcu_head head; + + call_rcu_sched(&head, test_callback); +} + +void rcu_early_boot_tests(void) +{ + pr_info("Running RCU self tests\n"); + + if (rcu_self_test) + early_boot_test_call_rcu(); + if (rcu_self_test_bh) + early_boot_test_call_rcu_bh(); + if (rcu_self_test_sched) + early_boot_test_call_rcu_sched(); +} + +static int rcu_verify_early_boot_tests(void) +{ + int ret = 0; + int early_boot_test_counter = 0; + + if (rcu_self_test) { + early_boot_test_counter++; + rcu_barrier(); + } + if (rcu_self_test_bh) { + early_boot_test_counter++; + rcu_barrier_bh(); + } + if (rcu_self_test_sched) { + early_boot_test_counter++; + rcu_barrier_sched(); + } + + if (rcu_self_test_counter != early_boot_test_counter) { + WARN_ON(1); + ret = -1; + } + + return ret; +} +late_initcall(rcu_verify_early_boot_tests); +#else +void rcu_early_boot_tests(void) {} +#endif /* CONFIG_PROVE_RCU */ diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c index a63f4dc27909..607f852b4d04 100644 --- a/kernel/sched/completion.c +++ b/kernel/sched/completion.c @@ -148,7 +148,7 @@ EXPORT_SYMBOL(wait_for_completion_timeout); * * This waits to be signaled for completion of a specific task. It is NOT * interruptible and there is no timeout. The caller is accounted as waiting - * for IO. + * for IO (which traditionally means blkio only). */ void __sched wait_for_completion_io(struct completion *x) { @@ -163,7 +163,8 @@ EXPORT_SYMBOL(wait_for_completion_io); * * This waits for either a completion of a specific task to be signaled or for a * specified timeout to expire. The timeout is in jiffies. It is not - * interruptible. The caller is accounted as waiting for IO. + * interruptible. The caller is accounted as waiting for IO (which traditionally + * means blkio only). * * Return: 0 if timed out, and positive (at least 1, or number of jiffies left * till timeout) if completed. diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 24beb9bb4c3e..bb398c0c5f08 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1008,6 +1008,9 @@ inline int task_curr(const struct task_struct *p) return cpu_curr(task_cpu(p)) == p; } +/* + * Can drop rq->lock because from sched_class::switched_from() methods drop it. + */ static inline void check_class_changed(struct rq *rq, struct task_struct *p, const struct sched_class *prev_class, int oldprio) @@ -1015,6 +1018,7 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p, if (prev_class != p->sched_class) { if (prev_class->switched_from) prev_class->switched_from(rq, p); + /* Possble rq->lock 'hole'. */ p->sched_class->switched_to(rq, p); } else if (oldprio != p->prio || dl_task(p)) p->sched_class->prio_changed(rq, p, oldprio); @@ -1054,7 +1058,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) * ttwu() will sort out the placement. */ WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING && - !(task_preempt_count(p) & PREEMPT_ACTIVE)); + !p->on_rq); #ifdef CONFIG_LOCKDEP /* @@ -1407,7 +1411,8 @@ out: static inline int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags) { - cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags); + if (p->nr_cpus_allowed > 1) + cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags); /* * In order not to call set_task_cpu() on a blocking task we need @@ -1623,8 +1628,10 @@ void wake_up_if_idle(int cpu) struct rq *rq = cpu_rq(cpu); unsigned long flags; - if (!is_idle_task(rq->curr)) - return; + rcu_read_lock(); + + if (!is_idle_task(rcu_dereference(rq->curr))) + goto out; if (set_nr_if_polling(rq->idle)) { trace_sched_wake_idle_without_ipi(cpu); @@ -1635,6 +1642,9 @@ void wake_up_if_idle(int cpu) /* Else cpu is not in idle, do nothing here */ raw_spin_unlock_irqrestore(&rq->lock, flags); } + +out: + rcu_read_unlock(); } bool cpus_share_cache(int this_cpu, int that_cpu) @@ -1853,12 +1863,10 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0; p->numa_scan_period = sysctl_numa_balancing_scan_delay; p->numa_work.next = &p->numa_work; - p->numa_faults_memory = NULL; - p->numa_faults_buffer_memory = NULL; + p->numa_faults = NULL; p->last_task_numa_placement = 0; p->last_sum_exec_runtime = 0; - INIT_LIST_HEAD(&p->numa_entry); p->numa_group = NULL; #endif /* CONFIG_NUMA_BALANCING */ } @@ -2034,25 +2042,6 @@ static inline int dl_bw_cpus(int i) } #endif -static inline -void __dl_clear(struct dl_bw *dl_b, u64 tsk_bw) -{ - dl_b->total_bw -= tsk_bw; -} - -static inline -void __dl_add(struct dl_bw *dl_b, u64 tsk_bw) -{ - dl_b->total_bw += tsk_bw; -} - -static inline -bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw) -{ - return dl_b->bw != -1 && - dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw; -} - /* * We must be sure that accepting a new task (or allowing changing the * parameters of an existing one) is consistent with the bandwidth @@ -2220,7 +2209,6 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev, /** * finish_task_switch - clean up after a task-switch - * @rq: runqueue associated with task-switch * @prev: the thread we just switched away from. * * finish_task_switch must be called after the context switch, paired @@ -2232,10 +2220,16 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev, * so, we finish that here outside of the runqueue lock. (Doing it * with the lock held can cause deadlocks; see schedule() for * details.) + * + * The context switch have flipped the stack from under us and restored the + * local variables which were saved when this task called schedule() in the + * past. prev == current is still correct but we need to recalculate this_rq + * because prev may have moved to another CPU. */ -static void finish_task_switch(struct rq *rq, struct task_struct *prev) +static struct rq *finish_task_switch(struct task_struct *prev) __releases(rq->lock) { + struct rq *rq = this_rq(); struct mm_struct *mm = rq->prev_mm; long prev_state; @@ -2275,6 +2269,7 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) } tick_nohz_task_switch(current); + return rq; } #ifdef CONFIG_SMP @@ -2309,25 +2304,22 @@ static inline void post_schedule(struct rq *rq) asmlinkage __visible void schedule_tail(struct task_struct *prev) __releases(rq->lock) { - struct rq *rq = this_rq(); - - finish_task_switch(rq, prev); + struct rq *rq; - /* - * FIXME: do we need to worry about rq being invalidated by the - * task_switch? - */ + /* finish_task_switch() drops rq->lock and enables preemtion */ + preempt_disable(); + rq = finish_task_switch(prev); post_schedule(rq); + preempt_enable(); if (current->set_child_tid) put_user(task_pid_vnr(current), current->set_child_tid); } /* - * context_switch - switch to the new MM and the new - * thread's register state. + * context_switch - switch to the new MM and the new thread's register state. */ -static inline void +static inline struct rq * context_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next) { @@ -2366,14 +2358,9 @@ context_switch(struct rq *rq, struct task_struct *prev, context_tracking_task_switch(prev, next); /* Here we just switch the register state and the stack. */ switch_to(prev, next, prev); - barrier(); - /* - * this_rq must be evaluated again because prev may have moved - * CPUs since it called schedule(), thus the 'rq' on its stack - * frame will be invalid. - */ - finish_task_switch(this_rq(), prev); + + return finish_task_switch(prev); } /* @@ -2773,7 +2760,7 @@ need_resched: preempt_disable(); cpu = smp_processor_id(); rq = cpu_rq(cpu); - rcu_note_context_switch(cpu); + rcu_note_context_switch(); prev = rq->curr; schedule_debug(prev); @@ -2826,15 +2813,8 @@ need_resched: rq->curr = next; ++*switch_count; - context_switch(rq, prev, next); /* unlocks the rq */ - /* - * The context switch have flipped the stack from under us - * and restored the local variables which were saved when - * this task called schedule() in the past. prev == current - * is still correct, but it can be moved to another cpu/rq. - */ - cpu = smp_processor_id(); - rq = cpu_rq(cpu); + rq = context_switch(rq, prev, next); /* unlocks the rq */ + cpu = cpu_of(rq); } else raw_spin_unlock_irq(&rq->lock); @@ -2874,10 +2854,14 @@ asmlinkage __visible void __sched schedule_user(void) * or we have been woken up remotely but the IPI has not yet arrived, * we haven't yet exited the RCU idle mode. Do it here manually until * we find a better solution. + * + * NB: There are buggy callers of this function. Ideally we + * should warn if prev_state != IN_USER, but that will trigger + * too frequently to make sense yet. */ - user_exit(); + enum ctx_state prev_state = exception_enter(); schedule(); - user_enter(); + exception_exit(prev_state); } #endif @@ -4649,6 +4633,81 @@ void init_idle(struct task_struct *idle, int cpu) #endif } +int cpuset_cpumask_can_shrink(const struct cpumask *cur, + const struct cpumask *trial) +{ + int ret = 1, trial_cpus; + struct dl_bw *cur_dl_b; + unsigned long flags; + + rcu_read_lock_sched(); + cur_dl_b = dl_bw_of(cpumask_any(cur)); + trial_cpus = cpumask_weight(trial); + + raw_spin_lock_irqsave(&cur_dl_b->lock, flags); + if (cur_dl_b->bw != -1 && + cur_dl_b->bw * trial_cpus < cur_dl_b->total_bw) + ret = 0; + raw_spin_unlock_irqrestore(&cur_dl_b->lock, flags); + rcu_read_unlock_sched(); + + return ret; +} + +int task_can_attach(struct task_struct *p, + const struct cpumask *cs_cpus_allowed) +{ + int ret = 0; + + /* + * Kthreads which disallow setaffinity shouldn't be moved + * to a new cpuset; we don't want to change their cpu + * affinity and isolating such threads by their set of + * allowed nodes is unnecessary. Thus, cpusets are not + * applicable for such threads. This prevents checking for + * success of set_cpus_allowed_ptr() on all attached tasks + * before cpus_allowed may be changed. + */ + if (p->flags & PF_NO_SETAFFINITY) { + ret = -EINVAL; + goto out; + } + +#ifdef CONFIG_SMP + if (dl_task(p) && !cpumask_intersects(task_rq(p)->rd->span, + cs_cpus_allowed)) { + unsigned int dest_cpu = cpumask_any_and(cpu_active_mask, + cs_cpus_allowed); + struct dl_bw *dl_b; + bool overflow; + int cpus; + unsigned long flags; + + rcu_read_lock_sched(); + dl_b = dl_bw_of(dest_cpu); + raw_spin_lock_irqsave(&dl_b->lock, flags); + cpus = dl_bw_cpus(dest_cpu); + overflow = __dl_overflow(dl_b, cpus, 0, p->dl.dl_bw); + if (overflow) + ret = -EBUSY; + else { + /* + * We reserve space for this task in the destination + * root_domain, as we can't fail after this point. + * We will free resources in the source root_domain + * later on (see set_cpus_allowed_dl()). + */ + __dl_add(dl_b, p->dl.dl_bw); + } + raw_spin_unlock_irqrestore(&dl_b->lock, flags); + rcu_read_unlock_sched(); + + } +#endif +out: + return ret; +} + #ifdef CONFIG_SMP /* * move_queued_task - move a queued task to new rq. @@ -6099,7 +6158,9 @@ static void claim_allocations(int cpu, struct sched_domain *sd) #ifdef CONFIG_NUMA static int sched_domains_numa_levels; +enum numa_topology_type sched_numa_topology_type; static int *sched_domains_numa_distance; +int sched_max_numa_distance; static struct cpumask ***sched_domains_numa_masks; static int sched_domains_curr_level; #endif @@ -6271,7 +6332,7 @@ static void sched_numa_warn(const char *str) printk(KERN_WARNING "\n"); } -static bool find_numa_distance(int distance) +bool find_numa_distance(int distance) { int i; @@ -6286,6 +6347,56 @@ static bool find_numa_distance(int distance) return false; } +/* + * A system can have three types of NUMA topology: + * NUMA_DIRECT: all nodes are directly connected, or not a NUMA system + * NUMA_GLUELESS_MESH: some nodes reachable through intermediary nodes + * NUMA_BACKPLANE: nodes can reach other nodes through a backplane + * + * The difference between a glueless mesh topology and a backplane + * topology lies in whether communication between not directly + * connected nodes goes through intermediary nodes (where programs + * could run), or through backplane controllers. This affects + * placement of programs. + * + * The type of topology can be discerned with the following tests: + * - If the maximum distance between any nodes is 1 hop, the system + * is directly connected. + * - If for two nodes A and B, located N > 1 hops away from each other, + * there is an intermediary node C, which is < N hops away from both + * nodes A and B, the system is a glueless mesh. + */ +static void init_numa_topology_type(void) +{ + int a, b, c, n; + + n = sched_max_numa_distance; + + if (n <= 1) + sched_numa_topology_type = NUMA_DIRECT; + + for_each_online_node(a) { + for_each_online_node(b) { + /* Find two nodes furthest removed from each other. */ + if (node_distance(a, b) < n) + continue; + + /* Is there an intermediary node between a and b? */ + for_each_online_node(c) { + if (node_distance(a, c) < n && + node_distance(b, c) < n) { + sched_numa_topology_type = + NUMA_GLUELESS_MESH; + return; + } + } + + sched_numa_topology_type = NUMA_BACKPLANE; + return; + } + } +} + static void sched_init_numa(void) { int next_distance, curr_distance = node_distance(0, 0); @@ -6422,6 +6533,9 @@ static void sched_init_numa(void) sched_domain_topology = tl; sched_domains_numa_levels = level; + sched_max_numa_distance = sched_domains_numa_distance[level - 1]; + + init_numa_topology_type(); } static void sched_domains_numa_masks_set(int cpu) @@ -7174,6 +7288,25 @@ static inline int preempt_count_equals(int preempt_offset) void __might_sleep(const char *file, int line, int preempt_offset) { + /* + * Blocking primitives will set (and therefore destroy) current->state, + * since we will exit with TASK_RUNNING make sure we enter with it, + * otherwise we will destroy state. + */ + if (WARN_ONCE(current->state != TASK_RUNNING, + "do not call blocking ops when !TASK_RUNNING; " + "state=%lx set at [<%p>] %pS\n", + current->state, + (void *)current->task_state_change, + (void *)current->task_state_change)) + __set_current_state(TASK_RUNNING); + + ___might_sleep(file, line, preempt_offset); +} +EXPORT_SYMBOL(__might_sleep); + +void ___might_sleep(const char *file, int line, int preempt_offset) +{ static unsigned long prev_jiffy; /* ratelimiting */ rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */ @@ -7205,7 +7338,7 @@ void __might_sleep(const char *file, int line, int preempt_offset) #endif dump_stack(); } -EXPORT_SYMBOL(__might_sleep); +EXPORT_SYMBOL(___might_sleep); #endif #ifdef CONFIG_MAGIC_SYSRQ diff --git a/kernel/sched/cpudeadline.h b/kernel/sched/cpudeadline.h index 538c9796ad4a..020039bd1326 100644 --- a/kernel/sched/cpudeadline.h +++ b/kernel/sched/cpudeadline.h @@ -25,9 +25,6 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p, void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid); int cpudl_init(struct cpudl *cp); void cpudl_cleanup(struct cpudl *cp); -#else -#define cpudl_set(cp, cpu, dl) do { } while (0) -#define cpudl_init() do { } while (0) #endif /* CONFIG_SMP */ #endif /* _LINUX_CPUDL_H */ diff --git a/kernel/sched/cpupri.h b/kernel/sched/cpupri.h index 6b033347fdfd..63cbb9ca0496 100644 --- a/kernel/sched/cpupri.h +++ b/kernel/sched/cpupri.h @@ -26,9 +26,6 @@ int cpupri_find(struct cpupri *cp, void cpupri_set(struct cpupri *cp, int cpu, int pri); int cpupri_init(struct cpupri *cp); void cpupri_cleanup(struct cpupri *cp); -#else -#define cpupri_set(cp, cpu, pri) do { } while (0) -#define cpupri_init() do { } while (0) #endif #endif /* _LINUX_CPUPRI_H */ diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 28fa9d9e9201..e5db8c6feebd 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -563,11 +563,6 @@ void init_dl_task_timer(struct sched_dl_entity *dl_se) { struct hrtimer *timer = &dl_se->dl_timer; - if (hrtimer_active(timer)) { - hrtimer_try_to_cancel(timer); - return; - } - hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); timer->function = dl_task_timer; } @@ -633,7 +628,7 @@ static void update_curr_dl(struct rq *rq) sched_rt_avg_update(rq, delta_exec); - dl_se->runtime -= delta_exec; + dl_se->runtime -= dl_se->dl_yielded ? 0 : delta_exec; if (dl_runtime_exceeded(rq, dl_se)) { __dequeue_task_dl(rq, curr, 0); if (likely(start_dl_timer(dl_se, curr->dl.dl_boosted))) @@ -933,7 +928,7 @@ select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags) struct task_struct *curr; struct rq *rq; - if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK) + if (sd_flag != SD_BALANCE_WAKE) goto out; rq = cpu_rq(cpu); @@ -1018,6 +1013,10 @@ static void start_hrtick_dl(struct rq *rq, struct task_struct *p) { hrtick_start(rq, p->dl.runtime); } +#else /* !CONFIG_SCHED_HRTICK */ +static void start_hrtick_dl(struct rq *rq, struct task_struct *p) +{ +} #endif static struct sched_dl_entity *pick_next_dl_entity(struct rq *rq, @@ -1071,10 +1070,8 @@ struct task_struct *pick_next_task_dl(struct rq *rq, struct task_struct *prev) /* Running task will never be pushed. */ dequeue_pushable_dl_task(rq, p); -#ifdef CONFIG_SCHED_HRTICK if (hrtick_enabled(rq)) start_hrtick_dl(rq, p); -#endif set_post_schedule(rq); @@ -1093,10 +1090,8 @@ static void task_tick_dl(struct rq *rq, struct task_struct *p, int queued) { update_curr_dl(rq); -#ifdef CONFIG_SCHED_HRTICK if (hrtick_enabled(rq) && queued && p->dl.runtime > 0) start_hrtick_dl(rq, p); -#endif } static void task_fork_dl(struct task_struct *p) @@ -1333,6 +1328,7 @@ static int push_dl_task(struct rq *rq) { struct task_struct *next_task; struct rq *later_rq; + int ret = 0; if (!rq->dl.overloaded) return 0; @@ -1378,7 +1374,6 @@ retry: * The task is still there. We don't try * again, some other cpu will pull it when ready. */ - dequeue_pushable_dl_task(rq, next_task); goto out; } @@ -1394,6 +1389,7 @@ retry: deactivate_task(rq, next_task, 0); set_task_cpu(next_task, later_rq->cpu); activate_task(later_rq, next_task, 0); + ret = 1; resched_curr(later_rq); @@ -1402,7 +1398,7 @@ retry: out: put_task_struct(next_task); - return 1; + return ret; } static void push_dl_tasks(struct rq *rq) @@ -1508,7 +1504,7 @@ static void task_woken_dl(struct rq *rq, struct task_struct *p) p->nr_cpus_allowed > 1 && dl_task(rq->curr) && (rq->curr->nr_cpus_allowed < 2 || - dl_entity_preempt(&rq->curr->dl, &p->dl))) { + !dl_entity_preempt(&p->dl, &rq->curr->dl))) { push_dl_tasks(rq); } } @@ -1517,10 +1513,33 @@ static void set_cpus_allowed_dl(struct task_struct *p, const struct cpumask *new_mask) { struct rq *rq; + struct root_domain *src_rd; int weight; BUG_ON(!dl_task(p)); + rq = task_rq(p); + src_rd = rq->rd; + /* + * Migrating a SCHED_DEADLINE task between exclusive + * cpusets (different root_domains) entails a bandwidth + * update. We already made space for us in the destination + * domain (see cpuset_can_attach()). + */ + if (!cpumask_intersects(src_rd->span, new_mask)) { + struct dl_bw *src_dl_b; + + src_dl_b = dl_bw_of(cpu_of(rq)); + /* + * We now free resources of the root_domain we are migrating + * off. In the worst case, sched_setattr() may temporary fail + * until we complete the update. + */ + raw_spin_lock(&src_dl_b->lock); + __dl_clear(src_dl_b, p->dl.dl_bw); + raw_spin_unlock(&src_dl_b->lock); + } + /* * Update only if the task is actually running (i.e., * it is on the rq AND it is not throttled). @@ -1537,8 +1556,6 @@ static void set_cpus_allowed_dl(struct task_struct *p, if ((p->nr_cpus_allowed > 1) == (weight > 1)) return; - rq = task_rq(p); - /* * The process used to be able to migrate OR it can now migrate */ @@ -1586,22 +1603,48 @@ void init_sched_dl_class(void) #endif /* CONFIG_SMP */ +/* + * Ensure p's dl_timer is cancelled. May drop rq->lock for a while. + */ +static void cancel_dl_timer(struct rq *rq, struct task_struct *p) +{ + struct hrtimer *dl_timer = &p->dl.dl_timer; + + /* Nobody will change task's class if pi_lock is held */ + lockdep_assert_held(&p->pi_lock); + + if (hrtimer_active(dl_timer)) { + int ret = hrtimer_try_to_cancel(dl_timer); + + if (unlikely(ret == -1)) { + /* + * Note, p may migrate OR new deadline tasks + * may appear in rq when we are unlocking it. + * A caller of us must be fine with that. + */ + raw_spin_unlock(&rq->lock); + hrtimer_cancel(dl_timer); + raw_spin_lock(&rq->lock); + } + } +} + static void switched_from_dl(struct rq *rq, struct task_struct *p) { - if (hrtimer_active(&p->dl.dl_timer) && !dl_policy(p->policy)) - hrtimer_try_to_cancel(&p->dl.dl_timer); + cancel_dl_timer(rq, p); __dl_clear_params(p); -#ifdef CONFIG_SMP /* * Since this might be the only -deadline task on the rq, * this is the right place to try to pull some other one * from an overloaded cpu, if any. */ - if (!rq->dl.dl_nr_running) - pull_dl_task(rq); -#endif + if (!task_on_rq_queued(p) || rq->dl.dl_nr_running) + return; + + if (pull_dl_task(rq)) + resched_curr(rq); } /* @@ -1622,7 +1665,8 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p) if (task_on_rq_queued(p) && rq->curr != p) { #ifdef CONFIG_SMP - if (rq->dl.overloaded && push_dl_task(rq) && rq != task_rq(p)) + if (p->nr_cpus_allowed > 1 && rq->dl.overloaded && + push_dl_task(rq) && rq != task_rq(p)) /* Only reschedule if pushing failed */ check_resched = 0; #endif /* CONFIG_SMP */ @@ -1704,3 +1748,12 @@ const struct sched_class dl_sched_class = { .update_curr = update_curr_dl, }; + +#ifdef CONFIG_SCHED_DEBUG +extern void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq); + +void print_dl_stats(struct seq_file *m, int cpu) +{ + print_dl_rq(m, cpu, &cpu_rq(cpu)->dl); +} +#endif /* CONFIG_SCHED_DEBUG */ diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index ce33780d8f20..92cc52001e74 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -261,6 +261,12 @@ void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq) #undef P } +void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq) +{ + SEQ_printf(m, "\ndl_rq[%d]:\n", cpu); + SEQ_printf(m, " .%-30s: %ld\n", "dl_nr_running", dl_rq->dl_nr_running); +} + extern __read_mostly int sched_clock_running; static void print_cpu(struct seq_file *m, int cpu) @@ -329,6 +335,7 @@ do { \ spin_lock_irqsave(&sched_debug_lock, flags); print_cfs_stats(m, cpu); print_rt_stats(m, cpu); + print_dl_stats(m, cpu); print_rq(m, rq, cpu); spin_unlock_irqrestore(&sched_debug_lock, flags); @@ -528,8 +535,8 @@ static void sched_show_numa(struct task_struct *p, struct seq_file *m) unsigned long nr_faults = -1; int cpu_current, home_node; - if (p->numa_faults_memory) - nr_faults = p->numa_faults_memory[2*node + i]; + if (p->numa_faults) + nr_faults = p->numa_faults[2*node + i]; cpu_current = !i ? (task_node(p) == node) : (pol && node_isset(node, pol->v.nodes)); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index ef2b104b254c..df2cdf77f899 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -873,7 +873,6 @@ struct numa_group { spinlock_t lock; /* nr_tasks, tasks */ int nr_tasks; pid_t gid; - struct list_head task_list; struct rcu_head rcu; nodemask_t active_nodes; @@ -901,18 +900,24 @@ pid_t task_numa_group_id(struct task_struct *p) return p->numa_group ? p->numa_group->gid : 0; } -static inline int task_faults_idx(int nid, int priv) +/* + * The averaged statistics, shared & private, memory & cpu, + * occupy the first half of the array. The second half of the + * array is for current counters, which are averaged into the + * first set by task_numa_placement. + */ +static inline int task_faults_idx(enum numa_faults_stats s, int nid, int priv) { - return NR_NUMA_HINT_FAULT_TYPES * nid + priv; + return NR_NUMA_HINT_FAULT_TYPES * (s * nr_node_ids + nid) + priv; } static inline unsigned long task_faults(struct task_struct *p, int nid) { - if (!p->numa_faults_memory) + if (!p->numa_faults) return 0; - return p->numa_faults_memory[task_faults_idx(nid, 0)] + - p->numa_faults_memory[task_faults_idx(nid, 1)]; + return p->numa_faults[task_faults_idx(NUMA_MEM, nid, 0)] + + p->numa_faults[task_faults_idx(NUMA_MEM, nid, 1)]; } static inline unsigned long group_faults(struct task_struct *p, int nid) @@ -920,14 +925,79 @@ static inline unsigned long group_faults(struct task_struct *p, int nid) if (!p->numa_group) return 0; - return p->numa_group->faults[task_faults_idx(nid, 0)] + - p->numa_group->faults[task_faults_idx(nid, 1)]; + return p->numa_group->faults[task_faults_idx(NUMA_MEM, nid, 0)] + + p->numa_group->faults[task_faults_idx(NUMA_MEM, nid, 1)]; } static inline unsigned long group_faults_cpu(struct numa_group *group, int nid) { - return group->faults_cpu[task_faults_idx(nid, 0)] + - group->faults_cpu[task_faults_idx(nid, 1)]; + return group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 0)] + + group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 1)]; +} + +/* Handle placement on systems where not all nodes are directly connected. */ +static unsigned long score_nearby_nodes(struct task_struct *p, int nid, + int maxdist, bool task) +{ + unsigned long score = 0; + int node; + + /* + * All nodes are directly connected, and the same distance + * from each other. No need for fancy placement algorithms. + */ + if (sched_numa_topology_type == NUMA_DIRECT) + return 0; + + /* + * This code is called for each node, introducing N^2 complexity, + * which should be ok given the number of nodes rarely exceeds 8. + */ + for_each_online_node(node) { + unsigned long faults; + int dist = node_distance(nid, node); + + /* + * The furthest away nodes in the system are not interesting + * for placement; nid was already counted. + */ + if (dist == sched_max_numa_distance || node == nid) + continue; + + /* + * On systems with a backplane NUMA topology, compare groups + * of nodes, and move tasks towards the group with the most + * memory accesses. When comparing two nodes at distance + * "hoplimit", only nodes closer by than "hoplimit" are part + * of each group. Skip other nodes. + */ + if (sched_numa_topology_type == NUMA_BACKPLANE && + dist > maxdist) + continue; + + /* Add up the faults from nearby nodes. */ + if (task) + faults = task_faults(p, node); + else + faults = group_faults(p, node); + + /* + * On systems with a glueless mesh NUMA topology, there are + * no fixed "groups of nodes". Instead, nodes that are not + * directly connected bounce traffic through intermediate + * nodes; a numa_group can occupy any set of nodes. + * The further away a node is, the less the faults count. + * This seems to result in good task placement. + */ + if (sched_numa_topology_type == NUMA_GLUELESS_MESH) { + faults *= (sched_max_numa_distance - dist); + faults /= (sched_max_numa_distance - LOCAL_DISTANCE); + } + + score += faults; + } + + return score; } /* @@ -936,11 +1006,12 @@ static inline unsigned long group_faults_cpu(struct numa_group *group, int nid) * larger multiplier, in order to group tasks together that are almost * evenly spread out between numa nodes. */ -static inline unsigned long task_weight(struct task_struct *p, int nid) +static inline unsigned long task_weight(struct task_struct *p, int nid, + int dist) { - unsigned long total_faults; + unsigned long faults, total_faults; - if (!p->numa_faults_memory) + if (!p->numa_faults) return 0; total_faults = p->total_numa_faults; @@ -948,15 +1019,29 @@ static inline unsigned long task_weight(struct task_struct *p, int nid) if (!total_faults) return 0; - return 1000 * task_faults(p, nid) / total_faults; + faults = task_faults(p, nid); + faults += score_nearby_nodes(p, nid, dist, true); + + return 1000 * faults / total_faults; } -static inline unsigned long group_weight(struct task_struct *p, int nid) +static inline unsigned long group_weight(struct task_struct *p, int nid, + int dist) { - if (!p->numa_group || !p->numa_group->total_faults) + unsigned long faults, total_faults; + + if (!p->numa_group) return 0; - return 1000 * group_faults(p, nid) / p->numa_group->total_faults; + total_faults = p->numa_group->total_faults; + + if (!total_faults) + return 0; + + faults = group_faults(p, nid); + faults += score_nearby_nodes(p, nid, dist, false); + + return 1000 * faults / total_faults; } bool should_numa_migrate_memory(struct task_struct *p, struct page * page, @@ -1089,6 +1174,7 @@ struct task_numa_env { struct numa_stats src_stats, dst_stats; int imbalance_pct; + int dist; struct task_struct *best_task; long best_imp; @@ -1168,6 +1254,7 @@ static void task_numa_compare(struct task_numa_env *env, long load; long imp = env->p->numa_group ? groupimp : taskimp; long moveimp = imp; + int dist = env->dist; rcu_read_lock(); @@ -1208,8 +1295,8 @@ static void task_numa_compare(struct task_numa_env *env, * in any group then look only at task weights. */ if (cur->numa_group == env->p->numa_group) { - imp = taskimp + task_weight(cur, env->src_nid) - - task_weight(cur, env->dst_nid); + imp = taskimp + task_weight(cur, env->src_nid, dist) - + task_weight(cur, env->dst_nid, dist); /* * Add some hysteresis to prevent swapping the * tasks within a group over tiny differences. @@ -1223,11 +1310,11 @@ static void task_numa_compare(struct task_numa_env *env, * instead. */ if (cur->numa_group) - imp += group_weight(cur, env->src_nid) - - group_weight(cur, env->dst_nid); + imp += group_weight(cur, env->src_nid, dist) - + group_weight(cur, env->dst_nid, dist); else - imp += task_weight(cur, env->src_nid) - - task_weight(cur, env->dst_nid); + imp += task_weight(cur, env->src_nid, dist) - + task_weight(cur, env->dst_nid, dist); } } @@ -1326,7 +1413,7 @@ static int task_numa_migrate(struct task_struct *p) }; struct sched_domain *sd; unsigned long taskweight, groupweight; - int nid, ret; + int nid, ret, dist; long taskimp, groupimp; /* @@ -1354,29 +1441,45 @@ static int task_numa_migrate(struct task_struct *p) return -EINVAL; } - taskweight = task_weight(p, env.src_nid); - groupweight = group_weight(p, env.src_nid); - update_numa_stats(&env.src_stats, env.src_nid); env.dst_nid = p->numa_preferred_nid; - taskimp = task_weight(p, env.dst_nid) - taskweight; - groupimp = group_weight(p, env.dst_nid) - groupweight; + dist = env.dist = node_distance(env.src_nid, env.dst_nid); + taskweight = task_weight(p, env.src_nid, dist); + groupweight = group_weight(p, env.src_nid, dist); + update_numa_stats(&env.src_stats, env.src_nid); + taskimp = task_weight(p, env.dst_nid, dist) - taskweight; + groupimp = group_weight(p, env.dst_nid, dist) - groupweight; update_numa_stats(&env.dst_stats, env.dst_nid); /* Try to find a spot on the preferred nid. */ task_numa_find_cpu(&env, taskimp, groupimp); - /* No space available on the preferred nid. Look elsewhere. */ - if (env.best_cpu == -1) { + /* + * Look at other nodes in these cases: + * - there is no space available on the preferred_nid + * - the task is part of a numa_group that is interleaved across + * multiple NUMA nodes; in order to better consolidate the group, + * we need to check other locations. + */ + if (env.best_cpu == -1 || (p->numa_group && + nodes_weight(p->numa_group->active_nodes) > 1)) { for_each_online_node(nid) { if (nid == env.src_nid || nid == p->numa_preferred_nid) continue; + dist = node_distance(env.src_nid, env.dst_nid); + if (sched_numa_topology_type == NUMA_BACKPLANE && + dist != env.dist) { + taskweight = task_weight(p, env.src_nid, dist); + groupweight = group_weight(p, env.src_nid, dist); + } + /* Only consider nodes where both task and groups benefit */ - taskimp = task_weight(p, nid) - taskweight; - groupimp = group_weight(p, nid) - groupweight; + taskimp = task_weight(p, nid, dist) - taskweight; + groupimp = group_weight(p, nid, dist) - groupweight; if (taskimp < 0 && groupimp < 0) continue; + env.dist = dist; env.dst_nid = nid; update_numa_stats(&env.dst_stats, env.dst_nid); task_numa_find_cpu(&env, taskimp, groupimp); @@ -1431,7 +1534,7 @@ static void numa_migrate_preferred(struct task_struct *p) unsigned long interval = HZ; /* This task has no NUMA fault statistics yet */ - if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults_memory)) + if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults)) return; /* Periodically retry migrating the task to the preferred node */ @@ -1580,6 +1683,92 @@ static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period) return delta; } +/* + * Determine the preferred nid for a task in a numa_group. This needs to + * be done in a way that produces consistent results with group_weight, + * otherwise workloads might not converge. + */ +static int preferred_group_nid(struct task_struct *p, int nid) +{ + nodemask_t nodes; + int dist; + + /* Direct connections between all NUMA nodes. */ + if (sched_numa_topology_type == NUMA_DIRECT) + return nid; + + /* + * On a system with glueless mesh NUMA topology, group_weight + * scores nodes according to the number of NUMA hinting faults on + * both the node itself, and on nearby nodes. + */ + if (sched_numa_topology_type == NUMA_GLUELESS_MESH) { + unsigned long score, max_score = 0; + int node, max_node = nid; + + dist = sched_max_numa_distance; + + for_each_online_node(node) { + score = group_weight(p, node, dist); + if (score > max_score) { + max_score = score; + max_node = node; + } + } + return max_node; + } + + /* + * Finding the preferred nid in a system with NUMA backplane + * interconnect topology is more involved. The goal is to locate + * tasks from numa_groups near each other in the system, and + * untangle workloads from different sides of the system. This requires + * searching down the hierarchy of node groups, recursively searching + * inside the highest scoring group of nodes. The nodemask tricks + * keep the complexity of the search down. + */ + nodes = node_online_map; + for (dist = sched_max_numa_distance; dist > LOCAL_DISTANCE; dist--) { + unsigned long max_faults = 0; + nodemask_t max_group; + int a, b; + + /* Are there nodes at this distance from each other? */ + if (!find_numa_distance(dist)) + continue; + + for_each_node_mask(a, nodes) { + unsigned long faults = 0; + nodemask_t this_group; + nodes_clear(this_group); + + /* Sum group's NUMA faults; includes a==b case. */ + for_each_node_mask(b, nodes) { + if (node_distance(a, b) < dist) { + faults += group_faults(p, b); + node_set(b, this_group); + node_clear(b, nodes); + } + } + + /* Remember the top group. */ + if (faults > max_faults) { + max_faults = faults; + max_group = this_group; + /* + * subtle: at the smallest distance there is + * just one node left in each "group", the + * winner is the preferred nid. + */ + nid = a; + } + } + /* Next round, evaluate the nodes within max_group. */ + nodes = max_group; + } + return nid; +} + static void task_numa_placement(struct task_struct *p) { int seq, nid, max_nid = -1, max_group_nid = -1; @@ -1607,18 +1796,23 @@ static void task_numa_placement(struct task_struct *p) /* Find the node with the highest number of faults */ for_each_online_node(nid) { + /* Keep track of the offsets in numa_faults array */ + int mem_idx, membuf_idx, cpu_idx, cpubuf_idx; unsigned long faults = 0, group_faults = 0; - int priv, i; + int priv; for (priv = 0; priv < NR_NUMA_HINT_FAULT_TYPES; priv++) { long diff, f_diff, f_weight; - i = task_faults_idx(nid, priv); + mem_idx = task_faults_idx(NUMA_MEM, nid, priv); + membuf_idx = task_faults_idx(NUMA_MEMBUF, nid, priv); + cpu_idx = task_faults_idx(NUMA_CPU, nid, priv); + cpubuf_idx = task_faults_idx(NUMA_CPUBUF, nid, priv); /* Decay existing window, copy faults since last scan */ - diff = p->numa_faults_buffer_memory[i] - p->numa_faults_memory[i] / 2; - fault_types[priv] += p->numa_faults_buffer_memory[i]; - p->numa_faults_buffer_memory[i] = 0; + diff = p->numa_faults[membuf_idx] - p->numa_faults[mem_idx] / 2; + fault_types[priv] += p->numa_faults[membuf_idx]; + p->numa_faults[membuf_idx] = 0; /* * Normalize the faults_from, so all tasks in a group @@ -1628,21 +1822,27 @@ static void task_numa_placement(struct task_struct *p) * faults are less important. */ f_weight = div64_u64(runtime << 16, period + 1); - f_weight = (f_weight * p->numa_faults_buffer_cpu[i]) / + f_weight = (f_weight * p->numa_faults[cpubuf_idx]) / (total_faults + 1); - f_diff = f_weight - p->numa_faults_cpu[i] / 2; - p->numa_faults_buffer_cpu[i] = 0; + f_diff = f_weight - p->numa_faults[cpu_idx] / 2; + p->numa_faults[cpubuf_idx] = 0; - p->numa_faults_memory[i] += diff; - p->numa_faults_cpu[i] += f_diff; - faults += p->numa_faults_memory[i]; + p->numa_faults[mem_idx] += diff; + p->numa_faults[cpu_idx] += f_diff; + faults += p->numa_faults[mem_idx]; p->total_numa_faults += diff; if (p->numa_group) { - /* safe because we can only change our own group */ - p->numa_group->faults[i] += diff; - p->numa_group->faults_cpu[i] += f_diff; + /* + * safe because we can only change our own group + * + * mem_idx represents the offset for a given + * nid and priv in a specific region because it + * is at the beginning of the numa_faults array. + */ + p->numa_group->faults[mem_idx] += diff; + p->numa_group->faults_cpu[mem_idx] += f_diff; p->numa_group->total_faults += diff; - group_faults += p->numa_group->faults[i]; + group_faults += p->numa_group->faults[mem_idx]; } } @@ -1662,7 +1862,7 @@ static void task_numa_placement(struct task_struct *p) if (p->numa_group) { update_numa_active_node_mask(p->numa_group); spin_unlock_irq(group_lock); - max_nid = max_group_nid; + max_nid = preferred_group_nid(p, max_group_nid); } if (max_faults) { @@ -1705,7 +1905,6 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags, atomic_set(&grp->refcount, 1); spin_lock_init(&grp->lock); - INIT_LIST_HEAD(&grp->task_list); grp->gid = p->pid; /* Second half of the array tracks nids where faults happen */ grp->faults_cpu = grp->faults + NR_NUMA_HINT_FAULT_TYPES * @@ -1714,11 +1913,10 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags, node_set(task_node(current), grp->active_nodes); for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) - grp->faults[i] = p->numa_faults_memory[i]; + grp->faults[i] = p->numa_faults[i]; grp->total_faults = p->total_numa_faults; - list_add(&p->numa_entry, &grp->task_list); grp->nr_tasks++; rcu_assign_pointer(p->numa_group, grp); } @@ -1773,13 +1971,12 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags, double_lock_irq(&my_grp->lock, &grp->lock); for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) { - my_grp->faults[i] -= p->numa_faults_memory[i]; - grp->faults[i] += p->numa_faults_memory[i]; + my_grp->faults[i] -= p->numa_faults[i]; + grp->faults[i] += p->numa_faults[i]; } my_grp->total_faults -= p->total_numa_faults; grp->total_faults += p->total_numa_faults; - list_move(&p->numa_entry, &grp->task_list); my_grp->nr_tasks--; grp->nr_tasks++; @@ -1799,27 +1996,23 @@ no_join: void task_numa_free(struct task_struct *p) { struct numa_group *grp = p->numa_group; - void *numa_faults = p->numa_faults_memory; + void *numa_faults = p->numa_faults; unsigned long flags; int i; if (grp) { spin_lock_irqsave(&grp->lock, flags); for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) - grp->faults[i] -= p->numa_faults_memory[i]; + grp->faults[i] -= p->numa_faults[i]; grp->total_faults -= p->total_numa_faults; - list_del(&p->numa_entry); grp->nr_tasks--; spin_unlock_irqrestore(&grp->lock, flags); RCU_INIT_POINTER(p->numa_group, NULL); put_numa_group(grp); } - p->numa_faults_memory = NULL; - p->numa_faults_buffer_memory = NULL; - p->numa_faults_cpu= NULL; - p->numa_faults_buffer_cpu = NULL; + p->numa_faults = NULL; kfree(numa_faults); } @@ -1842,24 +2035,14 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags) return; /* Allocate buffer to track faults on a per-node basis */ - if (unlikely(!p->numa_faults_memory)) { - int size = sizeof(*p->numa_faults_memory) * + if (unlikely(!p->numa_faults)) { + int size = sizeof(*p->numa_faults) * NR_NUMA_HINT_FAULT_BUCKETS * nr_node_ids; - p->numa_faults_memory = kzalloc(size, GFP_KERNEL|__GFP_NOWARN); - if (!p->numa_faults_memory) + p->numa_faults = kzalloc(size, GFP_KERNEL|__GFP_NOWARN); + if (!p->numa_faults) return; - BUG_ON(p->numa_faults_buffer_memory); - /* - * The averaged statistics, shared & private, memory & cpu, - * occupy the first half of the array. The second half of the - * array is for current counters, which are averaged into the - * first set by task_numa_placement. - */ - p->numa_faults_cpu = p->numa_faults_memory + (2 * nr_node_ids); - p->numa_faults_buffer_memory = p->numa_faults_memory + (4 * nr_node_ids); - p->numa_faults_buffer_cpu = p->numa_faults_memory + (6 * nr_node_ids); p->total_numa_faults = 0; memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality)); } @@ -1899,8 +2082,8 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags) if (migrated) p->numa_pages_migrated += pages; - p->numa_faults_buffer_memory[task_faults_idx(mem_node, priv)] += pages; - p->numa_faults_buffer_cpu[task_faults_idx(cpu_node, priv)] += pages; + p->numa_faults[task_faults_idx(NUMA_MEMBUF, mem_node, priv)] += pages; + p->numa_faults[task_faults_idx(NUMA_CPUBUF, cpu_node, priv)] += pages; p->numa_faults_locality[local] += pages; } @@ -4469,7 +4652,7 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) latest_idle_timestamp = rq->idle_stamp; shallowest_idle_cpu = i; } - } else { + } else if (shallowest_idle_cpu == -1) { load = weighted_cpuload(i); if (load < min_load || (load == min_load && i == this_cpu)) { min_load = load; @@ -4547,9 +4730,6 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f int want_affine = 0; int sync = wake_flags & WF_SYNC; - if (p->nr_cpus_allowed == 1) - return prev_cpu; - if (sd_flag & SD_BALANCE_WAKE) want_affine = cpumask_test_cpu(cpu, tsk_cpus_allowed(p)); @@ -5189,7 +5369,7 @@ static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env) struct numa_group *numa_group = rcu_dereference(p->numa_group); int src_nid, dst_nid; - if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults_memory || + if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults || !(env->sd->flags & SD_NUMA)) { return false; } @@ -5228,7 +5408,7 @@ static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env) if (!sched_feat(NUMA) || !sched_feat(NUMA_RESIST_LOWER)) return false; - if (!p->numa_faults_memory || !(env->sd->flags & SD_NUMA)) + if (!p->numa_faults || !(env->sd->flags & SD_NUMA)) return false; src_nid = cpu_to_node(env->src_cpu); @@ -6172,8 +6352,10 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd * with a large weight task outweighs the tasks on the system). */ if (prefer_sibling && sds->local && - sds->local_stat.group_has_free_capacity) + sds->local_stat.group_has_free_capacity) { sgs->group_capacity_factor = min(sgs->group_capacity_factor, 1U); + sgs->group_type = group_classify(sg, sgs); + } if (update_sd_pick_busiest(env, sds, sg, sgs)) { sds->busiest = sg; diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 20bca398084a..ee15f5a0d1c1 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1301,9 +1301,6 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags) struct task_struct *curr; struct rq *rq; - if (p->nr_cpus_allowed == 1) - goto out; - /* For anything but wake ups, just return the task_cpu */ if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK) goto out; @@ -1351,16 +1348,22 @@ out: static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) { - if (rq->curr->nr_cpus_allowed == 1) + /* + * Current can't be migrated, useless to reschedule, + * let's hope p can move out. + */ + if (rq->curr->nr_cpus_allowed == 1 || + !cpupri_find(&rq->rd->cpupri, rq->curr, NULL)) return; + /* + * p is migratable, so let's not schedule it and + * see if it is pushed or pulled somewhere else. + */ if (p->nr_cpus_allowed != 1 && cpupri_find(&rq->rd->cpupri, p, NULL)) return; - if (!cpupri_find(&rq->rd->cpupri, rq->curr, NULL)) - return; - /* * There appears to be other cpus that can accept * current and none to run 'p', so lets reschedule diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 2df8ef067cc5..9a2a45c970e7 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -176,6 +176,25 @@ struct dl_bw { u64 bw, total_bw; }; +static inline +void __dl_clear(struct dl_bw *dl_b, u64 tsk_bw) +{ + dl_b->total_bw -= tsk_bw; +} + +static inline +void __dl_add(struct dl_bw *dl_b, u64 tsk_bw) +{ + dl_b->total_bw += tsk_bw; +} + +static inline +bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw) +{ + return dl_b->bw != -1 && + dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw; +} + extern struct mutex sched_domains_mutex; #ifdef CONFIG_CGROUP_SCHED @@ -678,7 +697,25 @@ static inline u64 rq_clock_task(struct rq *rq) return rq->clock_task; } +#ifdef CONFIG_NUMA +enum numa_topology_type { + NUMA_DIRECT, + NUMA_GLUELESS_MESH, + NUMA_BACKPLANE, +}; +extern enum numa_topology_type sched_numa_topology_type; +extern int sched_max_numa_distance; +extern bool find_numa_distance(int distance); +#endif + #ifdef CONFIG_NUMA_BALANCING +/* The regions in numa_faults array from task_struct */ +enum numa_faults_stats { + NUMA_MEM = 0, + NUMA_CPU, + NUMA_MEMBUF, + NUMA_CPUBUF +}; extern void sched_setnuma(struct task_struct *p, int node); extern int migrate_task_to(struct task_struct *p, int cpu); extern int migrate_swap(struct task_struct *, struct task_struct *); @@ -1127,6 +1164,11 @@ struct sched_class { void (*task_fork) (struct task_struct *p); void (*task_dead) (struct task_struct *p); + /* + * The switched_from() call is allowed to drop rq->lock, therefore we + * cannot assume the switched_from/switched_to pair is serliazed by + * rq->lock. They are however serialized by p->pi_lock. + */ void (*switched_from) (struct rq *this_rq, struct task_struct *task); void (*switched_to) (struct rq *this_rq, struct task_struct *task); void (*prio_changed) (struct rq *this_rq, struct task_struct *task, @@ -1504,6 +1546,7 @@ extern struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq); extern struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq); extern void print_cfs_stats(struct seq_file *m, int cpu); extern void print_rt_stats(struct seq_file *m, int cpu); +extern void print_dl_stats(struct seq_file *m, int cpu); extern void init_cfs_rq(struct cfs_rq *cfs_rq); extern void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq); diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index 5a62915f47a8..852143a79f36 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c @@ -9,6 +9,7 @@ #include <linux/mm.h> #include <linux/wait.h> #include <linux/hash.h> +#include <linux/kthread.h> void __init_waitqueue_head(wait_queue_head_t *q, const char *name, struct lock_class_key *key) { @@ -297,6 +298,71 @@ int autoremove_wake_function(wait_queue_t *wait, unsigned mode, int sync, void * } EXPORT_SYMBOL(autoremove_wake_function); +static inline bool is_kthread_should_stop(void) +{ + return (current->flags & PF_KTHREAD) && kthread_should_stop(); +} + +/* + * DEFINE_WAIT_FUNC(wait, woken_wake_func); + * + * add_wait_queue(&wq, &wait); + * for (;;) { + * if (condition) + * break; + * + * p->state = mode; condition = true; + * smp_mb(); // A smp_wmb(); // C + * if (!wait->flags & WQ_FLAG_WOKEN) wait->flags |= WQ_FLAG_WOKEN; + * schedule() try_to_wake_up(); + * p->state = TASK_RUNNING; ~~~~~~~~~~~~~~~~~~ + * wait->flags &= ~WQ_FLAG_WOKEN; condition = true; + * smp_mb() // B smp_wmb(); // C + * wait->flags |= WQ_FLAG_WOKEN; + * } + * remove_wait_queue(&wq, &wait); + * + */ +long wait_woken(wait_queue_t *wait, unsigned mode, long timeout) +{ + set_current_state(mode); /* A */ + /* + * The above implies an smp_mb(), which matches with the smp_wmb() from + * woken_wake_function() such that if we observe WQ_FLAG_WOKEN we must + * also observe all state before the wakeup. + */ + if (!(wait->flags & WQ_FLAG_WOKEN) && !is_kthread_should_stop()) + timeout = schedule_timeout(timeout); + __set_current_state(TASK_RUNNING); + + /* + * The below implies an smp_mb(), it too pairs with the smp_wmb() from + * woken_wake_function() such that we must either observe the wait + * condition being true _OR_ WQ_FLAG_WOKEN such that we will not miss + * an event. + */ + set_mb(wait->flags, wait->flags & ~WQ_FLAG_WOKEN); /* B */ + + return timeout; +} +EXPORT_SYMBOL(wait_woken); + +int woken_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key) +{ + /* + * Although this function is called under waitqueue lock, LOCK + * doesn't imply write barrier and the users expects write + * barrier semantics on wakeup functions. The following + * smp_wmb() is equivalent to smp_wmb() in try_to_wake_up() + * and is paired with set_mb() in wait_woken(). + */ + smp_wmb(); /* C */ + wait->flags |= WQ_FLAG_WOKEN; + + return default_wake_function(wait, mode, sync, key); +} +EXPORT_SYMBOL(woken_wake_function); + int wake_bit_function(wait_queue_t *wait, unsigned mode, int sync, void *arg) { struct wait_bit_key *key = arg; diff --git a/kernel/signal.c b/kernel/signal.c index 8f0876f9f6dd..16a305295256 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1275,7 +1275,17 @@ struct sighand_struct *__lock_task_sighand(struct task_struct *tsk, local_irq_restore(*flags); break; } - + /* + * This sighand can be already freed and even reused, but + * we rely on SLAB_DESTROY_BY_RCU and sighand_ctor() which + * initializes ->siglock: this slab can't go away, it has + * the same object type, ->siglock can't be reinitialized. + * + * We need to ensure that tsk->sighand is still the same + * after we take the lock, we can race with de_thread() or + * __exit_signal(). In the latter case the next iteration + * must see ->sighand == NULL. + */ spin_lock(&sighand->siglock); if (likely(sighand == tsk->sighand)) { rcu_read_unlock(); @@ -1331,23 +1341,21 @@ int kill_pid_info(int sig, struct siginfo *info, struct pid *pid) int error = -ESRCH; struct task_struct *p; - rcu_read_lock(); -retry: - p = pid_task(pid, PIDTYPE_PID); - if (p) { - error = group_send_sig_info(sig, info, p); - if (unlikely(error == -ESRCH)) - /* - * The task was unhashed in between, try again. - * If it is dead, pid_task() will return NULL, - * if we race with de_thread() it will find the - * new leader. - */ - goto retry; - } - rcu_read_unlock(); + for (;;) { + rcu_read_lock(); + p = pid_task(pid, PIDTYPE_PID); + if (p) + error = group_send_sig_info(sig, info, p); + rcu_read_unlock(); + if (likely(!p || error != -ESRCH)) + return error; - return error; + /* + * The task was unhashed in between, try again. If it + * is dead, pid_task() will return NULL, if we race with + * de_thread() it will find the new leader. + */ + } } int kill_proc_info(int sig, struct siginfo *info, pid_t pid) @@ -2748,6 +2756,10 @@ int copy_siginfo_to_user(siginfo_t __user *to, const siginfo_t *from) if (from->si_code == BUS_MCEERR_AR || from->si_code == BUS_MCEERR_AO) err |= __put_user(from->si_addr_lsb, &to->si_addr_lsb); #endif +#ifdef SEGV_BNDERR + err |= __put_user(from->si_lower, &to->si_lower); + err |= __put_user(from->si_upper, &to->si_upper); +#endif break; case __SI_CHLD: err |= __put_user(from->si_pid, &to->si_pid); diff --git a/kernel/smpboot.c b/kernel/smpboot.c index eb89e1807408..f032fb5284e3 100644 --- a/kernel/smpboot.c +++ b/kernel/smpboot.c @@ -110,7 +110,7 @@ static int smpboot_thread_fn(void *data) set_current_state(TASK_INTERRUPTIBLE); preempt_disable(); if (kthread_should_stop()) { - set_current_state(TASK_RUNNING); + __set_current_state(TASK_RUNNING); preempt_enable(); if (ht->cleanup) ht->cleanup(td->cpu, cpu_online(td->cpu)); @@ -136,26 +136,27 @@ static int smpboot_thread_fn(void *data) /* Check for state change setup */ switch (td->status) { case HP_THREAD_NONE: + __set_current_state(TASK_RUNNING); preempt_enable(); if (ht->setup) ht->setup(td->cpu); td->status = HP_THREAD_ACTIVE; - preempt_disable(); - break; + continue; + case HP_THREAD_PARKED: + __set_current_state(TASK_RUNNING); preempt_enable(); if (ht->unpark) ht->unpark(td->cpu); td->status = HP_THREAD_ACTIVE; - preempt_disable(); - break; + continue; } if (!ht->thread_should_run(td->cpu)) { - preempt_enable(); + preempt_enable_no_resched(); schedule(); } else { - set_current_state(TASK_RUNNING); + __set_current_state(TASK_RUNNING); preempt_enable(); ht->thread_fn(td->cpu); } diff --git a/kernel/softirq.c b/kernel/softirq.c index 0699add19164..501baa9ac1be 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -656,7 +656,7 @@ static void run_ksoftirqd(unsigned int cpu) * in the task stack here. */ __do_softirq(); - rcu_note_context_switch(cpu); + rcu_note_context_switch(); local_irq_enable(); cond_resched(); return; diff --git a/kernel/sys.c b/kernel/sys.c index 1eaa2f0b0246..a8c9f5a7dda6 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -91,6 +91,12 @@ #ifndef SET_TSC_CTL # define SET_TSC_CTL(a) (-EINVAL) #endif +#ifndef MPX_ENABLE_MANAGEMENT +# define MPX_ENABLE_MANAGEMENT(a) (-EINVAL) +#endif +#ifndef MPX_DISABLE_MANAGEMENT +# define MPX_DISABLE_MANAGEMENT(a) (-EINVAL) +#endif /* * this is where the system-wide overflow UID and GID are defined, for @@ -2203,6 +2209,12 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, me->mm->def_flags &= ~VM_NOHUGEPAGE; up_write(&me->mm->mmap_sem); break; + case PR_MPX_ENABLE_MANAGEMENT: + error = MPX_ENABLE_MANAGEMENT(me); + break; + case PR_MPX_DISABLE_MANAGEMENT: + error = MPX_DISABLE_MANAGEMENT(me); + break; default: error = -EINVAL; break; diff --git a/kernel/time/Makefile b/kernel/time/Makefile index 7347426fa68d..f622cf28628a 100644 --- a/kernel/time/Makefile +++ b/kernel/time/Makefile @@ -13,7 +13,7 @@ obj-$(CONFIG_TICK_ONESHOT) += tick-oneshot.o obj-$(CONFIG_TICK_ONESHOT) += tick-sched.o obj-$(CONFIG_TIMER_STATS) += timer_stats.o obj-$(CONFIG_DEBUG_FS) += timekeeping_debug.o -obj-$(CONFIG_TEST_UDELAY) += udelay_test.o +obj-$(CONFIG_TEST_UDELAY) += test_udelay.o $(obj)/time.o: $(obj)/timeconst.h diff --git a/kernel/time/udelay_test.c b/kernel/time/test_udelay.c index e622ba365a13..e622ba365a13 100644 --- a/kernel/time/udelay_test.c +++ b/kernel/time/test_udelay.c diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 7b5741fc4110..1f4356037a7d 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -585,7 +585,7 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, last_jiffies = jiffies; } while (read_seqretry(&jiffies_lock, seq)); - if (rcu_needs_cpu(cpu, &rcu_delta_jiffies) || + if (rcu_needs_cpu(&rcu_delta_jiffies) || arch_needs_cpu() || irq_work_needs_cpu()) { next_jiffies = last_jiffies + 1; delta_jiffies = 1; diff --git a/kernel/time/time.c b/kernel/time/time.c index a9ae20fb0b11..65015ff2f07c 100644 --- a/kernel/time/time.c +++ b/kernel/time/time.c @@ -304,7 +304,9 @@ struct timespec timespec_trunc(struct timespec t, unsigned gran) } EXPORT_SYMBOL(timespec_trunc); -/* Converts Gregorian date to seconds since 1970-01-01 00:00:00. +/* + * mktime64 - Converts date to seconds. + * Converts Gregorian date to seconds since 1970-01-01 00:00:00. * Assumes input in normal date format, i.e. 1980-12-31 23:59:59 * => year=1980, mon=12, day=31, hour=23, min=59, sec=59. * @@ -314,15 +316,10 @@ EXPORT_SYMBOL(timespec_trunc); * -year/100+year/400 terms, and add 10.] * * This algorithm was first published by Gauss (I think). - * - * WARNING: this function will overflow on 2106-02-07 06:28:16 on - * machines where long is 32-bit! (However, as time_t is signed, we - * will already get problems at other places on 2038-01-19 03:14:08) */ -unsigned long -mktime(const unsigned int year0, const unsigned int mon0, - const unsigned int day, const unsigned int hour, - const unsigned int min, const unsigned int sec) +time64_t mktime64(const unsigned int year0, const unsigned int mon0, + const unsigned int day, const unsigned int hour, + const unsigned int min, const unsigned int sec) { unsigned int mon = mon0, year = year0; @@ -332,15 +329,14 @@ mktime(const unsigned int year0, const unsigned int mon0, year -= 1; } - return ((((unsigned long) + return ((((time64_t) (year/4 - year/100 + year/400 + 367*mon/12 + day) + year*365 - 719499 )*24 + hour /* now have hours */ )*60 + min /* now have minutes */ )*60 + sec; /* finally seconds */ } - -EXPORT_SYMBOL(mktime); +EXPORT_SYMBOL(mktime64); /** * set_normalized_timespec - set timespec sec and nsec parts and normalize diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index ec1791fae965..6a931852082f 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -417,7 +417,8 @@ EXPORT_SYMBOL_GPL(pvclock_gtod_unregister_notifier); */ static inline void tk_update_ktime_data(struct timekeeper *tk) { - s64 nsec; + u64 seconds; + u32 nsec; /* * The xtime based monotonic readout is: @@ -426,13 +427,22 @@ static inline void tk_update_ktime_data(struct timekeeper *tk) * nsec = base_mono + now(); * ==> base_mono = (xtime_sec + wtm_sec) * 1e9 + wtm_nsec */ - nsec = (s64)(tk->xtime_sec + tk->wall_to_monotonic.tv_sec); - nsec *= NSEC_PER_SEC; - nsec += tk->wall_to_monotonic.tv_nsec; - tk->tkr.base_mono = ns_to_ktime(nsec); + seconds = (u64)(tk->xtime_sec + tk->wall_to_monotonic.tv_sec); + nsec = (u32) tk->wall_to_monotonic.tv_nsec; + tk->tkr.base_mono = ns_to_ktime(seconds * NSEC_PER_SEC + nsec); /* Update the monotonic raw base */ tk->base_raw = timespec64_to_ktime(tk->raw_time); + + /* + * The sum of the nanoseconds portions of xtime and + * wall_to_monotonic can be greater/equal one second. Take + * this into account before updating tk->ktime_sec. + */ + nsec += (u32)(tk->tkr.xtime_nsec >> tk->tkr.shift); + if (nsec >= NSEC_PER_SEC) + seconds++; + tk->ktime_sec = seconds; } /* must hold timekeeper_lock */ @@ -519,9 +529,9 @@ EXPORT_SYMBOL(__getnstimeofday64); /** * getnstimeofday64 - Returns the time of day in a timespec64. - * @ts: pointer to the timespec to be set + * @ts: pointer to the timespec64 to be set * - * Returns the time of day in a timespec (WARN if suspended). + * Returns the time of day in a timespec64 (WARN if suspended). */ void getnstimeofday64(struct timespec64 *ts) { @@ -623,7 +633,7 @@ EXPORT_SYMBOL_GPL(ktime_get_raw); * * The function calculates the monotonic clock from the realtime * clock and the wall_to_monotonic offset and stores the result - * in normalized timespec format in the variable pointed to by @ts. + * in normalized timespec64 format in the variable pointed to by @ts. */ void ktime_get_ts64(struct timespec64 *ts) { @@ -648,6 +658,54 @@ void ktime_get_ts64(struct timespec64 *ts) } EXPORT_SYMBOL_GPL(ktime_get_ts64); +/** + * ktime_get_seconds - Get the seconds portion of CLOCK_MONOTONIC + * + * Returns the seconds portion of CLOCK_MONOTONIC with a single non + * serialized read. tk->ktime_sec is of type 'unsigned long' so this + * works on both 32 and 64 bit systems. On 32 bit systems the readout + * covers ~136 years of uptime which should be enough to prevent + * premature wrap arounds. + */ +time64_t ktime_get_seconds(void) +{ + struct timekeeper *tk = &tk_core.timekeeper; + + WARN_ON(timekeeping_suspended); + return tk->ktime_sec; +} +EXPORT_SYMBOL_GPL(ktime_get_seconds); + +/** + * ktime_get_real_seconds - Get the seconds portion of CLOCK_REALTIME + * + * Returns the wall clock seconds since 1970. This replaces the + * get_seconds() interface which is not y2038 safe on 32bit systems. + * + * For 64bit systems the fast access to tk->xtime_sec is preserved. On + * 32bit systems the access must be protected with the sequence + * counter to provide "atomic" access to the 64bit tk->xtime_sec + * value. + */ +time64_t ktime_get_real_seconds(void) +{ + struct timekeeper *tk = &tk_core.timekeeper; + time64_t seconds; + unsigned int seq; + + if (IS_ENABLED(CONFIG_64BIT)) + return tk->xtime_sec; + + do { + seq = read_seqcount_begin(&tk_core.seq); + seconds = tk->xtime_sec; + + } while (read_seqcount_retry(&tk_core.seq, seq)); + + return seconds; +} +EXPORT_SYMBOL_GPL(ktime_get_real_seconds); + #ifdef CONFIG_NTP_PPS /** @@ -703,18 +761,18 @@ void do_gettimeofday(struct timeval *tv) EXPORT_SYMBOL(do_gettimeofday); /** - * do_settimeofday - Sets the time of day - * @tv: pointer to the timespec variable containing the new time + * do_settimeofday64 - Sets the time of day. + * @ts: pointer to the timespec64 variable containing the new time * * Sets the time of day to the new time and update NTP and notify hrtimers */ -int do_settimeofday(const struct timespec *tv) +int do_settimeofday64(const struct timespec64 *ts) { struct timekeeper *tk = &tk_core.timekeeper; - struct timespec64 ts_delta, xt, tmp; + struct timespec64 ts_delta, xt; unsigned long flags; - if (!timespec_valid_strict(tv)) + if (!timespec64_valid_strict(ts)) return -EINVAL; raw_spin_lock_irqsave(&timekeeper_lock, flags); @@ -723,13 +781,12 @@ int do_settimeofday(const struct timespec *tv) timekeeping_forward_now(tk); xt = tk_xtime(tk); - ts_delta.tv_sec = tv->tv_sec - xt.tv_sec; - ts_delta.tv_nsec = tv->tv_nsec - xt.tv_nsec; + ts_delta.tv_sec = ts->tv_sec - xt.tv_sec; + ts_delta.tv_nsec = ts->tv_nsec - xt.tv_nsec; tk_set_wall_to_mono(tk, timespec64_sub(tk->wall_to_monotonic, ts_delta)); - tmp = timespec_to_timespec64(*tv); - tk_set_xtime(tk, &tmp); + tk_set_xtime(tk, ts); timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET); @@ -741,7 +798,7 @@ int do_settimeofday(const struct timespec *tv) return 0; } -EXPORT_SYMBOL(do_settimeofday); +EXPORT_SYMBOL(do_settimeofday64); /** * timekeeping_inject_offset - Adds or subtracts from the current time. @@ -895,12 +952,12 @@ int timekeeping_notify(struct clocksource *clock) } /** - * getrawmonotonic - Returns the raw monotonic time in a timespec - * @ts: pointer to the timespec to be set + * getrawmonotonic64 - Returns the raw monotonic time in a timespec + * @ts: pointer to the timespec64 to be set * * Returns the raw monotonic time (completely un-modified by ntp) */ -void getrawmonotonic(struct timespec *ts) +void getrawmonotonic64(struct timespec64 *ts) { struct timekeeper *tk = &tk_core.timekeeper; struct timespec64 ts64; @@ -915,9 +972,10 @@ void getrawmonotonic(struct timespec *ts) } while (read_seqcount_retry(&tk_core.seq, seq)); timespec64_add_ns(&ts64, nsecs); - *ts = timespec64_to_timespec(ts64); + *ts = ts64; } -EXPORT_SYMBOL(getrawmonotonic); +EXPORT_SYMBOL(getrawmonotonic64); + /** * timekeeping_valid_for_hres - Check if timekeeping is suitable for hres @@ -1068,8 +1126,8 @@ static void __timekeeping_inject_sleeptime(struct timekeeper *tk, } /** - * timekeeping_inject_sleeptime - Adds suspend interval to timeekeeping values - * @delta: pointer to a timespec delta value + * timekeeping_inject_sleeptime64 - Adds suspend interval to timeekeeping values + * @delta: pointer to a timespec64 delta value * * This hook is for architectures that cannot support read_persistent_clock * because their RTC/persistent clock is only accessible when irqs are enabled. @@ -1077,10 +1135,9 @@ static void __timekeeping_inject_sleeptime(struct timekeeper *tk, * This function should only be called by rtc_resume(), and allows * a suspend offset to be injected into the timekeeping values. */ -void timekeeping_inject_sleeptime(struct timespec *delta) +void timekeeping_inject_sleeptime64(struct timespec64 *delta) { struct timekeeper *tk = &tk_core.timekeeper; - struct timespec64 tmp; unsigned long flags; /* @@ -1095,8 +1152,7 @@ void timekeeping_inject_sleeptime(struct timespec *delta) timekeeping_forward_now(tk); - tmp = timespec_to_timespec64(*delta); - __timekeeping_inject_sleeptime(tk, &tmp); + __timekeeping_inject_sleeptime(tk, delta); timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET); @@ -1332,6 +1388,12 @@ static __always_inline void timekeeping_apply_adjustment(struct timekeeper *tk, * * XXX - TODO: Doc ntp_error calculation. */ + if ((mult_adj > 0) && (tk->tkr.mult + mult_adj < mult_adj)) { + /* NTP adjustment caused clocksource mult overflow */ + WARN_ON_ONCE(1); + return; + } + tk->tkr.mult += mult_adj; tk->xtime_interval += interval; tk->tkr.xtime_nsec -= offset; @@ -1397,7 +1459,8 @@ static void timekeeping_adjust(struct timekeeper *tk, s64 offset) } if (unlikely(tk->tkr.clock->maxadj && - (tk->tkr.mult > tk->tkr.clock->mult + tk->tkr.clock->maxadj))) { + (abs(tk->tkr.mult - tk->tkr.clock->mult) + > tk->tkr.clock->maxadj))) { printk_once(KERN_WARNING "Adjusting %s more than 11%% (%ld vs %ld)\n", tk->tkr.clock->name, (long)tk->tkr.mult, @@ -1646,7 +1709,7 @@ struct timespec current_kernel_time(void) } EXPORT_SYMBOL(current_kernel_time); -struct timespec get_monotonic_coarse(void) +struct timespec64 get_monotonic_coarse64(void) { struct timekeeper *tk = &tk_core.timekeeper; struct timespec64 now, mono; @@ -1662,7 +1725,7 @@ struct timespec get_monotonic_coarse(void) set_normalized_timespec64(&now, now.tv_sec + mono.tv_sec, now.tv_nsec + mono.tv_nsec); - return timespec64_to_timespec(now); + return now; } /* diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 3260ffdb368f..2d3f5c504939 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -1377,12 +1377,11 @@ unsigned long get_next_timer_interrupt(unsigned long now) void update_process_times(int user_tick) { struct task_struct *p = current; - int cpu = smp_processor_id(); /* Note: this timer irq context must be accounted for as well. */ account_process_tick(p, user_tick); run_local_timers(); - rcu_check_callbacks(cpu, user_tick); + rcu_check_callbacks(user_tick); #ifdef CONFIG_IRQ_WORK if (in_irq()) irq_work_tick(); |