diff options
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/irq/Kconfig | 3 | ||||
-rw-r--r-- | kernel/irq/handle.c | 111 | ||||
-rw-r--r-- | kernel/lockdep.c | 18 | ||||
-rw-r--r-- | kernel/sched.c | 26 | ||||
-rw-r--r-- | kernel/sched_autogroup.c | 32 | ||||
-rw-r--r-- | kernel/sched_autogroup.h | 4 | ||||
-rw-r--r-- | kernel/sched_debug.c | 42 | ||||
-rw-r--r-- | kernel/sched_fair.c | 35 | ||||
-rw-r--r-- | kernel/smp.c | 62 | ||||
-rw-r--r-- | kernel/trace/trace_irqsoff.c | 8 | ||||
-rw-r--r-- | kernel/workqueue.c | 20 |
11 files changed, 187 insertions, 174 deletions
diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig index 31d766bf5d2e..8e42fec7686d 100644 --- a/kernel/irq/Kconfig +++ b/kernel/irq/Kconfig @@ -9,9 +9,6 @@ menu "IRQ subsystem" config GENERIC_HARDIRQS def_bool y -config GENERIC_HARDIRQS_NO__DO_IRQ - def_bool y - # Select this to disable the deprecated stuff config GENERIC_HARDIRQS_NO_DEPRECATED def_bool n diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index e2347eb63306..3540a7190122 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -118,114 +118,3 @@ irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action) return retval; } - -#ifndef CONFIG_GENERIC_HARDIRQS_NO__DO_IRQ - -#ifdef CONFIG_ENABLE_WARN_DEPRECATED -# warning __do_IRQ is deprecated. Please convert to proper flow handlers -#endif - -/** - * __do_IRQ - original all in one highlevel IRQ handler - * @irq: the interrupt number - * - * __do_IRQ handles all normal device IRQ's (the special - * SMP cross-CPU interrupts have their own specific - * handlers). - * - * This is the original x86 implementation which is used for every - * interrupt type. - */ -unsigned int __do_IRQ(unsigned int irq) -{ - struct irq_desc *desc = irq_to_desc(irq); - struct irqaction *action; - unsigned int status; - - kstat_incr_irqs_this_cpu(irq, desc); - - if (CHECK_IRQ_PER_CPU(desc->status)) { - irqreturn_t action_ret; - - /* - * No locking required for CPU-local interrupts: - */ - if (desc->irq_data.chip->ack) - desc->irq_data.chip->ack(irq); - if (likely(!(desc->status & IRQ_DISABLED))) { - action_ret = handle_IRQ_event(irq, desc->action); - if (!noirqdebug) - note_interrupt(irq, desc, action_ret); - } - desc->irq_data.chip->end(irq); - return 1; - } - - raw_spin_lock(&desc->lock); - if (desc->irq_data.chip->ack) - desc->irq_data.chip->ack(irq); - /* - * REPLAY is when Linux resends an IRQ that was dropped earlier - * WAITING is used by probe to mark irqs that are being tested - */ - status = desc->status & ~(IRQ_REPLAY | IRQ_WAITING); - status |= IRQ_PENDING; /* we _want_ to handle it */ - - /* - * If the IRQ is disabled for whatever reason, we cannot - * use the action we have. - */ - action = NULL; - if (likely(!(status & (IRQ_DISABLED | IRQ_INPROGRESS)))) { - action = desc->action; - status &= ~IRQ_PENDING; /* we commit to handling */ - status |= IRQ_INPROGRESS; /* we are handling it */ - } - desc->status = status; - - /* - * If there is no IRQ handler or it was disabled, exit early. - * Since we set PENDING, if another processor is handling - * a different instance of this same irq, the other processor - * will take care of it. - */ - if (unlikely(!action)) - goto out; - - /* - * Edge triggered interrupts need to remember - * pending events. - * This applies to any hw interrupts that allow a second - * instance of the same irq to arrive while we are in do_IRQ - * or in the handler. But the code here only handles the _second_ - * instance of the irq, not the third or fourth. So it is mostly - * useful for irq hardware that does not mask cleanly in an - * SMP environment. - */ - for (;;) { - irqreturn_t action_ret; - - raw_spin_unlock(&desc->lock); - - action_ret = handle_IRQ_event(irq, action); - if (!noirqdebug) - note_interrupt(irq, desc, action_ret); - - raw_spin_lock(&desc->lock); - if (likely(!(desc->status & IRQ_PENDING))) - break; - desc->status &= ~IRQ_PENDING; - } - desc->status &= ~IRQ_INPROGRESS; - -out: - /* - * The ->end() handler has to deal with interrupts which got - * disabled while the handler was running. - */ - desc->irq_data.chip->end(irq); - raw_spin_unlock(&desc->lock); - - return 1; -} -#endif diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 42ba65dff7d9..0d2058da80f5 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -2292,22 +2292,6 @@ mark_held_locks(struct task_struct *curr, enum mark_type mark) } /* - * Debugging helper: via this flag we know that we are in - * 'early bootup code', and will warn about any invalid irqs-on event: - */ -static int early_boot_irqs_enabled; - -void early_boot_irqs_off(void) -{ - early_boot_irqs_enabled = 0; -} - -void early_boot_irqs_on(void) -{ - early_boot_irqs_enabled = 1; -} - -/* * Hardirqs will be enabled: */ void trace_hardirqs_on_caller(unsigned long ip) @@ -2319,7 +2303,7 @@ void trace_hardirqs_on_caller(unsigned long ip) if (unlikely(!debug_locks || current->lockdep_recursion)) return; - if (DEBUG_LOCKS_WARN_ON(unlikely(!early_boot_irqs_enabled))) + if (DEBUG_LOCKS_WARN_ON(unlikely(early_boot_irqs_disabled))) return; if (unlikely(curr->hardirqs_enabled)) { diff --git a/kernel/sched.c b/kernel/sched.c index ea3e5eff3878..18d38e4ec7ba 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -553,9 +553,6 @@ struct rq { /* try_to_wake_up() stats */ unsigned int ttwu_count; unsigned int ttwu_local; - - /* BKL stats */ - unsigned int bkl_count; #endif }; @@ -609,6 +606,9 @@ static inline struct task_group *task_group(struct task_struct *p) struct task_group *tg; struct cgroup_subsys_state *css; + if (p->flags & PF_EXITING) + return &root_task_group; + css = task_subsys_state_check(p, cpu_cgroup_subsys_id, lockdep_is_held(&task_rq(p)->lock)); tg = container_of(css, struct task_group, css); @@ -3887,7 +3887,7 @@ static inline void schedule_debug(struct task_struct *prev) schedstat_inc(this_rq(), sched_count); #ifdef CONFIG_SCHEDSTATS if (unlikely(prev->lock_depth >= 0)) { - schedstat_inc(this_rq(), bkl_count); + schedstat_inc(this_rq(), rq_sched_info.bkl_count); schedstat_inc(prev, sched_info.bkl_count); } #endif @@ -4871,7 +4871,8 @@ recheck: * assigned. */ if (rt_bandwidth_enabled() && rt_policy(policy) && - task_group(p)->rt_bandwidth.rt_runtime == 0) { + task_group(p)->rt_bandwidth.rt_runtime == 0 && + !task_group_is_autogroup(task_group(p))) { __task_rq_unlock(rq); raw_spin_unlock_irqrestore(&p->pi_lock, flags); return -EPERM; @@ -8882,6 +8883,20 @@ cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, } } +static void +cpu_cgroup_exit(struct cgroup_subsys *ss, struct task_struct *task) +{ + /* + * cgroup_exit() is called in the copy_process() failure path. + * Ignore this case since the task hasn't ran yet, this avoids + * trying to poke a half freed task state from generic code. + */ + if (!(task->flags & PF_EXITING)) + return; + + sched_move_task(task); +} + #ifdef CONFIG_FAIR_GROUP_SCHED static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype, u64 shareval) @@ -8954,6 +8969,7 @@ struct cgroup_subsys cpu_cgroup_subsys = { .destroy = cpu_cgroup_destroy, .can_attach = cpu_cgroup_can_attach, .attach = cpu_cgroup_attach, + .exit = cpu_cgroup_exit, .populate = cpu_cgroup_populate, .subsys_id = cpu_cgroup_subsys_id, .early_init = 1, diff --git a/kernel/sched_autogroup.c b/kernel/sched_autogroup.c index 32a723b8f84c..9fb656283157 100644 --- a/kernel/sched_autogroup.c +++ b/kernel/sched_autogroup.c @@ -27,6 +27,11 @@ static inline void autogroup_destroy(struct kref *kref) { struct autogroup *ag = container_of(kref, struct autogroup, kref); +#ifdef CONFIG_RT_GROUP_SCHED + /* We've redirected RT tasks to the root task group... */ + ag->tg->rt_se = NULL; + ag->tg->rt_rq = NULL; +#endif sched_destroy_group(ag->tg); } @@ -55,6 +60,10 @@ static inline struct autogroup *autogroup_task_get(struct task_struct *p) return ag; } +#ifdef CONFIG_RT_GROUP_SCHED +static void free_rt_sched_group(struct task_group *tg); +#endif + static inline struct autogroup *autogroup_create(void) { struct autogroup *ag = kzalloc(sizeof(*ag), GFP_KERNEL); @@ -72,6 +81,19 @@ static inline struct autogroup *autogroup_create(void) init_rwsem(&ag->lock); ag->id = atomic_inc_return(&autogroup_seq_nr); ag->tg = tg; +#ifdef CONFIG_RT_GROUP_SCHED + /* + * Autogroup RT tasks are redirected to the root task group + * so we don't have to move tasks around upon policy change, + * or flail around trying to allocate bandwidth on the fly. + * A bandwidth exception in __sched_setscheduler() allows + * the policy change to proceed. Thereafter, task_group() + * returns &root_task_group, so zero bandwidth is required. + */ + free_rt_sched_group(tg); + tg->rt_se = root_task_group.rt_se; + tg->rt_rq = root_task_group.rt_rq; +#endif tg->autogroup = ag; return ag; @@ -106,6 +128,11 @@ task_wants_autogroup(struct task_struct *p, struct task_group *tg) return true; } +static inline bool task_group_is_autogroup(struct task_group *tg) +{ + return tg != &root_task_group && tg->autogroup; +} + static inline struct task_group * autogroup_task_group(struct task_struct *p, struct task_group *tg) { @@ -231,6 +258,11 @@ void proc_sched_autogroup_show_task(struct task_struct *p, struct seq_file *m) #ifdef CONFIG_SCHED_DEBUG static inline int autogroup_path(struct task_group *tg, char *buf, int buflen) { + int enabled = ACCESS_ONCE(sysctl_sched_autogroup_enabled); + + if (!enabled || !tg->autogroup) + return 0; + return snprintf(buf, buflen, "%s-%ld", "/autogroup", tg->autogroup->id); } #endif /* CONFIG_SCHED_DEBUG */ diff --git a/kernel/sched_autogroup.h b/kernel/sched_autogroup.h index 5358e241cb20..7b859ffe5dad 100644 --- a/kernel/sched_autogroup.h +++ b/kernel/sched_autogroup.h @@ -15,6 +15,10 @@ autogroup_task_group(struct task_struct *p, struct task_group *tg); static inline void autogroup_init(struct task_struct *init_task) { } static inline void autogroup_free(struct task_group *tg) { } +static inline bool task_group_is_autogroup(struct task_group *tg) +{ + return 0; +} static inline struct task_group * autogroup_task_group(struct task_struct *p, struct task_group *tg) diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index 1dfae3d014b5..eb6cb8edd075 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c @@ -16,6 +16,8 @@ #include <linux/kallsyms.h> #include <linux/utsname.h> +static DEFINE_SPINLOCK(sched_debug_lock); + /* * This allows printing both to /proc/sched_debug and * to the console @@ -86,6 +88,26 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group } #endif +#ifdef CONFIG_CGROUP_SCHED +static char group_path[PATH_MAX]; + +static char *task_group_path(struct task_group *tg) +{ + if (autogroup_path(tg, group_path, PATH_MAX)) + return group_path; + + /* + * May be NULL if the underlying cgroup isn't fully-created yet + */ + if (!tg->css.cgroup) { + group_path[0] = '\0'; + return group_path; + } + cgroup_path(tg->css.cgroup, group_path, PATH_MAX); + return group_path; +} +#endif + static void print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) { @@ -108,6 +130,9 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) SEQ_printf(m, "%15Ld %15Ld %15Ld.%06ld %15Ld.%06ld %15Ld.%06ld", 0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L); #endif +#ifdef CONFIG_CGROUP_SCHED + SEQ_printf(m, " %s", task_group_path(task_group(p))); +#endif SEQ_printf(m, "\n"); } @@ -144,7 +169,11 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) struct sched_entity *last; unsigned long flags; +#ifdef CONFIG_FAIR_GROUP_SCHED + SEQ_printf(m, "\ncfs_rq[%d]:%s\n", cpu, task_group_path(cfs_rq->tg)); +#else SEQ_printf(m, "\ncfs_rq[%d]:\n", cpu); +#endif SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "exec_clock", SPLIT_NS(cfs_rq->exec_clock)); @@ -191,7 +220,11 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq) { +#ifdef CONFIG_RT_GROUP_SCHED + SEQ_printf(m, "\nrt_rq[%d]:%s\n", cpu, task_group_path(rt_rq->tg)); +#else SEQ_printf(m, "\nrt_rq[%d]:\n", cpu); +#endif #define P(x) \ SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(rt_rq->x)) @@ -212,6 +245,7 @@ extern __read_mostly int sched_clock_running; static void print_cpu(struct seq_file *m, int cpu) { struct rq *rq = cpu_rq(cpu); + unsigned long flags; #ifdef CONFIG_X86 { @@ -262,14 +296,20 @@ static void print_cpu(struct seq_file *m, int cpu) P(ttwu_count); P(ttwu_local); - P(bkl_count); + SEQ_printf(m, " .%-30s: %d\n", "bkl_count", + rq->rq_sched_info.bkl_count); #undef P +#undef P64 #endif + spin_lock_irqsave(&sched_debug_lock, flags); print_cfs_stats(m, cpu); print_rt_stats(m, cpu); + rcu_read_lock(); print_rq(m, rq, cpu); + rcu_read_unlock(); + spin_unlock_irqrestore(&sched_debug_lock, flags); } static const char *sched_tunable_scaling_names[] = { diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index c62ebae65cf0..77e9166d7bbf 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1062,6 +1062,9 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) struct sched_entity *se = __pick_next_entity(cfs_rq); s64 delta = curr->vruntime - se->vruntime; + if (delta < 0) + return; + if (delta > ideal_runtime) resched_task(rq_of(cfs_rq)->curr); } @@ -1362,27 +1365,27 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg) return wl; for_each_sched_entity(se) { - long S, rw, s, a, b; + long lw, w; - S = se->my_q->tg->shares; - s = se->load.weight; - rw = se->my_q->load.weight; + tg = se->my_q->tg; + w = se->my_q->load.weight; - a = S*(rw + wl); - b = S*rw + s*wg; + /* use this cpu's instantaneous contribution */ + lw = atomic_read(&tg->load_weight); + lw -= se->my_q->load_contribution; + lw += w + wg; - wl = s*(a-b); + wl += w; - if (likely(b)) - wl /= b; + if (lw > 0 && wl < lw) + wl = (wl * tg->shares) / lw; + else + wl = tg->shares; - /* - * Assume the group is already running and will - * thus already be accounted for in the weight. - * - * That is, moving shares between CPUs, does not - * alter the group weight. - */ + /* zero point is MIN_SHARES */ + if (wl < MIN_SHARES) + wl = MIN_SHARES; + wl -= se->load.weight; wg = 0; } diff --git a/kernel/smp.c b/kernel/smp.c index 4ec30e069987..9910744f0856 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -194,23 +194,52 @@ void generic_smp_call_function_interrupt(void) */ list_for_each_entry_rcu(data, &call_function.queue, csd.list) { int refs; + void (*func) (void *info); - if (!cpumask_test_and_clear_cpu(cpu, data->cpumask)) + /* + * Since we walk the list without any locks, we might + * see an entry that was completed, removed from the + * list and is in the process of being reused. + * + * We must check that the cpu is in the cpumask before + * checking the refs, and both must be set before + * executing the callback on this cpu. + */ + + if (!cpumask_test_cpu(cpu, data->cpumask)) + continue; + + smp_rmb(); + + if (atomic_read(&data->refs) == 0) continue; + func = data->csd.func; /* for later warn */ data->csd.func(data->csd.info); + /* + * If the cpu mask is not still set then it enabled interrupts, + * we took another smp interrupt, and executed the function + * twice on this cpu. In theory that copy decremented refs. + */ + if (!cpumask_test_and_clear_cpu(cpu, data->cpumask)) { + WARN(1, "%pS enabled interrupts and double executed\n", + func); + continue; + } + refs = atomic_dec_return(&data->refs); WARN_ON(refs < 0); - if (!refs) { - raw_spin_lock(&call_function.lock); - list_del_rcu(&data->csd.list); - raw_spin_unlock(&call_function.lock); - } if (refs) continue; + WARN_ON(!cpumask_empty(data->cpumask)); + + raw_spin_lock(&call_function.lock); + list_del_rcu(&data->csd.list); + raw_spin_unlock(&call_function.lock); + csd_unlock(&data->csd); } @@ -430,7 +459,7 @@ void smp_call_function_many(const struct cpumask *mask, * can't happen. */ WARN_ON_ONCE(cpu_online(this_cpu) && irqs_disabled() - && !oops_in_progress); + && !oops_in_progress && !early_boot_irqs_disabled); /* So, what's a CPU they want? Ignoring this one. */ cpu = cpumask_first_and(mask, cpu_online_mask); @@ -454,11 +483,21 @@ void smp_call_function_many(const struct cpumask *mask, data = &__get_cpu_var(cfd_data); csd_lock(&data->csd); + BUG_ON(atomic_read(&data->refs) || !cpumask_empty(data->cpumask)); data->csd.func = func; data->csd.info = info; cpumask_and(data->cpumask, mask, cpu_online_mask); cpumask_clear_cpu(this_cpu, data->cpumask); + + /* + * To ensure the interrupt handler gets an complete view + * we order the cpumask and refs writes and order the read + * of them in the interrupt handler. In addition we may + * only clear our own cpu bit from the mask. + */ + smp_wmb(); + atomic_set(&data->refs, cpumask_weight(data->cpumask)); raw_spin_lock_irqsave(&call_function.lock, flags); @@ -533,17 +572,20 @@ void ipi_call_unlock_irq(void) #endif /* USE_GENERIC_SMP_HELPERS */ /* - * Call a function on all processors + * Call a function on all processors. May be used during early boot while + * early_boot_irqs_disabled is set. Use local_irq_save/restore() instead + * of local_irq_disable/enable(). */ int on_each_cpu(void (*func) (void *info), void *info, int wait) { + unsigned long flags; int ret = 0; preempt_disable(); ret = smp_call_function(func, info, wait); - local_irq_disable(); + local_irq_save(flags); func(info); - local_irq_enable(); + local_irq_restore(flags); preempt_enable(); return ret; } diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index 5cf8c602b880..92b6e1e12d98 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -453,14 +453,6 @@ void time_hardirqs_off(unsigned long a0, unsigned long a1) * Stubs: */ -void early_boot_irqs_off(void) -{ -} - -void early_boot_irqs_on(void) -{ -} - void trace_softirqs_on(unsigned long ip) { } diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 8ee6ec82f88a..11869faa6819 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -768,7 +768,11 @@ static inline void worker_clr_flags(struct worker *worker, unsigned int flags) worker->flags &= ~flags; - /* if transitioning out of NOT_RUNNING, increment nr_running */ + /* + * If transitioning out of NOT_RUNNING, increment nr_running. Note + * that the nested NOT_RUNNING is not a noop. NOT_RUNNING is mask + * of multiple flags, not a single flag. + */ if ((flags & WORKER_NOT_RUNNING) && (oflags & WORKER_NOT_RUNNING)) if (!(worker->flags & WORKER_NOT_RUNNING)) atomic_inc(get_gcwq_nr_running(gcwq->cpu)); @@ -1840,7 +1844,7 @@ __acquires(&gcwq->lock) spin_unlock_irq(&gcwq->lock); work_clear_pending(work); - lock_map_acquire(&cwq->wq->lockdep_map); + lock_map_acquire_read(&cwq->wq->lockdep_map); lock_map_acquire(&lockdep_map); trace_workqueue_execute_start(work); f(work); @@ -2384,8 +2388,18 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr, insert_wq_barrier(cwq, barr, work, worker); spin_unlock_irq(&gcwq->lock); - lock_map_acquire(&cwq->wq->lockdep_map); + /* + * If @max_active is 1 or rescuer is in use, flushing another work + * item on the same workqueue may lead to deadlock. Make sure the + * flusher is not running on the same workqueue by verifying write + * access. + */ + if (cwq->wq->saved_max_active == 1 || cwq->wq->flags & WQ_RESCUER) + lock_map_acquire(&cwq->wq->lockdep_map); + else + lock_map_acquire_read(&cwq->wq->lockdep_map); lock_map_release(&cwq->wq->lockdep_map); + return true; already_gone: spin_unlock_irq(&gcwq->lock); |