diff options
Diffstat (limited to 'kernel')
81 files changed, 2838 insertions, 1580 deletions
diff --git a/kernel/acct.c b/kernel/acct.c index 6670fbd3e466..d15c0ee4d955 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -147,7 +147,7 @@ static struct bsd_acct_struct *acct_get(struct pid_namespace *ns) again: smp_rmb(); rcu_read_lock(); - res = to_acct(ACCESS_ONCE(ns->bacct)); + res = to_acct(READ_ONCE(ns->bacct)); if (!res) { rcu_read_unlock(); return NULL; @@ -159,7 +159,7 @@ again: } rcu_read_unlock(); mutex_lock(&res->lock); - if (res != to_acct(ACCESS_ONCE(ns->bacct))) { + if (res != to_acct(READ_ONCE(ns->bacct))) { mutex_unlock(&res->lock); acct_put(res); goto again; diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index e2636737b69b..c4b9ab01bba5 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -492,7 +492,7 @@ static void *perf_event_fd_array_get_ptr(struct bpf_map *map, ee = ERR_PTR(-EOPNOTSUPP); event = perf_file->private_data; - if (perf_event_read_local(event, &value) == -EOPNOTSUPP) + if (perf_event_read_local(event, &value, NULL, NULL) == -EOPNOTSUPP) goto err_out; ee = bpf_event_entry_gen(perf_file, map_file); diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 4657e2924ecb..f7efa7b4d825 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -57,7 +57,7 @@ #include <linux/backing-dev.h> #include <linux/sort.h> #include <linux/oom.h> - +#include <linux/sched/isolation.h> #include <linux/uaccess.h> #include <linux/atomic.h> #include <linux/mutex.h> @@ -656,7 +656,6 @@ static int generate_sched_domains(cpumask_var_t **domains, int csn; /* how many cpuset ptrs in csa so far */ int i, j, k; /* indices for partition finding loops */ cpumask_var_t *doms; /* resulting partition; i.e. sched domains */ - cpumask_var_t non_isolated_cpus; /* load balanced CPUs */ struct sched_domain_attr *dattr; /* attributes for custom domains */ int ndoms = 0; /* number of sched domains in result */ int nslot; /* next empty doms[] struct cpumask slot */ @@ -666,10 +665,6 @@ static int generate_sched_domains(cpumask_var_t **domains, dattr = NULL; csa = NULL; - if (!alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL)) - goto done; - cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map); - /* Special case for the 99% of systems with one, full, sched domain */ if (is_sched_load_balance(&top_cpuset)) { ndoms = 1; @@ -683,7 +678,7 @@ static int generate_sched_domains(cpumask_var_t **domains, update_domain_attr_tree(dattr, &top_cpuset); } cpumask_and(doms[0], top_cpuset.effective_cpus, - non_isolated_cpus); + housekeeping_cpumask(HK_FLAG_DOMAIN)); goto done; } @@ -707,7 +702,8 @@ static int generate_sched_domains(cpumask_var_t **domains, */ if (!cpumask_empty(cp->cpus_allowed) && !(is_sched_load_balance(cp) && - cpumask_intersects(cp->cpus_allowed, non_isolated_cpus))) + cpumask_intersects(cp->cpus_allowed, + housekeeping_cpumask(HK_FLAG_DOMAIN)))) continue; if (is_sched_load_balance(cp)) @@ -789,7 +785,7 @@ restart: if (apn == b->pn) { cpumask_or(dp, dp, b->effective_cpus); - cpumask_and(dp, dp, non_isolated_cpus); + cpumask_and(dp, dp, housekeeping_cpumask(HK_FLAG_DOMAIN)); if (dattr) update_domain_attr_tree(dattr + nslot, b); @@ -802,7 +798,6 @@ restart: BUG_ON(nslot != ndoms); done: - free_cpumask_var(non_isolated_cpus); kfree(csa); /* diff --git a/kernel/events/core.c b/kernel/events/core.c index 10cdb9c26b5d..4c39c05e029a 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -209,7 +209,7 @@ static int event_function(void *info) struct perf_event_context *task_ctx = cpuctx->task_ctx; int ret = 0; - WARN_ON_ONCE(!irqs_disabled()); + lockdep_assert_irqs_disabled(); perf_ctx_lock(cpuctx, task_ctx); /* @@ -306,7 +306,7 @@ static void event_function_local(struct perf_event *event, event_f func, void *d struct task_struct *task = READ_ONCE(ctx->task); struct perf_event_context *task_ctx = NULL; - WARN_ON_ONCE(!irqs_disabled()); + lockdep_assert_irqs_disabled(); if (task) { if (task == TASK_TOMBSTONE) @@ -582,6 +582,88 @@ static inline u64 perf_event_clock(struct perf_event *event) return event->clock(); } +/* + * State based event timekeeping... + * + * The basic idea is to use event->state to determine which (if any) time + * fields to increment with the current delta. This means we only need to + * update timestamps when we change state or when they are explicitly requested + * (read). + * + * Event groups make things a little more complicated, but not terribly so. The + * rules for a group are that if the group leader is OFF the entire group is + * OFF, irrespecive of what the group member states are. This results in + * __perf_effective_state(). + * + * A futher ramification is that when a group leader flips between OFF and + * !OFF, we need to update all group member times. + * + * + * NOTE: perf_event_time() is based on the (cgroup) context time, and thus we + * need to make sure the relevant context time is updated before we try and + * update our timestamps. + */ + +static __always_inline enum perf_event_state +__perf_effective_state(struct perf_event *event) +{ + struct perf_event *leader = event->group_leader; + + if (leader->state <= PERF_EVENT_STATE_OFF) + return leader->state; + + return event->state; +} + +static __always_inline void +__perf_update_times(struct perf_event *event, u64 now, u64 *enabled, u64 *running) +{ + enum perf_event_state state = __perf_effective_state(event); + u64 delta = now - event->tstamp; + + *enabled = event->total_time_enabled; + if (state >= PERF_EVENT_STATE_INACTIVE) + *enabled += delta; + + *running = event->total_time_running; + if (state >= PERF_EVENT_STATE_ACTIVE) + *running += delta; +} + +static void perf_event_update_time(struct perf_event *event) +{ + u64 now = perf_event_time(event); + + __perf_update_times(event, now, &event->total_time_enabled, + &event->total_time_running); + event->tstamp = now; +} + +static void perf_event_update_sibling_time(struct perf_event *leader) +{ + struct perf_event *sibling; + + list_for_each_entry(sibling, &leader->sibling_list, group_entry) + perf_event_update_time(sibling); +} + +static void +perf_event_set_state(struct perf_event *event, enum perf_event_state state) +{ + if (event->state == state) + return; + + perf_event_update_time(event); + /* + * If a group leader gets enabled/disabled all its siblings + * are affected too. + */ + if ((event->state < 0) ^ (state < 0)) + perf_event_update_sibling_time(event); + + WRITE_ONCE(event->state, state); +} + #ifdef CONFIG_CGROUP_PERF static inline bool @@ -841,40 +923,6 @@ perf_cgroup_set_shadow_time(struct perf_event *event, u64 now) event->shadow_ctx_time = now - t->timestamp; } -static inline void -perf_cgroup_defer_enabled(struct perf_event *event) -{ - /* - * when the current task's perf cgroup does not match - * the event's, we need to remember to call the - * perf_mark_enable() function the first time a task with - * a matching perf cgroup is scheduled in. - */ - if (is_cgroup_event(event) && !perf_cgroup_match(event)) - event->cgrp_defer_enabled = 1; -} - -static inline void -perf_cgroup_mark_enabled(struct perf_event *event, - struct perf_event_context *ctx) -{ - struct perf_event *sub; - u64 tstamp = perf_event_time(event); - - if (!event->cgrp_defer_enabled) - return; - - event->cgrp_defer_enabled = 0; - - event->tstamp_enabled = tstamp - event->total_time_enabled; - list_for_each_entry(sub, &event->sibling_list, group_entry) { - if (sub->state >= PERF_EVENT_STATE_INACTIVE) { - sub->tstamp_enabled = tstamp - sub->total_time_enabled; - sub->cgrp_defer_enabled = 0; - } - } -} - /* * Update cpuctx->cgrp so that it is set when first cgroup event is added and * cleared when last cgroup event is removed. @@ -975,17 +1023,6 @@ static inline u64 perf_cgroup_event_time(struct perf_event *event) } static inline void -perf_cgroup_defer_enabled(struct perf_event *event) -{ -} - -static inline void -perf_cgroup_mark_enabled(struct perf_event *event, - struct perf_event_context *ctx) -{ -} - -static inline void list_update_cgroup_event(struct perf_event *event, struct perf_event_context *ctx, bool add) { @@ -1006,7 +1043,7 @@ static enum hrtimer_restart perf_mux_hrtimer_handler(struct hrtimer *hr) struct perf_cpu_context *cpuctx; int rotations = 0; - WARN_ON(!irqs_disabled()); + lockdep_assert_irqs_disabled(); cpuctx = container_of(hr, struct perf_cpu_context, hrtimer); rotations = perf_rotate_context(cpuctx); @@ -1093,7 +1130,7 @@ static void perf_event_ctx_activate(struct perf_event_context *ctx) { struct list_head *head = this_cpu_ptr(&active_ctx_list); - WARN_ON(!irqs_disabled()); + lockdep_assert_irqs_disabled(); WARN_ON(!list_empty(&ctx->active_ctx_list)); @@ -1102,7 +1139,7 @@ static void perf_event_ctx_activate(struct perf_event_context *ctx) static void perf_event_ctx_deactivate(struct perf_event_context *ctx) { - WARN_ON(!irqs_disabled()); + lockdep_assert_irqs_disabled(); WARN_ON(list_empty(&ctx->active_ctx_list)); @@ -1202,7 +1239,7 @@ perf_event_ctx_lock_nested(struct perf_event *event, int nesting) again: rcu_read_lock(); - ctx = ACCESS_ONCE(event->ctx); + ctx = READ_ONCE(event->ctx); if (!atomic_inc_not_zero(&ctx->refcount)) { rcu_read_unlock(); goto again; @@ -1398,60 +1435,6 @@ static u64 perf_event_time(struct perf_event *event) return ctx ? ctx->time : 0; } -/* - * Update the total_time_enabled and total_time_running fields for a event. - */ -static void update_event_times(struct perf_event *event) -{ - struct perf_event_context *ctx = event->ctx; - u64 run_end; - - lockdep_assert_held(&ctx->lock); - - if (event->state < PERF_EVENT_STATE_INACTIVE || - event->group_leader->state < PERF_EVENT_STATE_INACTIVE) - return; - - /* - * in cgroup mode, time_enabled represents - * the time the event was enabled AND active - * tasks were in the monitored cgroup. This is - * independent of the activity of the context as - * there may be a mix of cgroup and non-cgroup events. - * - * That is why we treat cgroup events differently - * here. - */ - if (is_cgroup_event(event)) - run_end = perf_cgroup_event_time(event); - else if (ctx->is_active) - run_end = ctx->time; - else - run_end = event->tstamp_stopped; - - event->total_time_enabled = run_end - event->tstamp_enabled; - - if (event->state == PERF_EVENT_STATE_INACTIVE) - run_end = event->tstamp_stopped; - else - run_end = perf_event_time(event); - - event->total_time_running = run_end - event->tstamp_running; - -} - -/* - * Update total_time_enabled and total_time_running for all events in a group. - */ -static void update_group_times(struct perf_event *leader) -{ - struct perf_event *event; - - update_event_times(leader); - list_for_each_entry(event, &leader->sibling_list, group_entry) - update_event_times(event); -} - static enum event_type_t get_event_type(struct perf_event *event) { struct perf_event_context *ctx = event->ctx; @@ -1494,6 +1477,8 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx) WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT); event->attach_state |= PERF_ATTACH_CONTEXT; + event->tstamp = perf_event_time(event); + /* * If we're a stand alone event or group leader, we go to the context * list, group events are kept attached to the group so that @@ -1701,8 +1686,6 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx) if (event->group_leader == event) list_del_init(&event->group_entry); - update_group_times(event); - /* * If event was in error state, then keep it * that way, otherwise bogus counts will be @@ -1711,7 +1694,7 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx) * of the event */ if (event->state > PERF_EVENT_STATE_OFF) - event->state = PERF_EVENT_STATE_OFF; + perf_event_set_state(event, PERF_EVENT_STATE_OFF); ctx->generation++; } @@ -1810,38 +1793,24 @@ event_sched_out(struct perf_event *event, struct perf_cpu_context *cpuctx, struct perf_event_context *ctx) { - u64 tstamp = perf_event_time(event); - u64 delta; + enum perf_event_state state = PERF_EVENT_STATE_INACTIVE; WARN_ON_ONCE(event->ctx != ctx); lockdep_assert_held(&ctx->lock); - /* - * An event which could not be activated because of - * filter mismatch still needs to have its timings - * maintained, otherwise bogus information is return - * via read() for time_enabled, time_running: - */ - if (event->state == PERF_EVENT_STATE_INACTIVE && - !event_filter_match(event)) { - delta = tstamp - event->tstamp_stopped; - event->tstamp_running += delta; - event->tstamp_stopped = tstamp; - } - if (event->state != PERF_EVENT_STATE_ACTIVE) return; perf_pmu_disable(event->pmu); - event->tstamp_stopped = tstamp; event->pmu->del(event, 0); event->oncpu = -1; - event->state = PERF_EVENT_STATE_INACTIVE; + if (event->pending_disable) { event->pending_disable = 0; - event->state = PERF_EVENT_STATE_OFF; + state = PERF_EVENT_STATE_OFF; } + perf_event_set_state(event, state); if (!is_software_event(event)) cpuctx->active_oncpu--; @@ -1861,7 +1830,9 @@ group_sched_out(struct perf_event *group_event, struct perf_event_context *ctx) { struct perf_event *event; - int state = group_event->state; + + if (group_event->state != PERF_EVENT_STATE_ACTIVE) + return; perf_pmu_disable(ctx->pmu); @@ -1875,7 +1846,7 @@ group_sched_out(struct perf_event *group_event, perf_pmu_enable(ctx->pmu); - if (state == PERF_EVENT_STATE_ACTIVE && group_event->attr.exclusive) + if (group_event->attr.exclusive) cpuctx->exclusive = 0; } @@ -1895,6 +1866,11 @@ __perf_remove_from_context(struct perf_event *event, { unsigned long flags = (unsigned long)info; + if (ctx->is_active & EVENT_TIME) { + update_context_time(ctx); + update_cgrp_time_from_cpuctx(cpuctx); + } + event_sched_out(event, cpuctx, ctx); if (flags & DETACH_GROUP) perf_group_detach(event); @@ -1957,14 +1933,17 @@ static void __perf_event_disable(struct perf_event *event, if (event->state < PERF_EVENT_STATE_INACTIVE) return; - update_context_time(ctx); - update_cgrp_time_from_event(event); - update_group_times(event); + if (ctx->is_active & EVENT_TIME) { + update_context_time(ctx); + update_cgrp_time_from_event(event); + } + if (event == event->group_leader) group_sched_out(event, cpuctx, ctx); else event_sched_out(event, cpuctx, ctx); - event->state = PERF_EVENT_STATE_OFF; + + perf_event_set_state(event, PERF_EVENT_STATE_OFF); } /* @@ -2021,8 +2000,7 @@ void perf_event_disable_inatomic(struct perf_event *event) } static void perf_set_shadow_time(struct perf_event *event, - struct perf_event_context *ctx, - u64 tstamp) + struct perf_event_context *ctx) { /* * use the correct time source for the time snapshot @@ -2050,9 +2028,9 @@ static void perf_set_shadow_time(struct perf_event *event, * is cleaner and simpler to understand. */ if (is_cgroup_event(event)) - perf_cgroup_set_shadow_time(event, tstamp); + perf_cgroup_set_shadow_time(event, event->tstamp); else - event->shadow_ctx_time = tstamp - ctx->timestamp; + event->shadow_ctx_time = event->tstamp - ctx->timestamp; } #define MAX_INTERRUPTS (~0ULL) @@ -2065,7 +2043,6 @@ event_sched_in(struct perf_event *event, struct perf_cpu_context *cpuctx, struct perf_event_context *ctx) { - u64 tstamp = perf_event_time(event); int ret = 0; lockdep_assert_held(&ctx->lock); @@ -2075,11 +2052,12 @@ event_sched_in(struct perf_event *event, WRITE_ONCE(event->oncpu, smp_processor_id()); /* - * Order event::oncpu write to happen before the ACTIVE state - * is visible. + * Order event::oncpu write to happen before the ACTIVE state is + * visible. This allows perf_event_{stop,read}() to observe the correct + * ->oncpu if it sees ACTIVE. */ smp_wmb(); - WRITE_ONCE(event->state, PERF_EVENT_STATE_ACTIVE); + perf_event_set_state(event, PERF_EVENT_STATE_ACTIVE); /* * Unthrottle events, since we scheduled we might have missed several @@ -2091,26 +2069,19 @@ event_sched_in(struct perf_event *event, event->hw.interrupts = 0; } - /* - * The new state must be visible before we turn it on in the hardware: - */ - smp_wmb(); - perf_pmu_disable(event->pmu); - perf_set_shadow_time(event, ctx, tstamp); + perf_set_shadow_time(event, ctx); perf_log_itrace_start(event); if (event->pmu->add(event, PERF_EF_START)) { - event->state = PERF_EVENT_STATE_INACTIVE; + perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE); event->oncpu = -1; ret = -EAGAIN; goto out; } - event->tstamp_running += tstamp - event->tstamp_stopped; - if (!is_software_event(event)) cpuctx->active_oncpu++; if (!ctx->nr_active++) @@ -2134,8 +2105,6 @@ group_sched_in(struct perf_event *group_event, { struct perf_event *event, *partial_group = NULL; struct pmu *pmu = ctx->pmu; - u64 now = ctx->time; - bool simulate = false; if (group_event->state == PERF_EVENT_STATE_OFF) return 0; @@ -2165,27 +2134,13 @@ group_error: /* * Groups can be scheduled in as one unit only, so undo any * partial group before returning: - * The events up to the failed event are scheduled out normally, - * tstamp_stopped will be updated. - * - * The failed events and the remaining siblings need to have - * their timings updated as if they had gone thru event_sched_in() - * and event_sched_out(). This is required to get consistent timings - * across the group. This also takes care of the case where the group - * could never be scheduled by ensuring tstamp_stopped is set to mark - * the time the event was actually stopped, such that time delta - * calculation in update_event_times() is correct. + * The events up to the failed event are scheduled out normally. */ list_for_each_entry(event, &group_event->sibling_list, group_entry) { if (event == partial_group) - simulate = true; + break; - if (simulate) { - event->tstamp_running += now - event->tstamp_stopped; - event->tstamp_stopped = now; - } else { - event_sched_out(event, cpuctx, ctx); - } + event_sched_out(event, cpuctx, ctx); } event_sched_out(group_event, cpuctx, ctx); @@ -2227,46 +2182,11 @@ static int group_can_go_on(struct perf_event *event, return can_add_hw; } -/* - * Complement to update_event_times(). This computes the tstamp_* values to - * continue 'enabled' state from @now, and effectively discards the time - * between the prior tstamp_stopped and now (as we were in the OFF state, or - * just switched (context) time base). - * - * This further assumes '@event->state == INACTIVE' (we just came from OFF) and - * cannot have been scheduled in yet. And going into INACTIVE state means - * '@event->tstamp_stopped = @now'. - * - * Thus given the rules of update_event_times(): - * - * total_time_enabled = tstamp_stopped - tstamp_enabled - * total_time_running = tstamp_stopped - tstamp_running - * - * We can insert 'tstamp_stopped == now' and reverse them to compute new - * tstamp_* values. - */ -static void __perf_event_enable_time(struct perf_event *event, u64 now) -{ - WARN_ON_ONCE(event->state != PERF_EVENT_STATE_INACTIVE); - - event->tstamp_stopped = now; - event->tstamp_enabled = now - event->total_time_enabled; - event->tstamp_running = now - event->total_time_running; -} - static void add_event_to_ctx(struct perf_event *event, struct perf_event_context *ctx) { - u64 tstamp = perf_event_time(event); - list_add_event(event, ctx); perf_group_attach(event); - /* - * We can be called with event->state == STATE_OFF when we create with - * .disabled = 1. In that case the IOC_ENABLE will call this function. - */ - if (event->state == PERF_EVENT_STATE_INACTIVE) - __perf_event_enable_time(event, tstamp); } static void ctx_sched_out(struct perf_event_context *ctx, @@ -2498,28 +2418,6 @@ again: } /* - * Put a event into inactive state and update time fields. - * Enabling the leader of a group effectively enables all - * the group members that aren't explicitly disabled, so we - * have to update their ->tstamp_enabled also. - * Note: this works for group members as well as group leaders - * since the non-leader members' sibling_lists will be empty. - */ -static void __perf_event_mark_enabled(struct perf_event *event) -{ - struct perf_event *sub; - u64 tstamp = perf_event_time(event); - - event->state = PERF_EVENT_STATE_INACTIVE; - __perf_event_enable_time(event, tstamp); - list_for_each_entry(sub, &event->sibling_list, group_entry) { - /* XXX should not be > INACTIVE if event isn't */ - if (sub->state >= PERF_EVENT_STATE_INACTIVE) - __perf_event_enable_time(sub, tstamp); - } -} - -/* * Cross CPU call to enable a performance event */ static void __perf_event_enable(struct perf_event *event, @@ -2537,14 +2435,12 @@ static void __perf_event_enable(struct perf_event *event, if (ctx->is_active) ctx_sched_out(ctx, cpuctx, EVENT_TIME); - __perf_event_mark_enabled(event); + perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE); if (!ctx->is_active) return; if (!event_filter_match(event)) { - if (is_cgroup_event(event)) - perf_cgroup_defer_enabled(event); ctx_sched_in(ctx, cpuctx, EVENT_TIME, current); return; } @@ -2864,18 +2760,10 @@ static void __perf_event_sync_stat(struct perf_event *event, * we know the event must be on the current CPU, therefore we * don't need to use it. */ - switch (event->state) { - case PERF_EVENT_STATE_ACTIVE: + if (event->state == PERF_EVENT_STATE_ACTIVE) event->pmu->read(event); - /* fall-through */ - - case PERF_EVENT_STATE_INACTIVE: - update_event_times(event); - break; - default: - break; - } + perf_event_update_time(event); /* * In order to keep per-task stats reliable we need to flip the event @@ -3112,10 +3000,6 @@ ctx_pinned_sched_in(struct perf_event_context *ctx, if (!event_filter_match(event)) continue; - /* may need to reset tstamp_enabled */ - if (is_cgroup_event(event)) - perf_cgroup_mark_enabled(event, ctx); - if (group_can_go_on(event, cpuctx, 1)) group_sched_in(event, cpuctx, ctx); @@ -3123,10 +3007,8 @@ ctx_pinned_sched_in(struct perf_event_context *ctx, * If this pinned group hasn't been scheduled, * put it in error state. */ - if (event->state == PERF_EVENT_STATE_INACTIVE) { - update_group_times(event); - event->state = PERF_EVENT_STATE_ERROR; - } + if (event->state == PERF_EVENT_STATE_INACTIVE) + perf_event_set_state(event, PERF_EVENT_STATE_ERROR); } } @@ -3148,10 +3030,6 @@ ctx_flexible_sched_in(struct perf_event_context *ctx, if (!event_filter_match(event)) continue; - /* may need to reset tstamp_enabled */ - if (is_cgroup_event(event)) - perf_cgroup_mark_enabled(event, ctx); - if (group_can_go_on(event, cpuctx, can_add_hw)) { if (group_sched_in(event, cpuctx, ctx)) can_add_hw = 0; @@ -3523,7 +3401,7 @@ void perf_event_task_tick(void) struct perf_event_context *ctx, *tmp; int throttled; - WARN_ON(!irqs_disabled()); + lockdep_assert_irqs_disabled(); __this_cpu_inc(perf_throttled_seq); throttled = __this_cpu_xchg(perf_throttled_count, 0); @@ -3543,7 +3421,7 @@ static int event_enable_on_exec(struct perf_event *event, if (event->state >= PERF_EVENT_STATE_INACTIVE) return 0; - __perf_event_mark_enabled(event); + perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE); return 1; } @@ -3637,12 +3515,15 @@ static void __perf_event_read(void *info) return; raw_spin_lock(&ctx->lock); - if (ctx->is_active) { + if (ctx->is_active & EVENT_TIME) { update_context_time(ctx); update_cgrp_time_from_event(event); } - update_event_times(event); + perf_event_update_time(event); + if (data->group) + perf_event_update_sibling_time(event); + if (event->state != PERF_EVENT_STATE_ACTIVE) goto unlock; @@ -3657,7 +3538,6 @@ static void __perf_event_read(void *info) pmu->read(event); list_for_each_entry(sub, &event->sibling_list, group_entry) { - update_event_times(sub); if (sub->state == PERF_EVENT_STATE_ACTIVE) { /* * Use sibling's PMU rather than @event's since @@ -3686,7 +3566,8 @@ static inline u64 perf_event_count(struct perf_event *event) * will not be local and we cannot read them atomically * - must not have a pmu::count method */ -int perf_event_read_local(struct perf_event *event, u64 *value) +int perf_event_read_local(struct perf_event *event, u64 *value, + u64 *enabled, u64 *running) { unsigned long flags; int ret = 0; @@ -3720,6 +3601,7 @@ int perf_event_read_local(struct perf_event *event, u64 *value) goto out; } + /* * If the event is currently on this CPU, its either a per-task event, * or local to this CPU. Furthermore it means its ACTIVE (otherwise @@ -3729,6 +3611,16 @@ int perf_event_read_local(struct perf_event *event, u64 *value) event->pmu->read(event); *value = local64_read(&event->count); + if (enabled || running) { + u64 now = event->shadow_ctx_time + perf_clock(); + u64 __enabled, __running; + + __perf_update_times(event, now, &__enabled, &__running); + if (enabled) + *enabled = __enabled; + if (running) + *running = __running; + } out: local_irq_restore(flags); @@ -3737,23 +3629,35 @@ out: static int perf_event_read(struct perf_event *event, bool group) { + enum perf_event_state state = READ_ONCE(event->state); int event_cpu, ret = 0; /* * If event is enabled and currently active on a CPU, update the * value in the event structure: */ - if (event->state == PERF_EVENT_STATE_ACTIVE) { - struct perf_read_data data = { - .event = event, - .group = group, - .ret = 0, - }; +again: + if (state == PERF_EVENT_STATE_ACTIVE) { + struct perf_read_data data; + + /* + * Orders the ->state and ->oncpu loads such that if we see + * ACTIVE we must also see the right ->oncpu. + * + * Matches the smp_wmb() from event_sched_in(). + */ + smp_rmb(); event_cpu = READ_ONCE(event->oncpu); if ((unsigned)event_cpu >= nr_cpu_ids) return 0; + data = (struct perf_read_data){ + .event = event, + .group = group, + .ret = 0, + }; + preempt_disable(); event_cpu = __perf_event_read_cpu(event, event_cpu); @@ -3770,24 +3674,30 @@ static int perf_event_read(struct perf_event *event, bool group) (void)smp_call_function_single(event_cpu, __perf_event_read, &data, 1); preempt_enable(); ret = data.ret; - } else if (event->state == PERF_EVENT_STATE_INACTIVE) { + + } else if (state == PERF_EVENT_STATE_INACTIVE) { struct perf_event_context *ctx = event->ctx; unsigned long flags; raw_spin_lock_irqsave(&ctx->lock, flags); + state = event->state; + if (state != PERF_EVENT_STATE_INACTIVE) { + raw_spin_unlock_irqrestore(&ctx->lock, flags); + goto again; + } + /* - * may read while context is not active - * (e.g., thread is blocked), in that case - * we cannot update context time + * May read while context is not active (e.g., thread is + * blocked), in that case we cannot update context time */ - if (ctx->is_active) { + if (ctx->is_active & EVENT_TIME) { update_context_time(ctx); update_cgrp_time_from_event(event); } + + perf_event_update_time(event); if (group) - update_group_times(event); - else - update_event_times(event); + perf_event_update_sibling_time(event); raw_spin_unlock_irqrestore(&ctx->lock, flags); } @@ -4233,7 +4143,7 @@ static void perf_remove_from_owner(struct perf_event *event) * indeed free this event, otherwise we need to serialize on * owner->perf_event_mutex. */ - owner = lockless_dereference(event->owner); + owner = READ_ONCE(event->owner); if (owner) { /* * Since delayed_put_task_struct() also drops the last @@ -4330,7 +4240,7 @@ again: * Cannot change, child events are not migrated, see the * comment with perf_event_ctx_lock_nested(). */ - ctx = lockless_dereference(child->ctx); + ctx = READ_ONCE(child->ctx); /* * Since child_mutex nests inside ctx::mutex, we must jump * through hoops. We start by grabbing a reference on the ctx. @@ -4390,7 +4300,7 @@ static int perf_release(struct inode *inode, struct file *file) return 0; } -u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running) +static u64 __perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running) { struct perf_event *child; u64 total = 0; @@ -4418,6 +4328,18 @@ u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running) return total; } + +u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running) +{ + struct perf_event_context *ctx; + u64 count; + + ctx = perf_event_ctx_lock(event); + count = __perf_event_read_value(event, enabled, running); + perf_event_ctx_unlock(event, ctx); + + return count; +} EXPORT_SYMBOL_GPL(perf_event_read_value); static int __perf_read_group_add(struct perf_event *leader, @@ -4433,6 +4355,8 @@ static int __perf_read_group_add(struct perf_event *leader, if (ret) return ret; + raw_spin_lock_irqsave(&ctx->lock, flags); + /* * Since we co-schedule groups, {enabled,running} times of siblings * will be identical to those of the leader, so we only publish one @@ -4455,8 +4379,6 @@ static int __perf_read_group_add(struct perf_event *leader, if (read_format & PERF_FORMAT_ID) values[n++] = primary_event_id(leader); - raw_spin_lock_irqsave(&ctx->lock, flags); - list_for_each_entry(sub, &leader->sibling_list, group_entry) { values[n++] += perf_event_count(sub); if (read_format & PERF_FORMAT_ID) @@ -4520,7 +4442,7 @@ static int perf_read_one(struct perf_event *event, u64 values[4]; int n = 0; - values[n++] = perf_event_read_value(event, &enabled, &running); + values[n++] = __perf_event_read_value(event, &enabled, &running); if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) values[n++] = enabled; if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) @@ -4899,8 +4821,7 @@ static void calc_timer_values(struct perf_event *event, *now = perf_clock(); ctx_time = event->shadow_ctx_time + *now; - *enabled = ctx_time - event->tstamp_enabled; - *running = ctx_time - event->tstamp_running; + __perf_update_times(event, ctx_time, enabled, running); } static void perf_event_init_userpage(struct perf_event *event) @@ -5304,8 +5225,8 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) if (!rb) goto aux_unlock; - aux_offset = ACCESS_ONCE(rb->user_page->aux_offset); - aux_size = ACCESS_ONCE(rb->user_page->aux_size); + aux_offset = READ_ONCE(rb->user_page->aux_offset); + aux_size = READ_ONCE(rb->user_page->aux_size); if (aux_offset < perf_data_size(rb) + PAGE_SIZE) goto aux_unlock; @@ -8074,6 +7995,7 @@ static void bpf_overflow_handler(struct perf_event *event, struct bpf_perf_event_data_kern ctx = { .data = data, .regs = regs, + .event = event, }; int ret = 0; @@ -9404,6 +9326,11 @@ static void account_event(struct perf_event *event) inc = true; if (inc) { + /* + * We need the mutex here because static_branch_enable() + * must complete *before* the perf_sched_count increment + * becomes visible. + */ if (atomic_inc_not_zero(&perf_sched_count)) goto enabled; @@ -10529,7 +10456,7 @@ perf_event_exit_event(struct perf_event *child_event, if (parent_event) perf_group_detach(child_event); list_del_event(child_event, child_ctx); - child_event->state = PERF_EVENT_STATE_EXIT; /* is_event_hup() */ + perf_event_set_state(child_event, PERF_EVENT_STATE_EXIT); /* is_event_hup() */ raw_spin_unlock_irq(&child_ctx->lock); /* @@ -10767,7 +10694,7 @@ inherit_event(struct perf_event *parent_event, struct perf_event *group_leader, struct perf_event_context *child_ctx) { - enum perf_event_active_state parent_state = parent_event->state; + enum perf_event_state parent_state = parent_event->state; struct perf_event *child_event; unsigned long flags; @@ -11103,6 +11030,7 @@ static void __perf_event_exit_context(void *__info) struct perf_event *event; raw_spin_lock(&ctx->lock); + ctx_sched_out(ctx, cpuctx, EVENT_TIME); list_for_each_entry(event, &ctx->event_list, event_entry) __perf_remove_from_context(event, cpuctx, ctx, (void *)DETACH_GROUP); raw_spin_unlock(&ctx->lock); diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index f684d8e5fa2b..f3e37971c842 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -381,7 +381,7 @@ void *perf_aux_output_begin(struct perf_output_handle *handle, * (B) <-> (C) ordering is still observed by the pmu driver. */ if (!rb->aux_overwrite) { - aux_tail = ACCESS_ONCE(rb->user_page->aux_tail); + aux_tail = READ_ONCE(rb->user_page->aux_tail); handle->wakeup = rb->aux_wakeup + rb->aux_watermark; if (aux_head - aux_tail < perf_aux_size(rb)) handle->size = CIRC_SPACE(aux_head, aux_tail, perf_aux_size(rb)); diff --git a/kernel/exit.c b/kernel/exit.c index f6cad39f35df..6b4298a41167 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1339,7 +1339,7 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace, * Ensure that EXIT_ZOMBIE -> EXIT_DEAD/EXIT_TRACE transition * can't confuse the checks below. */ - int exit_state = ACCESS_ONCE(p->exit_state); + int exit_state = READ_ONCE(p->exit_state); int ret; if (unlikely(exit_state == EXIT_DEAD)) diff --git a/kernel/extable.c b/kernel/extable.c index 9aa1cc41ecf7..a17fdb63dc3e 100644 --- a/kernel/extable.c +++ b/kernel/extable.c @@ -31,6 +31,8 @@ * mutex protecting text section modification (dynamic code patching). * some users need to sleep (allocating memory...) while they hold this lock. * + * Note: Also protects SMP-alternatives modification on x86. + * * NOT exported to modules - patching kernel text is a really delicate matter. */ DEFINE_MUTEX(text_mutex); diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig index a117adf7084b..89e355866450 100644 --- a/kernel/irq/Kconfig +++ b/kernel/irq/Kconfig @@ -97,6 +97,12 @@ config HANDLE_DOMAIN_IRQ config IRQ_TIMINGS bool +config GENERIC_IRQ_MATRIX_ALLOCATOR + bool + +config GENERIC_IRQ_RESERVATION_MODE + bool + config IRQ_DOMAIN_DEBUG bool "Expose hardware/virtual IRQ mapping via debugfs" depends on IRQ_DOMAIN && DEBUG_FS diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile index ed15d142694b..ff6e352e3a6c 100644 --- a/kernel/irq/Makefile +++ b/kernel/irq/Makefile @@ -14,3 +14,4 @@ obj-$(CONFIG_GENERIC_MSI_IRQ) += msi.o obj-$(CONFIG_GENERIC_IRQ_IPI) += ipi.o obj-$(CONFIG_SMP) += affinity.o obj-$(CONFIG_GENERIC_IRQ_DEBUGFS) += debugfs.o +obj-$(CONFIG_GENERIC_IRQ_MATRIX_ALLOCATOR) += matrix.o diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c index befa671fba64..4e8089b319ae 100644 --- a/kernel/irq/autoprobe.c +++ b/kernel/irq/autoprobe.c @@ -54,7 +54,7 @@ unsigned long probe_irq_on(void) if (desc->irq_data.chip->irq_set_type) desc->irq_data.chip->irq_set_type(&desc->irq_data, IRQ_TYPE_PROBE); - irq_startup(desc, IRQ_NORESEND, IRQ_START_FORCE); + irq_activate_and_startup(desc, IRQ_NORESEND); } raw_spin_unlock_irq(&desc->lock); } diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 5a2ef92c2782..043bfc35b353 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -207,20 +207,24 @@ __irq_startup_managed(struct irq_desc *desc, struct cpumask *aff, bool force) * Catch code which fiddles with enable_irq() on a managed * and potentially shutdown IRQ. Chained interrupt * installment or irq auto probing should not happen on - * managed irqs either. Emit a warning, break the affinity - * and start it up as a normal interrupt. + * managed irqs either. */ if (WARN_ON_ONCE(force)) - return IRQ_STARTUP_NORMAL; + return IRQ_STARTUP_ABORT; /* * The interrupt was requested, but there is no online CPU * in it's affinity mask. Put it into managed shutdown * state and let the cpu hotplug mechanism start it up once * a CPU in the mask becomes available. */ - irqd_set_managed_shutdown(d); return IRQ_STARTUP_ABORT; } + /* + * Managed interrupts have reserved resources, so this should not + * happen. + */ + if (WARN_ON(irq_domain_activate_irq(d, false))) + return IRQ_STARTUP_ABORT; return IRQ_STARTUP_MANAGED; } #else @@ -236,7 +240,9 @@ static int __irq_startup(struct irq_desc *desc) struct irq_data *d = irq_desc_get_irq_data(desc); int ret = 0; - irq_domain_activate_irq(d); + /* Warn if this interrupt is not activated but try nevertheless */ + WARN_ON_ONCE(!irqd_is_activated(d)); + if (d->chip->irq_startup) { ret = d->chip->irq_startup(d); irq_state_clr_disabled(desc); @@ -269,6 +275,7 @@ int irq_startup(struct irq_desc *desc, bool resend, bool force) ret = __irq_startup(desc); break; case IRQ_STARTUP_ABORT: + irqd_set_managed_shutdown(d); return 0; } } @@ -278,6 +285,22 @@ int irq_startup(struct irq_desc *desc, bool resend, bool force) return ret; } +int irq_activate(struct irq_desc *desc) +{ + struct irq_data *d = irq_desc_get_irq_data(desc); + + if (!irqd_affinity_is_managed(d)) + return irq_domain_activate_irq(d, false); + return 0; +} + +void irq_activate_and_startup(struct irq_desc *desc, bool resend) +{ + if (WARN_ON(irq_activate(desc))) + return; + irq_startup(desc, resend, IRQ_START_FORCE); +} + static void __irq_disable(struct irq_desc *desc, bool mask); void irq_shutdown(struct irq_desc *desc) @@ -953,7 +976,7 @@ __irq_do_set_handler(struct irq_desc *desc, irq_flow_handler_t handle, irq_settings_set_norequest(desc); irq_settings_set_nothread(desc); desc->action = &chained_action; - irq_startup(desc, IRQ_RESEND, IRQ_START_FORCE); + irq_activate_and_startup(desc, IRQ_RESEND); } } diff --git a/kernel/irq/debugfs.c b/kernel/irq/debugfs.c index c3fdb36dec30..7f608ac39653 100644 --- a/kernel/irq/debugfs.c +++ b/kernel/irq/debugfs.c @@ -81,6 +81,8 @@ irq_debug_show_data(struct seq_file *m, struct irq_data *data, int ind) data->domain ? data->domain->name : ""); seq_printf(m, "%*shwirq: 0x%lx\n", ind + 1, "", data->hwirq); irq_debug_show_chip(m, data, ind + 1); + if (data->domain && data->domain->ops && data->domain->ops->debug_show) + data->domain->ops->debug_show(m, NULL, data, ind + 1); #ifdef CONFIG_IRQ_DOMAIN_HIERARCHY if (!data->parent_data) return; @@ -149,6 +151,7 @@ static int irq_debug_show(struct seq_file *m, void *p) raw_spin_lock_irq(&desc->lock); data = irq_desc_get_irq_data(desc); seq_printf(m, "handler: %pf\n", desc->handle_irq); + seq_printf(m, "device: %s\n", desc->dev_name); seq_printf(m, "status: 0x%08x\n", desc->status_use_accessors); irq_debug_show_bits(m, 0, desc->status_use_accessors, irqdesc_states, ARRAY_SIZE(irqdesc_states)); @@ -226,6 +229,15 @@ static const struct file_operations dfs_irq_ops = { .release = single_release, }; +void irq_debugfs_copy_devname(int irq, struct device *dev) +{ + struct irq_desc *desc = irq_to_desc(irq); + const char *name = dev_name(dev); + + if (name) + desc->dev_name = kstrdup(name, GFP_KERNEL); +} + void irq_add_debugfs_entry(unsigned int irq, struct irq_desc *desc) { char name [10]; diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 44ed5f8c8759..07d08ca701ec 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -75,6 +75,8 @@ extern void __enable_irq(struct irq_desc *desc); #define IRQ_START_FORCE true #define IRQ_START_COND false +extern int irq_activate(struct irq_desc *desc); +extern void irq_activate_and_startup(struct irq_desc *desc, bool resend); extern int irq_startup(struct irq_desc *desc, bool resend, bool force); extern void irq_shutdown(struct irq_desc *desc); @@ -437,6 +439,18 @@ static inline bool irq_fixup_move_pending(struct irq_desc *desc, bool fclear) } #endif /* !CONFIG_GENERIC_PENDING_IRQ */ +#if !defined(CONFIG_IRQ_DOMAIN) || !defined(CONFIG_IRQ_DOMAIN_HIERARCHY) +static inline int irq_domain_activate_irq(struct irq_data *data, bool early) +{ + irqd_set_activated(data); + return 0; +} +static inline void irq_domain_deactivate_irq(struct irq_data *data) +{ + irqd_clr_activated(data); +} +#endif + #ifdef CONFIG_GENERIC_IRQ_DEBUGFS #include <linux/debugfs.h> @@ -444,7 +458,9 @@ void irq_add_debugfs_entry(unsigned int irq, struct irq_desc *desc); static inline void irq_remove_debugfs_entry(struct irq_desc *desc) { debugfs_remove(desc->debugfs_file); + kfree(desc->dev_name); } +void irq_debugfs_copy_devname(int irq, struct device *dev); # ifdef CONFIG_IRQ_DOMAIN void irq_domain_debugfs_init(struct dentry *root); # else @@ -459,4 +475,7 @@ static inline void irq_add_debugfs_entry(unsigned int irq, struct irq_desc *d) static inline void irq_remove_debugfs_entry(struct irq_desc *d) { } +static inline void irq_debugfs_copy_devname(int irq, struct device *dev) +{ +} #endif /* CONFIG_GENERIC_IRQ_DEBUGFS */ diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 82afb7ed369f..f2edcf85780d 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -27,7 +27,7 @@ static struct lock_class_key irq_desc_lock_class; #if defined(CONFIG_SMP) static int __init irq_affinity_setup(char *str) { - zalloc_cpumask_var(&irq_default_affinity, GFP_NOWAIT); + alloc_bootmem_cpumask_var(&irq_default_affinity); cpulist_parse(str, irq_default_affinity); /* * Set at least the boot cpu. We don't want to end up with @@ -40,10 +40,8 @@ __setup("irqaffinity=", irq_affinity_setup); static void __init init_irq_default_affinity(void) { -#ifdef CONFIG_CPUMASK_OFFSTACK - if (!irq_default_affinity) + if (!cpumask_available(irq_default_affinity)) zalloc_cpumask_var(&irq_default_affinity, GFP_NOWAIT); -#endif if (cpumask_empty(irq_default_affinity)) cpumask_setall(irq_default_affinity); } @@ -448,7 +446,7 @@ static int alloc_descs(unsigned int start, unsigned int cnt, int node, } } - flags = affinity ? IRQD_AFFINITY_MANAGED : 0; + flags = affinity ? IRQD_AFFINITY_MANAGED | IRQD_MANAGED_SHUTDOWN : 0; mask = NULL; for (i = 0; i < cnt; i++) { @@ -462,6 +460,7 @@ static int alloc_descs(unsigned int start, unsigned int cnt, int node, goto err; irq_insert_desc(start + i, desc); irq_sysfs_add(start + i, desc); + irq_add_debugfs_entry(start + i, desc); } bitmap_set(allocated_irqs, start, cnt); return start; diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index ac4644e92b49..4f4f60015e8a 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -21,7 +21,6 @@ static LIST_HEAD(irq_domain_list); static DEFINE_MUTEX(irq_domain_mutex); -static DEFINE_MUTEX(revmap_trees_mutex); static struct irq_domain *irq_default_domain; static void irq_domain_check_hierarchy(struct irq_domain *domain); @@ -211,6 +210,7 @@ struct irq_domain *__irq_domain_add(struct fwnode_handle *fwnode, int size, /* Fill structure */ INIT_RADIX_TREE(&domain->revmap_tree, GFP_KERNEL); + mutex_init(&domain->revmap_tree_mutex); domain->ops = ops; domain->host_data = host_data; domain->hwirq_max = hwirq_max; @@ -462,9 +462,9 @@ static void irq_domain_clear_mapping(struct irq_domain *domain, if (hwirq < domain->revmap_size) { domain->linear_revmap[hwirq] = 0; } else { - mutex_lock(&revmap_trees_mutex); + mutex_lock(&domain->revmap_tree_mutex); radix_tree_delete(&domain->revmap_tree, hwirq); - mutex_unlock(&revmap_trees_mutex); + mutex_unlock(&domain->revmap_tree_mutex); } } @@ -475,9 +475,9 @@ static void irq_domain_set_mapping(struct irq_domain *domain, if (hwirq < domain->revmap_size) { domain->linear_revmap[hwirq] = irq_data->irq; } else { - mutex_lock(&revmap_trees_mutex); + mutex_lock(&domain->revmap_tree_mutex); radix_tree_insert(&domain->revmap_tree, hwirq, irq_data); - mutex_unlock(&revmap_trees_mutex); + mutex_unlock(&domain->revmap_tree_mutex); } } @@ -921,8 +921,7 @@ static void virq_debug_show_one(struct seq_file *m, struct irq_desc *desc) chip = irq_data_get_irq_chip(data); seq_printf(m, "%-15s ", (chip && chip->name) ? chip->name : "none"); - seq_printf(m, data ? "0x%p " : " %p ", - irq_data_get_irq_chip_data(data)); + seq_printf(m, "0x%p ", irq_data_get_irq_chip_data(data)); seq_printf(m, " %c ", (desc->action && desc->action->handler) ? '*' : ' '); direct = (irq == hwirq) && (irq < domain->revmap_direct_max_irq); @@ -1459,11 +1458,11 @@ static void irq_domain_fix_revmap(struct irq_data *d) return; /* Not using radix tree. */ /* Fix up the revmap. */ - mutex_lock(&revmap_trees_mutex); + mutex_lock(&d->domain->revmap_tree_mutex); slot = radix_tree_lookup_slot(&d->domain->revmap_tree, d->hwirq); if (slot) radix_tree_replace_slot(&d->domain->revmap_tree, slot, d); - mutex_unlock(&revmap_trees_mutex); + mutex_unlock(&d->domain->revmap_tree_mutex); } /** @@ -1682,28 +1681,36 @@ void irq_domain_free_irqs_parent(struct irq_domain *domain, } EXPORT_SYMBOL_GPL(irq_domain_free_irqs_parent); -static void __irq_domain_activate_irq(struct irq_data *irq_data) +static void __irq_domain_deactivate_irq(struct irq_data *irq_data) { if (irq_data && irq_data->domain) { struct irq_domain *domain = irq_data->domain; + if (domain->ops->deactivate) + domain->ops->deactivate(domain, irq_data); if (irq_data->parent_data) - __irq_domain_activate_irq(irq_data->parent_data); - if (domain->ops->activate) - domain->ops->activate(domain, irq_data); + __irq_domain_deactivate_irq(irq_data->parent_data); } } -static void __irq_domain_deactivate_irq(struct irq_data *irq_data) +static int __irq_domain_activate_irq(struct irq_data *irqd, bool early) { - if (irq_data && irq_data->domain) { - struct irq_domain *domain = irq_data->domain; + int ret = 0; - if (domain->ops->deactivate) - domain->ops->deactivate(domain, irq_data); - if (irq_data->parent_data) - __irq_domain_deactivate_irq(irq_data->parent_data); + if (irqd && irqd->domain) { + struct irq_domain *domain = irqd->domain; + + if (irqd->parent_data) + ret = __irq_domain_activate_irq(irqd->parent_data, + early); + if (!ret && domain->ops->activate) { + ret = domain->ops->activate(domain, irqd, early); + /* Rollback in case of error */ + if (ret && irqd->parent_data) + __irq_domain_deactivate_irq(irqd->parent_data); + } } + return ret; } /** @@ -1714,12 +1721,15 @@ static void __irq_domain_deactivate_irq(struct irq_data *irq_data) * This is the second step to call domain_ops->activate to program interrupt * controllers, so the interrupt could actually get delivered. */ -void irq_domain_activate_irq(struct irq_data *irq_data) +int irq_domain_activate_irq(struct irq_data *irq_data, bool early) { - if (!irqd_is_activated(irq_data)) { - __irq_domain_activate_irq(irq_data); + int ret = 0; + + if (!irqd_is_activated(irq_data)) + ret = __irq_domain_activate_irq(irq_data, early); + if (!ret) irqd_set_activated(irq_data); - } + return ret; } /** @@ -1810,6 +1820,8 @@ irq_domain_debug_show_one(struct seq_file *m, struct irq_domain *d, int ind) d->revmap_size + d->revmap_direct_max_irq); seq_printf(m, "%*smapped: %u\n", ind + 1, "", d->mapcount); seq_printf(m, "%*sflags: 0x%08x\n", ind +1 , "", d->flags); + if (d->ops && d->ops->debug_show) + d->ops->debug_show(m, d, NULL, ind + 1); #ifdef CONFIG_IRQ_DOMAIN_HIERARCHY if (!d->parent) return; diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 4bff6a10ae8e..2ff1c0c82fc9 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -398,7 +398,8 @@ int irq_select_affinity_usr(unsigned int irq) /** * irq_set_vcpu_affinity - Set vcpu affinity for the interrupt * @irq: interrupt number to set affinity - * @vcpu_info: vCPU specific data + * @vcpu_info: vCPU specific data or pointer to a percpu array of vCPU + * specific data for percpu_devid interrupts * * This function uses the vCPU specific data to set the vCPU * affinity for an irq. The vCPU specific data is passed from @@ -536,7 +537,7 @@ void __enable_irq(struct irq_desc *desc) * time. If it was already started up, then irq_startup() * will invoke irq_enable() under the hood. */ - irq_startup(desc, IRQ_RESEND, IRQ_START_COND); + irq_startup(desc, IRQ_RESEND, IRQ_START_FORCE); break; } default: @@ -1305,7 +1306,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) * thread_mask assigned. See the loop above which or's * all existing action->thread_mask bits. */ - new->thread_mask = 1 << ffz(thread_mask); + new->thread_mask = 1UL << ffz(thread_mask); } else if (new->handler == irq_default_primary_handler && !(desc->irq_data.chip->flags & IRQCHIP_ONESHOT_SAFE)) { @@ -1342,6 +1343,21 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) goto out_unlock; } + /* + * Activate the interrupt. That activation must happen + * independently of IRQ_NOAUTOEN. request_irq() can fail + * and the callers are supposed to handle + * that. enable_irq() of an interrupt requested with + * IRQ_NOAUTOEN is not supposed to fail. The activation + * keeps it in shutdown mode, it merily associates + * resources if necessary and if that's not possible it + * fails. Interrupts which are in managed shutdown mode + * will simply ignore that activation request. + */ + ret = irq_activate(desc); + if (ret) + goto out_unlock; + desc->istate &= ~(IRQS_AUTODETECT | IRQS_SPURIOUS_DISABLED | \ IRQS_ONESHOT | IRQS_WAITING); irqd_clear(&desc->irq_data, IRQD_IRQ_INPROGRESS); @@ -1417,7 +1433,6 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) wake_up_process(new->secondary->thread); register_irq_proc(irq, desc); - irq_add_debugfs_entry(irq, desc); new->dir = NULL; register_handler_proc(irq, new); return 0; diff --git a/kernel/irq/matrix.c b/kernel/irq/matrix.c new file mode 100644 index 000000000000..a3cbbc8191c5 --- /dev/null +++ b/kernel/irq/matrix.c @@ -0,0 +1,443 @@ +/* + * Copyright (C) 2017 Thomas Gleixner <tglx@linutronix.de> + * + * SPDX-License-Identifier: GPL-2.0 + */ +#include <linux/spinlock.h> +#include <linux/seq_file.h> +#include <linux/bitmap.h> +#include <linux/percpu.h> +#include <linux/cpu.h> +#include <linux/irq.h> + +#define IRQ_MATRIX_SIZE (BITS_TO_LONGS(IRQ_MATRIX_BITS) * sizeof(unsigned long)) + +struct cpumap { + unsigned int available; + unsigned int allocated; + unsigned int managed; + bool online; + unsigned long alloc_map[IRQ_MATRIX_SIZE]; + unsigned long managed_map[IRQ_MATRIX_SIZE]; +}; + +struct irq_matrix { + unsigned int matrix_bits; + unsigned int alloc_start; + unsigned int alloc_end; + unsigned int alloc_size; + unsigned int global_available; + unsigned int global_reserved; + unsigned int systembits_inalloc; + unsigned int total_allocated; + unsigned int online_maps; + struct cpumap __percpu *maps; + unsigned long scratch_map[IRQ_MATRIX_SIZE]; + unsigned long system_map[IRQ_MATRIX_SIZE]; +}; + +#define CREATE_TRACE_POINTS +#include <trace/events/irq_matrix.h> + +/** + * irq_alloc_matrix - Allocate a irq_matrix structure and initialize it + * @matrix_bits: Number of matrix bits must be <= IRQ_MATRIX_BITS + * @alloc_start: From which bit the allocation search starts + * @alloc_end: At which bit the allocation search ends, i.e first + * invalid bit + */ +__init struct irq_matrix *irq_alloc_matrix(unsigned int matrix_bits, + unsigned int alloc_start, + unsigned int alloc_end) +{ + struct irq_matrix *m; + + if (matrix_bits > IRQ_MATRIX_BITS) + return NULL; + + m = kzalloc(sizeof(*m), GFP_KERNEL); + if (!m) + return NULL; + + m->matrix_bits = matrix_bits; + m->alloc_start = alloc_start; + m->alloc_end = alloc_end; + m->alloc_size = alloc_end - alloc_start; + m->maps = alloc_percpu(*m->maps); + if (!m->maps) { + kfree(m); + return NULL; + } + return m; +} + +/** + * irq_matrix_online - Bring the local CPU matrix online + * @m: Matrix pointer + */ +void irq_matrix_online(struct irq_matrix *m) +{ + struct cpumap *cm = this_cpu_ptr(m->maps); + + BUG_ON(cm->online); + + bitmap_zero(cm->alloc_map, m->matrix_bits); + cm->available = m->alloc_size - (cm->managed + m->systembits_inalloc); + cm->allocated = 0; + m->global_available += cm->available; + cm->online = true; + m->online_maps++; + trace_irq_matrix_online(m); +} + +/** + * irq_matrix_offline - Bring the local CPU matrix offline + * @m: Matrix pointer + */ +void irq_matrix_offline(struct irq_matrix *m) +{ + struct cpumap *cm = this_cpu_ptr(m->maps); + + /* Update the global available size */ + m->global_available -= cm->available; + cm->online = false; + m->online_maps--; + trace_irq_matrix_offline(m); +} + +static unsigned int matrix_alloc_area(struct irq_matrix *m, struct cpumap *cm, + unsigned int num, bool managed) +{ + unsigned int area, start = m->alloc_start; + unsigned int end = m->alloc_end; + + bitmap_or(m->scratch_map, cm->managed_map, m->system_map, end); + bitmap_or(m->scratch_map, m->scratch_map, cm->alloc_map, end); + area = bitmap_find_next_zero_area(m->scratch_map, end, start, num, 0); + if (area >= end) + return area; + if (managed) + bitmap_set(cm->managed_map, area, num); + else + bitmap_set(cm->alloc_map, area, num); + return area; +} + +/** + * irq_matrix_assign_system - Assign system wide entry in the matrix + * @m: Matrix pointer + * @bit: Which bit to reserve + * @replace: Replace an already allocated vector with a system + * vector at the same bit position. + * + * The BUG_ON()s below are on purpose. If this goes wrong in the + * early boot process, then the chance to survive is about zero. + * If this happens when the system is life, it's not much better. + */ +void irq_matrix_assign_system(struct irq_matrix *m, unsigned int bit, + bool replace) +{ + struct cpumap *cm = this_cpu_ptr(m->maps); + + BUG_ON(bit > m->matrix_bits); + BUG_ON(m->online_maps > 1 || (m->online_maps && !replace)); + + set_bit(bit, m->system_map); + if (replace) { + BUG_ON(!test_and_clear_bit(bit, cm->alloc_map)); + cm->allocated--; + m->total_allocated--; + } + if (bit >= m->alloc_start && bit < m->alloc_end) + m->systembits_inalloc++; + + trace_irq_matrix_assign_system(bit, m); +} + +/** + * irq_matrix_reserve_managed - Reserve a managed interrupt in a CPU map + * @m: Matrix pointer + * @msk: On which CPUs the bits should be reserved. + * + * Can be called for offline CPUs. Note, this will only reserve one bit + * on all CPUs in @msk, but it's not guaranteed that the bits are at the + * same offset on all CPUs + */ +int irq_matrix_reserve_managed(struct irq_matrix *m, const struct cpumask *msk) +{ + unsigned int cpu, failed_cpu; + + for_each_cpu(cpu, msk) { + struct cpumap *cm = per_cpu_ptr(m->maps, cpu); + unsigned int bit; + + bit = matrix_alloc_area(m, cm, 1, true); + if (bit >= m->alloc_end) + goto cleanup; + cm->managed++; + if (cm->online) { + cm->available--; + m->global_available--; + } + trace_irq_matrix_reserve_managed(bit, cpu, m, cm); + } + return 0; +cleanup: + failed_cpu = cpu; + for_each_cpu(cpu, msk) { + if (cpu == failed_cpu) + break; + irq_matrix_remove_managed(m, cpumask_of(cpu)); + } + return -ENOSPC; +} + +/** + * irq_matrix_remove_managed - Remove managed interrupts in a CPU map + * @m: Matrix pointer + * @msk: On which CPUs the bits should be removed + * + * Can be called for offline CPUs + * + * This removes not allocated managed interrupts from the map. It does + * not matter which one because the managed interrupts free their + * allocation when they shut down. If not, the accounting is screwed, + * but all what can be done at this point is warn about it. + */ +void irq_matrix_remove_managed(struct irq_matrix *m, const struct cpumask *msk) +{ + unsigned int cpu; + + for_each_cpu(cpu, msk) { + struct cpumap *cm = per_cpu_ptr(m->maps, cpu); + unsigned int bit, end = m->alloc_end; + + if (WARN_ON_ONCE(!cm->managed)) + continue; + + /* Get managed bit which are not allocated */ + bitmap_andnot(m->scratch_map, cm->managed_map, cm->alloc_map, end); + + bit = find_first_bit(m->scratch_map, end); + if (WARN_ON_ONCE(bit >= end)) + continue; + + clear_bit(bit, cm->managed_map); + + cm->managed--; + if (cm->online) { + cm->available++; + m->global_available++; + } + trace_irq_matrix_remove_managed(bit, cpu, m, cm); + } +} + +/** + * irq_matrix_alloc_managed - Allocate a managed interrupt in a CPU map + * @m: Matrix pointer + * @cpu: On which CPU the interrupt should be allocated + */ +int irq_matrix_alloc_managed(struct irq_matrix *m, unsigned int cpu) +{ + struct cpumap *cm = per_cpu_ptr(m->maps, cpu); + unsigned int bit, end = m->alloc_end; + + /* Get managed bit which are not allocated */ + bitmap_andnot(m->scratch_map, cm->managed_map, cm->alloc_map, end); + bit = find_first_bit(m->scratch_map, end); + if (bit >= end) + return -ENOSPC; + set_bit(bit, cm->alloc_map); + cm->allocated++; + m->total_allocated++; + trace_irq_matrix_alloc_managed(bit, cpu, m, cm); + return bit; +} + +/** + * irq_matrix_assign - Assign a preallocated interrupt in the local CPU map + * @m: Matrix pointer + * @bit: Which bit to mark + * + * This should only be used to mark preallocated vectors + */ +void irq_matrix_assign(struct irq_matrix *m, unsigned int bit) +{ + struct cpumap *cm = this_cpu_ptr(m->maps); + + if (WARN_ON_ONCE(bit < m->alloc_start || bit >= m->alloc_end)) + return; + if (WARN_ON_ONCE(test_and_set_bit(bit, cm->alloc_map))) + return; + cm->allocated++; + m->total_allocated++; + cm->available--; + m->global_available--; + trace_irq_matrix_assign(bit, smp_processor_id(), m, cm); +} + +/** + * irq_matrix_reserve - Reserve interrupts + * @m: Matrix pointer + * + * This is merily a book keeping call. It increments the number of globally + * reserved interrupt bits w/o actually allocating them. This allows to + * setup interrupt descriptors w/o assigning low level resources to it. + * The actual allocation happens when the interrupt gets activated. + */ +void irq_matrix_reserve(struct irq_matrix *m) +{ + if (m->global_reserved <= m->global_available && + m->global_reserved + 1 > m->global_available) + pr_warn("Interrupt reservation exceeds available resources\n"); + + m->global_reserved++; + trace_irq_matrix_reserve(m); +} + +/** + * irq_matrix_remove_reserved - Remove interrupt reservation + * @m: Matrix pointer + * + * This is merily a book keeping call. It decrements the number of globally + * reserved interrupt bits. This is used to undo irq_matrix_reserve() when the + * interrupt was never in use and a real vector allocated, which undid the + * reservation. + */ +void irq_matrix_remove_reserved(struct irq_matrix *m) +{ + m->global_reserved--; + trace_irq_matrix_remove_reserved(m); +} + +/** + * irq_matrix_alloc - Allocate a regular interrupt in a CPU map + * @m: Matrix pointer + * @msk: Which CPUs to search in + * @reserved: Allocate previously reserved interrupts + * @mapped_cpu: Pointer to store the CPU for which the irq was allocated + */ +int irq_matrix_alloc(struct irq_matrix *m, const struct cpumask *msk, + bool reserved, unsigned int *mapped_cpu) +{ + unsigned int cpu; + + for_each_cpu(cpu, msk) { + struct cpumap *cm = per_cpu_ptr(m->maps, cpu); + unsigned int bit; + + if (!cm->online) + continue; + + bit = matrix_alloc_area(m, cm, 1, false); + if (bit < m->alloc_end) { + cm->allocated++; + cm->available--; + m->total_allocated++; + m->global_available--; + if (reserved) + m->global_reserved--; + *mapped_cpu = cpu; + trace_irq_matrix_alloc(bit, cpu, m, cm); + return bit; + } + } + return -ENOSPC; +} + +/** + * irq_matrix_free - Free allocated interrupt in the matrix + * @m: Matrix pointer + * @cpu: Which CPU map needs be updated + * @bit: The bit to remove + * @managed: If true, the interrupt is managed and not accounted + * as available. + */ +void irq_matrix_free(struct irq_matrix *m, unsigned int cpu, + unsigned int bit, bool managed) +{ + struct cpumap *cm = per_cpu_ptr(m->maps, cpu); + + if (WARN_ON_ONCE(bit < m->alloc_start || bit >= m->alloc_end)) + return; + + if (cm->online) { + clear_bit(bit, cm->alloc_map); + cm->allocated--; + m->total_allocated--; + if (!managed) { + cm->available++; + m->global_available++; + } + } + trace_irq_matrix_free(bit, cpu, m, cm); +} + +/** + * irq_matrix_available - Get the number of globally available irqs + * @m: Pointer to the matrix to query + * @cpudown: If true, the local CPU is about to go down, adjust + * the number of available irqs accordingly + */ +unsigned int irq_matrix_available(struct irq_matrix *m, bool cpudown) +{ + struct cpumap *cm = this_cpu_ptr(m->maps); + + return m->global_available - cpudown ? cm->available : 0; +} + +/** + * irq_matrix_reserved - Get the number of globally reserved irqs + * @m: Pointer to the matrix to query + */ +unsigned int irq_matrix_reserved(struct irq_matrix *m) +{ + return m->global_reserved; +} + +/** + * irq_matrix_allocated - Get the number of allocated irqs on the local cpu + * @m: Pointer to the matrix to search + * + * This returns number of allocated irqs + */ +unsigned int irq_matrix_allocated(struct irq_matrix *m) +{ + struct cpumap *cm = this_cpu_ptr(m->maps); + + return cm->allocated; +} + +#ifdef CONFIG_GENERIC_IRQ_DEBUGFS +/** + * irq_matrix_debug_show - Show detailed allocation information + * @sf: Pointer to the seq_file to print to + * @m: Pointer to the matrix allocator + * @ind: Indentation for the print format + * + * Note, this is a lockless snapshot. + */ +void irq_matrix_debug_show(struct seq_file *sf, struct irq_matrix *m, int ind) +{ + unsigned int nsys = bitmap_weight(m->system_map, m->matrix_bits); + int cpu; + + seq_printf(sf, "Online bitmaps: %6u\n", m->online_maps); + seq_printf(sf, "Global available: %6u\n", m->global_available); + seq_printf(sf, "Global reserved: %6u\n", m->global_reserved); + seq_printf(sf, "Total allocated: %6u\n", m->total_allocated); + seq_printf(sf, "System: %u: %*pbl\n", nsys, m->matrix_bits, + m->system_map); + seq_printf(sf, "%*s| CPU | avl | man | act | vectors\n", ind, " "); + cpus_read_lock(); + for_each_online_cpu(cpu) { + struct cpumap *cm = per_cpu_ptr(m->maps, cpu); + + seq_printf(sf, "%*s %4d %4u %4u %4u %*pbl\n", ind, " ", + cpu, cm->available, cm->managed, cm->allocated, + m->matrix_bits, cm->alloc_map); + } + cpus_read_unlock(); +} +#endif diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index 3fa4bd59f569..edb987b2c58d 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c @@ -16,6 +16,8 @@ #include <linux/msi.h> #include <linux/slab.h> +#include "internals.h" + /** * alloc_msi_entry - Allocate an initialize msi_entry * @dev: Pointer to the device for which this is allocated @@ -100,13 +102,14 @@ int msi_domain_set_affinity(struct irq_data *irq_data, return ret; } -static void msi_domain_activate(struct irq_domain *domain, - struct irq_data *irq_data) +static int msi_domain_activate(struct irq_domain *domain, + struct irq_data *irq_data, bool early) { struct msi_msg msg; BUG_ON(irq_chip_compose_msi_msg(irq_data, &msg)); irq_chip_write_msi_msg(irq_data, &msg); + return 0; } static void msi_domain_deactivate(struct irq_domain *domain, @@ -373,8 +376,10 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev, return ret; } - for (i = 0; i < desc->nvec_used; i++) + for (i = 0; i < desc->nvec_used; i++) { irq_set_msi_desc_off(virq, i, desc); + irq_debugfs_copy_devname(virq + i, dev); + } } if (ops->msi_finish) @@ -396,11 +401,28 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev, struct irq_data *irq_data; irq_data = irq_domain_get_irq_data(domain, desc->irq); - irq_domain_activate_irq(irq_data); + ret = irq_domain_activate_irq(irq_data, true); + if (ret) + goto cleanup; + if (info->flags & MSI_FLAG_MUST_REACTIVATE) + irqd_clr_activated(irq_data); } } - return 0; + +cleanup: + for_each_msi_entry(desc, dev) { + struct irq_data *irqd; + + if (desc->irq == virq) + break; + + irqd = irq_domain_get_irq_data(domain, desc->irq); + if (irqd_is_activated(irqd)) + irq_domain_deactivate_irq(irqd); + } + msi_domain_free_irqs(domain, dev); + return ret; } /** diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index c010cc0daf79..e8f374971e37 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -155,8 +155,9 @@ static ssize_t write_irq_affinity(int type, struct file *file, */ err = irq_select_affinity_usr(irq) ? -EINVAL : count; } else { - irq_set_affinity(irq, new_value); - err = count; + err = irq_set_affinity(irq, new_value); + if (!err) + err = count; } free_cpumask: diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index 987d7bca4864..1215229d1c12 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c @@ -21,7 +21,7 @@ static int irqfixup __read_mostly; #define POLL_SPURIOUS_IRQ_INTERVAL (HZ/10) static void poll_spurious_irqs(unsigned long dummy); -static DEFINE_TIMER(poll_spurious_irq_timer, poll_spurious_irqs, 0, 0); +static DEFINE_TIMER(poll_spurious_irq_timer, poll_spurious_irqs); static int irq_poll_cpu; static atomic_t irq_poll_active; diff --git a/kernel/irq/timings.c b/kernel/irq/timings.c index c8c1d073fbf1..e0923fa4927a 100644 --- a/kernel/irq/timings.c +++ b/kernel/irq/timings.c @@ -264,7 +264,7 @@ u64 irq_timings_next_event(u64 now) * order to prevent the timings circular buffer to be updated * while we are reading it. */ - WARN_ON_ONCE(!irqs_disabled()); + lockdep_assert_irqs_disabled(); /* * Number of elements in the circular buffer: If it happens it diff --git a/kernel/irq_work.c b/kernel/irq_work.c index bcf107ce0854..40e9d739c169 100644 --- a/kernel/irq_work.c +++ b/kernel/irq_work.c @@ -56,7 +56,6 @@ void __weak arch_irq_work_raise(void) */ } -#ifdef CONFIG_SMP /* * Enqueue the irq_work @work on @cpu unless it's already pending * somewhere. @@ -68,6 +67,8 @@ bool irq_work_queue_on(struct irq_work *work, int cpu) /* All work should have been flushed before going offline */ WARN_ON_ONCE(cpu_is_offline(cpu)); +#ifdef CONFIG_SMP + /* Arch remote IPI send/receive backend aren't NMI safe */ WARN_ON_ONCE(in_nmi()); @@ -78,10 +79,12 @@ bool irq_work_queue_on(struct irq_work *work, int cpu) if (llist_add(&work->llnode, &per_cpu(raised_list, cpu))) arch_send_call_function_single_ipi(cpu); +#else /* #ifdef CONFIG_SMP */ + irq_work_queue(work); +#endif /* #else #ifdef CONFIG_SMP */ + return true; } -EXPORT_SYMBOL_GPL(irq_work_queue_on); -#endif /* Enqueue the irq work @work on the current CPU */ bool irq_work_queue(struct irq_work *work) @@ -128,9 +131,9 @@ bool irq_work_needs_cpu(void) static void irq_work_run_list(struct llist_head *list) { - unsigned long flags; - struct irq_work *work; + struct irq_work *work, *tmp; struct llist_node *llnode; + unsigned long flags; BUG_ON(!irqs_disabled()); @@ -138,11 +141,7 @@ static void irq_work_run_list(struct llist_head *list) return; llnode = llist_del_all(list); - while (llnode != NULL) { - work = llist_entry(llnode, struct irq_work, llnode); - - llnode = llist_next(llnode); - + llist_for_each_entry_safe(work, tmp, llnode, llnode) { /* * Clear the PENDING bit, after this point the @work * can be re-used. @@ -188,7 +187,7 @@ void irq_work_tick(void) */ void irq_work_sync(struct irq_work *work) { - WARN_ON_ONCE(irqs_disabled()); + lockdep_assert_irqs_enabled(); while (work->flags & IRQ_WORK_BUSY) cpu_relax(); diff --git a/kernel/jump_label.c b/kernel/jump_label.c index 0bf2e8f5244a..8ff4ca4665ff 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -83,7 +83,7 @@ static void static_key_slow_inc_cpuslocked(struct static_key *key) { int v, v1; - STATIC_KEY_CHECK_USE(); + STATIC_KEY_CHECK_USE(key); /* * Careful if we get concurrent static_key_slow_inc() calls; @@ -128,7 +128,7 @@ EXPORT_SYMBOL_GPL(static_key_slow_inc); void static_key_enable_cpuslocked(struct static_key *key) { - STATIC_KEY_CHECK_USE(); + STATIC_KEY_CHECK_USE(key); if (atomic_read(&key->enabled) > 0) { WARN_ON_ONCE(atomic_read(&key->enabled) != 1); @@ -158,7 +158,7 @@ EXPORT_SYMBOL_GPL(static_key_enable); void static_key_disable_cpuslocked(struct static_key *key) { - STATIC_KEY_CHECK_USE(); + STATIC_KEY_CHECK_USE(key); if (atomic_read(&key->enabled) != 1) { WARN_ON_ONCE(atomic_read(&key->enabled) != 0); @@ -224,21 +224,21 @@ static void jump_label_update_timeout(struct work_struct *work) void static_key_slow_dec(struct static_key *key) { - STATIC_KEY_CHECK_USE(); + STATIC_KEY_CHECK_USE(key); __static_key_slow_dec(key, 0, NULL); } EXPORT_SYMBOL_GPL(static_key_slow_dec); void static_key_slow_dec_deferred(struct static_key_deferred *key) { - STATIC_KEY_CHECK_USE(); + STATIC_KEY_CHECK_USE(key); __static_key_slow_dec(&key->key, key->timeout, &key->work); } EXPORT_SYMBOL_GPL(static_key_slow_dec_deferred); void static_key_deferred_flush(struct static_key_deferred *key) { - STATIC_KEY_CHECK_USE(); + STATIC_KEY_CHECK_USE(key); flush_delayed_work(&key->work); } EXPORT_SYMBOL_GPL(static_key_deferred_flush); @@ -246,7 +246,7 @@ EXPORT_SYMBOL_GPL(static_key_deferred_flush); void jump_label_rate_limit(struct static_key_deferred *key, unsigned long rl) { - STATIC_KEY_CHECK_USE(); + STATIC_KEY_CHECK_USE(key); key->timeout = rl; INIT_DELAYED_WORK(&key->work, jump_label_update_timeout); } diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index 127e7cfafa55..1e6ae66c6244 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -480,6 +480,7 @@ struct kallsym_iter { char name[KSYM_NAME_LEN]; char module_name[MODULE_NAME_LEN]; int exported; + int show_value; }; static int get_ksymbol_mod(struct kallsym_iter *iter) @@ -582,12 +583,15 @@ static void s_stop(struct seq_file *m, void *p) static int s_show(struct seq_file *m, void *p) { + unsigned long value; struct kallsym_iter *iter = m->private; /* Some debugging symbols have no name. Ignore them. */ if (!iter->name[0]) return 0; + value = iter->show_value ? iter->value : 0; + if (iter->module_name[0]) { char type; @@ -597,10 +601,10 @@ static int s_show(struct seq_file *m, void *p) */ type = iter->exported ? toupper(iter->type) : tolower(iter->type); - seq_printf(m, "%pK %c %s\t[%s]\n", (void *)iter->value, + seq_printf(m, KALLSYM_FMT " %c %s\t[%s]\n", value, type, iter->name, iter->module_name); } else - seq_printf(m, "%pK %c %s\n", (void *)iter->value, + seq_printf(m, KALLSYM_FMT " %c %s\n", value, iter->type, iter->name); return 0; } @@ -612,6 +616,40 @@ static const struct seq_operations kallsyms_op = { .show = s_show }; +static inline int kallsyms_for_perf(void) +{ +#ifdef CONFIG_PERF_EVENTS + extern int sysctl_perf_event_paranoid; + if (sysctl_perf_event_paranoid <= 1) + return 1; +#endif + return 0; +} + +/* + * We show kallsyms information even to normal users if we've enabled + * kernel profiling and are explicitly not paranoid (so kptr_restrict + * is clear, and sysctl_perf_event_paranoid isn't set). + * + * Otherwise, require CAP_SYSLOG (assuming kptr_restrict isn't set to + * block even that). + */ +int kallsyms_show_value(void) +{ + switch (kptr_restrict) { + case 0: + if (kallsyms_for_perf()) + return 1; + /* fallthrough */ + case 1: + if (has_capability_noaudit(current, CAP_SYSLOG)) + return 1; + /* fallthrough */ + default: + return 0; + } +} + static int kallsyms_open(struct inode *inode, struct file *file) { /* @@ -625,6 +663,7 @@ static int kallsyms_open(struct inode *inode, struct file *file) return -ENOMEM; reset_iter(iter, 0); + iter->show_value = kallsyms_show_value(); return 0; } diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index 9f48f4412297..e5bcd94c1efb 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -406,9 +406,10 @@ static int locate_mem_hole_bottom_up(unsigned long start, unsigned long end, return 1; } -static int locate_mem_hole_callback(u64 start, u64 end, void *arg) +static int locate_mem_hole_callback(struct resource *res, void *arg) { struct kexec_buf *kbuf = (struct kexec_buf *)arg; + u64 start = res->start, end = res->end; unsigned long sz = end - start + 1; /* Returning 0 will take to next memory range */ @@ -437,7 +438,7 @@ static int locate_mem_hole_callback(u64 start, u64 end, void *arg) * func returning non-zero, then zero will be returned. */ int __weak arch_kexec_walk_mem(struct kexec_buf *kbuf, - int (*func)(u64, u64, void *)) + int (*func)(struct resource *, void *)) { if (kbuf->image->type == KEXEC_TYPE_CRASH) return walk_iomem_res_desc(crashk_res.desc, diff --git a/kernel/kprobes.c b/kernel/kprobes.c index a1606a4224e1..da2ccf142358 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -117,7 +117,7 @@ enum kprobe_slot_state { SLOT_USED = 2, }; -static void *alloc_insn_page(void) +void __weak *alloc_insn_page(void) { return module_alloc(PAGE_SIZE); } @@ -573,13 +573,15 @@ static void kprobe_optimizer(struct work_struct *work) do_unoptimize_kprobes(); /* - * Step 2: Wait for quiesence period to ensure all running interrupts - * are done. Because optprobe may modify multiple instructions - * there is a chance that Nth instruction is interrupted. In that - * case, running interrupt can return to 2nd-Nth byte of jump - * instruction. This wait is for avoiding it. + * Step 2: Wait for quiesence period to ensure all potentially + * preempted tasks to have normally scheduled. Because optprobe + * may modify multiple instructions, there is a chance that Nth + * instruction is preempted. In that case, such tasks can return + * to 2nd-Nth byte of jump instruction. This wait is for avoiding it. + * Note that on non-preemptive kernel, this is transparently converted + * to synchronoze_sched() to wait for all interrupts to have completed. */ - synchronize_sched(); + synchronize_rcu_tasks(); /* Step 3: Optimize kprobes after quiesence period */ do_optimize_kprobes(); @@ -1769,6 +1771,7 @@ unsigned long __weak arch_deref_entry_point(void *entry) return (unsigned long)entry; } +#if 0 int register_jprobes(struct jprobe **jps, int num) { int ret = 0, i; @@ -1837,6 +1840,7 @@ void unregister_jprobes(struct jprobe **jps, int num) } } EXPORT_SYMBOL_GPL(unregister_jprobes); +#endif #ifdef CONFIG_KRETPROBES /* diff --git a/kernel/kthread.c b/kernel/kthread.c index 1c19edf82427..ba3992c8c375 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -798,15 +798,14 @@ EXPORT_SYMBOL_GPL(kthread_queue_work); /** * kthread_delayed_work_timer_fn - callback that queues the associated kthread * delayed work when the timer expires. - * @__data: pointer to the data associated with the timer + * @t: pointer to the expired timer * * The format of the function is defined by struct timer_list. * It should have been called from irqsafe timer with irq already off. */ -void kthread_delayed_work_timer_fn(unsigned long __data) +void kthread_delayed_work_timer_fn(struct timer_list *t) { - struct kthread_delayed_work *dwork = - (struct kthread_delayed_work *)__data; + struct kthread_delayed_work *dwork = from_timer(dwork, t, timer); struct kthread_work *work = &dwork->work; struct kthread_worker *worker = work->worker; @@ -837,8 +836,7 @@ void __kthread_queue_delayed_work(struct kthread_worker *worker, struct timer_list *timer = &dwork->timer; struct kthread_work *work = &dwork->work; - WARN_ON_ONCE(timer->function != kthread_delayed_work_timer_fn || - timer->data != (unsigned long)dwork); + WARN_ON_ONCE(timer->function != (TIMER_FUNC_TYPE)kthread_delayed_work_timer_fn); /* * If @delay is 0, queue @dwork->work immediately. This is for diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index e36e652d996f..db933d063bfc 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -76,6 +76,19 @@ module_param(lock_stat, int, 0644); #define lock_stat 0 #endif +#ifdef CONFIG_BOOTPARAM_LOCKDEP_CROSSRELEASE_FULLSTACK +static int crossrelease_fullstack = 1; +#else +static int crossrelease_fullstack; +#endif +static int __init allow_crossrelease_fullstack(char *str) +{ + crossrelease_fullstack = 1; + return 0; +} + +early_param("crossrelease_fullstack", allow_crossrelease_fullstack); + /* * lockdep_lock: protects the lockdep graph, the hashes and the * class/list/hash allocators. @@ -4863,8 +4876,14 @@ static void add_xhlock(struct held_lock *hlock) xhlock->trace.nr_entries = 0; xhlock->trace.max_entries = MAX_XHLOCK_TRACE_ENTRIES; xhlock->trace.entries = xhlock->trace_entries; - xhlock->trace.skip = 3; - save_stack_trace(&xhlock->trace); + + if (crossrelease_fullstack) { + xhlock->trace.skip = 3; + save_stack_trace(&xhlock->trace); + } else { + xhlock->trace.nr_entries = 1; + xhlock->trace.entries[0] = hlock->acquire_ip; + } } static inline int same_context_xhlock(struct hist_lock *xhlock) diff --git a/kernel/locking/qrwlock.c b/kernel/locking/qrwlock.c index 2655f26ec882..c7471c3fb798 100644 --- a/kernel/locking/qrwlock.c +++ b/kernel/locking/qrwlock.c @@ -23,49 +23,11 @@ #include <linux/spinlock.h> #include <asm/qrwlock.h> -/* - * This internal data structure is used for optimizing access to some of - * the subfields within the atomic_t cnts. - */ -struct __qrwlock { - union { - atomic_t cnts; - struct { -#ifdef __LITTLE_ENDIAN - u8 wmode; /* Writer mode */ - u8 rcnts[3]; /* Reader counts */ -#else - u8 rcnts[3]; /* Reader counts */ - u8 wmode; /* Writer mode */ -#endif - }; - }; - arch_spinlock_t lock; -}; - -/** - * rspin_until_writer_unlock - inc reader count & spin until writer is gone - * @lock : Pointer to queue rwlock structure - * @writer: Current queue rwlock writer status byte - * - * In interrupt context or at the head of the queue, the reader will just - * increment the reader count & wait until the writer releases the lock. - */ -static __always_inline void -rspin_until_writer_unlock(struct qrwlock *lock, u32 cnts) -{ - while ((cnts & _QW_WMASK) == _QW_LOCKED) { - cpu_relax(); - cnts = atomic_read_acquire(&lock->cnts); - } -} - /** * queued_read_lock_slowpath - acquire read lock of a queue rwlock * @lock: Pointer to queue rwlock structure - * @cnts: Current qrwlock lock value */ -void queued_read_lock_slowpath(struct qrwlock *lock, u32 cnts) +void queued_read_lock_slowpath(struct qrwlock *lock) { /* * Readers come here when they cannot get the lock without waiting @@ -73,13 +35,11 @@ void queued_read_lock_slowpath(struct qrwlock *lock, u32 cnts) if (unlikely(in_interrupt())) { /* * Readers in interrupt context will get the lock immediately - * if the writer is just waiting (not holding the lock yet). - * The rspin_until_writer_unlock() function returns immediately - * in this case. Otherwise, they will spin (with ACQUIRE - * semantics) until the lock is available without waiting in - * the queue. + * if the writer is just waiting (not holding the lock yet), + * so spin with ACQUIRE semantics until the lock is available + * without waiting in the queue. */ - rspin_until_writer_unlock(lock, cnts); + atomic_cond_read_acquire(&lock->cnts, !(VAL & _QW_LOCKED)); return; } atomic_sub(_QR_BIAS, &lock->cnts); @@ -88,14 +48,14 @@ void queued_read_lock_slowpath(struct qrwlock *lock, u32 cnts) * Put the reader into the wait queue */ arch_spin_lock(&lock->wait_lock); + atomic_add(_QR_BIAS, &lock->cnts); /* * The ACQUIRE semantics of the following spinning code ensure * that accesses can't leak upwards out of our subsequent critical * section in the case that the lock is currently held for write. */ - cnts = atomic_fetch_add_acquire(_QR_BIAS, &lock->cnts); - rspin_until_writer_unlock(lock, cnts); + atomic_cond_read_acquire(&lock->cnts, !(VAL & _QW_LOCKED)); /* * Signal the next one in queue to become queue head @@ -110,8 +70,6 @@ EXPORT_SYMBOL(queued_read_lock_slowpath); */ void queued_write_lock_slowpath(struct qrwlock *lock) { - u32 cnts; - /* Put the writer into the wait queue */ arch_spin_lock(&lock->wait_lock); @@ -120,30 +78,14 @@ void queued_write_lock_slowpath(struct qrwlock *lock) (atomic_cmpxchg_acquire(&lock->cnts, 0, _QW_LOCKED) == 0)) goto unlock; - /* - * Set the waiting flag to notify readers that a writer is pending, - * or wait for a previous writer to go away. - */ - for (;;) { - struct __qrwlock *l = (struct __qrwlock *)lock; - - if (!READ_ONCE(l->wmode) && - (cmpxchg_relaxed(&l->wmode, 0, _QW_WAITING) == 0)) - break; + /* Set the waiting flag to notify readers that a writer is pending */ + atomic_add(_QW_WAITING, &lock->cnts); - cpu_relax(); - } - - /* When no more readers, set the locked flag */ - for (;;) { - cnts = atomic_read(&lock->cnts); - if ((cnts == _QW_WAITING) && - (atomic_cmpxchg_acquire(&lock->cnts, _QW_WAITING, - _QW_LOCKED) == _QW_WAITING)) - break; - - cpu_relax(); - } + /* When no more readers or writers, set the locked flag */ + do { + atomic_cond_read_acquire(&lock->cnts, VAL == _QW_WAITING); + } while (atomic_cmpxchg_relaxed(&lock->cnts, _QW_WAITING, + _QW_LOCKED) != _QW_WAITING); unlock: arch_spin_unlock(&lock->wait_lock); } diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h index 15b6a39366c6..6ee477765e6c 100644 --- a/kernel/locking/qspinlock_paravirt.h +++ b/kernel/locking/qspinlock_paravirt.h @@ -61,21 +61,50 @@ struct pv_node { #include "qspinlock_stat.h" /* + * Hybrid PV queued/unfair lock + * * By replacing the regular queued_spin_trylock() with the function below, * it will be called once when a lock waiter enter the PV slowpath before - * being queued. By allowing one lock stealing attempt here when the pending - * bit is off, it helps to reduce the performance impact of lock waiter - * preemption without the drawback of lock starvation. + * being queued. + * + * The pending bit is set by the queue head vCPU of the MCS wait queue in + * pv_wait_head_or_lock() to signal that it is ready to spin on the lock. + * When that bit becomes visible to the incoming waiters, no lock stealing + * is allowed. The function will return immediately to make the waiters + * enter the MCS wait queue. So lock starvation shouldn't happen as long + * as the queued mode vCPUs are actively running to set the pending bit + * and hence disabling lock stealing. + * + * When the pending bit isn't set, the lock waiters will stay in the unfair + * mode spinning on the lock unless the MCS wait queue is empty. In this + * case, the lock waiters will enter the queued mode slowpath trying to + * become the queue head and set the pending bit. + * + * This hybrid PV queued/unfair lock combines the best attributes of a + * queued lock (no lock starvation) and an unfair lock (good performance + * on not heavily contended locks). */ -#define queued_spin_trylock(l) pv_queued_spin_steal_lock(l) -static inline bool pv_queued_spin_steal_lock(struct qspinlock *lock) +#define queued_spin_trylock(l) pv_hybrid_queued_unfair_trylock(l) +static inline bool pv_hybrid_queued_unfair_trylock(struct qspinlock *lock) { struct __qspinlock *l = (void *)lock; - if (!(atomic_read(&lock->val) & _Q_LOCKED_PENDING_MASK) && - (cmpxchg_acquire(&l->locked, 0, _Q_LOCKED_VAL) == 0)) { - qstat_inc(qstat_pv_lock_stealing, true); - return true; + /* + * Stay in unfair lock mode as long as queued mode waiters are + * present in the MCS wait queue but the pending bit isn't set. + */ + for (;;) { + int val = atomic_read(&lock->val); + + if (!(val & _Q_LOCKED_PENDING_MASK) && + (cmpxchg_acquire(&l->locked, 0, _Q_LOCKED_VAL) == 0)) { + qstat_inc(qstat_pv_lock_stealing, true); + return true; + } + if (!(val & _Q_TAIL_MASK) || (val & _Q_PENDING_MASK)) + break; + + cpu_relax(); } return false; diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index a6c76a4832b4..f549c552dbf1 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -29,6 +29,22 @@ void __sched down_read(struct rw_semaphore *sem) EXPORT_SYMBOL(down_read); +int __sched down_read_killable(struct rw_semaphore *sem) +{ + might_sleep(); + rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_); + + if (LOCK_CONTENDED_RETURN(sem, __down_read_trylock, __down_read_killable)) { + rwsem_release(&sem->dep_map, 1, _RET_IP_); + return -EINTR; + } + + rwsem_set_reader_owned(sem); + return 0; +} + +EXPORT_SYMBOL(down_read_killable); + /* * trylock for reading -- returns 1 if successful, 0 if contention */ diff --git a/kernel/locking/spinlock.c b/kernel/locking/spinlock.c index 6e40fdfba326..1fd1a7543cdd 100644 --- a/kernel/locking/spinlock.c +++ b/kernel/locking/spinlock.c @@ -30,11 +30,10 @@ #if !defined(CONFIG_GENERIC_LOCKBREAK) || defined(CONFIG_DEBUG_LOCK_ALLOC) /* * The __lock_function inlines are taken from - * include/linux/spinlock_api_smp.h + * spinlock : include/linux/spinlock_api_smp.h + * rwlock : include/linux/rwlock_api_smp.h */ #else -#define raw_read_can_lock(l) read_can_lock(l) -#define raw_write_can_lock(l) write_can_lock(l) /* * Some architectures can relax in favour of the CPU owning the lock. @@ -69,7 +68,7 @@ void __lockfunc __raw_##op##_lock(locktype##_t *lock) \ \ if (!(lock)->break_lock) \ (lock)->break_lock = 1; \ - while (!raw_##op##_can_lock(lock) && (lock)->break_lock)\ + while ((lock)->break_lock) \ arch_##op##_relax(&lock->raw_lock); \ } \ (lock)->break_lock = 0; \ @@ -89,7 +88,7 @@ unsigned long __lockfunc __raw_##op##_lock_irqsave(locktype##_t *lock) \ \ if (!(lock)->break_lock) \ (lock)->break_lock = 1; \ - while (!raw_##op##_can_lock(lock) && (lock)->break_lock)\ + while ((lock)->break_lock) \ arch_##op##_relax(&lock->raw_lock); \ } \ (lock)->break_lock = 0; \ diff --git a/kernel/module.c b/kernel/module.c index de66ec825992..32c2cdaccd93 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -278,6 +278,16 @@ static bool sig_enforce = IS_ENABLED(CONFIG_MODULE_SIG_FORCE); module_param(sig_enforce, bool_enable_only, 0644); #endif /* !CONFIG_MODULE_SIG_FORCE */ +/* + * Export sig_enforce kernel cmdline parameter to allow other subsystems rely + * on that instead of directly to CONFIG_MODULE_SIG_FORCE config. + */ +bool is_module_sig_enforced(void) +{ + return sig_enforce; +} +EXPORT_SYMBOL(is_module_sig_enforced); + /* Block module loading/unloading? */ int modules_disabled = 0; core_param(nomodule, modules_disabled, bint, 0); @@ -1516,7 +1526,7 @@ static void add_sect_attrs(struct module *mod, const struct load_info *info) sattr->mattr.show = module_sect_show; sattr->mattr.store = NULL; sattr->mattr.attr.name = sattr->name; - sattr->mattr.attr.mode = S_IRUGO; + sattr->mattr.attr.mode = S_IRUSR; *(gattr++) = &(sattr++)->mattr.attr; } *gattr = NULL; @@ -4147,6 +4157,7 @@ static int m_show(struct seq_file *m, void *p) { struct module *mod = list_entry(p, struct module, list); char buf[MODULE_FLAGS_BUF_SIZE]; + unsigned long value; /* We always ignore unformed modules. */ if (mod->state == MODULE_STATE_UNFORMED) @@ -4162,7 +4173,8 @@ static int m_show(struct seq_file *m, void *p) mod->state == MODULE_STATE_COMING ? "Loading" : "Live"); /* Used by oprofile and other similar tools. */ - seq_printf(m, " 0x%pK", mod->core_layout.base); + value = m->private ? 0 : (unsigned long)mod->core_layout.base; + seq_printf(m, " 0x" KALLSYM_FMT, value); /* Taints info */ if (mod->taints) @@ -4184,9 +4196,23 @@ static const struct seq_operations modules_op = { .show = m_show }; +/* + * This also sets the "private" pointer to non-NULL if the + * kernel pointers should be hidden (so you can just test + * "m->private" to see if you should keep the values private). + * + * We use the same logic as for /proc/kallsyms. + */ static int modules_open(struct inode *inode, struct file *file) { - return seq_open(file, &modules_op); + int err = seq_open(file, &modules_op); + + if (!err) { + struct seq_file *m = file->private_data; + m->private = kallsyms_show_value() ? NULL : (void *)8ul; + } + + return 0; } static const struct file_operations proc_modules_operations = { diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index e4b43fef89f5..59c471de342a 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -203,6 +203,21 @@ static inline bool __rcu_reclaim(const char *rn, struct rcu_head *head) extern int rcu_cpu_stall_suppress; int rcu_jiffies_till_stall_check(void); +#define rcu_ftrace_dump_stall_suppress() \ +do { \ + if (!rcu_cpu_stall_suppress) \ + rcu_cpu_stall_suppress = 3; \ +} while (0) + +#define rcu_ftrace_dump_stall_unsuppress() \ +do { \ + if (rcu_cpu_stall_suppress == 3) \ + rcu_cpu_stall_suppress = 0; \ +} while (0) + +#else /* #endif #ifdef CONFIG_RCU_STALL_COMMON */ +#define rcu_ftrace_dump_stall_suppress() +#define rcu_ftrace_dump_stall_unsuppress() #endif /* #ifdef CONFIG_RCU_STALL_COMMON */ /* @@ -220,8 +235,12 @@ do { \ static atomic_t ___rfd_beenhere = ATOMIC_INIT(0); \ \ if (!atomic_read(&___rfd_beenhere) && \ - !atomic_xchg(&___rfd_beenhere, 1)) \ + !atomic_xchg(&___rfd_beenhere, 1)) { \ + tracing_off(); \ + rcu_ftrace_dump_stall_suppress(); \ ftrace_dump(oops_dump_mode); \ + rcu_ftrace_dump_stall_unsuppress(); \ + } \ } while (0) void rcu_early_boot_tests(void); diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c index 7649fcd2c4c7..88cba7c2956c 100644 --- a/kernel/rcu/rcu_segcblist.c +++ b/kernel/rcu/rcu_segcblist.c @@ -23,6 +23,7 @@ #include <linux/types.h> #include <linux/kernel.h> #include <linux/interrupt.h> +#include <linux/rcupdate.h> #include "rcu_segcblist.h" diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 45f2ffbc1e78..74f6b0146b98 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -51,6 +51,7 @@ #include <asm/byteorder.h> #include <linux/torture.h> #include <linux/vmalloc.h> +#include <linux/sched/debug.h> #include "rcu.h" @@ -89,6 +90,7 @@ torture_param(int, shutdown_secs, 0, "Shutdown time (s), <= zero to disable."); torture_param(int, stall_cpu, 0, "Stall duration (s), zero to disable."); torture_param(int, stall_cpu_holdoff, 10, "Time to wait before starting stall (s)."); +torture_param(int, stall_cpu_irqsoff, 0, "Disable interrupts while stalling."); torture_param(int, stat_interval, 60, "Number of seconds between stats printk()s"); torture_param(int, stutter, 5, "Number of seconds to run/halt test"); @@ -1076,7 +1078,7 @@ static void rcu_torture_timer_cb(struct rcu_head *rhp) * counter in the element should never be greater than 1, otherwise, the * RCU implementation is broken. */ -static void rcu_torture_timer(unsigned long unused) +static void rcu_torture_timer(struct timer_list *unused) { int idx; unsigned long started; @@ -1163,7 +1165,7 @@ rcu_torture_reader(void *arg) VERBOSE_TOROUT_STRING("rcu_torture_reader task started"); set_user_nice(current, MAX_NICE); if (irqreader && cur_ops->irq_capable) - setup_timer_on_stack(&t, rcu_torture_timer, 0); + timer_setup_on_stack(&t, rcu_torture_timer, 0); do { if (irqreader && cur_ops->irq_capable) { @@ -1239,6 +1241,7 @@ rcu_torture_stats_print(void) long pipesummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 }; long batchsummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 }; static unsigned long rtcv_snap = ULONG_MAX; + static bool splatted; struct task_struct *wtp; for_each_possible_cpu(cpu) { @@ -1324,6 +1327,10 @@ rcu_torture_stats_print(void) gpnum, completed, flags, wtp == NULL ? ~0UL : wtp->state, wtp == NULL ? -1 : (int)task_cpu(wtp)); + if (!splatted && wtp) { + sched_show_task(wtp); + splatted = true; + } show_rcu_gp_kthreads(); rcu_ftrace_dump(DUMP_ALL); } @@ -1357,7 +1364,7 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag) "fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d " "test_boost=%d/%d test_boost_interval=%d " "test_boost_duration=%d shutdown_secs=%d " - "stall_cpu=%d stall_cpu_holdoff=%d " + "stall_cpu=%d stall_cpu_holdoff=%d stall_cpu_irqsoff=%d " "n_barrier_cbs=%d " "onoff_interval=%d onoff_holdoff=%d\n", torture_type, tag, nrealreaders, nfakewriters, @@ -1365,7 +1372,7 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag) stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter, test_boost, cur_ops->can_boost, test_boost_interval, test_boost_duration, shutdown_secs, - stall_cpu, stall_cpu_holdoff, + stall_cpu, stall_cpu_holdoff, stall_cpu_irqsoff, n_barrier_cbs, onoff_interval, onoff_holdoff); } @@ -1430,12 +1437,19 @@ static int rcu_torture_stall(void *args) if (!kthread_should_stop()) { stop_at = get_seconds() + stall_cpu; /* RCU CPU stall is expected behavior in following code. */ - pr_alert("rcu_torture_stall start.\n"); rcu_read_lock(); - preempt_disable(); + if (stall_cpu_irqsoff) + local_irq_disable(); + else + preempt_disable(); + pr_alert("rcu_torture_stall start on CPU %d.\n", + smp_processor_id()); while (ULONG_CMP_LT(get_seconds(), stop_at)) continue; /* Induce RCU CPU stall warning. */ - preempt_enable(); + if (stall_cpu_irqsoff) + local_irq_enable(); + else + preempt_enable(); rcu_read_unlock(); pr_alert("rcu_torture_stall end.\n"); } diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 3e3650e94ae6..f9c0ca2ccf0c 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -534,8 +534,8 @@ module_param(rcu_kick_kthreads, bool, 0644); * How long the grace period must be before we start recruiting * quiescent-state help from rcu_note_context_switch(). */ -static ulong jiffies_till_sched_qs = HZ / 20; -module_param(jiffies_till_sched_qs, ulong, 0644); +static ulong jiffies_till_sched_qs = HZ / 10; +module_param(jiffies_till_sched_qs, ulong, 0444); static bool rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp); @@ -734,7 +734,7 @@ static int rcu_future_needs_gp(struct rcu_state *rsp) int idx = (READ_ONCE(rnp->completed) + 1) & 0x1; int *fp = &rnp->need_future_gp[idx]; - RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_future_needs_gp() invoked with irqs enabled!!!"); + lockdep_assert_irqs_disabled(); return READ_ONCE(*fp); } @@ -746,7 +746,7 @@ static int rcu_future_needs_gp(struct rcu_state *rsp) static bool cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp) { - RCU_LOCKDEP_WARN(!irqs_disabled(), "cpu_needs_another_gp() invoked with irqs enabled!!!"); + lockdep_assert_irqs_disabled(); if (rcu_gp_in_progress(rsp)) return false; /* No, a grace period is already in progress. */ if (rcu_future_needs_gp(rsp)) @@ -773,7 +773,7 @@ static void rcu_eqs_enter_common(bool user) struct rcu_data *rdp; struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); - RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_eqs_enter_common() invoked with irqs enabled!!!"); + lockdep_assert_irqs_disabled(); trace_rcu_dyntick(TPS("Start"), rdtp->dynticks_nesting, 0); if (IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current)) { @@ -837,10 +837,13 @@ static void rcu_eqs_enter(bool user) * We crowbar the ->dynticks_nesting field to zero to allow for * the possibility of usermode upcalls having messed up our count * of interrupt nesting level during the prior busy period. + * + * If you add or remove a call to rcu_idle_enter(), be sure to test with + * CONFIG_RCU_EQS_DEBUG=y. */ void rcu_idle_enter(void) { - RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_idle_enter() invoked with irqs enabled!!!"); + lockdep_assert_irqs_disabled(); rcu_eqs_enter(false); } @@ -852,10 +855,13 @@ void rcu_idle_enter(void) * is permitted between this call and rcu_user_exit(). This way the * CPU doesn't need to maintain the tick for RCU maintenance purposes * when the CPU runs in userspace. + * + * If you add or remove a call to rcu_user_enter(), be sure to test with + * CONFIG_RCU_EQS_DEBUG=y. */ void rcu_user_enter(void) { - RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_user_enter() invoked with irqs enabled!!!"); + lockdep_assert_irqs_disabled(); rcu_eqs_enter(true); } #endif /* CONFIG_NO_HZ_FULL */ @@ -875,12 +881,15 @@ void rcu_user_enter(void) * Use things like work queues to work around this limitation. * * You have been warned. + * + * If you add or remove a call to rcu_irq_exit(), be sure to test with + * CONFIG_RCU_EQS_DEBUG=y. */ void rcu_irq_exit(void) { struct rcu_dynticks *rdtp; - RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_irq_exit() invoked with irqs enabled!!!"); + lockdep_assert_irqs_disabled(); rdtp = this_cpu_ptr(&rcu_dynticks); /* Page faults can happen in NMI handlers, so check... */ @@ -899,6 +908,9 @@ void rcu_irq_exit(void) /* * Wrapper for rcu_irq_exit() where interrupts are enabled. + * + * If you add or remove a call to rcu_irq_exit_irqson(), be sure to test + * with CONFIG_RCU_EQS_DEBUG=y. */ void rcu_irq_exit_irqson(void) { @@ -947,7 +959,7 @@ static void rcu_eqs_exit(bool user) struct rcu_dynticks *rdtp; long long oldval; - RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_eqs_exit() invoked with irqs enabled!!!"); + lockdep_assert_irqs_disabled(); rdtp = this_cpu_ptr(&rcu_dynticks); oldval = rdtp->dynticks_nesting; WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && oldval < 0); @@ -971,6 +983,9 @@ static void rcu_eqs_exit(bool user) * allow for the possibility of usermode upcalls messing up our count * of interrupt nesting level during the busy period that is just * now starting. + * + * If you add or remove a call to rcu_idle_exit(), be sure to test with + * CONFIG_RCU_EQS_DEBUG=y. */ void rcu_idle_exit(void) { @@ -987,6 +1002,9 @@ void rcu_idle_exit(void) * * Exit RCU idle mode while entering the kernel because it can * run a RCU read side critical section anytime. + * + * If you add or remove a call to rcu_user_exit(), be sure to test with + * CONFIG_RCU_EQS_DEBUG=y. */ void rcu_user_exit(void) { @@ -1012,13 +1030,16 @@ void rcu_user_exit(void) * Use things like work queues to work around this limitation. * * You have been warned. + * + * If you add or remove a call to rcu_irq_enter(), be sure to test with + * CONFIG_RCU_EQS_DEBUG=y. */ void rcu_irq_enter(void) { struct rcu_dynticks *rdtp; long long oldval; - RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_irq_enter() invoked with irqs enabled!!!"); + lockdep_assert_irqs_disabled(); rdtp = this_cpu_ptr(&rcu_dynticks); /* Page faults can happen in NMI handlers, so check... */ @@ -1037,6 +1058,9 @@ void rcu_irq_enter(void) /* * Wrapper for rcu_irq_enter() where interrupts are enabled. + * + * If you add or remove a call to rcu_irq_enter_irqson(), be sure to test + * with CONFIG_RCU_EQS_DEBUG=y. */ void rcu_irq_enter_irqson(void) { @@ -1055,6 +1079,9 @@ void rcu_irq_enter_irqson(void) * that the CPU is active. This implementation permits nested NMIs, as * long as the nesting level does not overflow an int. (You will probably * run out of stack space first.) + * + * If you add or remove a call to rcu_nmi_enter(), be sure to test + * with CONFIG_RCU_EQS_DEBUG=y. */ void rcu_nmi_enter(void) { @@ -1087,6 +1114,9 @@ void rcu_nmi_enter(void) * RCU-idle period, update rdtp->dynticks and rdtp->dynticks_nmi_nesting * to let the RCU grace-period handling know that the CPU is back to * being RCU-idle. + * + * If you add or remove a call to rcu_nmi_exit(), be sure to test + * with CONFIG_RCU_EQS_DEBUG=y. */ void rcu_nmi_exit(void) { @@ -1207,6 +1237,22 @@ static int rcu_is_cpu_rrupt_from_idle(void) } /* + * We are reporting a quiescent state on behalf of some other CPU, so + * it is our responsibility to check for and handle potential overflow + * of the rcu_node ->gpnum counter with respect to the rcu_data counters. + * After all, the CPU might be in deep idle state, and thus executing no + * code whatsoever. + */ +static void rcu_gpnum_ovf(struct rcu_node *rnp, struct rcu_data *rdp) +{ + lockdep_assert_held(&rnp->lock); + if (ULONG_CMP_LT(READ_ONCE(rdp->gpnum) + ULONG_MAX / 4, rnp->gpnum)) + WRITE_ONCE(rdp->gpwrap, true); + if (ULONG_CMP_LT(rdp->rcu_iw_gpnum + ULONG_MAX / 4, rnp->gpnum)) + rdp->rcu_iw_gpnum = rnp->gpnum + ULONG_MAX / 4; +} + +/* * Snapshot the specified CPU's dynticks counter so that we can later * credit them with an implicit quiescent state. Return 1 if this CPU * is in dynticks idle mode, which is an extended quiescent state. @@ -1216,15 +1262,34 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp) rdp->dynticks_snap = rcu_dynticks_snap(rdp->dynticks); if (rcu_dynticks_in_eqs(rdp->dynticks_snap)) { trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("dti")); - if (ULONG_CMP_LT(READ_ONCE(rdp->gpnum) + ULONG_MAX / 4, - rdp->mynode->gpnum)) - WRITE_ONCE(rdp->gpwrap, true); + rcu_gpnum_ovf(rdp->mynode, rdp); return 1; } return 0; } /* + * Handler for the irq_work request posted when a grace period has + * gone on for too long, but not yet long enough for an RCU CPU + * stall warning. Set state appropriately, but just complain if + * there is unexpected state on entry. + */ +static void rcu_iw_handler(struct irq_work *iwp) +{ + struct rcu_data *rdp; + struct rcu_node *rnp; + + rdp = container_of(iwp, struct rcu_data, rcu_iw); + rnp = rdp->mynode; + raw_spin_lock_rcu_node(rnp); + if (!WARN_ON_ONCE(!rdp->rcu_iw_pending)) { + rdp->rcu_iw_gpnum = rnp->gpnum; + rdp->rcu_iw_pending = false; + } + raw_spin_unlock_rcu_node(rnp); +} + +/* * Return true if the specified CPU has passed through a quiescent * state by virtue of being in or having passed through an dynticks * idle state since the last call to dyntick_save_progress_counter() @@ -1235,8 +1300,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) unsigned long jtsq; bool *rnhqp; bool *ruqp; - unsigned long rjtsc; - struct rcu_node *rnp; + struct rcu_node *rnp = rdp->mynode; /* * If the CPU passed through or entered a dynticks idle phase with @@ -1249,34 +1313,25 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) if (rcu_dynticks_in_eqs_since(rdp->dynticks, rdp->dynticks_snap)) { trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("dti")); rdp->dynticks_fqs++; + rcu_gpnum_ovf(rnp, rdp); return 1; } - /* Compute and saturate jiffies_till_sched_qs. */ - jtsq = jiffies_till_sched_qs; - rjtsc = rcu_jiffies_till_stall_check(); - if (jtsq > rjtsc / 2) { - WRITE_ONCE(jiffies_till_sched_qs, rjtsc); - jtsq = rjtsc / 2; - } else if (jtsq < 1) { - WRITE_ONCE(jiffies_till_sched_qs, 1); - jtsq = 1; - } - /* * Has this CPU encountered a cond_resched_rcu_qs() since the * beginning of the grace period? For this to be the case, * the CPU has to have noticed the current grace period. This * might not be the case for nohz_full CPUs looping in the kernel. */ - rnp = rdp->mynode; + jtsq = jiffies_till_sched_qs; ruqp = per_cpu_ptr(&rcu_dynticks.rcu_urgent_qs, rdp->cpu); if (time_after(jiffies, rdp->rsp->gp_start + jtsq) && READ_ONCE(rdp->rcu_qs_ctr_snap) != per_cpu(rcu_dynticks.rcu_qs_ctr, rdp->cpu) && READ_ONCE(rdp->gpnum) == rnp->gpnum && !rdp->gpwrap) { trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("rqc")); + rcu_gpnum_ovf(rnp, rdp); return 1; - } else { + } else if (time_after(jiffies, rdp->rsp->gp_start + jtsq)) { /* Load rcu_qs_ctr before store to rcu_urgent_qs. */ smp_store_release(ruqp, true); } @@ -1285,6 +1340,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) if (!(rdp->grpmask & rcu_rnp_online_cpus(rnp))) { trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("ofl")); rdp->offline_fqs++; + rcu_gpnum_ovf(rnp, rdp); return 1; } @@ -1304,10 +1360,6 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) * updates are only once every few jiffies, the probability of * lossage (and thus of slight grace-period extension) is * quite low. - * - * Note that if the jiffies_till_sched_qs boot/sysfs parameter - * is set too high, we override with half of the RCU CPU stall - * warning delay. */ rnhqp = &per_cpu(rcu_dynticks.rcu_need_heavy_qs, rdp->cpu); if (!READ_ONCE(*rnhqp) && @@ -1316,15 +1368,26 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) WRITE_ONCE(*rnhqp, true); /* Store rcu_need_heavy_qs before rcu_urgent_qs. */ smp_store_release(ruqp, true); - rdp->rsp->jiffies_resched += 5; /* Re-enable beating. */ + rdp->rsp->jiffies_resched += jtsq; /* Re-enable beating. */ } /* - * If more than halfway to RCU CPU stall-warning time, do - * a resched_cpu() to try to loosen things up a bit. + * If more than halfway to RCU CPU stall-warning time, do a + * resched_cpu() to try to loosen things up a bit. Also check to + * see if the CPU is getting hammered with interrupts, but only + * once per grace period, just to keep the IPIs down to a dull roar. */ - if (jiffies - rdp->rsp->gp_start > rcu_jiffies_till_stall_check() / 2) + if (jiffies - rdp->rsp->gp_start > rcu_jiffies_till_stall_check() / 2) { resched_cpu(rdp->cpu); + if (IS_ENABLED(CONFIG_IRQ_WORK) && + !rdp->rcu_iw_pending && rdp->rcu_iw_gpnum != rnp->gpnum && + (rnp->ffmask & rdp->grpmask)) { + init_irq_work(&rdp->rcu_iw, rcu_iw_handler); + rdp->rcu_iw_pending = true; + rdp->rcu_iw_gpnum = rnp->gpnum; + irq_work_queue_on(&rdp->rcu_iw, rdp->cpu); + } + } return 0; } @@ -1513,6 +1576,7 @@ static void print_cpu_stall(struct rcu_state *rsp) { int cpu; unsigned long flags; + struct rcu_data *rdp = this_cpu_ptr(rsp->rda); struct rcu_node *rnp = rcu_get_root(rsp); long totqlen = 0; @@ -1528,7 +1592,9 @@ static void print_cpu_stall(struct rcu_state *rsp) */ pr_err("INFO: %s self-detected stall on CPU", rsp->name); print_cpu_stall_info_begin(); + raw_spin_lock_irqsave_rcu_node(rdp->mynode, flags); print_cpu_stall_info(rsp, smp_processor_id()); + raw_spin_unlock_irqrestore_rcu_node(rdp->mynode, flags); print_cpu_stall_info_end(); for_each_possible_cpu(cpu) totqlen += rcu_segcblist_n_cbs(&per_cpu_ptr(rsp->rda, @@ -1922,6 +1988,7 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp, rdp->core_needs_qs = need_gp; zero_cpu_stall_ticks(rdp); WRITE_ONCE(rdp->gpwrap, false); + rcu_gpnum_ovf(rnp, rdp); } return ret; } @@ -3702,6 +3769,8 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp) rdp->cpu_no_qs.b.norm = true; rdp->rcu_qs_ctr_snap = per_cpu(rcu_dynticks.rcu_qs_ctr, cpu); rdp->core_needs_qs = false; + rdp->rcu_iw_pending = false; + rdp->rcu_iw_gpnum = rnp->gpnum - 1; trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuonl")); raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } @@ -3739,10 +3808,24 @@ static void rcutree_affinity_setting(unsigned int cpu, int outgoing) */ int rcutree_online_cpu(unsigned int cpu) { - sync_sched_exp_online_cleanup(cpu); - rcutree_affinity_setting(cpu, -1); + unsigned long flags; + struct rcu_data *rdp; + struct rcu_node *rnp; + struct rcu_state *rsp; + + for_each_rcu_flavor(rsp) { + rdp = per_cpu_ptr(rsp->rda, cpu); + rnp = rdp->mynode; + raw_spin_lock_irqsave_rcu_node(rnp, flags); + rnp->ffmask |= rdp->grpmask; + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + } if (IS_ENABLED(CONFIG_TREE_SRCU)) srcu_online_cpu(cpu); + if (rcu_scheduler_active == RCU_SCHEDULER_INACTIVE) + return 0; /* Too early in boot for scheduler work. */ + sync_sched_exp_online_cleanup(cpu); + rcutree_affinity_setting(cpu, -1); return 0; } @@ -3752,6 +3835,19 @@ int rcutree_online_cpu(unsigned int cpu) */ int rcutree_offline_cpu(unsigned int cpu) { + unsigned long flags; + struct rcu_data *rdp; + struct rcu_node *rnp; + struct rcu_state *rsp; + + for_each_rcu_flavor(rsp) { + rdp = per_cpu_ptr(rsp->rda, cpu); + rnp = rdp->mynode; + raw_spin_lock_irqsave_rcu_node(rnp, flags); + rnp->ffmask &= ~rdp->grpmask; + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + } + rcutree_affinity_setting(cpu, cpu); if (IS_ENABLED(CONFIG_TREE_SRCU)) srcu_offline_cpu(cpu); @@ -4200,8 +4296,7 @@ void __init rcu_init(void) for_each_online_cpu(cpu) { rcutree_prepare_cpu(cpu); rcu_cpu_starting(cpu); - if (IS_ENABLED(CONFIG_TREE_SRCU)) - srcu_online_cpu(cpu); + rcutree_online_cpu(cpu); } } diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 8e1f285f0a70..46a5d1991450 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -103,6 +103,7 @@ struct rcu_node { /* Online CPUs for next expedited GP. */ /* Any CPU that has ever been online will */ /* have its bit set. */ + unsigned long ffmask; /* Fully functional CPUs. */ unsigned long grpmask; /* Mask to apply to parent qsmask. */ /* Only one bit will be set in this mask. */ int grplo; /* lowest-numbered CPU or group here. */ @@ -285,6 +286,10 @@ struct rcu_data { /* 8) RCU CPU stall data. */ unsigned int softirq_snap; /* Snapshot of softirq activity. */ + /* ->rcu_iw* fields protected by leaf rcu_node ->lock. */ + struct irq_work rcu_iw; /* Check for non-irq activity. */ + bool rcu_iw_pending; /* Is ->rcu_iw pending? */ + unsigned long rcu_iw_gpnum; /* ->gpnum associated with ->rcu_iw. */ int cpu; struct rcu_state *rsp; diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index e012b9be777e..db85ca3975f1 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -29,6 +29,7 @@ #include <linux/oom.h> #include <linux/sched/debug.h> #include <linux/smpboot.h> +#include <linux/sched/isolation.h> #include <uapi/linux/sched/types.h> #include "../time/tick-internal.h" @@ -54,6 +55,7 @@ DEFINE_PER_CPU(char, rcu_cpu_has_work); * This probably needs to be excluded from -rt builds. */ #define rt_mutex_owner(a) ({ WARN_ON_ONCE(1); NULL; }) +#define rt_mutex_futex_unlock(x) WARN_ON_ONCE(1) #endif /* #else #ifdef CONFIG_RCU_BOOST */ @@ -325,7 +327,7 @@ static void rcu_preempt_note_context_switch(bool preempt) struct rcu_data *rdp; struct rcu_node *rnp; - RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_preempt_note_context_switch() invoked with interrupts enabled!!!\n"); + lockdep_assert_irqs_disabled(); WARN_ON_ONCE(!preempt && t->rcu_read_lock_nesting > 0); if (t->rcu_read_lock_nesting > 0 && !t->rcu_read_unlock_special.b.blocked) { @@ -530,7 +532,7 @@ void rcu_read_unlock_special(struct task_struct *t) /* Unboost if we were boosted. */ if (IS_ENABLED(CONFIG_RCU_BOOST) && drop_boost_mutex) - rt_mutex_unlock(&rnp->boost_mtx); + rt_mutex_futex_unlock(&rnp->boost_mtx); /* * If this was the last task on the expedited lists, @@ -911,8 +913,6 @@ void exit_rcu(void) #ifdef CONFIG_RCU_BOOST -#include "../locking/rtmutex_common.h" - static void rcu_wake_cond(struct task_struct *t, int status) { /* @@ -1421,7 +1421,7 @@ int rcu_needs_cpu(u64 basemono, u64 *nextevt) struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); unsigned long dj; - RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_needs_cpu() invoked with irqs enabled!!!"); + lockdep_assert_irqs_disabled(); /* Snapshot to detect later posting of non-lazy callback. */ rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted; @@ -1470,7 +1470,7 @@ static void rcu_prepare_for_idle(void) struct rcu_state *rsp; int tne; - RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_prepare_for_idle() invoked with irqs enabled!!!"); + lockdep_assert_irqs_disabled(); if (rcu_is_nocb_cpu(smp_processor_id())) return; @@ -1507,7 +1507,7 @@ static void rcu_prepare_for_idle(void) rdtp->last_accelerate = jiffies; for_each_rcu_flavor(rsp) { rdp = this_cpu_ptr(rsp->rda); - if (rcu_segcblist_pend_cbs(&rdp->cblist)) + if (!rcu_segcblist_pend_cbs(&rdp->cblist)) continue; rnp = rdp->mynode; raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */ @@ -1525,7 +1525,7 @@ static void rcu_prepare_for_idle(void) */ static void rcu_cleanup_after_idle(void) { - RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_cleanup_after_idle() invoked with irqs enabled!!!"); + lockdep_assert_irqs_disabled(); if (rcu_is_nocb_cpu(smp_processor_id())) return; if (rcu_try_advance_all_cbs()) @@ -1671,6 +1671,7 @@ static void print_cpu_stall_info_begin(void) */ static void print_cpu_stall_info(struct rcu_state *rsp, int cpu) { + unsigned long delta; char fast_no_hz[72]; struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); struct rcu_dynticks *rdtp = rdp->dynticks; @@ -1685,11 +1686,15 @@ static void print_cpu_stall_info(struct rcu_state *rsp, int cpu) ticks_value = rsp->gpnum - rdp->gpnum; } print_cpu_stall_fast_no_hz(fast_no_hz, cpu); - pr_err("\t%d-%c%c%c: (%lu %s) idle=%03x/%llx/%d softirq=%u/%u fqs=%ld %s\n", + delta = rdp->mynode->gpnum - rdp->rcu_iw_gpnum; + pr_err("\t%d-%c%c%c%c: (%lu %s) idle=%03x/%llx/%d softirq=%u/%u fqs=%ld %s\n", cpu, "O."[!!cpu_online(cpu)], "o."[!!(rdp->grpmask & rdp->mynode->qsmaskinit)], "N."[!!(rdp->grpmask & rdp->mynode->qsmaskinitnext)], + !IS_ENABLED(CONFIG_IRQ_WORK) ? '?' : + rdp->rcu_iw_pending ? (int)min(delta, 9UL) + '0' : + "!."[!delta], ticks_value, ticks_title, rcu_dynticks_snap(rdtp) & 0xfff, rdtp->dynticks_nesting, rdtp->dynticks_nmi_nesting, @@ -2012,7 +2017,7 @@ static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_data *my_rdp, struct rcu_data *rdp, unsigned long flags) { - RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_nocb_adopt_orphan_cbs() invoked with irqs enabled!!!"); + lockdep_assert_irqs_disabled(); if (!rcu_is_nocb_cpu(smp_processor_id())) return false; /* Not NOCBs CPU, caller must migrate CBs. */ __call_rcu_nocb_enqueue(my_rdp, rcu_segcblist_head(&rdp->cblist), @@ -2261,9 +2266,11 @@ static void do_nocb_deferred_wakeup_common(struct rcu_data *rdp) } /* Do a deferred wakeup of rcu_nocb_kthread() from a timer handler. */ -static void do_nocb_deferred_wakeup_timer(unsigned long x) +static void do_nocb_deferred_wakeup_timer(struct timer_list *t) { - do_nocb_deferred_wakeup_common((struct rcu_data *)x); + struct rcu_data *rdp = from_timer(rdp, t, nocb_timer); + + do_nocb_deferred_wakeup_common(rdp); } /* @@ -2327,8 +2334,7 @@ static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp) init_swait_queue_head(&rdp->nocb_wq); rdp->nocb_follower_tail = &rdp->nocb_follower_head; raw_spin_lock_init(&rdp->nocb_lock); - setup_timer(&rdp->nocb_timer, do_nocb_deferred_wakeup_timer, - (unsigned long)rdp); + timer_setup(&rdp->nocb_timer, do_nocb_deferred_wakeup_timer, 0); } /* @@ -2583,7 +2589,7 @@ static void rcu_bind_gp_kthread(void) if (!tick_nohz_full_enabled()) return; - housekeeping_affine(current); + housekeeping_affine(current, HK_FLAG_RCU); } /* Record the current task on dyntick-idle entry. */ diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 5033b66d2753..fbd56d6e575b 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -51,6 +51,7 @@ #include <linux/kthread.h> #include <linux/tick.h> #include <linux/rcupdate_wait.h> +#include <linux/sched/isolation.h> #define CREATE_TRACE_POINTS @@ -494,6 +495,7 @@ EXPORT_SYMBOL_GPL(do_trace_rcu_torture_read); #endif int rcu_cpu_stall_suppress __read_mostly; /* 1 = suppress stall warnings. */ +EXPORT_SYMBOL_GPL(rcu_cpu_stall_suppress); static int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT; module_param(rcu_cpu_stall_suppress, int, 0644); @@ -575,7 +577,6 @@ DEFINE_STATIC_SRCU(tasks_rcu_exit_srcu); static int rcu_task_stall_timeout __read_mostly = RCU_TASK_STALL_TIMEOUT; module_param(rcu_task_stall_timeout, int, 0644); -static void rcu_spawn_tasks_kthread(void); static struct task_struct *rcu_tasks_kthread_ptr; /** @@ -600,7 +601,6 @@ void call_rcu_tasks(struct rcu_head *rhp, rcu_callback_t func) { unsigned long flags; bool needwake; - bool havetask = READ_ONCE(rcu_tasks_kthread_ptr); rhp->next = NULL; rhp->func = func; @@ -610,11 +610,8 @@ void call_rcu_tasks(struct rcu_head *rhp, rcu_callback_t func) rcu_tasks_cbs_tail = &rhp->next; raw_spin_unlock_irqrestore(&rcu_tasks_cbs_lock, flags); /* We can't create the thread unless interrupts are enabled. */ - if ((needwake && havetask) || - (!havetask && !irqs_disabled_flags(flags))) { - rcu_spawn_tasks_kthread(); + if (needwake && READ_ONCE(rcu_tasks_kthread_ptr)) wake_up(&rcu_tasks_cbs_wq); - } } EXPORT_SYMBOL_GPL(call_rcu_tasks); @@ -718,7 +715,7 @@ static int __noreturn rcu_tasks_kthread(void *arg) LIST_HEAD(rcu_tasks_holdouts); /* Run on housekeeping CPUs by default. Sysadm can move if desired. */ - housekeeping_affine(current); + housekeeping_affine(current, HK_FLAG_RCU); /* * Each pass through the following loop makes one check for @@ -853,27 +850,18 @@ static int __noreturn rcu_tasks_kthread(void *arg) } } -/* Spawn rcu_tasks_kthread() at first call to call_rcu_tasks(). */ -static void rcu_spawn_tasks_kthread(void) +/* Spawn rcu_tasks_kthread() at core_initcall() time. */ +static int __init rcu_spawn_tasks_kthread(void) { - static DEFINE_MUTEX(rcu_tasks_kthread_mutex); struct task_struct *t; - if (READ_ONCE(rcu_tasks_kthread_ptr)) { - smp_mb(); /* Ensure caller sees full kthread. */ - return; - } - mutex_lock(&rcu_tasks_kthread_mutex); - if (rcu_tasks_kthread_ptr) { - mutex_unlock(&rcu_tasks_kthread_mutex); - return; - } t = kthread_run(rcu_tasks_kthread, NULL, "rcu_tasks_kthread"); BUG_ON(IS_ERR(t)); smp_mb(); /* Ensure others see full kthread. */ WRITE_ONCE(rcu_tasks_kthread_ptr, t); - mutex_unlock(&rcu_tasks_kthread_mutex); + return 0; } +core_initcall(rcu_spawn_tasks_kthread); /* Do the srcu_read_lock() for the above synchronize_srcu(). */ void exit_tasks_rcu_start(void) diff --git a/kernel/resource.c b/kernel/resource.c index 9b5f04404152..54ba6de3757c 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -397,9 +397,32 @@ static int find_next_iomem_res(struct resource *res, unsigned long desc, res->start = p->start; if (res->end > p->end) res->end = p->end; + res->flags = p->flags; + res->desc = p->desc; return 0; } +static int __walk_iomem_res_desc(struct resource *res, unsigned long desc, + bool first_level_children_only, + void *arg, + int (*func)(struct resource *, void *)) +{ + u64 orig_end = res->end; + int ret = -1; + + while ((res->start < res->end) && + !find_next_iomem_res(res, desc, first_level_children_only)) { + ret = (*func)(res, arg); + if (ret) + break; + + res->start = res->end + 1; + res->end = orig_end; + } + + return ret; +} + /* * Walks through iomem resources and calls func() with matching resource * ranges. This walks through whole tree and not just first level children. @@ -415,29 +438,15 @@ static int find_next_iomem_res(struct resource *res, unsigned long desc, * <linux/ioport.h> and set it in 'desc' of a target resource entry. */ int walk_iomem_res_desc(unsigned long desc, unsigned long flags, u64 start, - u64 end, void *arg, int (*func)(u64, u64, void *)) + u64 end, void *arg, int (*func)(struct resource *, void *)) { struct resource res; - u64 orig_end; - int ret = -1; res.start = start; res.end = end; res.flags = flags; - orig_end = res.end; - - while ((res.start < res.end) && - (!find_next_iomem_res(&res, desc, false))) { - - ret = (*func)(res.start, res.end, arg); - if (ret) - break; - - res.start = res.end + 1; - res.end = orig_end; - } - return ret; + return __walk_iomem_res_desc(&res, desc, false, arg, func); } /* @@ -448,25 +457,33 @@ int walk_iomem_res_desc(unsigned long desc, unsigned long flags, u64 start, * ranges. */ int walk_system_ram_res(u64 start, u64 end, void *arg, - int (*func)(u64, u64, void *)) + int (*func)(struct resource *, void *)) { struct resource res; - u64 orig_end; - int ret = -1; res.start = start; res.end = end; res.flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY; - orig_end = res.end; - while ((res.start < res.end) && - (!find_next_iomem_res(&res, IORES_DESC_NONE, true))) { - ret = (*func)(res.start, res.end, arg); - if (ret) - break; - res.start = res.end + 1; - res.end = orig_end; - } - return ret; + + return __walk_iomem_res_desc(&res, IORES_DESC_NONE, true, + arg, func); +} + +/* + * This function calls the @func callback against all memory ranges, which + * are ranges marked as IORESOURCE_MEM and IORESOUCE_BUSY. + */ +int walk_mem_res(u64 start, u64 end, void *arg, + int (*func)(struct resource *, void *)) +{ + struct resource res; + + res.start = start; + res.end = end; + res.flags = IORESOURCE_MEM | IORESOURCE_BUSY; + + return __walk_iomem_res_desc(&res, IORES_DESC_NONE, true, + arg, func); } #if !defined(CONFIG_ARCH_HAS_WALK_MEMORY) @@ -508,6 +525,7 @@ static int __is_ram(unsigned long pfn, unsigned long nr_pages, void *arg) { return 1; } + /* * This generic page_is_ram() returns true if specified address is * registered as System RAM in iomem_resource list. diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index a9ee16bbc693..e2f9d4feff40 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -27,3 +27,4 @@ obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o obj-$(CONFIG_CPU_FREQ) += cpufreq.o obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o obj-$(CONFIG_MEMBARRIER) += membarrier.o +obj-$(CONFIG_CPU_ISOLATION) += isolation.o diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c index ca0f8fc945c6..e086babe6c61 100644 --- a/kernel/sched/clock.c +++ b/kernel/sched/clock.c @@ -388,7 +388,7 @@ void sched_clock_tick(void) if (unlikely(!sched_clock_running)) return; - WARN_ON_ONCE(!irqs_disabled()); + lockdep_assert_irqs_disabled(); scd = this_scd(); __scd_stamp(scd); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index d17c5da523a0..5b82a0073532 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -26,6 +26,7 @@ #include <linux/profile.h> #include <linux/security.h> #include <linux/syscalls.h> +#include <linux/sched/isolation.h> #include <asm/switch_to.h> #include <asm/tlb.h> @@ -42,18 +43,21 @@ DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); +#if defined(CONFIG_SCHED_DEBUG) && defined(HAVE_JUMP_LABEL) /* * Debugging: various feature bits + * + * If SCHED_DEBUG is disabled, each compilation unit has its own copy of + * sysctl_sched_features, defined in sched.h, to allow constants propagation + * at compile time and compiler optimization based on features default. */ - #define SCHED_FEAT(name, enabled) \ (1UL << __SCHED_FEAT_##name) * enabled | - const_debug unsigned int sysctl_sched_features = #include "features.h" 0; - #undef SCHED_FEAT +#endif /* * Number of tasks to iterate in a single balance run. @@ -83,9 +87,6 @@ __read_mostly int scheduler_running; */ int sysctl_sched_rt_runtime = 950000; -/* CPUs with isolated domains */ -cpumask_var_t cpu_isolated_map; - /* * __task_rq_lock - lock the rq @p resides on. */ @@ -505,8 +506,7 @@ void resched_cpu(int cpu) struct rq *rq = cpu_rq(cpu); unsigned long flags; - if (!raw_spin_trylock_irqsave(&rq->lock, flags)) - return; + raw_spin_lock_irqsave(&rq->lock, flags); resched_curr(rq); raw_spin_unlock_irqrestore(&rq->lock, flags); } @@ -526,7 +526,7 @@ int get_nohz_timer_target(void) int i, cpu = smp_processor_id(); struct sched_domain *sd; - if (!idle_cpu(cpu) && is_housekeeping_cpu(cpu)) + if (!idle_cpu(cpu) && housekeeping_cpu(cpu, HK_FLAG_TIMER)) return cpu; rcu_read_lock(); @@ -535,15 +535,15 @@ int get_nohz_timer_target(void) if (cpu == i) continue; - if (!idle_cpu(i) && is_housekeeping_cpu(i)) { + if (!idle_cpu(i) && housekeeping_cpu(i, HK_FLAG_TIMER)) { cpu = i; goto unlock; } } } - if (!is_housekeeping_cpu(cpu)) - cpu = housekeeping_any_cpu(); + if (!housekeeping_cpu(cpu, HK_FLAG_TIMER)) + cpu = housekeeping_any_cpu(HK_FLAG_TIMER); unlock: rcu_read_unlock(); return cpu; @@ -733,7 +733,7 @@ int tg_nop(struct task_group *tg, void *data) } #endif -static void set_load_weight(struct task_struct *p) +static void set_load_weight(struct task_struct *p, bool update_load) { int prio = p->static_prio - MAX_RT_PRIO; struct load_weight *load = &p->se.load; @@ -747,8 +747,16 @@ static void set_load_weight(struct task_struct *p) return; } - load->weight = scale_load(sched_prio_to_weight[prio]); - load->inv_weight = sched_prio_to_wmult[prio]; + /* + * SCHED_OTHER tasks have to update their load when changing their + * weight + */ + if (update_load && p->sched_class == &fair_sched_class) { + reweight_task(p, prio); + } else { + load->weight = scale_load(sched_prio_to_weight[prio]); + load->inv_weight = sched_prio_to_wmult[prio]; + } } static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags) @@ -2358,7 +2366,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) p->static_prio = NICE_TO_PRIO(0); p->prio = p->normal_prio = __normal_prio(p); - set_load_weight(p); + set_load_weight(p, false); /* * We don't need the reset flag anymore after the fork. It has @@ -3805,7 +3813,7 @@ void set_user_nice(struct task_struct *p, long nice) put_prev_task(rq, p); p->static_prio = NICE_TO_PRIO(nice); - set_load_weight(p); + set_load_weight(p, true); old_prio = p->prio; p->prio = effective_prio(p); delta = p->prio - old_prio; @@ -3962,7 +3970,7 @@ static void __setscheduler_params(struct task_struct *p, */ p->rt_priority = attr->sched_priority; p->normal_prio = normal_prio(p); - set_load_weight(p); + set_load_weight(p, true); } /* Actually do priority change: must hold pi & rq lock. */ @@ -4842,6 +4850,7 @@ int __sched _cond_resched(void) preempt_schedule_common(); return 1; } + rcu_all_qs(); return 0; } EXPORT_SYMBOL(_cond_resched); @@ -5165,6 +5174,7 @@ void sched_show_task(struct task_struct *p) show_stack(p, NULL); put_task_stack(p); } +EXPORT_SYMBOL_GPL(sched_show_task); static inline bool state_filter_match(unsigned long state_filter, struct task_struct *p) @@ -5726,10 +5736,6 @@ static inline void sched_init_smt(void) { } void __init sched_init_smp(void) { - cpumask_var_t non_isolated_cpus; - - alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL); - sched_init_numa(); /* @@ -5739,16 +5745,12 @@ void __init sched_init_smp(void) */ mutex_lock(&sched_domains_mutex); sched_init_domains(cpu_active_mask); - cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map); - if (cpumask_empty(non_isolated_cpus)) - cpumask_set_cpu(smp_processor_id(), non_isolated_cpus); mutex_unlock(&sched_domains_mutex); /* Move init over to a non-isolated CPU */ - if (set_cpus_allowed_ptr(current, non_isolated_cpus) < 0) + if (set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_FLAG_DOMAIN)) < 0) BUG(); sched_init_granularity(); - free_cpumask_var(non_isolated_cpus); init_sched_rt_class(); init_sched_dl_class(); @@ -5933,7 +5935,7 @@ void __init sched_init(void) atomic_set(&rq->nr_iowait, 0); } - set_load_weight(&init_task); + set_load_weight(&init_task, false); /* * The boot idle thread does lazy MMU switching as well: @@ -5952,9 +5954,6 @@ void __init sched_init(void) calc_load_update = jiffies + LOAD_FREQ; #ifdef CONFIG_SMP - /* May be allocated at isolcpus cmdline parse time */ - if (cpu_isolated_map == NULL) - zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); idle_thread_set_boot_cpu(); set_cpu_rq_start_time(smp_processor_id()); #endif diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 14d2dbf97c53..9be8b68a66da 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -259,8 +259,7 @@ static inline u64 account_other_time(u64 max) { u64 accounted; - /* Shall be converted to a lockdep-enabled lightweight check */ - WARN_ON_ONCE(!irqs_disabled()); + lockdep_assert_irqs_disabled(); accounted = steal_account_process_time(max); diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 4ae5c1ea90e2..f349f7e98dec 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -243,7 +243,7 @@ static void task_non_contending(struct task_struct *p) if (p->state == TASK_DEAD) sub_rq_bw(p->dl.dl_bw, &rq->dl); raw_spin_lock(&dl_b->lock); - __dl_clear(dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p))); + __dl_sub(dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p))); __dl_clear_params(p); raw_spin_unlock(&dl_b->lock); } @@ -1210,7 +1210,7 @@ static enum hrtimer_restart inactive_task_timer(struct hrtimer *timer) } raw_spin_lock(&dl_b->lock); - __dl_clear(dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p))); + __dl_sub(dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p))); raw_spin_unlock(&dl_b->lock); __dl_clear_params(p); @@ -1365,6 +1365,10 @@ enqueue_dl_entity(struct sched_dl_entity *dl_se, update_dl_entity(dl_se, pi_se); } else if (flags & ENQUEUE_REPLENISH) { replenish_dl_entity(dl_se, pi_se); + } else if ((flags & ENQUEUE_RESTORE) && + dl_time_before(dl_se->deadline, + rq_clock(rq_of_dl_rq(dl_rq_of_se(dl_se))))) { + setup_new_dl_entity(dl_se); } __enqueue_dl_entity(dl_se); @@ -2167,7 +2171,7 @@ static void set_cpus_allowed_dl(struct task_struct *p, * until we complete the update. */ raw_spin_lock(&src_dl_b->lock); - __dl_clear(src_dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p))); + __dl_sub(src_dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p))); raw_spin_unlock(&src_dl_b->lock); } @@ -2256,13 +2260,6 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p) return; } - /* - * If p is boosted we already updated its params in - * rt_mutex_setprio()->enqueue_task(..., ENQUEUE_REPLENISH), - * p's deadline being now already after rq_clock(rq). - */ - if (dl_time_before(p->dl.deadline, rq_clock(rq))) - setup_new_dl_entity(&p->dl); if (rq->curr != p) { #ifdef CONFIG_SMP @@ -2452,7 +2449,7 @@ int sched_dl_overflow(struct task_struct *p, int policy, if (dl_policy(policy) && !task_has_dl_policy(p) && !__dl_overflow(dl_b, cpus, 0, new_bw)) { if (hrtimer_active(&p->dl.inactive_timer)) - __dl_clear(dl_b, p->dl.dl_bw, cpus); + __dl_sub(dl_b, p->dl.dl_bw, cpus); __dl_add(dl_b, new_bw, cpus); err = 0; } else if (dl_policy(policy) && task_has_dl_policy(p) && @@ -2464,7 +2461,7 @@ int sched_dl_overflow(struct task_struct *p, int policy, * But this would require to set the task's "inactive * timer" when the task is not inactive. */ - __dl_clear(dl_b, p->dl.dl_bw, cpus); + __dl_sub(dl_b, p->dl.dl_bw, cpus); __dl_add(dl_b, new_bw, cpus); dl_change_utilization(p, new_bw); err = 0; diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 2f93e4a2d9f6..1ca0130ed4f9 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -441,9 +441,11 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group P_SCHEDSTAT(se->statistics.wait_count); } P(se->load.weight); + P(se->runnable_weight); #ifdef CONFIG_SMP P(se->avg.load_avg); P(se->avg.util_avg); + P(se->avg.runnable_load_avg); #endif #undef PN_SCHEDSTAT @@ -558,16 +560,19 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) SEQ_printf(m, " .%-30s: %d\n", "nr_running", cfs_rq->nr_running); SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight); #ifdef CONFIG_SMP + SEQ_printf(m, " .%-30s: %ld\n", "runnable_weight", cfs_rq->runnable_weight); SEQ_printf(m, " .%-30s: %lu\n", "load_avg", cfs_rq->avg.load_avg); SEQ_printf(m, " .%-30s: %lu\n", "runnable_load_avg", - cfs_rq->runnable_load_avg); + cfs_rq->avg.runnable_load_avg); SEQ_printf(m, " .%-30s: %lu\n", "util_avg", cfs_rq->avg.util_avg); - SEQ_printf(m, " .%-30s: %ld\n", "removed_load_avg", - atomic_long_read(&cfs_rq->removed_load_avg)); - SEQ_printf(m, " .%-30s: %ld\n", "removed_util_avg", - atomic_long_read(&cfs_rq->removed_util_avg)); + SEQ_printf(m, " .%-30s: %ld\n", "removed.load_avg", + cfs_rq->removed.load_avg); + SEQ_printf(m, " .%-30s: %ld\n", "removed.util_avg", + cfs_rq->removed.util_avg); + SEQ_printf(m, " .%-30s: %ld\n", "removed.runnable_sum", + cfs_rq->removed.runnable_sum); #ifdef CONFIG_FAIR_GROUP_SCHED SEQ_printf(m, " .%-30s: %lu\n", "tg_load_avg_contrib", cfs_rq->tg_load_avg_contrib); @@ -1004,10 +1009,13 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, "nr_involuntary_switches", (long long)p->nivcsw); P(se.load.weight); + P(se.runnable_weight); #ifdef CONFIG_SMP P(se.avg.load_sum); + P(se.avg.runnable_load_sum); P(se.avg.util_sum); P(se.avg.load_avg); + P(se.avg.runnable_load_avg); P(se.avg.util_avg); P(se.avg.last_update_time); #endif diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 5c09ddf8c832..0989676c50e9 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -33,6 +33,7 @@ #include <linux/mempolicy.h> #include <linux/migrate.h> #include <linux/task_work.h> +#include <linux/sched/isolation.h> #include <trace/events/sched.h> @@ -717,13 +718,8 @@ void init_entity_runnable_average(struct sched_entity *se) { struct sched_avg *sa = &se->avg; - sa->last_update_time = 0; - /* - * sched_avg's period_contrib should be strictly less then 1024, so - * we give it 1023 to make sure it is almost a period (1024us), and - * will definitely be update (after enqueue). - */ - sa->period_contrib = 1023; + memset(sa, 0, sizeof(*sa)); + /* * Tasks are intialized with full load to be seen as heavy tasks until * they get a chance to stabilize to their real load level. @@ -731,13 +727,10 @@ void init_entity_runnable_average(struct sched_entity *se) * nothing has been attached to the task group yet. */ if (entity_is_task(se)) - sa->load_avg = scale_load_down(se->load.weight); - sa->load_sum = sa->load_avg * LOAD_AVG_MAX; - /* - * At this point, util_avg won't be used in select_task_rq_fair anyway - */ - sa->util_avg = 0; - sa->util_sum = 0; + sa->runnable_load_avg = sa->load_avg = scale_load_down(se->load.weight); + + se->runnable_weight = se->load.weight; + /* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */ } @@ -785,7 +778,6 @@ void post_init_entity_util_avg(struct sched_entity *se) } else { sa->util_avg = cap; } - sa->util_sum = sa->util_avg * LOAD_AVG_MAX; } if (entity_is_task(se)) { @@ -2026,7 +2018,7 @@ static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period) delta = runtime - p->last_sum_exec_runtime; *period = now - p->last_task_numa_placement; } else { - delta = p->se.avg.load_sum / p->se.load.weight; + delta = p->se.avg.load_sum; *period = LOAD_AVG_MAX; } @@ -2693,18 +2685,226 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) cfs_rq->nr_running--; } +/* + * Signed add and clamp on underflow. + * + * Explicitly do a load-store to ensure the intermediate value never hits + * memory. This allows lockless observations without ever seeing the negative + * values. + */ +#define add_positive(_ptr, _val) do { \ + typeof(_ptr) ptr = (_ptr); \ + typeof(_val) val = (_val); \ + typeof(*ptr) res, var = READ_ONCE(*ptr); \ + \ + res = var + val; \ + \ + if (val < 0 && res > var) \ + res = 0; \ + \ + WRITE_ONCE(*ptr, res); \ +} while (0) + +/* + * Unsigned subtract and clamp on underflow. + * + * Explicitly do a load-store to ensure the intermediate value never hits + * memory. This allows lockless observations without ever seeing the negative + * values. + */ +#define sub_positive(_ptr, _val) do { \ + typeof(_ptr) ptr = (_ptr); \ + typeof(*ptr) val = (_val); \ + typeof(*ptr) res, var = READ_ONCE(*ptr); \ + res = var - val; \ + if (res > var) \ + res = 0; \ + WRITE_ONCE(*ptr, res); \ +} while (0) + +#ifdef CONFIG_SMP +/* + * XXX we want to get rid of these helpers and use the full load resolution. + */ +static inline long se_weight(struct sched_entity *se) +{ + return scale_load_down(se->load.weight); +} + +static inline long se_runnable(struct sched_entity *se) +{ + return scale_load_down(se->runnable_weight); +} + +static inline void +enqueue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + cfs_rq->runnable_weight += se->runnable_weight; + + cfs_rq->avg.runnable_load_avg += se->avg.runnable_load_avg; + cfs_rq->avg.runnable_load_sum += se_runnable(se) * se->avg.runnable_load_sum; +} + +static inline void +dequeue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + cfs_rq->runnable_weight -= se->runnable_weight; + + sub_positive(&cfs_rq->avg.runnable_load_avg, se->avg.runnable_load_avg); + sub_positive(&cfs_rq->avg.runnable_load_sum, + se_runnable(se) * se->avg.runnable_load_sum); +} + +static inline void +enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + cfs_rq->avg.load_avg += se->avg.load_avg; + cfs_rq->avg.load_sum += se_weight(se) * se->avg.load_sum; +} + +static inline void +dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg); + sub_positive(&cfs_rq->avg.load_sum, se_weight(se) * se->avg.load_sum); +} +#else +static inline void +enqueue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { } +static inline void +dequeue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { } +static inline void +enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { } +static inline void +dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { } +#endif + +static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, + unsigned long weight, unsigned long runnable) +{ + if (se->on_rq) { + /* commit outstanding execution time */ + if (cfs_rq->curr == se) + update_curr(cfs_rq); + account_entity_dequeue(cfs_rq, se); + dequeue_runnable_load_avg(cfs_rq, se); + } + dequeue_load_avg(cfs_rq, se); + + se->runnable_weight = runnable; + update_load_set(&se->load, weight); + +#ifdef CONFIG_SMP + do { + u32 divider = LOAD_AVG_MAX - 1024 + se->avg.period_contrib; + + se->avg.load_avg = div_u64(se_weight(se) * se->avg.load_sum, divider); + se->avg.runnable_load_avg = + div_u64(se_runnable(se) * se->avg.runnable_load_sum, divider); + } while (0); +#endif + + enqueue_load_avg(cfs_rq, se); + if (se->on_rq) { + account_entity_enqueue(cfs_rq, se); + enqueue_runnable_load_avg(cfs_rq, se); + } +} + +void reweight_task(struct task_struct *p, int prio) +{ + struct sched_entity *se = &p->se; + struct cfs_rq *cfs_rq = cfs_rq_of(se); + struct load_weight *load = &se->load; + unsigned long weight = scale_load(sched_prio_to_weight[prio]); + + reweight_entity(cfs_rq, se, weight, weight); + load->inv_weight = sched_prio_to_wmult[prio]; +} + #ifdef CONFIG_FAIR_GROUP_SCHED # ifdef CONFIG_SMP -static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg) +/* + * All this does is approximate the hierarchical proportion which includes that + * global sum we all love to hate. + * + * That is, the weight of a group entity, is the proportional share of the + * group weight based on the group runqueue weights. That is: + * + * tg->weight * grq->load.weight + * ge->load.weight = ----------------------------- (1) + * \Sum grq->load.weight + * + * Now, because computing that sum is prohibitively expensive to compute (been + * there, done that) we approximate it with this average stuff. The average + * moves slower and therefore the approximation is cheaper and more stable. + * + * So instead of the above, we substitute: + * + * grq->load.weight -> grq->avg.load_avg (2) + * + * which yields the following: + * + * tg->weight * grq->avg.load_avg + * ge->load.weight = ------------------------------ (3) + * tg->load_avg + * + * Where: tg->load_avg ~= \Sum grq->avg.load_avg + * + * That is shares_avg, and it is right (given the approximation (2)). + * + * The problem with it is that because the average is slow -- it was designed + * to be exactly that of course -- this leads to transients in boundary + * conditions. In specific, the case where the group was idle and we start the + * one task. It takes time for our CPU's grq->avg.load_avg to build up, + * yielding bad latency etc.. + * + * Now, in that special case (1) reduces to: + * + * tg->weight * grq->load.weight + * ge->load.weight = ----------------------------- = tg->weight (4) + * grp->load.weight + * + * That is, the sum collapses because all other CPUs are idle; the UP scenario. + * + * So what we do is modify our approximation (3) to approach (4) in the (near) + * UP case, like: + * + * ge->load.weight = + * + * tg->weight * grq->load.weight + * --------------------------------------------------- (5) + * tg->load_avg - grq->avg.load_avg + grq->load.weight + * + * But because grq->load.weight can drop to 0, resulting in a divide by zero, + * we need to use grq->avg.load_avg as its lower bound, which then gives: + * + * + * tg->weight * grq->load.weight + * ge->load.weight = ----------------------------- (6) + * tg_load_avg' + * + * Where: + * + * tg_load_avg' = tg->load_avg - grq->avg.load_avg + + * max(grq->load.weight, grq->avg.load_avg) + * + * And that is shares_weight and is icky. In the (near) UP case it approaches + * (4) while in the normal case it approaches (3). It consistently + * overestimates the ge->load.weight and therefore: + * + * \Sum ge->load.weight >= tg->weight + * + * hence icky! + */ +static long calc_group_shares(struct cfs_rq *cfs_rq) { - long tg_weight, load, shares; + long tg_weight, tg_shares, load, shares; + struct task_group *tg = cfs_rq->tg; - /* - * This really should be: cfs_rq->avg.load_avg, but instead we use - * cfs_rq->load.weight, which is its upper bound. This helps ramp up - * the shares for small weight interactive tasks. - */ - load = scale_load_down(cfs_rq->load.weight); + tg_shares = READ_ONCE(tg->shares); + + load = max(scale_load_down(cfs_rq->load.weight), cfs_rq->avg.load_avg); tg_weight = atomic_long_read(&tg->load_avg); @@ -2712,7 +2912,7 @@ static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg) tg_weight -= cfs_rq->tg_load_avg_contrib; tg_weight += load; - shares = (tg->shares * load); + shares = (tg_shares * load); if (tg_weight) shares /= tg_weight; @@ -2728,63 +2928,86 @@ static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg) * case no task is runnable on a CPU MIN_SHARES=2 should be returned * instead of 0. */ - if (shares < MIN_SHARES) - shares = MIN_SHARES; - if (shares > tg->shares) - shares = tg->shares; - - return shares; -} -# else /* CONFIG_SMP */ -static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg) -{ - return tg->shares; + return clamp_t(long, shares, MIN_SHARES, tg_shares); } -# endif /* CONFIG_SMP */ -static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, - unsigned long weight) +/* + * This calculates the effective runnable weight for a group entity based on + * the group entity weight calculated above. + * + * Because of the above approximation (2), our group entity weight is + * an load_avg based ratio (3). This means that it includes blocked load and + * does not represent the runnable weight. + * + * Approximate the group entity's runnable weight per ratio from the group + * runqueue: + * + * grq->avg.runnable_load_avg + * ge->runnable_weight = ge->load.weight * -------------------------- (7) + * grq->avg.load_avg + * + * However, analogous to above, since the avg numbers are slow, this leads to + * transients in the from-idle case. Instead we use: + * + * ge->runnable_weight = ge->load.weight * + * + * max(grq->avg.runnable_load_avg, grq->runnable_weight) + * ----------------------------------------------------- (8) + * max(grq->avg.load_avg, grq->load.weight) + * + * Where these max() serve both to use the 'instant' values to fix the slow + * from-idle and avoid the /0 on to-idle, similar to (6). + */ +static long calc_group_runnable(struct cfs_rq *cfs_rq, long shares) { - if (se->on_rq) { - /* commit outstanding execution time */ - if (cfs_rq->curr == se) - update_curr(cfs_rq); - account_entity_dequeue(cfs_rq, se); - } + long runnable, load_avg; - update_load_set(&se->load, weight); + load_avg = max(cfs_rq->avg.load_avg, + scale_load_down(cfs_rq->load.weight)); - if (se->on_rq) - account_entity_enqueue(cfs_rq, se); + runnable = max(cfs_rq->avg.runnable_load_avg, + scale_load_down(cfs_rq->runnable_weight)); + + runnable *= shares; + if (load_avg) + runnable /= load_avg; + + return clamp_t(long, runnable, MIN_SHARES, shares); } +# endif /* CONFIG_SMP */ static inline int throttled_hierarchy(struct cfs_rq *cfs_rq); -static void update_cfs_shares(struct sched_entity *se) +/* + * Recomputes the group entity based on the current state of its group + * runqueue. + */ +static void update_cfs_group(struct sched_entity *se) { - struct cfs_rq *cfs_rq = group_cfs_rq(se); - struct task_group *tg; - long shares; + struct cfs_rq *gcfs_rq = group_cfs_rq(se); + long shares, runnable; - if (!cfs_rq) + if (!gcfs_rq) return; - if (throttled_hierarchy(cfs_rq)) + if (throttled_hierarchy(gcfs_rq)) return; - tg = cfs_rq->tg; - #ifndef CONFIG_SMP - if (likely(se->load.weight == tg->shares)) + runnable = shares = READ_ONCE(gcfs_rq->tg->shares); + + if (likely(se->load.weight == shares)) return; +#else + shares = calc_group_shares(gcfs_rq); + runnable = calc_group_runnable(gcfs_rq, shares); #endif - shares = calc_cfs_shares(cfs_rq, tg); - reweight_entity(cfs_rq_of(se), se, shares); + reweight_entity(cfs_rq_of(se), se, shares, runnable); } #else /* CONFIG_FAIR_GROUP_SCHED */ -static inline void update_cfs_shares(struct sched_entity *se) +static inline void update_cfs_group(struct sched_entity *se) { } #endif /* CONFIG_FAIR_GROUP_SCHED */ @@ -2893,7 +3116,7 @@ static u32 __accumulate_pelt_segments(u64 periods, u32 d1, u32 d3) */ static __always_inline u32 accumulate_sum(u64 delta, int cpu, struct sched_avg *sa, - unsigned long weight, int running, struct cfs_rq *cfs_rq) + unsigned long load, unsigned long runnable, int running) { unsigned long scale_freq, scale_cpu; u32 contrib = (u32)delta; /* p == 0 -> delta < 1024 */ @@ -2910,10 +3133,8 @@ accumulate_sum(u64 delta, int cpu, struct sched_avg *sa, */ if (periods) { sa->load_sum = decay_load(sa->load_sum, periods); - if (cfs_rq) { - cfs_rq->runnable_load_sum = - decay_load(cfs_rq->runnable_load_sum, periods); - } + sa->runnable_load_sum = + decay_load(sa->runnable_load_sum, periods); sa->util_sum = decay_load((u64)(sa->util_sum), periods); /* @@ -2926,11 +3147,10 @@ accumulate_sum(u64 delta, int cpu, struct sched_avg *sa, sa->period_contrib = delta; contrib = cap_scale(contrib, scale_freq); - if (weight) { - sa->load_sum += weight * contrib; - if (cfs_rq) - cfs_rq->runnable_load_sum += weight * contrib; - } + if (load) + sa->load_sum += load * contrib; + if (runnable) + sa->runnable_load_sum += runnable * contrib; if (running) sa->util_sum += contrib * scale_cpu; @@ -2966,8 +3186,8 @@ accumulate_sum(u64 delta, int cpu, struct sched_avg *sa, * = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}] */ static __always_inline int -___update_load_avg(u64 now, int cpu, struct sched_avg *sa, - unsigned long weight, int running, struct cfs_rq *cfs_rq) +___update_load_sum(u64 now, int cpu, struct sched_avg *sa, + unsigned long load, unsigned long runnable, int running) { u64 delta; @@ -3000,8 +3220,8 @@ ___update_load_avg(u64 now, int cpu, struct sched_avg *sa, * this happens during idle_balance() which calls * update_blocked_averages() */ - if (!weight) - running = 0; + if (!load) + runnable = running = 0; /* * Now we know we crossed measurement unit boundaries. The *_avg @@ -3010,63 +3230,96 @@ ___update_load_avg(u64 now, int cpu, struct sched_avg *sa, * Step 1: accumulate *_sum since last_update_time. If we haven't * crossed period boundaries, finish. */ - if (!accumulate_sum(delta, cpu, sa, weight, running, cfs_rq)) + if (!accumulate_sum(delta, cpu, sa, load, runnable, running)) return 0; + return 1; +} + +static __always_inline void +___update_load_avg(struct sched_avg *sa, unsigned long load, unsigned long runnable) +{ + u32 divider = LOAD_AVG_MAX - 1024 + sa->period_contrib; + /* * Step 2: update *_avg. */ - sa->load_avg = div_u64(sa->load_sum, LOAD_AVG_MAX - 1024 + sa->period_contrib); - if (cfs_rq) { - cfs_rq->runnable_load_avg = - div_u64(cfs_rq->runnable_load_sum, LOAD_AVG_MAX - 1024 + sa->period_contrib); - } - sa->util_avg = sa->util_sum / (LOAD_AVG_MAX - 1024 + sa->period_contrib); - - return 1; + sa->load_avg = div_u64(load * sa->load_sum, divider); + sa->runnable_load_avg = div_u64(runnable * sa->runnable_load_sum, divider); + sa->util_avg = sa->util_sum / divider; } +/* + * sched_entity: + * + * task: + * se_runnable() == se_weight() + * + * group: [ see update_cfs_group() ] + * se_weight() = tg->weight * grq->load_avg / tg->load_avg + * se_runnable() = se_weight(se) * grq->runnable_load_avg / grq->load_avg + * + * load_sum := runnable_sum + * load_avg = se_weight(se) * runnable_avg + * + * runnable_load_sum := runnable_sum + * runnable_load_avg = se_runnable(se) * runnable_avg + * + * XXX collapse load_sum and runnable_load_sum + * + * cfq_rs: + * + * load_sum = \Sum se_weight(se) * se->avg.load_sum + * load_avg = \Sum se->avg.load_avg + * + * runnable_load_sum = \Sum se_runnable(se) * se->avg.runnable_load_sum + * runnable_load_avg = \Sum se->avg.runable_load_avg + */ + static int __update_load_avg_blocked_se(u64 now, int cpu, struct sched_entity *se) { - return ___update_load_avg(now, cpu, &se->avg, 0, 0, NULL); + if (entity_is_task(se)) + se->runnable_weight = se->load.weight; + + if (___update_load_sum(now, cpu, &se->avg, 0, 0, 0)) { + ___update_load_avg(&se->avg, se_weight(se), se_runnable(se)); + return 1; + } + + return 0; } static int __update_load_avg_se(u64 now, int cpu, struct cfs_rq *cfs_rq, struct sched_entity *se) { - return ___update_load_avg(now, cpu, &se->avg, - se->on_rq * scale_load_down(se->load.weight), - cfs_rq->curr == se, NULL); + if (entity_is_task(se)) + se->runnable_weight = se->load.weight; + + if (___update_load_sum(now, cpu, &se->avg, !!se->on_rq, !!se->on_rq, + cfs_rq->curr == se)) { + + ___update_load_avg(&se->avg, se_weight(se), se_runnable(se)); + return 1; + } + + return 0; } static int __update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq *cfs_rq) { - return ___update_load_avg(now, cpu, &cfs_rq->avg, - scale_load_down(cfs_rq->load.weight), - cfs_rq->curr != NULL, cfs_rq); -} + if (___update_load_sum(now, cpu, &cfs_rq->avg, + scale_load_down(cfs_rq->load.weight), + scale_load_down(cfs_rq->runnable_weight), + cfs_rq->curr != NULL)) { -/* - * Signed add and clamp on underflow. - * - * Explicitly do a load-store to ensure the intermediate value never hits - * memory. This allows lockless observations without ever seeing the negative - * values. - */ -#define add_positive(_ptr, _val) do { \ - typeof(_ptr) ptr = (_ptr); \ - typeof(_val) val = (_val); \ - typeof(*ptr) res, var = READ_ONCE(*ptr); \ - \ - res = var + val; \ - \ - if (val < 0 && res > var) \ - res = 0; \ - \ - WRITE_ONCE(*ptr, res); \ -} while (0) + ___update_load_avg(&cfs_rq->avg, 1, 1); + return 1; + } + + return 0; +} #ifdef CONFIG_FAIR_GROUP_SCHED /** @@ -3149,11 +3402,77 @@ void set_task_rq_fair(struct sched_entity *se, se->avg.last_update_time = n_last_update_time; } -/* Take into account change of utilization of a child task group */ + +/* + * When on migration a sched_entity joins/leaves the PELT hierarchy, we need to + * propagate its contribution. The key to this propagation is the invariant + * that for each group: + * + * ge->avg == grq->avg (1) + * + * _IFF_ we look at the pure running and runnable sums. Because they + * represent the very same entity, just at different points in the hierarchy. + * + * + * Per the above update_tg_cfs_util() is trivial (and still 'wrong') and + * simply copies the running sum over. + * + * However, update_tg_cfs_runnable() is more complex. So we have: + * + * ge->avg.load_avg = ge->load.weight * ge->avg.runnable_avg (2) + * + * And since, like util, the runnable part should be directly transferable, + * the following would _appear_ to be the straight forward approach: + * + * grq->avg.load_avg = grq->load.weight * grq->avg.running_avg (3) + * + * And per (1) we have: + * + * ge->avg.running_avg == grq->avg.running_avg + * + * Which gives: + * + * ge->load.weight * grq->avg.load_avg + * ge->avg.load_avg = ----------------------------------- (4) + * grq->load.weight + * + * Except that is wrong! + * + * Because while for entities historical weight is not important and we + * really only care about our future and therefore can consider a pure + * runnable sum, runqueues can NOT do this. + * + * We specifically want runqueues to have a load_avg that includes + * historical weights. Those represent the blocked load, the load we expect + * to (shortly) return to us. This only works by keeping the weights as + * integral part of the sum. We therefore cannot decompose as per (3). + * + * OK, so what then? + * + * + * Another way to look at things is: + * + * grq->avg.load_avg = \Sum se->avg.load_avg + * + * Therefore, per (2): + * + * grq->avg.load_avg = \Sum se->load.weight * se->avg.runnable_avg + * + * And the very thing we're propagating is a change in that sum (someone + * joined/left). So we can easily know the runnable change, which would be, per + * (2) the already tracked se->load_avg divided by the corresponding + * se->weight. + * + * Basically (4) but in differential form: + * + * d(runnable_avg) += se->avg.load_avg / se->load.weight + * (5) + * ge->avg.load_avg += ge->load.weight * d(runnable_avg) + */ + static inline void -update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se) +update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq) { - struct cfs_rq *gcfs_rq = group_cfs_rq(se); long delta = gcfs_rq->avg.util_avg - se->avg.util_avg; /* Nothing to update */ @@ -3169,102 +3488,65 @@ update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se) cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * LOAD_AVG_MAX; } -/* Take into account change of load of a child task group */ static inline void -update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se) +update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq) { - struct cfs_rq *gcfs_rq = group_cfs_rq(se); - long delta, load = gcfs_rq->avg.load_avg; + long runnable_sum = gcfs_rq->prop_runnable_sum; + long runnable_load_avg, load_avg; + s64 runnable_load_sum, load_sum; - /* - * If the load of group cfs_rq is null, the load of the - * sched_entity will also be null so we can skip the formula - */ - if (load) { - long tg_load; + if (!runnable_sum) + return; - /* Get tg's load and ensure tg_load > 0 */ - tg_load = atomic_long_read(&gcfs_rq->tg->load_avg) + 1; + gcfs_rq->prop_runnable_sum = 0; - /* Ensure tg_load >= load and updated with current load*/ - tg_load -= gcfs_rq->tg_load_avg_contrib; - tg_load += load; + load_sum = (s64)se_weight(se) * runnable_sum; + load_avg = div_s64(load_sum, LOAD_AVG_MAX); - /* - * We need to compute a correction term in the case that the - * task group is consuming more CPU than a task of equal - * weight. A task with a weight equals to tg->shares will have - * a load less or equal to scale_load_down(tg->shares). - * Similarly, the sched_entities that represent the task group - * at parent level, can't have a load higher than - * scale_load_down(tg->shares). And the Sum of sched_entities' - * load must be <= scale_load_down(tg->shares). - */ - if (tg_load > scale_load_down(gcfs_rq->tg->shares)) { - /* scale gcfs_rq's load into tg's shares*/ - load *= scale_load_down(gcfs_rq->tg->shares); - load /= tg_load; - } - } + add_positive(&se->avg.load_sum, runnable_sum); + add_positive(&se->avg.load_avg, load_avg); - delta = load - se->avg.load_avg; + add_positive(&cfs_rq->avg.load_avg, load_avg); + add_positive(&cfs_rq->avg.load_sum, load_sum); - /* Nothing to update */ - if (!delta) - return; - - /* Set new sched_entity's load */ - se->avg.load_avg = load; - se->avg.load_sum = se->avg.load_avg * LOAD_AVG_MAX; + runnable_load_sum = (s64)se_runnable(se) * runnable_sum; + runnable_load_avg = div_s64(runnable_load_sum, LOAD_AVG_MAX); - /* Update parent cfs_rq load */ - add_positive(&cfs_rq->avg.load_avg, delta); - cfs_rq->avg.load_sum = cfs_rq->avg.load_avg * LOAD_AVG_MAX; + add_positive(&se->avg.runnable_load_sum, runnable_sum); + add_positive(&se->avg.runnable_load_avg, runnable_load_avg); - /* - * If the sched_entity is already enqueued, we also have to update the - * runnable load avg. - */ if (se->on_rq) { - /* Update parent cfs_rq runnable_load_avg */ - add_positive(&cfs_rq->runnable_load_avg, delta); - cfs_rq->runnable_load_sum = cfs_rq->runnable_load_avg * LOAD_AVG_MAX; + add_positive(&cfs_rq->avg.runnable_load_avg, runnable_load_avg); + add_positive(&cfs_rq->avg.runnable_load_sum, runnable_load_sum); } } -static inline void set_tg_cfs_propagate(struct cfs_rq *cfs_rq) -{ - cfs_rq->propagate_avg = 1; -} - -static inline int test_and_clear_tg_cfs_propagate(struct sched_entity *se) +static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum) { - struct cfs_rq *cfs_rq = group_cfs_rq(se); - - if (!cfs_rq->propagate_avg) - return 0; - - cfs_rq->propagate_avg = 0; - return 1; + cfs_rq->propagate = 1; + cfs_rq->prop_runnable_sum += runnable_sum; } /* Update task and its cfs_rq load average */ static inline int propagate_entity_load_avg(struct sched_entity *se) { - struct cfs_rq *cfs_rq; + struct cfs_rq *cfs_rq, *gcfs_rq; if (entity_is_task(se)) return 0; - if (!test_and_clear_tg_cfs_propagate(se)) + gcfs_rq = group_cfs_rq(se); + if (!gcfs_rq->propagate) return 0; + gcfs_rq->propagate = 0; + cfs_rq = cfs_rq_of(se); - set_tg_cfs_propagate(cfs_rq); + add_tg_cfs_propagate(cfs_rq, gcfs_rq->prop_runnable_sum); - update_tg_cfs_util(cfs_rq, se); - update_tg_cfs_load(cfs_rq, se); + update_tg_cfs_util(cfs_rq, se, gcfs_rq); + update_tg_cfs_runnable(cfs_rq, se, gcfs_rq); return 1; } @@ -3288,7 +3570,7 @@ static inline bool skip_blocked_update(struct sched_entity *se) * If there is a pending propagation, we have to update the load and * the utilization of the sched_entity: */ - if (gcfs_rq->propagate_avg) + if (gcfs_rq->propagate) return false; /* @@ -3308,27 +3590,10 @@ static inline int propagate_entity_load_avg(struct sched_entity *se) return 0; } -static inline void set_tg_cfs_propagate(struct cfs_rq *cfs_rq) {} +static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum) {} #endif /* CONFIG_FAIR_GROUP_SCHED */ -/* - * Unsigned subtract and clamp on underflow. - * - * Explicitly do a load-store to ensure the intermediate value never hits - * memory. This allows lockless observations without ever seeing the negative - * values. - */ -#define sub_positive(_ptr, _val) do { \ - typeof(_ptr) ptr = (_ptr); \ - typeof(*ptr) val = (_val); \ - typeof(*ptr) res, var = READ_ONCE(*ptr); \ - res = var - val; \ - if (res > var) \ - res = 0; \ - WRITE_ONCE(*ptr, res); \ -} while (0) - /** * update_cfs_rq_load_avg - update the cfs_rq's load/util averages * @now: current time, as per cfs_rq_clock_task() @@ -3348,65 +3613,45 @@ static inline void set_tg_cfs_propagate(struct cfs_rq *cfs_rq) {} static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) { + unsigned long removed_load = 0, removed_util = 0, removed_runnable_sum = 0; struct sched_avg *sa = &cfs_rq->avg; - int decayed, removed_load = 0, removed_util = 0; + int decayed = 0; - if (atomic_long_read(&cfs_rq->removed_load_avg)) { - s64 r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0); + if (cfs_rq->removed.nr) { + unsigned long r; + u32 divider = LOAD_AVG_MAX - 1024 + sa->period_contrib; + + raw_spin_lock(&cfs_rq->removed.lock); + swap(cfs_rq->removed.util_avg, removed_util); + swap(cfs_rq->removed.load_avg, removed_load); + swap(cfs_rq->removed.runnable_sum, removed_runnable_sum); + cfs_rq->removed.nr = 0; + raw_spin_unlock(&cfs_rq->removed.lock); + + r = removed_load; sub_positive(&sa->load_avg, r); - sub_positive(&sa->load_sum, r * LOAD_AVG_MAX); - removed_load = 1; - set_tg_cfs_propagate(cfs_rq); - } + sub_positive(&sa->load_sum, r * divider); - if (atomic_long_read(&cfs_rq->removed_util_avg)) { - long r = atomic_long_xchg(&cfs_rq->removed_util_avg, 0); + r = removed_util; sub_positive(&sa->util_avg, r); - sub_positive(&sa->util_sum, r * LOAD_AVG_MAX); - removed_util = 1; - set_tg_cfs_propagate(cfs_rq); + sub_positive(&sa->util_sum, r * divider); + + add_tg_cfs_propagate(cfs_rq, -(long)removed_runnable_sum); + + decayed = 1; } - decayed = __update_load_avg_cfs_rq(now, cpu_of(rq_of(cfs_rq)), cfs_rq); + decayed |= __update_load_avg_cfs_rq(now, cpu_of(rq_of(cfs_rq)), cfs_rq); #ifndef CONFIG_64BIT smp_wmb(); cfs_rq->load_last_update_time_copy = sa->last_update_time; #endif - if (decayed || removed_util) + if (decayed) cfs_rq_util_change(cfs_rq); - return decayed || removed_load; -} - -/* - * Optional action to be done while updating the load average - */ -#define UPDATE_TG 0x1 -#define SKIP_AGE_LOAD 0x2 - -/* Update task and its cfs_rq load average */ -static inline void update_load_avg(struct sched_entity *se, int flags) -{ - struct cfs_rq *cfs_rq = cfs_rq_of(se); - u64 now = cfs_rq_clock_task(cfs_rq); - struct rq *rq = rq_of(cfs_rq); - int cpu = cpu_of(rq); - int decayed; - - /* - * Track task load average for carrying it to new CPU after migrated, and - * track group sched_entity load average for task_h_load calc in migration - */ - if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD)) - __update_load_avg_se(now, cpu, cfs_rq, se); - - decayed = update_cfs_rq_load_avg(now, cfs_rq); - decayed |= propagate_entity_load_avg(se); - - if (decayed && (flags & UPDATE_TG)) - update_tg_load_avg(cfs_rq, 0); + return decayed; } /** @@ -3419,12 +3664,39 @@ static inline void update_load_avg(struct sched_entity *se, int flags) */ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { + u32 divider = LOAD_AVG_MAX - 1024 + cfs_rq->avg.period_contrib; + + /* + * When we attach the @se to the @cfs_rq, we must align the decay + * window because without that, really weird and wonderful things can + * happen. + * + * XXX illustrate + */ se->avg.last_update_time = cfs_rq->avg.last_update_time; - cfs_rq->avg.load_avg += se->avg.load_avg; - cfs_rq->avg.load_sum += se->avg.load_sum; + se->avg.period_contrib = cfs_rq->avg.period_contrib; + + /* + * Hell(o) Nasty stuff.. we need to recompute _sum based on the new + * period_contrib. This isn't strictly correct, but since we're + * entirely outside of the PELT hierarchy, nobody cares if we truncate + * _sum a little. + */ + se->avg.util_sum = se->avg.util_avg * divider; + + se->avg.load_sum = divider; + if (se_weight(se)) { + se->avg.load_sum = + div_u64(se->avg.load_avg * se->avg.load_sum, se_weight(se)); + } + + se->avg.runnable_load_sum = se->avg.load_sum; + + enqueue_load_avg(cfs_rq, se); cfs_rq->avg.util_avg += se->avg.util_avg; cfs_rq->avg.util_sum += se->avg.util_sum; - set_tg_cfs_propagate(cfs_rq); + + add_tg_cfs_propagate(cfs_rq, se->avg.load_sum); cfs_rq_util_change(cfs_rq); } @@ -3439,39 +3711,47 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s */ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { - - sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg); - sub_positive(&cfs_rq->avg.load_sum, se->avg.load_sum); + dequeue_load_avg(cfs_rq, se); sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg); sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum); - set_tg_cfs_propagate(cfs_rq); + + add_tg_cfs_propagate(cfs_rq, -se->avg.load_sum); cfs_rq_util_change(cfs_rq); } -/* Add the load generated by se into cfs_rq's load average */ -static inline void -enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) +/* + * Optional action to be done while updating the load average + */ +#define UPDATE_TG 0x1 +#define SKIP_AGE_LOAD 0x2 +#define DO_ATTACH 0x4 + +/* Update task and its cfs_rq load average */ +static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) { - struct sched_avg *sa = &se->avg; + u64 now = cfs_rq_clock_task(cfs_rq); + struct rq *rq = rq_of(cfs_rq); + int cpu = cpu_of(rq); + int decayed; + + /* + * Track task load average for carrying it to new CPU after migrated, and + * track group sched_entity load average for task_h_load calc in migration + */ + if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD)) + __update_load_avg_se(now, cpu, cfs_rq, se); - cfs_rq->runnable_load_avg += sa->load_avg; - cfs_rq->runnable_load_sum += sa->load_sum; + decayed = update_cfs_rq_load_avg(now, cfs_rq); + decayed |= propagate_entity_load_avg(se); + + if (!se->avg.last_update_time && (flags & DO_ATTACH)) { - if (!sa->last_update_time) { attach_entity_load_avg(cfs_rq, se); update_tg_load_avg(cfs_rq, 0); - } -} -/* Remove the runnable load generated by se from cfs_rq's runnable load average */ -static inline void -dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) -{ - cfs_rq->runnable_load_avg = - max_t(long, cfs_rq->runnable_load_avg - se->avg.load_avg, 0); - cfs_rq->runnable_load_sum = - max_t(s64, cfs_rq->runnable_load_sum - se->avg.load_sum, 0); + } else if (decayed && (flags & UPDATE_TG)) + update_tg_load_avg(cfs_rq, 0); } #ifndef CONFIG_64BIT @@ -3515,6 +3795,7 @@ void sync_entity_load_avg(struct sched_entity *se) void remove_entity_load_avg(struct sched_entity *se) { struct cfs_rq *cfs_rq = cfs_rq_of(se); + unsigned long flags; /* * tasks cannot exit without having gone through wake_up_new_task() -> @@ -3527,13 +3808,18 @@ void remove_entity_load_avg(struct sched_entity *se) */ sync_entity_load_avg(se); - atomic_long_add(se->avg.load_avg, &cfs_rq->removed_load_avg); - atomic_long_add(se->avg.util_avg, &cfs_rq->removed_util_avg); + + raw_spin_lock_irqsave(&cfs_rq->removed.lock, flags); + ++cfs_rq->removed.nr; + cfs_rq->removed.util_avg += se->avg.util_avg; + cfs_rq->removed.load_avg += se->avg.load_avg; + cfs_rq->removed.runnable_sum += se->avg.load_sum; /* == runnable_sum */ + raw_spin_unlock_irqrestore(&cfs_rq->removed.lock, flags); } static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq) { - return cfs_rq->runnable_load_avg; + return cfs_rq->avg.runnable_load_avg; } static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq) @@ -3553,16 +3839,13 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) #define UPDATE_TG 0x0 #define SKIP_AGE_LOAD 0x0 +#define DO_ATTACH 0x0 -static inline void update_load_avg(struct sched_entity *se, int not_used1) +static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int not_used1) { - cfs_rq_util_change(cfs_rq_of(se)); + cfs_rq_util_change(cfs_rq); } -static inline void -enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {} -static inline void -dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {} static inline void remove_entity_load_avg(struct sched_entity *se) {} static inline void @@ -3707,9 +3990,9 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) * its group cfs_rq * - Add its new weight to cfs_rq->load.weight */ - update_load_avg(se, UPDATE_TG); - enqueue_entity_load_avg(cfs_rq, se); - update_cfs_shares(se); + update_load_avg(cfs_rq, se, UPDATE_TG | DO_ATTACH); + update_cfs_group(se); + enqueue_runnable_load_avg(cfs_rq, se); account_entity_enqueue(cfs_rq, se); if (flags & ENQUEUE_WAKEUP) @@ -3791,8 +4074,8 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) * - For group entity, update its weight to reflect the new share * of its group cfs_rq. */ - update_load_avg(se, UPDATE_TG); - dequeue_entity_load_avg(cfs_rq, se); + update_load_avg(cfs_rq, se, UPDATE_TG); + dequeue_runnable_load_avg(cfs_rq, se); update_stats_dequeue(cfs_rq, se, flags); @@ -3815,7 +4098,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) /* return excess runtime on last dequeue */ return_cfs_rq_runtime(cfs_rq); - update_cfs_shares(se); + update_cfs_group(se); /* * Now advance min_vruntime if @se was the entity holding it back, @@ -3879,7 +4162,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) */ update_stats_wait_end(cfs_rq, se); __dequeue_entity(cfs_rq, se); - update_load_avg(se, UPDATE_TG); + update_load_avg(cfs_rq, se, UPDATE_TG); } update_stats_curr_start(cfs_rq, se); @@ -3981,7 +4264,7 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev) /* Put 'current' back into the tree. */ __enqueue_entity(cfs_rq, prev); /* in !on_rq case, update occurred at dequeue */ - update_load_avg(prev, 0); + update_load_avg(cfs_rq, prev, 0); } cfs_rq->curr = NULL; } @@ -3997,8 +4280,8 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) /* * Ensure that runnable average is periodically updated. */ - update_load_avg(curr, UPDATE_TG); - update_cfs_shares(curr); + update_load_avg(cfs_rq, curr, UPDATE_TG); + update_cfs_group(curr); #ifdef CONFIG_SCHED_HRTICK /* @@ -4915,8 +5198,8 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (cfs_rq_throttled(cfs_rq)) break; - update_load_avg(se, UPDATE_TG); - update_cfs_shares(se); + update_load_avg(cfs_rq, se, UPDATE_TG); + update_cfs_group(se); } if (!se) @@ -4974,8 +5257,8 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (cfs_rq_throttled(cfs_rq)) break; - update_load_avg(se, UPDATE_TG); - update_cfs_shares(se); + update_load_avg(cfs_rq, se, UPDATE_TG); + update_cfs_group(se); } if (!se) @@ -5449,6 +5732,8 @@ static unsigned long capacity_spare_wake(int cpu, struct task_struct *p) /* * find_idlest_group finds and returns the least busy CPU group within the * domain. + * + * Assumes p is allowed on at least one CPU in sd. */ static struct sched_group * find_idlest_group(struct sched_domain *sd, struct task_struct *p, @@ -5456,8 +5741,9 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, { struct sched_group *idlest = NULL, *group = sd->groups; struct sched_group *most_spare_sg = NULL; - unsigned long min_runnable_load = ULONG_MAX, this_runnable_load = 0; - unsigned long min_avg_load = ULONG_MAX, this_avg_load = 0; + unsigned long min_runnable_load = ULONG_MAX; + unsigned long this_runnable_load = ULONG_MAX; + unsigned long min_avg_load = ULONG_MAX, this_avg_load = ULONG_MAX; unsigned long most_spare = 0, this_spare = 0; int load_idx = sd->forkexec_idx; int imbalance_scale = 100 + (sd->imbalance_pct-100)/2; @@ -5578,10 +5864,10 @@ skip_spare: } /* - * find_idlest_cpu - find the idlest cpu among the cpus in group. + * find_idlest_group_cpu - find the idlest cpu among the cpus in group. */ static int -find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) +find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) { unsigned long load, min_load = ULONG_MAX; unsigned int min_exit_latency = UINT_MAX; @@ -5630,6 +5916,53 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu; } +static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p, + int cpu, int prev_cpu, int sd_flag) +{ + int new_cpu = cpu; + + if (!cpumask_intersects(sched_domain_span(sd), &p->cpus_allowed)) + return prev_cpu; + + while (sd) { + struct sched_group *group; + struct sched_domain *tmp; + int weight; + + if (!(sd->flags & sd_flag)) { + sd = sd->child; + continue; + } + + group = find_idlest_group(sd, p, cpu, sd_flag); + if (!group) { + sd = sd->child; + continue; + } + + new_cpu = find_idlest_group_cpu(group, p, cpu); + if (new_cpu == cpu) { + /* Now try balancing at a lower domain level of cpu */ + sd = sd->child; + continue; + } + + /* Now try balancing at a lower domain level of new_cpu */ + cpu = new_cpu; + weight = sd->span_weight; + sd = NULL; + for_each_domain(cpu, tmp) { + if (weight <= tmp->span_weight) + break; + if (tmp->flags & sd_flag) + sd = tmp; + } + /* while loop will break here if sd == NULL */ + } + + return new_cpu; +} + #ifdef CONFIG_SCHED_SMT static inline void set_idle_cores(int cpu, int val) @@ -5982,50 +6315,30 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f new_cpu = cpu; } + if (sd && !(sd_flag & SD_BALANCE_FORK)) { + /* + * We're going to need the task's util for capacity_spare_wake + * in find_idlest_group. Sync it up to prev_cpu's + * last_update_time. + */ + sync_entity_load_avg(&p->se); + } + if (!sd) { - pick_cpu: +pick_cpu: if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */ new_cpu = select_idle_sibling(p, prev_cpu, new_cpu); - } else while (sd) { - struct sched_group *group; - int weight; - - if (!(sd->flags & sd_flag)) { - sd = sd->child; - continue; - } - - group = find_idlest_group(sd, p, cpu, sd_flag); - if (!group) { - sd = sd->child; - continue; - } - - new_cpu = find_idlest_cpu(group, p, cpu); - if (new_cpu == -1 || new_cpu == cpu) { - /* Now try balancing at a lower domain level of cpu */ - sd = sd->child; - continue; - } - - /* Now try balancing at a lower domain level of new_cpu */ - cpu = new_cpu; - weight = sd->span_weight; - sd = NULL; - for_each_domain(cpu, tmp) { - if (weight <= tmp->span_weight) - break; - if (tmp->flags & sd_flag) - sd = tmp; - } - /* while loop will break here if sd == NULL */ + } else { + new_cpu = find_idlest_cpu(sd, p, cpu, prev_cpu, sd_flag); } rcu_read_unlock(); return new_cpu; } +static void detach_entity_cfs_rq(struct sched_entity *se); + /* * Called immediately before a task is migrated to a new cpu; task_cpu(p) and * cfs_rq_of(p) references at time of call are still valid and identify the @@ -6059,14 +6372,25 @@ static void migrate_task_rq_fair(struct task_struct *p) se->vruntime -= min_vruntime; } - /* - * We are supposed to update the task to "current" time, then its up to date - * and ready to go to new CPU/cfs_rq. But we have difficulty in getting - * what current time is, so simply throw away the out-of-date time. This - * will result in the wakee task is less decayed, but giving the wakee more - * load sounds not bad. - */ - remove_entity_load_avg(&p->se); + if (p->on_rq == TASK_ON_RQ_MIGRATING) { + /* + * In case of TASK_ON_RQ_MIGRATING we in fact hold the 'old' + * rq->lock and can modify state directly. + */ + lockdep_assert_held(&task_rq(p)->lock); + detach_entity_cfs_rq(&p->se); + + } else { + /* + * We are supposed to update the task to "current" time, then + * its up to date and ready to go to new CPU/cfs_rq. But we + * have difficulty in getting what current time is, so simply + * throw away the out-of-date time. This will result in the + * wakee task is less decayed, but giving the wakee more load + * sounds not bad. + */ + remove_entity_load_avg(&p->se); + } /* Tell new CPU we are migrated */ p->se.avg.last_update_time = 0; @@ -6334,10 +6658,7 @@ again: set_next_entity(cfs_rq, se); } - if (hrtick_enabled(rq)) - hrtick_start_fair(rq, p); - - return p; + goto done; simple: #endif @@ -6351,6 +6672,16 @@ simple: p = task_of(se); +done: __maybe_unused +#ifdef CONFIG_SMP + /* + * Move the next running task to the front of + * the list, so our cfs_tasks list becomes MRU + * one. + */ + list_move(&p->se.group_node, &rq->cfs_tasks); +#endif + if (hrtick_enabled(rq)) hrtick_start_fair(rq, p); @@ -6786,11 +7117,12 @@ static void detach_task(struct task_struct *p, struct lb_env *env) */ static struct task_struct *detach_one_task(struct lb_env *env) { - struct task_struct *p, *n; + struct task_struct *p; lockdep_assert_held(&env->src_rq->lock); - list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) { + list_for_each_entry_reverse(p, + &env->src_rq->cfs_tasks, se.group_node) { if (!can_migrate_task(p, env)) continue; @@ -6836,7 +7168,7 @@ static int detach_tasks(struct lb_env *env) if (env->idle != CPU_NOT_IDLE && env->src_rq->nr_running <= 1) break; - p = list_first_entry(tasks, struct task_struct, se.group_node); + p = list_last_entry(tasks, struct task_struct, se.group_node); env->loop++; /* We've more or less seen every task there is, call it quits */ @@ -6886,7 +7218,7 @@ static int detach_tasks(struct lb_env *env) continue; next: - list_move_tail(&p->se.group_node, tasks); + list_move(&p->se.group_node, tasks); } /* @@ -6962,7 +7294,7 @@ static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq) if (cfs_rq->avg.util_sum) return false; - if (cfs_rq->runnable_load_sum) + if (cfs_rq->avg.runnable_load_sum) return false; return true; @@ -6994,7 +7326,7 @@ static void update_blocked_averages(int cpu) /* Propagate pending load changes to the parent, if any: */ se = cfs_rq->tg->se[cpu]; if (se && !skip_blocked_update(se)) - update_load_avg(se, 0); + update_load_avg(cfs_rq_of(se), se, 0); /* * There can be a lot of idle CPU cgroups. Don't let fully @@ -7875,8 +8207,11 @@ static struct sched_group *find_busiest_group(struct lb_env *env) if (busiest->group_type == group_imbalanced) goto force_balance; - /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */ - if (env->idle == CPU_NEWLY_IDLE && group_has_capacity(env, local) && + /* + * When dst_cpu is idle, prevent SMP nice and/or asymmetric group + * capacities from resulting in underutilization due to avg_load. + */ + if (env->idle != CPU_NOT_IDLE && group_has_capacity(env, local) && busiest->group_no_capacity) goto force_balance; @@ -8693,7 +9028,7 @@ void nohz_balance_enter_idle(int cpu) return; /* Spare idle load balancing on CPUs that don't want to be disturbed: */ - if (!is_housekeeping_cpu(cpu)) + if (!housekeeping_cpu(cpu, HK_FLAG_SCHED)) return; if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu))) @@ -9158,7 +9493,7 @@ static void propagate_entity_cfs_rq(struct sched_entity *se) if (cfs_rq_throttled(cfs_rq)) break; - update_load_avg(se, UPDATE_TG); + update_load_avg(cfs_rq, se, UPDATE_TG); } } #else @@ -9170,7 +9505,7 @@ static void detach_entity_cfs_rq(struct sched_entity *se) struct cfs_rq *cfs_rq = cfs_rq_of(se); /* Catch up with the cfs_rq and remove our load when we leave */ - update_load_avg(se, 0); + update_load_avg(cfs_rq, se, 0); detach_entity_load_avg(cfs_rq, se); update_tg_load_avg(cfs_rq, false); propagate_entity_cfs_rq(se); @@ -9189,7 +9524,7 @@ static void attach_entity_cfs_rq(struct sched_entity *se) #endif /* Synchronize entity with its cfs_rq */ - update_load_avg(se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD); + update_load_avg(cfs_rq, se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD); attach_entity_load_avg(cfs_rq, se); update_tg_load_avg(cfs_rq, false); propagate_entity_cfs_rq(se); @@ -9271,11 +9606,7 @@ void init_cfs_rq(struct cfs_rq *cfs_rq) cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime; #endif #ifdef CONFIG_SMP -#ifdef CONFIG_FAIR_GROUP_SCHED - cfs_rq->propagate_avg = 0; -#endif - atomic_long_set(&cfs_rq->removed_load_avg, 0); - atomic_long_set(&cfs_rq->removed_util_avg, 0); + raw_spin_lock_init(&cfs_rq->removed.lock); #endif } @@ -9473,8 +9804,8 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares) rq_lock_irqsave(rq, &rf); update_rq_clock(rq); for_each_sched_entity(se) { - update_load_avg(se, UPDATE_TG); - update_cfs_shares(se); + update_load_avg(cfs_rq_of(se), se, UPDATE_TG); + update_cfs_group(se); } rq_unlock_irqrestore(rq, &rf); } diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 257f4f0b4532..7dae9eb8c042 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -209,6 +209,7 @@ exit_idle: */ static void do_idle(void) { + int cpu = smp_processor_id(); /* * If the arch has a polling bit, we maintain an invariant: * @@ -219,14 +220,13 @@ static void do_idle(void) */ __current_set_polling(); - quiet_vmstat(); tick_nohz_idle_enter(); while (!need_resched()) { check_pgt_cache(); rmb(); - if (cpu_is_offline(smp_processor_id())) { + if (cpu_is_offline(cpu)) { cpuhp_report_idle_dead(); arch_cpu_idle_dead(); } diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c new file mode 100644 index 000000000000..b71b436f59f2 --- /dev/null +++ b/kernel/sched/isolation.c @@ -0,0 +1,155 @@ +/* + * Housekeeping management. Manage the targets for routine code that can run on + * any CPU: unbound workqueues, timers, kthreads and any offloadable work. + * + * Copyright (C) 2017 Red Hat, Inc., Frederic Weisbecker + * + */ + +#include <linux/sched/isolation.h> +#include <linux/tick.h> +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/static_key.h> +#include <linux/ctype.h> + +DEFINE_STATIC_KEY_FALSE(housekeeping_overriden); +EXPORT_SYMBOL_GPL(housekeeping_overriden); +static cpumask_var_t housekeeping_mask; +static unsigned int housekeeping_flags; + +int housekeeping_any_cpu(enum hk_flags flags) +{ + if (static_branch_unlikely(&housekeeping_overriden)) + if (housekeeping_flags & flags) + return cpumask_any_and(housekeeping_mask, cpu_online_mask); + return smp_processor_id(); +} +EXPORT_SYMBOL_GPL(housekeeping_any_cpu); + +const struct cpumask *housekeeping_cpumask(enum hk_flags flags) +{ + if (static_branch_unlikely(&housekeeping_overriden)) + if (housekeeping_flags & flags) + return housekeeping_mask; + return cpu_possible_mask; +} +EXPORT_SYMBOL_GPL(housekeeping_cpumask); + +void housekeeping_affine(struct task_struct *t, enum hk_flags flags) +{ + if (static_branch_unlikely(&housekeeping_overriden)) + if (housekeeping_flags & flags) + set_cpus_allowed_ptr(t, housekeeping_mask); +} +EXPORT_SYMBOL_GPL(housekeeping_affine); + +bool housekeeping_test_cpu(int cpu, enum hk_flags flags) +{ + if (static_branch_unlikely(&housekeeping_overriden)) + if (housekeeping_flags & flags) + return cpumask_test_cpu(cpu, housekeeping_mask); + return true; +} +EXPORT_SYMBOL_GPL(housekeeping_test_cpu); + +void __init housekeeping_init(void) +{ + if (!housekeeping_flags) + return; + + static_branch_enable(&housekeeping_overriden); + + /* We need at least one CPU to handle housekeeping work */ + WARN_ON_ONCE(cpumask_empty(housekeeping_mask)); +} + +static int __init housekeeping_setup(char *str, enum hk_flags flags) +{ + cpumask_var_t non_housekeeping_mask; + int err; + + alloc_bootmem_cpumask_var(&non_housekeeping_mask); + err = cpulist_parse(str, non_housekeeping_mask); + if (err < 0 || cpumask_last(non_housekeeping_mask) >= nr_cpu_ids) { + pr_warn("Housekeeping: nohz_full= or isolcpus= incorrect CPU range\n"); + free_bootmem_cpumask_var(non_housekeeping_mask); + return 0; + } + + if (!housekeeping_flags) { + alloc_bootmem_cpumask_var(&housekeeping_mask); + cpumask_andnot(housekeeping_mask, + cpu_possible_mask, non_housekeeping_mask); + if (cpumask_empty(housekeeping_mask)) + cpumask_set_cpu(smp_processor_id(), housekeeping_mask); + } else { + cpumask_var_t tmp; + + alloc_bootmem_cpumask_var(&tmp); + cpumask_andnot(tmp, cpu_possible_mask, non_housekeeping_mask); + if (!cpumask_equal(tmp, housekeeping_mask)) { + pr_warn("Housekeeping: nohz_full= must match isolcpus=\n"); + free_bootmem_cpumask_var(tmp); + free_bootmem_cpumask_var(non_housekeeping_mask); + return 0; + } + free_bootmem_cpumask_var(tmp); + } + + if ((flags & HK_FLAG_TICK) && !(housekeeping_flags & HK_FLAG_TICK)) { + if (IS_ENABLED(CONFIG_NO_HZ_FULL)) { + tick_nohz_full_setup(non_housekeeping_mask); + } else { + pr_warn("Housekeeping: nohz unsupported." + " Build with CONFIG_NO_HZ_FULL\n"); + free_bootmem_cpumask_var(non_housekeeping_mask); + return 0; + } + } + + housekeeping_flags |= flags; + + free_bootmem_cpumask_var(non_housekeeping_mask); + + return 1; +} + +static int __init housekeeping_nohz_full_setup(char *str) +{ + unsigned int flags; + + flags = HK_FLAG_TICK | HK_FLAG_TIMER | HK_FLAG_RCU | HK_FLAG_MISC; + + return housekeeping_setup(str, flags); +} +__setup("nohz_full=", housekeeping_nohz_full_setup); + +static int __init housekeeping_isolcpus_setup(char *str) +{ + unsigned int flags = 0; + + while (isalpha(*str)) { + if (!strncmp(str, "nohz,", 5)) { + str += 5; + flags |= HK_FLAG_TICK; + continue; + } + + if (!strncmp(str, "domain,", 7)) { + str += 7; + flags |= HK_FLAG_DOMAIN; + continue; + } + + pr_warn("isolcpus: Error, unknown flag\n"); + return 0; + } + + /* Default behaviour for isolcpus without flags */ + if (!flags) + flags |= HK_FLAG_DOMAIN; + + return housekeeping_setup(str, flags); +} +__setup("isolcpus=", housekeeping_isolcpus_setup); diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 3c96c80e0992..d8c43d73e078 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -74,10 +74,6 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b) raw_spin_unlock(&rt_b->rt_runtime_lock); } -#if defined(CONFIG_SMP) && defined(HAVE_RT_PUSH_IPI) -static void push_irq_work_func(struct irq_work *work); -#endif - void init_rt_rq(struct rt_rq *rt_rq) { struct rt_prio_array *array; @@ -97,13 +93,6 @@ void init_rt_rq(struct rt_rq *rt_rq) rt_rq->rt_nr_migratory = 0; rt_rq->overloaded = 0; plist_head_init(&rt_rq->pushable_tasks); - -#ifdef HAVE_RT_PUSH_IPI - rt_rq->push_flags = 0; - rt_rq->push_cpu = nr_cpu_ids; - raw_spin_lock_init(&rt_rq->push_lock); - init_irq_work(&rt_rq->push_work, push_irq_work_func); -#endif #endif /* CONFIG_SMP */ /* We start is dequeued state, because no RT tasks are queued */ rt_rq->rt_queued = 0; @@ -1876,241 +1865,166 @@ static void push_rt_tasks(struct rq *rq) } #ifdef HAVE_RT_PUSH_IPI + /* - * The search for the next cpu always starts at rq->cpu and ends - * when we reach rq->cpu again. It will never return rq->cpu. - * This returns the next cpu to check, or nr_cpu_ids if the loop - * is complete. + * When a high priority task schedules out from a CPU and a lower priority + * task is scheduled in, a check is made to see if there's any RT tasks + * on other CPUs that are waiting to run because a higher priority RT task + * is currently running on its CPU. In this case, the CPU with multiple RT + * tasks queued on it (overloaded) needs to be notified that a CPU has opened + * up that may be able to run one of its non-running queued RT tasks. + * + * All CPUs with overloaded RT tasks need to be notified as there is currently + * no way to know which of these CPUs have the highest priority task waiting + * to run. Instead of trying to take a spinlock on each of these CPUs, + * which has shown to cause large latency when done on machines with many + * CPUs, sending an IPI to the CPUs to have them push off the overloaded + * RT tasks waiting to run. + * + * Just sending an IPI to each of the CPUs is also an issue, as on large + * count CPU machines, this can cause an IPI storm on a CPU, especially + * if its the only CPU with multiple RT tasks queued, and a large number + * of CPUs scheduling a lower priority task at the same time. + * + * Each root domain has its own irq work function that can iterate over + * all CPUs with RT overloaded tasks. Since all CPUs with overloaded RT + * tassk must be checked if there's one or many CPUs that are lowering + * their priority, there's a single irq work iterator that will try to + * push off RT tasks that are waiting to run. + * + * When a CPU schedules a lower priority task, it will kick off the + * irq work iterator that will jump to each CPU with overloaded RT tasks. + * As it only takes the first CPU that schedules a lower priority task + * to start the process, the rto_start variable is incremented and if + * the atomic result is one, then that CPU will try to take the rto_lock. + * This prevents high contention on the lock as the process handles all + * CPUs scheduling lower priority tasks. + * + * All CPUs that are scheduling a lower priority task will increment the + * rt_loop_next variable. This will make sure that the irq work iterator + * checks all RT overloaded CPUs whenever a CPU schedules a new lower + * priority task, even if the iterator is in the middle of a scan. Incrementing + * the rt_loop_next will cause the iterator to perform another scan. * - * rq->rt.push_cpu holds the last cpu returned by this function, - * or if this is the first instance, it must hold rq->cpu. */ static int rto_next_cpu(struct rq *rq) { - int prev_cpu = rq->rt.push_cpu; + struct root_domain *rd = rq->rd; + int next; int cpu; - cpu = cpumask_next(prev_cpu, rq->rd->rto_mask); - /* - * If the previous cpu is less than the rq's CPU, then it already - * passed the end of the mask, and has started from the beginning. - * We end if the next CPU is greater or equal to rq's CPU. + * When starting the IPI RT pushing, the rto_cpu is set to -1, + * rt_next_cpu() will simply return the first CPU found in + * the rto_mask. + * + * If rto_next_cpu() is called with rto_cpu is a valid cpu, it + * will return the next CPU found in the rto_mask. + * + * If there are no more CPUs left in the rto_mask, then a check is made + * against rto_loop and rto_loop_next. rto_loop is only updated with + * the rto_lock held, but any CPU may increment the rto_loop_next + * without any locking. */ - if (prev_cpu < rq->cpu) { - if (cpu >= rq->cpu) - return nr_cpu_ids; + for (;;) { - } else if (cpu >= nr_cpu_ids) { - /* - * We passed the end of the mask, start at the beginning. - * If the result is greater or equal to the rq's CPU, then - * the loop is finished. - */ - cpu = cpumask_first(rq->rd->rto_mask); - if (cpu >= rq->cpu) - return nr_cpu_ids; - } - rq->rt.push_cpu = cpu; + /* When rto_cpu is -1 this acts like cpumask_first() */ + cpu = cpumask_next(rd->rto_cpu, rd->rto_mask); - /* Return cpu to let the caller know if the loop is finished or not */ - return cpu; -} + rd->rto_cpu = cpu; -static int find_next_push_cpu(struct rq *rq) -{ - struct rq *next_rq; - int cpu; + if (cpu < nr_cpu_ids) + return cpu; - while (1) { - cpu = rto_next_cpu(rq); - if (cpu >= nr_cpu_ids) - break; - next_rq = cpu_rq(cpu); + rd->rto_cpu = -1; + + /* + * ACQUIRE ensures we see the @rto_mask changes + * made prior to the @next value observed. + * + * Matches WMB in rt_set_overload(). + */ + next = atomic_read_acquire(&rd->rto_loop_next); - /* Make sure the next rq can push to this rq */ - if (next_rq->rt.highest_prio.next < rq->rt.highest_prio.curr) + if (rd->rto_loop == next) break; + + rd->rto_loop = next; } - return cpu; + return -1; } -#define RT_PUSH_IPI_EXECUTING 1 -#define RT_PUSH_IPI_RESTART 2 +static inline bool rto_start_trylock(atomic_t *v) +{ + return !atomic_cmpxchg_acquire(v, 0, 1); +} -/* - * When a high priority task schedules out from a CPU and a lower priority - * task is scheduled in, a check is made to see if there's any RT tasks - * on other CPUs that are waiting to run because a higher priority RT task - * is currently running on its CPU. In this case, the CPU with multiple RT - * tasks queued on it (overloaded) needs to be notified that a CPU has opened - * up that may be able to run one of its non-running queued RT tasks. - * - * On large CPU boxes, there's the case that several CPUs could schedule - * a lower priority task at the same time, in which case it will look for - * any overloaded CPUs that it could pull a task from. To do this, the runqueue - * lock must be taken from that overloaded CPU. Having 10s of CPUs all fighting - * for a single overloaded CPU's runqueue lock can produce a large latency. - * (This has actually been observed on large boxes running cyclictest). - * Instead of taking the runqueue lock of the overloaded CPU, each of the - * CPUs that scheduled a lower priority task simply sends an IPI to the - * overloaded CPU. An IPI is much cheaper than taking an runqueue lock with - * lots of contention. The overloaded CPU will look to push its non-running - * RT task off, and if it does, it can then ignore the other IPIs coming - * in, and just pass those IPIs off to any other overloaded CPU. - * - * When a CPU schedules a lower priority task, it only sends an IPI to - * the "next" CPU that has overloaded RT tasks. This prevents IPI storms, - * as having 10 CPUs scheduling lower priority tasks and 10 CPUs with - * RT overloaded tasks, would cause 100 IPIs to go out at once. - * - * The overloaded RT CPU, when receiving an IPI, will try to push off its - * overloaded RT tasks and then send an IPI to the next CPU that has - * overloaded RT tasks. This stops when all CPUs with overloaded RT tasks - * have completed. Just because a CPU may have pushed off its own overloaded - * RT task does not mean it should stop sending the IPI around to other - * overloaded CPUs. There may be another RT task waiting to run on one of - * those CPUs that are of higher priority than the one that was just - * pushed. - * - * An optimization that could possibly be made is to make a CPU array similar - * to the cpupri array mask of all running RT tasks, but for the overloaded - * case, then the IPI could be sent to only the CPU with the highest priority - * RT task waiting, and that CPU could send off further IPIs to the CPU with - * the next highest waiting task. Since the overloaded case is much less likely - * to happen, the complexity of this implementation may not be worth it. - * Instead, just send an IPI around to all overloaded CPUs. - * - * The rq->rt.push_flags holds the status of the IPI that is going around. - * A run queue can only send out a single IPI at a time. The possible flags - * for rq->rt.push_flags are: - * - * (None or zero): No IPI is going around for the current rq - * RT_PUSH_IPI_EXECUTING: An IPI for the rq is being passed around - * RT_PUSH_IPI_RESTART: The priority of the running task for the rq - * has changed, and the IPI should restart - * circulating the overloaded CPUs again. - * - * rq->rt.push_cpu contains the CPU that is being sent the IPI. It is updated - * before sending to the next CPU. - * - * Instead of having all CPUs that schedule a lower priority task send - * an IPI to the same "first" CPU in the RT overload mask, they send it - * to the next overloaded CPU after their own CPU. This helps distribute - * the work when there's more than one overloaded CPU and multiple CPUs - * scheduling in lower priority tasks. - * - * When a rq schedules a lower priority task than what was currently - * running, the next CPU with overloaded RT tasks is examined first. - * That is, if CPU 1 and 5 are overloaded, and CPU 3 schedules a lower - * priority task, it will send an IPI first to CPU 5, then CPU 5 will - * send to CPU 1 if it is still overloaded. CPU 1 will clear the - * rq->rt.push_flags if RT_PUSH_IPI_RESTART is not set. - * - * The first CPU to notice IPI_RESTART is set, will clear that flag and then - * send an IPI to the next overloaded CPU after the rq->cpu and not the next - * CPU after push_cpu. That is, if CPU 1, 4 and 5 are overloaded when CPU 3 - * schedules a lower priority task, and the IPI_RESTART gets set while the - * handling is being done on CPU 5, it will clear the flag and send it back to - * CPU 4 instead of CPU 1. - * - * Note, the above logic can be disabled by turning off the sched_feature - * RT_PUSH_IPI. Then the rq lock of the overloaded CPU will simply be - * taken by the CPU requesting a pull and the waiting RT task will be pulled - * by that CPU. This may be fine for machines with few CPUs. - */ -static void tell_cpu_to_push(struct rq *rq) +static inline void rto_start_unlock(atomic_t *v) { - int cpu; + atomic_set_release(v, 0); +} - if (rq->rt.push_flags & RT_PUSH_IPI_EXECUTING) { - raw_spin_lock(&rq->rt.push_lock); - /* Make sure it's still executing */ - if (rq->rt.push_flags & RT_PUSH_IPI_EXECUTING) { - /* - * Tell the IPI to restart the loop as things have - * changed since it started. - */ - rq->rt.push_flags |= RT_PUSH_IPI_RESTART; - raw_spin_unlock(&rq->rt.push_lock); - return; - } - raw_spin_unlock(&rq->rt.push_lock); - } +static void tell_cpu_to_push(struct rq *rq) +{ + int cpu = -1; - /* When here, there's no IPI going around */ + /* Keep the loop going if the IPI is currently active */ + atomic_inc(&rq->rd->rto_loop_next); - rq->rt.push_cpu = rq->cpu; - cpu = find_next_push_cpu(rq); - if (cpu >= nr_cpu_ids) + /* Only one CPU can initiate a loop at a time */ + if (!rto_start_trylock(&rq->rd->rto_loop_start)) return; - rq->rt.push_flags = RT_PUSH_IPI_EXECUTING; + raw_spin_lock(&rq->rd->rto_lock); + + /* + * The rto_cpu is updated under the lock, if it has a valid cpu + * then the IPI is still running and will continue due to the + * update to loop_next, and nothing needs to be done here. + * Otherwise it is finishing up and an ipi needs to be sent. + */ + if (rq->rd->rto_cpu < 0) + cpu = rto_next_cpu(rq); - irq_work_queue_on(&rq->rt.push_work, cpu); + raw_spin_unlock(&rq->rd->rto_lock); + + rto_start_unlock(&rq->rd->rto_loop_start); + + if (cpu >= 0) + irq_work_queue_on(&rq->rd->rto_push_work, cpu); } /* Called from hardirq context */ -static void try_to_push_tasks(void *arg) +void rto_push_irq_work_func(struct irq_work *work) { - struct rt_rq *rt_rq = arg; - struct rq *rq, *src_rq; - int this_cpu; + struct rq *rq; int cpu; - this_cpu = rt_rq->push_cpu; + rq = this_rq(); - /* Paranoid check */ - BUG_ON(this_cpu != smp_processor_id()); - - rq = cpu_rq(this_cpu); - src_rq = rq_of_rt_rq(rt_rq); - -again: + /* + * We do not need to grab the lock to check for has_pushable_tasks. + * When it gets updated, a check is made if a push is possible. + */ if (has_pushable_tasks(rq)) { raw_spin_lock(&rq->lock); - push_rt_task(rq); + push_rt_tasks(rq); raw_spin_unlock(&rq->lock); } - /* Pass the IPI to the next rt overloaded queue */ - raw_spin_lock(&rt_rq->push_lock); - /* - * If the source queue changed since the IPI went out, - * we need to restart the search from that CPU again. - */ - if (rt_rq->push_flags & RT_PUSH_IPI_RESTART) { - rt_rq->push_flags &= ~RT_PUSH_IPI_RESTART; - rt_rq->push_cpu = src_rq->cpu; - } + raw_spin_lock(&rq->rd->rto_lock); - cpu = find_next_push_cpu(src_rq); + /* Pass the IPI to the next rt overloaded queue */ + cpu = rto_next_cpu(rq); - if (cpu >= nr_cpu_ids) - rt_rq->push_flags &= ~RT_PUSH_IPI_EXECUTING; - raw_spin_unlock(&rt_rq->push_lock); + raw_spin_unlock(&rq->rd->rto_lock); - if (cpu >= nr_cpu_ids) + if (cpu < 0) return; - /* - * It is possible that a restart caused this CPU to be - * chosen again. Don't bother with an IPI, just see if we - * have more to push. - */ - if (unlikely(cpu == rq->cpu)) - goto again; - /* Try the next RT overloaded CPU */ - irq_work_queue_on(&rt_rq->push_work, cpu); -} - -static void push_irq_work_func(struct irq_work *work) -{ - struct rt_rq *rt_rq = container_of(work, struct rt_rq, push_work); - - try_to_push_tasks(rt_rq); + irq_work_queue_on(&rq->rd->rto_push_work, cpu); } #endif /* HAVE_RT_PUSH_IPI */ diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 3b448ba82225..45ab0bf564e7 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -227,7 +227,7 @@ struct dl_bw { static inline void __dl_update(struct dl_bw *dl_b, s64 bw); static inline -void __dl_clear(struct dl_bw *dl_b, u64 tsk_bw, int cpus) +void __dl_sub(struct dl_bw *dl_b, u64 tsk_bw, int cpus) { dl_b->total_bw -= tsk_bw; __dl_update(dl_b, (s32)tsk_bw / cpus); @@ -256,7 +256,6 @@ extern int sched_dl_overflow(struct task_struct *p, int policy, extern void __setparam_dl(struct task_struct *p, const struct sched_attr *attr); extern void __getparam_dl(struct task_struct *p, struct sched_attr *attr); extern bool __checkparam_dl(const struct sched_attr *attr); -extern void __dl_clear_params(struct task_struct *p); extern bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr); extern int dl_task_can_attach(struct task_struct *p, const struct cpumask *cs_cpus_allowed); @@ -419,6 +418,7 @@ struct cfs_bandwidth { }; /* CFS-related fields in a runqueue */ struct cfs_rq { struct load_weight load; + unsigned long runnable_weight; unsigned int nr_running, h_nr_running; u64 exec_clock; @@ -444,18 +444,22 @@ struct cfs_rq { * CFS load tracking */ struct sched_avg avg; - u64 runnable_load_sum; - unsigned long runnable_load_avg; -#ifdef CONFIG_FAIR_GROUP_SCHED - unsigned long tg_load_avg_contrib; - unsigned long propagate_avg; -#endif - atomic_long_t removed_load_avg, removed_util_avg; #ifndef CONFIG_64BIT u64 load_last_update_time_copy; #endif + struct { + raw_spinlock_t lock ____cacheline_aligned; + int nr; + unsigned long load_avg; + unsigned long util_avg; + unsigned long runnable_sum; + } removed; #ifdef CONFIG_FAIR_GROUP_SCHED + unsigned long tg_load_avg_contrib; + long propagate; + long prop_runnable_sum; + /* * h_load = weight * f(tg) * @@ -502,7 +506,7 @@ static inline int rt_bandwidth_enabled(void) } /* RT IPI pull logic requires IRQ_WORK */ -#ifdef CONFIG_IRQ_WORK +#if defined(CONFIG_IRQ_WORK) && defined(CONFIG_SMP) # define HAVE_RT_PUSH_IPI #endif @@ -524,12 +528,6 @@ struct rt_rq { unsigned long rt_nr_total; int overloaded; struct plist_head pushable_tasks; -#ifdef HAVE_RT_PUSH_IPI - int push_flags; - int push_cpu; - struct irq_work push_work; - raw_spinlock_t push_lock; -#endif #endif /* CONFIG_SMP */ int rt_queued; @@ -638,6 +636,19 @@ struct root_domain { struct dl_bw dl_bw; struct cpudl cpudl; +#ifdef HAVE_RT_PUSH_IPI + /* + * For IPI pull requests, loop across the rto_mask. + */ + struct irq_work rto_push_work; + raw_spinlock_t rto_lock; + /* These are only updated and read within rto_lock */ + int rto_loop; + int rto_cpu; + /* These atomics are updated outside of a lock */ + atomic_t rto_loop_next; + atomic_t rto_loop_start; +#endif /* * The "RT overload" flag: it gets set if a CPU has more than * one runnable RT task. @@ -655,6 +666,9 @@ extern void init_defrootdomain(void); extern int sched_init_domains(const struct cpumask *cpu_map); extern void rq_attach_root(struct rq *rq, struct root_domain *rd); +#ifdef HAVE_RT_PUSH_IPI +extern void rto_push_irq_work_func(struct irq_work *work); +#endif #endif /* CONFIG_SMP */ /* @@ -1219,8 +1233,6 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) # define const_debug const #endif -extern const_debug unsigned int sysctl_sched_features; - #define SCHED_FEAT(name, enabled) \ __SCHED_FEAT_##name , @@ -1232,6 +1244,13 @@ enum { #undef SCHED_FEAT #if defined(CONFIG_SCHED_DEBUG) && defined(HAVE_JUMP_LABEL) + +/* + * To support run-time toggling of sched features, all the translation units + * (but core.c) reference the sysctl_sched_features defined in core.c. + */ +extern const_debug unsigned int sysctl_sched_features; + #define SCHED_FEAT(name, enabled) \ static __always_inline bool static_branch_##name(struct static_key *key) \ { \ @@ -1239,13 +1258,27 @@ static __always_inline bool static_branch_##name(struct static_key *key) \ } #include "features.h" - #undef SCHED_FEAT extern struct static_key sched_feat_keys[__SCHED_FEAT_NR]; #define sched_feat(x) (static_branch_##x(&sched_feat_keys[__SCHED_FEAT_##x])) + #else /* !(SCHED_DEBUG && HAVE_JUMP_LABEL) */ + +/* + * Each translation unit has its own copy of sysctl_sched_features to allow + * constants propagation at compile time and compiler optimization based on + * features default. + */ +#define SCHED_FEAT(name, enabled) \ + (1UL << __SCHED_FEAT_##name) * enabled | +static const_debug __maybe_unused unsigned int sysctl_sched_features = +#include "features.h" + 0; +#undef SCHED_FEAT + #define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x)) + #endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */ extern struct static_key_false sched_numa_balancing; @@ -1530,6 +1563,8 @@ extern void init_sched_dl_class(void); extern void init_sched_rt_class(void); extern void init_sched_fair_class(void); +extern void reweight_task(struct task_struct *p, int prio); + extern void resched_curr(struct rq *rq); extern void resched_cpu(int cpu); diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 6798276d29af..034cbed7f88b 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -4,6 +4,7 @@ */ #include <linux/sched.h> #include <linux/mutex.h> +#include <linux/sched/isolation.h> #include "sched.h" @@ -269,6 +270,12 @@ static int init_rootdomain(struct root_domain *rd) if (!zalloc_cpumask_var(&rd->rto_mask, GFP_KERNEL)) goto free_dlo_mask; +#ifdef HAVE_RT_PUSH_IPI + rd->rto_cpu = -1; + raw_spin_lock_init(&rd->rto_lock); + init_irq_work(&rd->rto_push_work, rto_push_irq_work_func); +#endif + init_dl_bw(&rd->dl_bw); if (cpudl_init(&rd->cpudl) != 0) goto free_rto_mask; @@ -464,21 +471,6 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) update_top_cache_domain(cpu); } -/* Setup the mask of CPUs configured for isolated domains */ -static int __init isolated_cpu_setup(char *str) -{ - int ret; - - alloc_bootmem_cpumask_var(&cpu_isolated_map); - ret = cpulist_parse(str, cpu_isolated_map); - if (ret) { - pr_err("sched: Error, all isolcpus= values must be between 0 and %u\n", nr_cpu_ids); - return 0; - } - return 1; -} -__setup("isolcpus=", isolated_cpu_setup); - struct s_data { struct sched_domain ** __percpu sd; struct root_domain *rd; @@ -1158,6 +1150,7 @@ sd_init(struct sched_domain_topology_level *tl, sd->smt_gain = 1178; /* ~15% */ } else if (sd->flags & SD_SHARE_PKG_RESOURCES) { + sd->flags |= SD_PREFER_SIBLING; sd->imbalance_pct = 117; sd->cache_nice_tries = 1; sd->busy_idx = 2; @@ -1332,6 +1325,10 @@ void sched_init_numa(void) if (!sched_domains_numa_distance) return; + /* Includes NUMA identity node at level 0. */ + sched_domains_numa_distance[level++] = curr_distance; + sched_domains_numa_levels = level; + /* * O(nr_nodes^2) deduplicating selection sort -- in order to find the * unique distances in the node_distance() table. @@ -1379,8 +1376,7 @@ void sched_init_numa(void) return; /* - * 'level' contains the number of unique distances, excluding the - * identity distance node_distance(i,i). + * 'level' contains the number of unique distances * * The sched_domains_numa_distance[] array includes the actual distance * numbers. @@ -1442,9 +1438,18 @@ void sched_init_numa(void) tl[i] = sched_domain_topology[i]; /* + * Add the NUMA identity distance, aka single NODE. + */ + tl[i++] = (struct sched_domain_topology_level){ + .mask = sd_numa_mask, + .numa_level = 0, + SD_INIT_NAME(NODE) + }; + + /* * .. and append 'j' levels of NUMA goodness. */ - for (j = 0; j < level; i++, j++) { + for (j = 1; j < level; i++, j++) { tl[i] = (struct sched_domain_topology_level){ .mask = sd_numa_mask, .sd_flags = cpu_numa_flags, @@ -1774,7 +1779,7 @@ int sched_init_domains(const struct cpumask *cpu_map) doms_cur = alloc_sched_domains(ndoms_cur); if (!doms_cur) doms_cur = &fallback_doms; - cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map); + cpumask_and(doms_cur[0], cpu_map, housekeeping_cpumask(HK_FLAG_DOMAIN)); err = build_sched_domains(doms_cur[0], NULL); register_sched_domain_sysctl(); @@ -1857,7 +1862,8 @@ void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], doms_new = alloc_sched_domains(1); if (doms_new) { n = 1; - cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map); + cpumask_and(doms_new[0], cpu_active_mask, + housekeeping_cpumask(HK_FLAG_DOMAIN)); } } else { n = ndoms_new; @@ -1880,7 +1886,8 @@ match1: if (!doms_new) { n = 0; doms_new = &fallback_doms; - cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map); + cpumask_and(doms_new[0], cpu_active_mask, + housekeeping_cpumask(HK_FLAG_DOMAIN)); } /* Build new domains: */ diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 418a1c045933..5f0dfb2abb8d 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -190,7 +190,7 @@ static u32 seccomp_run_filters(const struct seccomp_data *sd, u32 ret = SECCOMP_RET_ALLOW; /* Make sure cross-thread synced filter points somewhere sane. */ struct seccomp_filter *f = - lockless_dereference(current->seccomp.filter); + READ_ONCE(current->seccomp.filter); /* Ensure unexpected behavior doesn't result in failing open. */ if (unlikely(WARN_ON(f == NULL))) diff --git a/kernel/smp.c b/kernel/smp.c index c94dd85c8d41..084c8b3a2681 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -213,7 +213,7 @@ static void flush_smp_call_function_queue(bool warn_cpu_offline) call_single_data_t *csd, *csd_next; static bool warned; - WARN_ON(!irqs_disabled()); + lockdep_assert_irqs_disabled(); head = this_cpu_ptr(&call_single_queue); entry = llist_del_all(head); diff --git a/kernel/softirq.c b/kernel/softirq.c index 4e09821f9d9e..662f7b1b7a78 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -137,7 +137,7 @@ EXPORT_SYMBOL(__local_bh_disable_ip); static void __local_bh_enable(unsigned int cnt) { - WARN_ON_ONCE(!irqs_disabled()); + lockdep_assert_irqs_disabled(); if (softirq_count() == (cnt & SOFTIRQ_MASK)) trace_softirqs_on(_RET_IP_); @@ -158,7 +158,8 @@ EXPORT_SYMBOL(_local_bh_enable); void __local_bh_enable_ip(unsigned long ip, unsigned int cnt) { - WARN_ON_ONCE(in_irq() || irqs_disabled()); + WARN_ON_ONCE(in_irq()); + lockdep_assert_irqs_enabled(); #ifdef CONFIG_TRACE_IRQFLAGS local_irq_disable(); #endif @@ -396,9 +397,8 @@ void irq_exit(void) #ifndef __ARCH_IRQ_EXIT_IRQS_DISABLED local_irq_disable(); #else - WARN_ON_ONCE(!irqs_disabled()); + lockdep_assert_irqs_disabled(); #endif - account_irq_exit_time(current); preempt_count_sub(HARDIRQ_OFFSET); if (!in_interrupt() && local_softirq_pending()) @@ -488,7 +488,7 @@ EXPORT_SYMBOL(__tasklet_hi_schedule); void __tasklet_hi_schedule_first(struct tasklet_struct *t) { - BUG_ON(!irqs_disabled()); + lockdep_assert_irqs_disabled(); t->next = __this_cpu_read(tasklet_hi_vec.head); __this_cpu_write(tasklet_hi_vec.head, t); diff --git a/kernel/task_work.c b/kernel/task_work.c index 5718b3ea202a..0fef395662a6 100644 --- a/kernel/task_work.c +++ b/kernel/task_work.c @@ -68,7 +68,7 @@ task_work_cancel(struct task_struct *task, task_work_func_t func) * we raced with task_work_run(), *pprev == NULL/exited. */ raw_spin_lock_irqsave(&task->pi_lock, flags); - while ((work = lockless_dereference(*pprev))) { + while ((work = READ_ONCE(*pprev))) { if (work->func != func) pprev = &work->next; else if (cmpxchg(pprev, work, work->next) == work) diff --git a/kernel/test_kprobes.c b/kernel/test_kprobes.c index 0dbab6d1acb4..dd53e354f630 100644 --- a/kernel/test_kprobes.c +++ b/kernel/test_kprobes.c @@ -22,7 +22,7 @@ #define div_factor 3 -static u32 rand1, preh_val, posth_val, jph_val; +static u32 rand1, preh_val, posth_val; static int errors, handler_errors, num_tests; static u32 (*target)(u32 value); static u32 (*target2)(u32 value); @@ -34,6 +34,10 @@ static noinline u32 kprobe_target(u32 value) static int kp_pre_handler(struct kprobe *p, struct pt_regs *regs) { + if (preemptible()) { + handler_errors++; + pr_err("pre-handler is preemptible\n"); + } preh_val = (rand1 / div_factor); return 0; } @@ -41,6 +45,10 @@ static int kp_pre_handler(struct kprobe *p, struct pt_regs *regs) static void kp_post_handler(struct kprobe *p, struct pt_regs *regs, unsigned long flags) { + if (preemptible()) { + handler_errors++; + pr_err("post-handler is preemptible\n"); + } if (preh_val != (rand1 / div_factor)) { handler_errors++; pr_err("incorrect value in post_handler\n"); @@ -154,8 +162,15 @@ static int test_kprobes(void) } +#if 0 +static u32 jph_val; + static u32 j_kprobe_target(u32 value) { + if (preemptible()) { + handler_errors++; + pr_err("jprobe-handler is preemptible\n"); + } if (value != rand1) { handler_errors++; pr_err("incorrect value in jprobe handler\n"); @@ -227,11 +242,19 @@ static int test_jprobes(void) return 0; } +#else +#define test_jprobe() (0) +#define test_jprobes() (0) +#endif #ifdef CONFIG_KRETPROBES static u32 krph_val; static int entry_handler(struct kretprobe_instance *ri, struct pt_regs *regs) { + if (preemptible()) { + handler_errors++; + pr_err("kretprobe entry handler is preemptible\n"); + } krph_val = (rand1 / div_factor); return 0; } @@ -240,6 +263,10 @@ static int return_handler(struct kretprobe_instance *ri, struct pt_regs *regs) { unsigned long ret = regs_return_value(regs); + if (preemptible()) { + handler_errors++; + pr_err("kretprobe return handler is preemptible\n"); + } if (ret != (rand1 / div_factor)) { handler_errors++; pr_err("incorrect value in kretprobe handler\n"); diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index ac09bc29eb08..d689a9557e17 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig @@ -56,7 +56,7 @@ menu "Timers subsystem" # Core internal switch. Selected by NO_HZ_COMMON / HIGH_RES_TIMERS. This is # only related to the tick functionality. Oneshot clockevent devices -# are supported independ of this. +# are supported independent of this. config TICK_ONESHOT bool diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index 4237e0744e26..16c027e9cc73 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -280,17 +280,22 @@ static int clockevents_program_min_delta(struct clock_event_device *dev) static int clockevents_program_min_delta(struct clock_event_device *dev) { unsigned long long clc; - int64_t delta; + int64_t delta = 0; + int i; - delta = dev->min_delta_ns; - dev->next_event = ktime_add_ns(ktime_get(), delta); + for (i = 0; i < 10; i++) { + delta += dev->min_delta_ns; + dev->next_event = ktime_add_ns(ktime_get(), delta); - if (clockevent_state_shutdown(dev)) - return 0; + if (clockevent_state_shutdown(dev)) + return 0; - dev->retries++; - clc = ((unsigned long long) delta * dev->mult) >> dev->shift; - return dev->set_next_event((unsigned long) clc, dev); + dev->retries++; + clc = ((unsigned long long) delta * dev->mult) >> dev->shift; + if (dev->set_next_event((unsigned long) clc, dev) == 0) + return 0; + } + return -ETIME; } #endif /* CONFIG_GENERIC_CLOCKEVENTS_MIN_ADJUST */ diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 88f75f92ef36..d32520840fde 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -758,9 +758,7 @@ void clock_was_set(void) */ void hrtimers_resume(void) { - WARN_ONCE(!irqs_disabled(), - KERN_INFO "hrtimers_resume() called with IRQs enabled!"); - + lockdep_assert_irqs_disabled(); /* Retrigger on the local CPU */ retrigger_next_event(NULL); /* And schedule a retrigger for all others */ diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 99e03bec68e4..8d70da1b9a0d 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -493,6 +493,67 @@ out: return leap; } +static void sync_hw_clock(struct work_struct *work); +static DECLARE_DELAYED_WORK(sync_work, sync_hw_clock); + +static void sched_sync_hw_clock(struct timespec64 now, + unsigned long target_nsec, bool fail) + +{ + struct timespec64 next; + + getnstimeofday64(&next); + if (!fail) + next.tv_sec = 659; + else { + /* + * Try again as soon as possible. Delaying long periods + * decreases the accuracy of the work queue timer. Due to this + * the algorithm is very likely to require a short-sleep retry + * after the above long sleep to synchronize ts_nsec. + */ + next.tv_sec = 0; + } + + /* Compute the needed delay that will get to tv_nsec == target_nsec */ + next.tv_nsec = target_nsec - next.tv_nsec; + if (next.tv_nsec <= 0) + next.tv_nsec += NSEC_PER_SEC; + if (next.tv_nsec >= NSEC_PER_SEC) { + next.tv_sec++; + next.tv_nsec -= NSEC_PER_SEC; + } + + queue_delayed_work(system_power_efficient_wq, &sync_work, + timespec64_to_jiffies(&next)); +} + +static void sync_rtc_clock(void) +{ + unsigned long target_nsec; + struct timespec64 adjust, now; + int rc; + + if (!IS_ENABLED(CONFIG_RTC_SYSTOHC)) + return; + + getnstimeofday64(&now); + + adjust = now; + if (persistent_clock_is_local) + adjust.tv_sec -= (sys_tz.tz_minuteswest * 60); + + /* + * The current RTC in use will provide the target_nsec it wants to be + * called at, and does rtc_tv_nsec_ok internally. + */ + rc = rtc_set_ntp_time(adjust, &target_nsec); + if (rc == -ENODEV) + return; + + sched_sync_hw_clock(now, target_nsec, rc); +} + #ifdef CONFIG_GENERIC_CMOS_UPDATE int __weak update_persistent_clock(struct timespec now) { @@ -508,76 +569,75 @@ int __weak update_persistent_clock64(struct timespec64 now64) } #endif -#if defined(CONFIG_GENERIC_CMOS_UPDATE) || defined(CONFIG_RTC_SYSTOHC) -static void sync_cmos_clock(struct work_struct *work); - -static DECLARE_DELAYED_WORK(sync_cmos_work, sync_cmos_clock); - -static void sync_cmos_clock(struct work_struct *work) +static bool sync_cmos_clock(void) { + static bool no_cmos; struct timespec64 now; - struct timespec64 next; - int fail = 1; + struct timespec64 adjust; + int rc = -EPROTO; + long target_nsec = NSEC_PER_SEC / 2; + + if (!IS_ENABLED(CONFIG_GENERIC_CMOS_UPDATE)) + return false; + + if (no_cmos) + return false; /* - * If we have an externally synchronized Linux clock, then update - * CMOS clock accordingly every ~11 minutes. Set_rtc_mmss() has to be - * called as close as possible to 500 ms before the new second starts. - * This code is run on a timer. If the clock is set, that timer - * may not expire at the correct time. Thus, we adjust... - * We want the clock to be within a couple of ticks from the target. + * Historically update_persistent_clock64() has followed x86 + * semantics, which match the MC146818A/etc RTC. This RTC will store + * 'adjust' and then in .5s it will advance once second. + * + * Architectures are strongly encouraged to use rtclib and not + * implement this legacy API. */ - if (!ntp_synced()) { - /* - * Not synced, exit, do not restart a timer (if one is - * running, let it run out). - */ - return; - } - getnstimeofday64(&now); - if (abs(now.tv_nsec - (NSEC_PER_SEC / 2)) <= tick_nsec * 5) { - struct timespec64 adjust = now; - - fail = -ENODEV; + if (rtc_tv_nsec_ok(-1 * target_nsec, &adjust, &now)) { if (persistent_clock_is_local) adjust.tv_sec -= (sys_tz.tz_minuteswest * 60); -#ifdef CONFIG_GENERIC_CMOS_UPDATE - fail = update_persistent_clock64(adjust); -#endif - -#ifdef CONFIG_RTC_SYSTOHC - if (fail == -ENODEV) - fail = rtc_set_ntp_time(adjust); -#endif + rc = update_persistent_clock64(adjust); + /* + * The machine does not support update_persistent_clock64 even + * though it defines CONFIG_GENERIC_CMOS_UPDATE. + */ + if (rc == -ENODEV) { + no_cmos = true; + return false; + } } - next.tv_nsec = (NSEC_PER_SEC / 2) - now.tv_nsec - (TICK_NSEC / 2); - if (next.tv_nsec <= 0) - next.tv_nsec += NSEC_PER_SEC; + sched_sync_hw_clock(now, target_nsec, rc); + return true; +} - if (!fail || fail == -ENODEV) - next.tv_sec = 659; - else - next.tv_sec = 0; +/* + * If we have an externally synchronized Linux clock, then update RTC clock + * accordingly every ~11 minutes. Generally RTCs can only store second + * precision, but many RTCs will adjust the phase of their second tick to + * match the moment of update. This infrastructure arranges to call to the RTC + * set at the correct moment to phase synchronize the RTC second tick over + * with the kernel clock. + */ +static void sync_hw_clock(struct work_struct *work) +{ + if (!ntp_synced()) + return; - if (next.tv_nsec >= NSEC_PER_SEC) { - next.tv_sec++; - next.tv_nsec -= NSEC_PER_SEC; - } - queue_delayed_work(system_power_efficient_wq, - &sync_cmos_work, timespec64_to_jiffies(&next)); + if (sync_cmos_clock()) + return; + + sync_rtc_clock(); } void ntp_notify_cmos_timer(void) { - queue_delayed_work(system_power_efficient_wq, &sync_cmos_work, 0); -} - -#else -void ntp_notify_cmos_timer(void) { } -#endif + if (!ntp_synced()) + return; + if (IS_ENABLED(CONFIG_GENERIC_CMOS_UPDATE) || + IS_ENABLED(CONFIG_RTC_SYSTOHC)) + queue_delayed_work(system_power_efficient_wq, &sync_work, 0); +} /* * Propagate a new txc->status value into the NTP state: @@ -654,67 +714,6 @@ static inline void process_adjtimex_modes(struct timex *txc, } - -/** - * ntp_validate_timex - Ensures the timex is ok for use in do_adjtimex - */ -int ntp_validate_timex(struct timex *txc) -{ - if (txc->modes & ADJ_ADJTIME) { - /* singleshot must not be used with any other mode bits */ - if (!(txc->modes & ADJ_OFFSET_SINGLESHOT)) - return -EINVAL; - if (!(txc->modes & ADJ_OFFSET_READONLY) && - !capable(CAP_SYS_TIME)) - return -EPERM; - } else { - /* In order to modify anything, you gotta be super-user! */ - if (txc->modes && !capable(CAP_SYS_TIME)) - return -EPERM; - /* - * if the quartz is off by more than 10% then - * something is VERY wrong! - */ - if (txc->modes & ADJ_TICK && - (txc->tick < 900000/USER_HZ || - txc->tick > 1100000/USER_HZ)) - return -EINVAL; - } - - if (txc->modes & ADJ_SETOFFSET) { - /* In order to inject time, you gotta be super-user! */ - if (!capable(CAP_SYS_TIME)) - return -EPERM; - - if (txc->modes & ADJ_NANO) { - struct timespec ts; - - ts.tv_sec = txc->time.tv_sec; - ts.tv_nsec = txc->time.tv_usec; - if (!timespec_inject_offset_valid(&ts)) - return -EINVAL; - - } else { - if (!timeval_inject_offset_valid(&txc->time)) - return -EINVAL; - } - } - - /* - * Check for potential multiplication overflows that can - * only happen on 64-bit systems: - */ - if ((txc->modes & ADJ_FREQUENCY) && (BITS_PER_LONG == 64)) { - if (LLONG_MIN / PPM_SCALE > txc->freq) - return -EINVAL; - if (LLONG_MAX / PPM_SCALE < txc->freq) - return -EINVAL; - } - - return 0; -} - - /* * adjtimex mainly allows reading (and writing, if superuser) of * kernel time-keeping variables. used by xntpd. diff --git a/kernel/time/ntp_internal.h b/kernel/time/ntp_internal.h index 0a53e6ea47b1..909bd1f1bfb1 100644 --- a/kernel/time/ntp_internal.h +++ b/kernel/time/ntp_internal.h @@ -8,7 +8,6 @@ extern void ntp_clear(void); extern u64 ntp_tick_length(void); extern ktime_t ntp_get_next_leap(void); extern int second_overflow(time64_t secs); -extern int ntp_validate_timex(struct timex *); extern int __do_adjtimex(struct timex *, struct timespec64 *, s32 *); extern void __hardpps(const struct timespec64 *, const struct timespec64 *); #endif /* _LINUX_NTP_INTERNAL_H */ diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 5b117110b55b..1f27887aa194 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -603,7 +603,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags, /* * Disarm any old timer after extracting its expiry time. */ - WARN_ON_ONCE(!irqs_disabled()); + lockdep_assert_irqs_disabled(); ret = 0; old_incr = timer->it.cpu.incr; @@ -1034,7 +1034,7 @@ static void posix_cpu_timer_rearm(struct k_itimer *timer) /* * Now re-arm for the new expiry time. */ - WARN_ON_ONCE(!irqs_disabled()); + lockdep_assert_irqs_disabled(); arm_timer(timer); unlock: unlock_task_sighand(p, &flags); @@ -1125,7 +1125,7 @@ void run_posix_cpu_timers(struct task_struct *tsk) struct k_itimer *timer, *next; unsigned long flags; - WARN_ON_ONCE(!irqs_disabled()); + lockdep_assert_irqs_disabled(); /* * The fast path checks that there are no expired thread or thread diff --git a/kernel/time/posix-stubs.c b/kernel/time/posix-stubs.c index 06f34feb635e..b258bee13b02 100644 --- a/kernel/time/posix-stubs.c +++ b/kernel/time/posix-stubs.c @@ -117,8 +117,7 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags, const struct timespec __user *, rqtp, struct timespec __user *, rmtp) { - struct timespec64 t64; - struct timespec t; + struct timespec64 t; switch (which_clock) { case CLOCK_REALTIME: @@ -129,16 +128,15 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags, return -EINVAL; } - if (copy_from_user(&t, rqtp, sizeof (struct timespec))) + if (get_timespec64(&t, rqtp)) return -EFAULT; - t64 = timespec_to_timespec64(t); - if (!timespec64_valid(&t64)) + if (!timespec64_valid(&t)) return -EINVAL; if (flags & TIMER_ABSTIME) rmtp = NULL; current->restart_block.nanosleep.type = rmtp ? TT_NATIVE : TT_NONE; current->restart_block.nanosleep.rmtp = rmtp; - return hrtimer_nanosleep(&t64, flags & TIMER_ABSTIME ? + return hrtimer_nanosleep(&t, flags & TIMER_ABSTIME ? HRTIMER_MODE_ABS : HRTIMER_MODE_REL, which_clock); } @@ -203,8 +201,7 @@ COMPAT_SYSCALL_DEFINE4(clock_nanosleep, clockid_t, which_clock, int, flags, struct compat_timespec __user *, rqtp, struct compat_timespec __user *, rmtp) { - struct timespec64 t64; - struct timespec t; + struct timespec64 t; switch (which_clock) { case CLOCK_REALTIME: @@ -215,16 +212,15 @@ COMPAT_SYSCALL_DEFINE4(clock_nanosleep, clockid_t, which_clock, int, flags, return -EINVAL; } - if (compat_get_timespec(&t, rqtp)) + if (compat_get_timespec64(&t, rqtp)) return -EFAULT; - t64 = timespec_to_timespec64(t); - if (!timespec64_valid(&t64)) + if (!timespec64_valid(&t)) return -EINVAL; if (flags & TIMER_ABSTIME) rmtp = NULL; current->restart_block.nanosleep.type = rmtp ? TT_COMPAT : TT_NONE; current->restart_block.nanosleep.compat_rmtp = rmtp; - return hrtimer_nanosleep(&t64, flags & TIMER_ABSTIME ? + return hrtimer_nanosleep(&t, flags & TIMER_ABSTIME ? HRTIMER_MODE_ABS : HRTIMER_MODE_REL, which_clock); } diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c index 6b009c207671..c1f518e7aa80 100644 --- a/kernel/time/tick-oneshot.c +++ b/kernel/time/tick-oneshot.c @@ -33,6 +33,7 @@ int tick_program_event(ktime_t expires, int force) * We don't need the clock event device any more, stop it. */ clockevents_switch_state(dev, CLOCK_EVT_STATE_ONESHOT_STOPPED); + dev->next_event = KTIME_MAX; return 0; } diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index c7a899c5ce64..99578f06c8d4 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -27,6 +27,7 @@ #include <linux/irq_work.h> #include <linux/posix-timers.h> #include <linux/context_tracking.h> +#include <linux/mm.h> #include <asm/irq_regs.h> @@ -165,7 +166,6 @@ static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs) #ifdef CONFIG_NO_HZ_FULL cpumask_var_t tick_nohz_full_mask; -cpumask_var_t housekeeping_mask; bool tick_nohz_full_running; static atomic_t tick_dep_mask; @@ -198,7 +198,7 @@ static bool check_tick_dependency(atomic_t *dep) static bool can_stop_full_tick(int cpu, struct tick_sched *ts) { - WARN_ON_ONCE(!irqs_disabled()); + lockdep_assert_irqs_disabled(); if (unlikely(!cpu_online(cpu))) return false; @@ -385,20 +385,13 @@ out: local_irq_restore(flags); } -/* Parse the boot-time nohz CPU list from the kernel parameters. */ -static int __init tick_nohz_full_setup(char *str) +/* Get the boot-time nohz CPU list from the kernel parameters. */ +void __init tick_nohz_full_setup(cpumask_var_t cpumask) { alloc_bootmem_cpumask_var(&tick_nohz_full_mask); - if (cpulist_parse(str, tick_nohz_full_mask) < 0) { - pr_warn("NO_HZ: Incorrect nohz_full cpumask\n"); - free_bootmem_cpumask_var(tick_nohz_full_mask); - return 1; - } + cpumask_copy(tick_nohz_full_mask, cpumask); tick_nohz_full_running = true; - - return 1; } -__setup("nohz_full=", tick_nohz_full_setup); static int tick_nohz_cpu_down(unsigned int cpu) { @@ -437,13 +430,6 @@ void __init tick_nohz_init(void) return; } - if (!alloc_cpumask_var(&housekeeping_mask, GFP_KERNEL)) { - WARN(1, "NO_HZ: Can't allocate not-full dynticks cpumask\n"); - cpumask_clear(tick_nohz_full_mask); - tick_nohz_full_running = false; - return; - } - /* * Full dynticks uses irq work to drive the tick rescheduling on safe * locking contexts. But then we need irq work to raise its own @@ -452,7 +438,6 @@ void __init tick_nohz_init(void) if (!arch_irq_work_has_interrupt()) { pr_warn("NO_HZ: Can't run full dynticks because arch doesn't support irq work self-IPIs\n"); cpumask_clear(tick_nohz_full_mask); - cpumask_copy(housekeeping_mask, cpu_possible_mask); tick_nohz_full_running = false; return; } @@ -465,9 +450,6 @@ void __init tick_nohz_init(void) cpumask_clear_cpu(cpu, tick_nohz_full_mask); } - cpumask_andnot(housekeeping_mask, - cpu_possible_mask, tick_nohz_full_mask); - for_each_cpu(cpu, tick_nohz_full_mask) context_tracking_cpu_set(cpu); @@ -477,12 +459,6 @@ void __init tick_nohz_init(void) WARN_ON(ret < 0); pr_info("NO_HZ: Full dynticks CPUs: %*pbl.\n", cpumask_pr_args(tick_nohz_full_mask)); - - /* - * We need at least one CPU to handle housekeeping work such - * as timekeeping, unbound timers, workqueues, ... - */ - WARN_ON_ONCE(cpumask_empty(housekeeping_mask)); } #endif @@ -787,6 +763,7 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, if (!ts->tick_stopped) { calc_load_nohz_start(); cpu_load_update_nohz_start(); + quiet_vmstat(); ts->last_tick = hrtimer_get_expires(&ts->sched_timer); ts->tick_stopped = 1; @@ -960,8 +937,7 @@ void tick_nohz_idle_enter(void) { struct tick_sched *ts; - WARN_ON_ONCE(irqs_disabled()); - + lockdep_assert_irqs_enabled(); /* * Update the idle state in the scheduler domain hierarchy * when tick_nohz_stop_sched_tick() is called from the idle loop. diff --git a/kernel/time/time.c b/kernel/time/time.c index 44a8c1402133..bd4e6c7dd689 100644 --- a/kernel/time/time.c +++ b/kernel/time/time.c @@ -82,7 +82,7 @@ SYSCALL_DEFINE1(time, time_t __user *, tloc) SYSCALL_DEFINE1(stime, time_t __user *, tptr) { - struct timespec tv; + struct timespec64 tv; int err; if (get_user(tv.tv_sec, tptr)) @@ -90,11 +90,11 @@ SYSCALL_DEFINE1(stime, time_t __user *, tptr) tv.tv_nsec = 0; - err = security_settime(&tv, NULL); + err = security_settime64(&tv, NULL); if (err) return err; - do_settimeofday(&tv); + do_settimeofday64(&tv); return 0; } @@ -122,7 +122,7 @@ COMPAT_SYSCALL_DEFINE1(time, compat_time_t __user *, tloc) COMPAT_SYSCALL_DEFINE1(stime, compat_time_t __user *, tptr) { - struct timespec tv; + struct timespec64 tv; int err; if (get_user(tv.tv_sec, tptr)) @@ -130,11 +130,11 @@ COMPAT_SYSCALL_DEFINE1(stime, compat_time_t __user *, tptr) tv.tv_nsec = 0; - err = security_settime(&tv, NULL); + err = security_settime64(&tv, NULL); if (err) return err; - do_settimeofday(&tv); + do_settimeofday64(&tv); return 0; } @@ -158,40 +158,6 @@ SYSCALL_DEFINE2(gettimeofday, struct timeval __user *, tv, } /* - * Indicates if there is an offset between the system clock and the hardware - * clock/persistent clock/rtc. - */ -int persistent_clock_is_local; - -/* - * Adjust the time obtained from the CMOS to be UTC time instead of - * local time. - * - * This is ugly, but preferable to the alternatives. Otherwise we - * would either need to write a program to do it in /etc/rc (and risk - * confusion if the program gets run more than once; it would also be - * hard to make the program warp the clock precisely n hours) or - * compile in the timezone information into the kernel. Bad, bad.... - * - * - TYT, 1992-01-01 - * - * The best thing to do is to keep the CMOS clock in universal time (UTC) - * as real UNIX machines always do it. This avoids all headaches about - * daylight saving times and warping kernel clocks. - */ -static inline void warp_clock(void) -{ - if (sys_tz.tz_minuteswest != 0) { - struct timespec adjust; - - persistent_clock_is_local = 1; - adjust.tv_sec = sys_tz.tz_minuteswest * 60; - adjust.tv_nsec = 0; - timekeeping_inject_offset(&adjust); - } -} - -/* * In case for some reason the CMOS clock has not already been running * in UTC, but in some local time: The first time we set the timezone, * we will warp the clock so that it is ticking UTC time instead of @@ -224,7 +190,7 @@ int do_sys_settimeofday64(const struct timespec64 *tv, const struct timezone *tz if (firsttime) { firsttime = 0; if (!tv) - warp_clock(); + timekeeping_warp_clock(); } } if (tv) @@ -441,6 +407,7 @@ time64_t mktime64(const unsigned int year0, const unsigned int mon0, } EXPORT_SYMBOL(mktime64); +#if __BITS_PER_LONG == 32 /** * set_normalized_timespec - set timespec sec and nsec parts and normalize * @@ -501,6 +468,7 @@ struct timespec ns_to_timespec(const s64 nsec) return ts; } EXPORT_SYMBOL(ns_to_timespec); +#endif /** * ns_to_timeval - Convert nanoseconds to timeval @@ -520,7 +488,6 @@ struct timeval ns_to_timeval(const s64 nsec) } EXPORT_SYMBOL(ns_to_timeval); -#if BITS_PER_LONG == 32 /** * set_normalized_timespec - set timespec sec and nsec parts and normalize * @@ -581,7 +548,7 @@ struct timespec64 ns_to_timespec64(const s64 nsec) return ts; } EXPORT_SYMBOL(ns_to_timespec64); -#endif + /** * msecs_to_jiffies: - convert milliseconds to jiffies * @m: time in milliseconds @@ -853,24 +820,6 @@ unsigned long nsecs_to_jiffies(u64 n) EXPORT_SYMBOL_GPL(nsecs_to_jiffies); /* - * Add two timespec values and do a safety check for overflow. - * It's assumed that both values are valid (>= 0) - */ -struct timespec timespec_add_safe(const struct timespec lhs, - const struct timespec rhs) -{ - struct timespec res; - - set_normalized_timespec(&res, lhs.tv_sec + rhs.tv_sec, - lhs.tv_nsec + rhs.tv_nsec); - - if (res.tv_sec < lhs.tv_sec || res.tv_sec < rhs.tv_sec) - res.tv_sec = TIME_T_MAX; - - return res; -} - -/* * Add two timespec64 values and do a safety check for overflow. * It's assumed that both values are valid (>= 0). * And, each timespec64 is in normalized form. diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 2cafb49aa65e..198afa78bf69 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -60,8 +60,27 @@ struct tk_fast { struct tk_read_base base[2]; }; -static struct tk_fast tk_fast_mono ____cacheline_aligned; -static struct tk_fast tk_fast_raw ____cacheline_aligned; +/* Suspend-time cycles value for halted fast timekeeper. */ +static u64 cycles_at_suspend; + +static u64 dummy_clock_read(struct clocksource *cs) +{ + return cycles_at_suspend; +} + +static struct clocksource dummy_clock = { + .read = dummy_clock_read, +}; + +static struct tk_fast tk_fast_mono ____cacheline_aligned = { + .base[0] = { .clock = &dummy_clock, }, + .base[1] = { .clock = &dummy_clock, }, +}; + +static struct tk_fast tk_fast_raw ____cacheline_aligned = { + .base[0] = { .clock = &dummy_clock, }, + .base[1] = { .clock = &dummy_clock, }, +}; /* flag for if timekeeping is suspended */ int __read_mostly timekeeping_suspended; @@ -477,17 +496,39 @@ u64 notrace ktime_get_boot_fast_ns(void) } EXPORT_SYMBOL_GPL(ktime_get_boot_fast_ns); -/* Suspend-time cycles value for halted fast timekeeper. */ -static u64 cycles_at_suspend; -static u64 dummy_clock_read(struct clocksource *cs) +/* + * See comment for __ktime_get_fast_ns() vs. timestamp ordering + */ +static __always_inline u64 __ktime_get_real_fast_ns(struct tk_fast *tkf) { - return cycles_at_suspend; + struct tk_read_base *tkr; + unsigned int seq; + u64 now; + + do { + seq = raw_read_seqcount_latch(&tkf->seq); + tkr = tkf->base + (seq & 0x01); + now = ktime_to_ns(tkr->base_real); + + now += timekeeping_delta_to_ns(tkr, + clocksource_delta( + tk_clock_read(tkr), + tkr->cycle_last, + tkr->mask)); + } while (read_seqcount_retry(&tkf->seq, seq)); + + return now; } -static struct clocksource dummy_clock = { - .read = dummy_clock_read, -}; +/** + * ktime_get_real_fast_ns: - NMI safe and fast access to clock realtime. + */ +u64 ktime_get_real_fast_ns(void) +{ + return __ktime_get_real_fast_ns(&tk_fast_mono); +} +EXPORT_SYMBOL_GPL(ktime_get_real_fast_ns); /** * halt_fast_timekeeper - Prevent fast timekeeper from accessing clocksource. @@ -507,6 +548,7 @@ static void halt_fast_timekeeper(struct timekeeper *tk) memcpy(&tkr_dummy, tkr, sizeof(tkr_dummy)); cycles_at_suspend = tk_clock_read(tkr); tkr_dummy.clock = &dummy_clock; + tkr_dummy.base_real = tkr->base + tk->offs_real; update_fast_timekeeper(&tkr_dummy, &tk_fast_mono); tkr = &tk->tkr_raw; @@ -654,6 +696,7 @@ static void timekeeping_update(struct timekeeper *tk, unsigned int action) update_vsyscall(tk); update_pvclock_gtod(tk, action & TK_CLOCK_WAS_SET); + tk->tkr_mono.base_real = tk->tkr_mono.base + tk->offs_real; update_fast_timekeeper(&tk->tkr_mono, &tk_fast_mono); update_fast_timekeeper(&tk->tkr_raw, &tk_fast_raw); @@ -1264,33 +1307,31 @@ EXPORT_SYMBOL(do_settimeofday64); * * Adds or subtracts an offset value from the current time. */ -int timekeeping_inject_offset(struct timespec *ts) +static int timekeeping_inject_offset(struct timespec64 *ts) { struct timekeeper *tk = &tk_core.timekeeper; unsigned long flags; - struct timespec64 ts64, tmp; + struct timespec64 tmp; int ret = 0; - if (!timespec_inject_offset_valid(ts)) + if (ts->tv_nsec < 0 || ts->tv_nsec >= NSEC_PER_SEC) return -EINVAL; - ts64 = timespec_to_timespec64(*ts); - raw_spin_lock_irqsave(&timekeeper_lock, flags); write_seqcount_begin(&tk_core.seq); timekeeping_forward_now(tk); /* Make sure the proposed value is valid */ - tmp = timespec64_add(tk_xtime(tk), ts64); - if (timespec64_compare(&tk->wall_to_monotonic, &ts64) > 0 || + tmp = timespec64_add(tk_xtime(tk), *ts); + if (timespec64_compare(&tk->wall_to_monotonic, ts) > 0 || !timespec64_valid_strict(&tmp)) { ret = -EINVAL; goto error; } - tk_xtime_add(tk, &ts64); - tk_set_wall_to_mono(tk, timespec64_sub(tk->wall_to_monotonic, ts64)); + tk_xtime_add(tk, ts); + tk_set_wall_to_mono(tk, timespec64_sub(tk->wall_to_monotonic, *ts)); error: /* even if we error out, we forwarded the time, so call update */ timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET); @@ -1303,7 +1344,40 @@ error: /* even if we error out, we forwarded the time, so call update */ return ret; } -EXPORT_SYMBOL(timekeeping_inject_offset); + +/* + * Indicates if there is an offset between the system clock and the hardware + * clock/persistent clock/rtc. + */ +int persistent_clock_is_local; + +/* + * Adjust the time obtained from the CMOS to be UTC time instead of + * local time. + * + * This is ugly, but preferable to the alternatives. Otherwise we + * would either need to write a program to do it in /etc/rc (and risk + * confusion if the program gets run more than once; it would also be + * hard to make the program warp the clock precisely n hours) or + * compile in the timezone information into the kernel. Bad, bad.... + * + * - TYT, 1992-01-01 + * + * The best thing to do is to keep the CMOS clock in universal time (UTC) + * as real UNIX machines always do it. This avoids all headaches about + * daylight saving times and warping kernel clocks. + */ +void timekeeping_warp_clock(void) +{ + if (sys_tz.tz_minuteswest != 0) { + struct timespec64 adjust; + + persistent_clock_is_local = 1; + adjust.tv_sec = sys_tz.tz_minuteswest * 60; + adjust.tv_nsec = 0; + timekeeping_inject_offset(&adjust); + } +} /** * __timekeeping_set_tai_offset - Sets the TAI offset from UTC and monotonic @@ -2248,6 +2322,72 @@ ktime_t ktime_get_update_offsets_now(unsigned int *cwsseq, ktime_t *offs_real, } /** + * timekeeping_validate_timex - Ensures the timex is ok for use in do_adjtimex + */ +static int timekeeping_validate_timex(struct timex *txc) +{ + if (txc->modes & ADJ_ADJTIME) { + /* singleshot must not be used with any other mode bits */ + if (!(txc->modes & ADJ_OFFSET_SINGLESHOT)) + return -EINVAL; + if (!(txc->modes & ADJ_OFFSET_READONLY) && + !capable(CAP_SYS_TIME)) + return -EPERM; + } else { + /* In order to modify anything, you gotta be super-user! */ + if (txc->modes && !capable(CAP_SYS_TIME)) + return -EPERM; + /* + * if the quartz is off by more than 10% then + * something is VERY wrong! + */ + if (txc->modes & ADJ_TICK && + (txc->tick < 900000/USER_HZ || + txc->tick > 1100000/USER_HZ)) + return -EINVAL; + } + + if (txc->modes & ADJ_SETOFFSET) { + /* In order to inject time, you gotta be super-user! */ + if (!capable(CAP_SYS_TIME)) + return -EPERM; + + /* + * Validate if a timespec/timeval used to inject a time + * offset is valid. Offsets can be postive or negative, so + * we don't check tv_sec. The value of the timeval/timespec + * is the sum of its fields,but *NOTE*: + * The field tv_usec/tv_nsec must always be non-negative and + * we can't have more nanoseconds/microseconds than a second. + */ + if (txc->time.tv_usec < 0) + return -EINVAL; + + if (txc->modes & ADJ_NANO) { + if (txc->time.tv_usec >= NSEC_PER_SEC) + return -EINVAL; + } else { + if (txc->time.tv_usec >= USEC_PER_SEC) + return -EINVAL; + } + } + + /* + * Check for potential multiplication overflows that can + * only happen on 64-bit systems: + */ + if ((txc->modes & ADJ_FREQUENCY) && (BITS_PER_LONG == 64)) { + if (LLONG_MIN / PPM_SCALE > txc->freq) + return -EINVAL; + if (LLONG_MAX / PPM_SCALE < txc->freq) + return -EINVAL; + } + + return 0; +} + + +/** * do_adjtimex() - Accessor function to NTP __do_adjtimex function */ int do_adjtimex(struct timex *txc) @@ -2259,12 +2399,12 @@ int do_adjtimex(struct timex *txc) int ret; /* Validate the data before disabling interrupts */ - ret = ntp_validate_timex(txc); + ret = timekeeping_validate_timex(txc); if (ret) return ret; if (txc->modes & ADJ_SETOFFSET) { - struct timespec delta; + struct timespec64 delta; delta.tv_sec = txc->time.tv_sec; delta.tv_nsec = txc->time.tv_usec; if (!(txc->modes & ADJ_NANO)) diff --git a/kernel/time/timekeeping.h b/kernel/time/timekeeping.h index c9f9af339914..7a9b4eb7a1d5 100644 --- a/kernel/time/timekeeping.h +++ b/kernel/time/timekeeping.h @@ -11,7 +11,7 @@ extern ktime_t ktime_get_update_offsets_now(unsigned int *cwsseq, extern int timekeeping_valid_for_hres(void); extern u64 timekeeping_max_deferment(void); -extern int timekeeping_inject_offset(struct timespec *ts); +extern void timekeeping_warp_clock(void); extern int timekeeping_suspend(void); extern void timekeeping_resume(void); diff --git a/kernel/time/timer.c b/kernel/time/timer.c index f2674a056c26..af0b8bae4502 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -610,7 +610,7 @@ static bool timer_fixup_init(void *addr, enum debug_obj_state state) } /* Stub timer callback for improperly used timers. */ -static void stub_timer(unsigned long data) +static void stub_timer(struct timer_list *unused) { WARN_ON(1); } @@ -626,7 +626,7 @@ static bool timer_fixup_activate(void *addr, enum debug_obj_state state) switch (state) { case ODEBUG_STATE_NOTAVAILABLE: - setup_timer(timer, stub_timer, 0); + timer_setup(timer, stub_timer, 0); return true; case ODEBUG_STATE_ACTIVE: @@ -665,7 +665,7 @@ static bool timer_fixup_assert_init(void *addr, enum debug_obj_state state) switch (state) { case ODEBUG_STATE_NOTAVAILABLE: - setup_timer(timer, stub_timer, 0); + timer_setup(timer, stub_timer, 0); return true; default: return false; @@ -929,8 +929,11 @@ static struct timer_base *lock_timer_base(struct timer_list *timer, } } +#define MOD_TIMER_PENDING_ONLY 0x01 +#define MOD_TIMER_REDUCE 0x02 + static inline int -__mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only) +__mod_timer(struct timer_list *timer, unsigned long expires, unsigned int options) { struct timer_base *base, *new_base; unsigned int idx = UINT_MAX; @@ -950,7 +953,11 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only) * larger granularity than you would get from adding a new * timer with this expiry. */ - if (timer->expires == expires) + long diff = timer->expires - expires; + + if (!diff) + return 1; + if (options & MOD_TIMER_REDUCE && diff <= 0) return 1; /* @@ -962,6 +969,12 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only) base = lock_timer_base(timer, &flags); forward_timer_base(base); + if (timer_pending(timer) && (options & MOD_TIMER_REDUCE) && + time_before_eq(timer->expires, expires)) { + ret = 1; + goto out_unlock; + } + clk = base->clk; idx = calc_wheel_index(expires, clk); @@ -971,7 +984,10 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only) * subsequent call will exit in the expires check above. */ if (idx == timer_get_idx(timer)) { - timer->expires = expires; + if (!(options & MOD_TIMER_REDUCE)) + timer->expires = expires; + else if (time_after(timer->expires, expires)) + timer->expires = expires; ret = 1; goto out_unlock; } @@ -981,7 +997,7 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only) } ret = detach_if_pending(timer, base, false); - if (!ret && pending_only) + if (!ret && (options & MOD_TIMER_PENDING_ONLY)) goto out_unlock; debug_activate(timer, expires); @@ -1042,7 +1058,7 @@ out_unlock: */ int mod_timer_pending(struct timer_list *timer, unsigned long expires) { - return __mod_timer(timer, expires, true); + return __mod_timer(timer, expires, MOD_TIMER_PENDING_ONLY); } EXPORT_SYMBOL(mod_timer_pending); @@ -1068,11 +1084,26 @@ EXPORT_SYMBOL(mod_timer_pending); */ int mod_timer(struct timer_list *timer, unsigned long expires) { - return __mod_timer(timer, expires, false); + return __mod_timer(timer, expires, 0); } EXPORT_SYMBOL(mod_timer); /** + * timer_reduce - Modify a timer's timeout if it would reduce the timeout + * @timer: The timer to be modified + * @expires: New timeout in jiffies + * + * timer_reduce() is very similar to mod_timer(), except that it will only + * modify a running timer if that would reduce the expiration time (it will + * start a timer that isn't running). + */ +int timer_reduce(struct timer_list *timer, unsigned long expires) +{ + return __mod_timer(timer, expires, MOD_TIMER_REDUCE); +} +EXPORT_SYMBOL(timer_reduce); + +/** * add_timer - start a timer * @timer: the timer to be added * @@ -1560,8 +1591,11 @@ static int collect_expired_timers(struct timer_base *base, * jiffies, otherwise forward to the next expiry time: */ if (time_after(next, jiffies)) { - /* The call site will increment clock! */ - base->clk = jiffies - 1; + /* + * The call site will increment base->clk and then + * terminate the expiry loop immediately. + */ + base->clk = jiffies; return 0; } base->clk = next; @@ -1668,9 +1702,20 @@ void run_local_timers(void) raise_softirq(TIMER_SOFTIRQ); } -static void process_timeout(unsigned long __data) +/* + * Since schedule_timeout()'s timer is defined on the stack, it must store + * the target task on the stack as well. + */ +struct process_timer { + struct timer_list timer; + struct task_struct *task; +}; + +static void process_timeout(struct timer_list *t) { - wake_up_process((struct task_struct *)__data); + struct process_timer *timeout = from_timer(timeout, t, timer); + + wake_up_process(timeout->task); } /** @@ -1704,7 +1749,7 @@ static void process_timeout(unsigned long __data) */ signed long __sched schedule_timeout(signed long timeout) { - struct timer_list timer; + struct process_timer timer; unsigned long expire; switch (timeout) @@ -1738,13 +1783,14 @@ signed long __sched schedule_timeout(signed long timeout) expire = timeout + jiffies; - setup_timer_on_stack(&timer, process_timeout, (unsigned long)current); - __mod_timer(&timer, expire, false); + timer.task = current; + timer_setup_on_stack(&timer.timer, process_timeout, 0); + __mod_timer(&timer.timer, expire, 0); schedule(); - del_singleshot_timer_sync(&timer); + del_singleshot_timer_sync(&timer.timer); /* Remove the timer from the object tracker */ - destroy_timer_on_stack(&timer); + destroy_timer_on_stack(&timer.timer); timeout = expire - jiffies; diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index dc498b605d5d..95888ae6c263 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -275,7 +275,7 @@ BPF_CALL_2(bpf_perf_event_read, struct bpf_map *, map, u64, flags) if (!ee) return -ENOENT; - err = perf_event_read_local(ee->event, &value); + err = perf_event_read_local(ee->event, &value, NULL, NULL); /* * this api is ugly since we miss [-22..-2] range of valid * counter values, but that's uapi diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 81279c6602ff..845f3805c73d 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -2724,7 +2724,7 @@ rb_reserve_next_event(struct ring_buffer *buffer, * if it happened, we have to fail the write. */ barrier(); - if (unlikely(ACCESS_ONCE(cpu_buffer->buffer) != buffer)) { + if (unlikely(READ_ONCE(cpu_buffer->buffer) != buffer)) { local_dec(&cpu_buffer->committing); local_dec(&cpu_buffer->commits); return NULL; diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 401b0639116f..6b0b343a36a2 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -1460,7 +1460,7 @@ extern struct trace_event_file *find_event_file(struct trace_array *tr, static inline void *event_file_data(struct file *filp) { - return ACCESS_ONCE(file_inode(filp)->i_private); + return READ_ONCE(file_inode(filp)->i_private); } extern struct mutex event_mutex; diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index c738e764e2a5..90db994ac900 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -921,8 +921,8 @@ static enum print_line_t trace_ctxwake_print(struct trace_iterator *iter, trace_assign_type(field, iter->ent); - T = __task_state_to_char(field->next_state); - S = __task_state_to_char(field->prev_state); + T = task_index_to_char(field->next_state); + S = task_index_to_char(field->prev_state); trace_find_cmdline(field->next_pid, comm); trace_seq_printf(&iter->seq, " %5d:%3d:%c %s [%03d] %5d:%3d:%c %s\n", @@ -957,8 +957,8 @@ static int trace_ctxwake_raw(struct trace_iterator *iter, char S) trace_assign_type(field, iter->ent); if (!S) - S = __task_state_to_char(field->prev_state); - T = __task_state_to_char(field->next_state); + S = task_index_to_char(field->prev_state); + T = task_index_to_char(field->next_state); trace_seq_printf(&iter->seq, "%d %d %c %d %d %d %c\n", field->prev_pid, field->prev_prio, @@ -993,8 +993,8 @@ static int trace_ctxwake_hex(struct trace_iterator *iter, char S) trace_assign_type(field, iter->ent); if (!S) - S = __task_state_to_char(field->prev_state); - T = __task_state_to_char(field->next_state); + S = task_index_to_char(field->prev_state); + T = task_index_to_char(field->next_state); SEQ_PUT_HEX_FIELD(s, field->prev_pid); SEQ_PUT_HEX_FIELD(s, field->prev_prio); diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index 7d461dcd4831..a86b303e6c67 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -398,10 +398,10 @@ tracing_sched_switch_trace(struct trace_array *tr, entry = ring_buffer_event_data(event); entry->prev_pid = prev->pid; entry->prev_prio = prev->prio; - entry->prev_state = __get_task_state(prev); + entry->prev_state = task_state_index(prev); entry->next_pid = next->pid; entry->next_prio = next->prio; - entry->next_state = __get_task_state(next); + entry->next_state = task_state_index(next); entry->next_cpu = task_cpu(next); if (!call_filter_check_discard(call, entry, buffer, event)) @@ -426,10 +426,10 @@ tracing_sched_wakeup_trace(struct trace_array *tr, entry = ring_buffer_event_data(event); entry->prev_pid = curr->pid; entry->prev_prio = curr->prio; - entry->prev_state = __get_task_state(curr); + entry->prev_state = task_state_index(curr); entry->next_pid = wakee->pid; entry->next_prio = wakee->prio; - entry->next_state = __get_task_state(wakee); + entry->next_state = task_state_index(wakee); entry->next_cpu = task_cpu(wakee); if (!call_filter_check_discard(call, entry, buffer, event)) diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index 719a52a4064a..734accc02418 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -78,7 +78,7 @@ check_stack(unsigned long ip, unsigned long *stack) { unsigned long this_size, flags; unsigned long *p, *top, *start; static int tracer_frame; - int frame_size = ACCESS_ONCE(tracer_frame); + int frame_size = READ_ONCE(tracer_frame); int i, x; this_size = ((unsigned long)stack) & (THREAD_SIZE-1); diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index c490f1e4313b..d32b45662fb6 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -894,7 +894,7 @@ static bool new_idmap_permitted(const struct file *file, int proc_setgroups_show(struct seq_file *seq, void *v) { struct user_namespace *ns = seq->private; - unsigned long userns_flags = ACCESS_ONCE(ns->flags); + unsigned long userns_flags = READ_ONCE(ns->flags); seq_printf(seq, "%s\n", (userns_flags & USERNS_SETGROUPS_ALLOWED) ? diff --git a/kernel/watchdog.c b/kernel/watchdog.c index c8e06703e44c..576d18045811 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -25,6 +25,7 @@ #include <linux/workqueue.h> #include <linux/sched/clock.h> #include <linux/sched/debug.h> +#include <linux/sched/isolation.h> #include <asm/irq_regs.h> #include <linux/kvm_para.h> @@ -774,15 +775,11 @@ int proc_watchdog_cpumask(struct ctl_table *table, int write, void __init lockup_detector_init(void) { -#ifdef CONFIG_NO_HZ_FULL - if (tick_nohz_full_enabled()) { + if (tick_nohz_full_enabled()) pr_info("Disabling watchdog on nohz_full cores by default\n"); - cpumask_copy(&watchdog_cpumask, housekeeping_mask); - } else - cpumask_copy(&watchdog_cpumask, cpu_possible_mask); -#else - cpumask_copy(&watchdog_cpumask, cpu_possible_mask); -#endif + + cpumask_copy(&watchdog_cpumask, + housekeeping_cpumask(HK_FLAG_TIMER)); if (!watchdog_nmi_probe()) nmi_watchdog_available = true; diff --git a/kernel/workqueue.c b/kernel/workqueue.c index a2dccfe1acec..7368b57842ea 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1376,7 +1376,7 @@ static void __queue_work(int cpu, struct workqueue_struct *wq, * queued or lose PENDING. Grabbing PENDING and queueing should * happen with IRQ disabled. */ - WARN_ON_ONCE(!irqs_disabled()); + lockdep_assert_irqs_disabled(); debug_work_activate(work); @@ -1493,9 +1493,9 @@ bool queue_work_on(int cpu, struct workqueue_struct *wq, } EXPORT_SYMBOL(queue_work_on); -void delayed_work_timer_fn(unsigned long __data) +void delayed_work_timer_fn(struct timer_list *t) { - struct delayed_work *dwork = (struct delayed_work *)__data; + struct delayed_work *dwork = from_timer(dwork, t, timer); /* should have been called from irqsafe timer with irq already off */ __queue_work(dwork->cpu, dwork->wq, &dwork->work); @@ -1509,8 +1509,7 @@ static void __queue_delayed_work(int cpu, struct workqueue_struct *wq, struct work_struct *work = &dwork->work; WARN_ON_ONCE(!wq); - WARN_ON_ONCE(timer->function != delayed_work_timer_fn || - timer->data != (unsigned long)dwork); + WARN_ON_ONCE(timer->function != (TIMER_FUNC_TYPE)delayed_work_timer_fn); WARN_ON_ONCE(timer_pending(timer)); WARN_ON_ONCE(!list_empty(&work->entry)); @@ -1833,9 +1832,9 @@ static void destroy_worker(struct worker *worker) wake_up_process(worker->task); } -static void idle_worker_timeout(unsigned long __pool) +static void idle_worker_timeout(struct timer_list *t) { - struct worker_pool *pool = (void *)__pool; + struct worker_pool *pool = from_timer(pool, t, idle_timer); spin_lock_irq(&pool->lock); @@ -1881,9 +1880,9 @@ static void send_mayday(struct work_struct *work) } } -static void pool_mayday_timeout(unsigned long __pool) +static void pool_mayday_timeout(struct timer_list *t) { - struct worker_pool *pool = (void *)__pool; + struct worker_pool *pool = from_timer(pool, t, mayday_timer); struct work_struct *work; spin_lock_irq(&pool->lock); @@ -2491,15 +2490,8 @@ static void insert_wq_barrier(struct pool_workqueue *pwq, INIT_WORK_ONSTACK(&barr->work, wq_barrier_func); __set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(&barr->work)); - /* - * Explicitly init the crosslock for wq_barrier::done, make its lock - * key a subkey of the corresponding work. As a result we won't - * build a dependency between wq_barrier::done and unrelated work. - */ - lockdep_init_map_crosslock((struct lockdep_map *)&barr->done.map, - "(complete)wq_barr::done", - target->lockdep_map.key, 1); - __init_completion(&barr->done); + init_completion_map(&barr->done, &target->lockdep_map); + barr->task = current; /* @@ -2605,16 +2597,13 @@ void flush_workqueue(struct workqueue_struct *wq) struct wq_flusher this_flusher = { .list = LIST_HEAD_INIT(this_flusher.list), .flush_color = -1, - .done = COMPLETION_INITIALIZER_ONSTACK(this_flusher.done), + .done = COMPLETION_INITIALIZER_ONSTACK_MAP(this_flusher.done, wq->lockdep_map), }; int next_color; if (WARN_ON(!wq_online)) return; - lock_map_acquire(&wq->lockdep_map); - lock_map_release(&wq->lockdep_map); - mutex_lock(&wq->mutex); /* @@ -2877,9 +2866,6 @@ bool flush_work(struct work_struct *work) if (WARN_ON(!wq_online)) return false; - lock_map_acquire(&work->lockdep_map); - lock_map_release(&work->lockdep_map); - if (start_flush_work(work, &barr)) { wait_for_completion(&barr.done); destroy_work_on_stack(&barr.work); @@ -3236,11 +3222,9 @@ static int init_worker_pool(struct worker_pool *pool) INIT_LIST_HEAD(&pool->idle_list); hash_init(pool->busy_hash); - setup_deferrable_timer(&pool->idle_timer, idle_worker_timeout, - (unsigned long)pool); + timer_setup(&pool->idle_timer, idle_worker_timeout, TIMER_DEFERRABLE); - setup_timer(&pool->mayday_timer, pool_mayday_timeout, - (unsigned long)pool); + timer_setup(&pool->mayday_timer, pool_mayday_timeout, 0); mutex_init(&pool->attach_mutex); INIT_LIST_HEAD(&pool->workers); @@ -4640,7 +4624,7 @@ static void rebind_workers(struct worker_pool *pool) * concurrency management. Note that when or whether * @worker clears REBOUND doesn't affect correctness. * - * ACCESS_ONCE() is necessary because @worker->flags may be + * WRITE_ONCE() is necessary because @worker->flags may be * tested without holding any lock in * wq_worker_waking_up(). Without it, NOT_RUNNING test may * fail incorrectly leading to premature concurrency @@ -4649,7 +4633,7 @@ static void rebind_workers(struct worker_pool *pool) WARN_ON_ONCE(!(worker_flags & WORKER_UNBOUND)); worker_flags |= WORKER_REBOUND; worker_flags &= ~WORKER_UNBOUND; - ACCESS_ONCE(worker->flags) = worker_flags; + WRITE_ONCE(worker->flags, worker_flags); } spin_unlock_irq(&pool->lock); @@ -5383,11 +5367,8 @@ static void workqueue_sysfs_unregister(struct workqueue_struct *wq) { } */ #ifdef CONFIG_WQ_WATCHDOG -static void wq_watchdog_timer_fn(unsigned long data); - static unsigned long wq_watchdog_thresh = 30; -static struct timer_list wq_watchdog_timer = - TIMER_DEFERRED_INITIALIZER(wq_watchdog_timer_fn, 0, 0); +static struct timer_list wq_watchdog_timer; static unsigned long wq_watchdog_touched = INITIAL_JIFFIES; static DEFINE_PER_CPU(unsigned long, wq_watchdog_touched_cpu) = INITIAL_JIFFIES; @@ -5401,7 +5382,7 @@ static void wq_watchdog_reset_touched(void) per_cpu(wq_watchdog_touched_cpu, cpu) = jiffies; } -static void wq_watchdog_timer_fn(unsigned long data) +static void wq_watchdog_timer_fn(struct timer_list *unused) { unsigned long thresh = READ_ONCE(wq_watchdog_thresh) * HZ; bool lockup_detected = false; @@ -5503,6 +5484,7 @@ module_param_cb(watchdog_thresh, &wq_watchdog_thresh_ops, &wq_watchdog_thresh, static void wq_watchdog_init(void) { + timer_setup(&wq_watchdog_timer, wq_watchdog_timer_fn, TIMER_DEFERRABLE); wq_watchdog_set_thresh(wq_watchdog_thresh); } diff --git a/kernel/workqueue_internal.h b/kernel/workqueue_internal.h index efdd72e15794..d390d1be3748 100644 --- a/kernel/workqueue_internal.h +++ b/kernel/workqueue_internal.h @@ -10,6 +10,7 @@ #include <linux/workqueue.h> #include <linux/kthread.h> +#include <linux/preempt.h> struct worker_pool; @@ -60,7 +61,7 @@ struct worker { */ static inline struct worker *current_wq_worker(void) { - if (current->flags & PF_WQ_WORKER) + if (in_task() && (current->flags & PF_WQ_WORKER)) return kthread_data(current); return NULL; } |