summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/acct.c4
-rw-r--r--kernel/bpf/arraymap.c2
-rw-r--r--kernel/cgroup/cpuset.c15
-rw-r--r--kernel/events/core.c492
-rw-r--r--kernel/events/ring_buffer.c2
-rw-r--r--kernel/exit.c2
-rw-r--r--kernel/extable.c2
-rw-r--r--kernel/irq/Kconfig6
-rw-r--r--kernel/irq/Makefile1
-rw-r--r--kernel/irq/autoprobe.c2
-rw-r--r--kernel/irq/chip.c35
-rw-r--r--kernel/irq/debugfs.c12
-rw-r--r--kernel/irq/internals.h19
-rw-r--r--kernel/irq/irqdesc.c9
-rw-r--r--kernel/irq/irqdomain.c60
-rw-r--r--kernel/irq/manage.c23
-rw-r--r--kernel/irq/matrix.c443
-rw-r--r--kernel/irq/msi.c32
-rw-r--r--kernel/irq/proc.c5
-rw-r--r--kernel/irq/spurious.c2
-rw-r--r--kernel/irq/timings.c2
-rw-r--r--kernel/irq_work.c21
-rw-r--r--kernel/jump_label.c14
-rw-r--r--kernel/kallsyms.c43
-rw-r--r--kernel/kexec_file.c5
-rw-r--r--kernel/kprobes.c18
-rw-r--r--kernel/kthread.c10
-rw-r--r--kernel/locking/lockdep.c23
-rw-r--r--kernel/locking/qrwlock.c86
-rw-r--r--kernel/locking/qspinlock_paravirt.h47
-rw-r--r--kernel/locking/rwsem.c16
-rw-r--r--kernel/locking/spinlock.c9
-rw-r--r--kernel/module.c32
-rw-r--r--kernel/rcu/rcu.h21
-rw-r--r--kernel/rcu/rcu_segcblist.c1
-rw-r--r--kernel/rcu/rcutorture.c28
-rw-r--r--kernel/rcu/tree.c175
-rw-r--r--kernel/rcu/tree.h5
-rw-r--r--kernel/rcu/tree_plugin.h36
-rw-r--r--kernel/rcu/update.c28
-rw-r--r--kernel/resource.c76
-rw-r--r--kernel/sched/Makefile1
-rw-r--r--kernel/sched/clock.c2
-rw-r--r--kernel/sched/core.c61
-rw-r--r--kernel/sched/cputime.c3
-rw-r--r--kernel/sched/deadline.c21
-rw-r--r--kernel/sched/debug.c18
-rw-r--r--kernel/sched/fair.c1049
-rw-r--r--kernel/sched/idle.c4
-rw-r--r--kernel/sched/isolation.c155
-rw-r--r--kernel/sched/rt.c316
-rw-r--r--kernel/sched/sched.h73
-rw-r--r--kernel/sched/topology.c49
-rw-r--r--kernel/seccomp.c2
-rw-r--r--kernel/smp.c2
-rw-r--r--kernel/softirq.c10
-rw-r--r--kernel/task_work.c2
-rw-r--r--kernel/test_kprobes.c29
-rw-r--r--kernel/time/Kconfig2
-rw-r--r--kernel/time/clockevents.c21
-rw-r--r--kernel/time/hrtimer.c4
-rw-r--r--kernel/time/ntp.c227
-rw-r--r--kernel/time/ntp_internal.h1
-rw-r--r--kernel/time/posix-cpu-timers.c6
-rw-r--r--kernel/time/posix-stubs.c20
-rw-r--r--kernel/time/tick-oneshot.c1
-rw-r--r--kernel/time/tick-sched.c38
-rw-r--r--kernel/time/time.c71
-rw-r--r--kernel/time/timekeeping.c182
-rw-r--r--kernel/time/timekeeping.h2
-rw-r--r--kernel/time/timer.c82
-rw-r--r--kernel/trace/bpf_trace.c2
-rw-r--r--kernel/trace/ring_buffer.c2
-rw-r--r--kernel/trace/trace.h2
-rw-r--r--kernel/trace/trace_output.c12
-rw-r--r--kernel/trace/trace_sched_wakeup.c8
-rw-r--r--kernel/trace/trace_stack.c2
-rw-r--r--kernel/user_namespace.c2
-rw-r--r--kernel/watchdog.c13
-rw-r--r--kernel/workqueue.c54
-rw-r--r--kernel/workqueue_internal.h3
81 files changed, 2838 insertions, 1580 deletions
diff --git a/kernel/acct.c b/kernel/acct.c
index 6670fbd3e466..d15c0ee4d955 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -147,7 +147,7 @@ static struct bsd_acct_struct *acct_get(struct pid_namespace *ns)
again:
smp_rmb();
rcu_read_lock();
- res = to_acct(ACCESS_ONCE(ns->bacct));
+ res = to_acct(READ_ONCE(ns->bacct));
if (!res) {
rcu_read_unlock();
return NULL;
@@ -159,7 +159,7 @@ again:
}
rcu_read_unlock();
mutex_lock(&res->lock);
- if (res != to_acct(ACCESS_ONCE(ns->bacct))) {
+ if (res != to_acct(READ_ONCE(ns->bacct))) {
mutex_unlock(&res->lock);
acct_put(res);
goto again;
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index e2636737b69b..c4b9ab01bba5 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -492,7 +492,7 @@ static void *perf_event_fd_array_get_ptr(struct bpf_map *map,
ee = ERR_PTR(-EOPNOTSUPP);
event = perf_file->private_data;
- if (perf_event_read_local(event, &value) == -EOPNOTSUPP)
+ if (perf_event_read_local(event, &value, NULL, NULL) == -EOPNOTSUPP)
goto err_out;
ee = bpf_event_entry_gen(perf_file, map_file);
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 4657e2924ecb..f7efa7b4d825 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -57,7 +57,7 @@
#include <linux/backing-dev.h>
#include <linux/sort.h>
#include <linux/oom.h>
-
+#include <linux/sched/isolation.h>
#include <linux/uaccess.h>
#include <linux/atomic.h>
#include <linux/mutex.h>
@@ -656,7 +656,6 @@ static int generate_sched_domains(cpumask_var_t **domains,
int csn; /* how many cpuset ptrs in csa so far */
int i, j, k; /* indices for partition finding loops */
cpumask_var_t *doms; /* resulting partition; i.e. sched domains */
- cpumask_var_t non_isolated_cpus; /* load balanced CPUs */
struct sched_domain_attr *dattr; /* attributes for custom domains */
int ndoms = 0; /* number of sched domains in result */
int nslot; /* next empty doms[] struct cpumask slot */
@@ -666,10 +665,6 @@ static int generate_sched_domains(cpumask_var_t **domains,
dattr = NULL;
csa = NULL;
- if (!alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL))
- goto done;
- cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
-
/* Special case for the 99% of systems with one, full, sched domain */
if (is_sched_load_balance(&top_cpuset)) {
ndoms = 1;
@@ -683,7 +678,7 @@ static int generate_sched_domains(cpumask_var_t **domains,
update_domain_attr_tree(dattr, &top_cpuset);
}
cpumask_and(doms[0], top_cpuset.effective_cpus,
- non_isolated_cpus);
+ housekeeping_cpumask(HK_FLAG_DOMAIN));
goto done;
}
@@ -707,7 +702,8 @@ static int generate_sched_domains(cpumask_var_t **domains,
*/
if (!cpumask_empty(cp->cpus_allowed) &&
!(is_sched_load_balance(cp) &&
- cpumask_intersects(cp->cpus_allowed, non_isolated_cpus)))
+ cpumask_intersects(cp->cpus_allowed,
+ housekeeping_cpumask(HK_FLAG_DOMAIN))))
continue;
if (is_sched_load_balance(cp))
@@ -789,7 +785,7 @@ restart:
if (apn == b->pn) {
cpumask_or(dp, dp, b->effective_cpus);
- cpumask_and(dp, dp, non_isolated_cpus);
+ cpumask_and(dp, dp, housekeeping_cpumask(HK_FLAG_DOMAIN));
if (dattr)
update_domain_attr_tree(dattr + nslot, b);
@@ -802,7 +798,6 @@ restart:
BUG_ON(nslot != ndoms);
done:
- free_cpumask_var(non_isolated_cpus);
kfree(csa);
/*
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 10cdb9c26b5d..4c39c05e029a 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -209,7 +209,7 @@ static int event_function(void *info)
struct perf_event_context *task_ctx = cpuctx->task_ctx;
int ret = 0;
- WARN_ON_ONCE(!irqs_disabled());
+ lockdep_assert_irqs_disabled();
perf_ctx_lock(cpuctx, task_ctx);
/*
@@ -306,7 +306,7 @@ static void event_function_local(struct perf_event *event, event_f func, void *d
struct task_struct *task = READ_ONCE(ctx->task);
struct perf_event_context *task_ctx = NULL;
- WARN_ON_ONCE(!irqs_disabled());
+ lockdep_assert_irqs_disabled();
if (task) {
if (task == TASK_TOMBSTONE)
@@ -582,6 +582,88 @@ static inline u64 perf_event_clock(struct perf_event *event)
return event->clock();
}
+/*
+ * State based event timekeeping...
+ *
+ * The basic idea is to use event->state to determine which (if any) time
+ * fields to increment with the current delta. This means we only need to
+ * update timestamps when we change state or when they are explicitly requested
+ * (read).
+ *
+ * Event groups make things a little more complicated, but not terribly so. The
+ * rules for a group are that if the group leader is OFF the entire group is
+ * OFF, irrespecive of what the group member states are. This results in
+ * __perf_effective_state().
+ *
+ * A futher ramification is that when a group leader flips between OFF and
+ * !OFF, we need to update all group member times.
+ *
+ *
+ * NOTE: perf_event_time() is based on the (cgroup) context time, and thus we
+ * need to make sure the relevant context time is updated before we try and
+ * update our timestamps.
+ */
+
+static __always_inline enum perf_event_state
+__perf_effective_state(struct perf_event *event)
+{
+ struct perf_event *leader = event->group_leader;
+
+ if (leader->state <= PERF_EVENT_STATE_OFF)
+ return leader->state;
+
+ return event->state;
+}
+
+static __always_inline void
+__perf_update_times(struct perf_event *event, u64 now, u64 *enabled, u64 *running)
+{
+ enum perf_event_state state = __perf_effective_state(event);
+ u64 delta = now - event->tstamp;
+
+ *enabled = event->total_time_enabled;
+ if (state >= PERF_EVENT_STATE_INACTIVE)
+ *enabled += delta;
+
+ *running = event->total_time_running;
+ if (state >= PERF_EVENT_STATE_ACTIVE)
+ *running += delta;
+}
+
+static void perf_event_update_time(struct perf_event *event)
+{
+ u64 now = perf_event_time(event);
+
+ __perf_update_times(event, now, &event->total_time_enabled,
+ &event->total_time_running);
+ event->tstamp = now;
+}
+
+static void perf_event_update_sibling_time(struct perf_event *leader)
+{
+ struct perf_event *sibling;
+
+ list_for_each_entry(sibling, &leader->sibling_list, group_entry)
+ perf_event_update_time(sibling);
+}
+
+static void
+perf_event_set_state(struct perf_event *event, enum perf_event_state state)
+{
+ if (event->state == state)
+ return;
+
+ perf_event_update_time(event);
+ /*
+ * If a group leader gets enabled/disabled all its siblings
+ * are affected too.
+ */
+ if ((event->state < 0) ^ (state < 0))
+ perf_event_update_sibling_time(event);
+
+ WRITE_ONCE(event->state, state);
+}
+
#ifdef CONFIG_CGROUP_PERF
static inline bool
@@ -841,40 +923,6 @@ perf_cgroup_set_shadow_time(struct perf_event *event, u64 now)
event->shadow_ctx_time = now - t->timestamp;
}
-static inline void
-perf_cgroup_defer_enabled(struct perf_event *event)
-{
- /*
- * when the current task's perf cgroup does not match
- * the event's, we need to remember to call the
- * perf_mark_enable() function the first time a task with
- * a matching perf cgroup is scheduled in.
- */
- if (is_cgroup_event(event) && !perf_cgroup_match(event))
- event->cgrp_defer_enabled = 1;
-}
-
-static inline void
-perf_cgroup_mark_enabled(struct perf_event *event,
- struct perf_event_context *ctx)
-{
- struct perf_event *sub;
- u64 tstamp = perf_event_time(event);
-
- if (!event->cgrp_defer_enabled)
- return;
-
- event->cgrp_defer_enabled = 0;
-
- event->tstamp_enabled = tstamp - event->total_time_enabled;
- list_for_each_entry(sub, &event->sibling_list, group_entry) {
- if (sub->state >= PERF_EVENT_STATE_INACTIVE) {
- sub->tstamp_enabled = tstamp - sub->total_time_enabled;
- sub->cgrp_defer_enabled = 0;
- }
- }
-}
-
/*
* Update cpuctx->cgrp so that it is set when first cgroup event is added and
* cleared when last cgroup event is removed.
@@ -975,17 +1023,6 @@ static inline u64 perf_cgroup_event_time(struct perf_event *event)
}
static inline void
-perf_cgroup_defer_enabled(struct perf_event *event)
-{
-}
-
-static inline void
-perf_cgroup_mark_enabled(struct perf_event *event,
- struct perf_event_context *ctx)
-{
-}
-
-static inline void
list_update_cgroup_event(struct perf_event *event,
struct perf_event_context *ctx, bool add)
{
@@ -1006,7 +1043,7 @@ static enum hrtimer_restart perf_mux_hrtimer_handler(struct hrtimer *hr)
struct perf_cpu_context *cpuctx;
int rotations = 0;
- WARN_ON(!irqs_disabled());
+ lockdep_assert_irqs_disabled();
cpuctx = container_of(hr, struct perf_cpu_context, hrtimer);
rotations = perf_rotate_context(cpuctx);
@@ -1093,7 +1130,7 @@ static void perf_event_ctx_activate(struct perf_event_context *ctx)
{
struct list_head *head = this_cpu_ptr(&active_ctx_list);
- WARN_ON(!irqs_disabled());
+ lockdep_assert_irqs_disabled();
WARN_ON(!list_empty(&ctx->active_ctx_list));
@@ -1102,7 +1139,7 @@ static void perf_event_ctx_activate(struct perf_event_context *ctx)
static void perf_event_ctx_deactivate(struct perf_event_context *ctx)
{
- WARN_ON(!irqs_disabled());
+ lockdep_assert_irqs_disabled();
WARN_ON(list_empty(&ctx->active_ctx_list));
@@ -1202,7 +1239,7 @@ perf_event_ctx_lock_nested(struct perf_event *event, int nesting)
again:
rcu_read_lock();
- ctx = ACCESS_ONCE(event->ctx);
+ ctx = READ_ONCE(event->ctx);
if (!atomic_inc_not_zero(&ctx->refcount)) {
rcu_read_unlock();
goto again;
@@ -1398,60 +1435,6 @@ static u64 perf_event_time(struct perf_event *event)
return ctx ? ctx->time : 0;
}
-/*
- * Update the total_time_enabled and total_time_running fields for a event.
- */
-static void update_event_times(struct perf_event *event)
-{
- struct perf_event_context *ctx = event->ctx;
- u64 run_end;
-
- lockdep_assert_held(&ctx->lock);
-
- if (event->state < PERF_EVENT_STATE_INACTIVE ||
- event->group_leader->state < PERF_EVENT_STATE_INACTIVE)
- return;
-
- /*
- * in cgroup mode, time_enabled represents
- * the time the event was enabled AND active
- * tasks were in the monitored cgroup. This is
- * independent of the activity of the context as
- * there may be a mix of cgroup and non-cgroup events.
- *
- * That is why we treat cgroup events differently
- * here.
- */
- if (is_cgroup_event(event))
- run_end = perf_cgroup_event_time(event);
- else if (ctx->is_active)
- run_end = ctx->time;
- else
- run_end = event->tstamp_stopped;
-
- event->total_time_enabled = run_end - event->tstamp_enabled;
-
- if (event->state == PERF_EVENT_STATE_INACTIVE)
- run_end = event->tstamp_stopped;
- else
- run_end = perf_event_time(event);
-
- event->total_time_running = run_end - event->tstamp_running;
-
-}
-
-/*
- * Update total_time_enabled and total_time_running for all events in a group.
- */
-static void update_group_times(struct perf_event *leader)
-{
- struct perf_event *event;
-
- update_event_times(leader);
- list_for_each_entry(event, &leader->sibling_list, group_entry)
- update_event_times(event);
-}
-
static enum event_type_t get_event_type(struct perf_event *event)
{
struct perf_event_context *ctx = event->ctx;
@@ -1494,6 +1477,8 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT);
event->attach_state |= PERF_ATTACH_CONTEXT;
+ event->tstamp = perf_event_time(event);
+
/*
* If we're a stand alone event or group leader, we go to the context
* list, group events are kept attached to the group so that
@@ -1701,8 +1686,6 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
if (event->group_leader == event)
list_del_init(&event->group_entry);
- update_group_times(event);
-
/*
* If event was in error state, then keep it
* that way, otherwise bogus counts will be
@@ -1711,7 +1694,7 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
* of the event
*/
if (event->state > PERF_EVENT_STATE_OFF)
- event->state = PERF_EVENT_STATE_OFF;
+ perf_event_set_state(event, PERF_EVENT_STATE_OFF);
ctx->generation++;
}
@@ -1810,38 +1793,24 @@ event_sched_out(struct perf_event *event,
struct perf_cpu_context *cpuctx,
struct perf_event_context *ctx)
{
- u64 tstamp = perf_event_time(event);
- u64 delta;
+ enum perf_event_state state = PERF_EVENT_STATE_INACTIVE;
WARN_ON_ONCE(event->ctx != ctx);
lockdep_assert_held(&ctx->lock);
- /*
- * An event which could not be activated because of
- * filter mismatch still needs to have its timings
- * maintained, otherwise bogus information is return
- * via read() for time_enabled, time_running:
- */
- if (event->state == PERF_EVENT_STATE_INACTIVE &&
- !event_filter_match(event)) {
- delta = tstamp - event->tstamp_stopped;
- event->tstamp_running += delta;
- event->tstamp_stopped = tstamp;
- }
-
if (event->state != PERF_EVENT_STATE_ACTIVE)
return;
perf_pmu_disable(event->pmu);
- event->tstamp_stopped = tstamp;
event->pmu->del(event, 0);
event->oncpu = -1;
- event->state = PERF_EVENT_STATE_INACTIVE;
+
if (event->pending_disable) {
event->pending_disable = 0;
- event->state = PERF_EVENT_STATE_OFF;
+ state = PERF_EVENT_STATE_OFF;
}
+ perf_event_set_state(event, state);
if (!is_software_event(event))
cpuctx->active_oncpu--;
@@ -1861,7 +1830,9 @@ group_sched_out(struct perf_event *group_event,
struct perf_event_context *ctx)
{
struct perf_event *event;
- int state = group_event->state;
+
+ if (group_event->state != PERF_EVENT_STATE_ACTIVE)
+ return;
perf_pmu_disable(ctx->pmu);
@@ -1875,7 +1846,7 @@ group_sched_out(struct perf_event *group_event,
perf_pmu_enable(ctx->pmu);
- if (state == PERF_EVENT_STATE_ACTIVE && group_event->attr.exclusive)
+ if (group_event->attr.exclusive)
cpuctx->exclusive = 0;
}
@@ -1895,6 +1866,11 @@ __perf_remove_from_context(struct perf_event *event,
{
unsigned long flags = (unsigned long)info;
+ if (ctx->is_active & EVENT_TIME) {
+ update_context_time(ctx);
+ update_cgrp_time_from_cpuctx(cpuctx);
+ }
+
event_sched_out(event, cpuctx, ctx);
if (flags & DETACH_GROUP)
perf_group_detach(event);
@@ -1957,14 +1933,17 @@ static void __perf_event_disable(struct perf_event *event,
if (event->state < PERF_EVENT_STATE_INACTIVE)
return;
- update_context_time(ctx);
- update_cgrp_time_from_event(event);
- update_group_times(event);
+ if (ctx->is_active & EVENT_TIME) {
+ update_context_time(ctx);
+ update_cgrp_time_from_event(event);
+ }
+
if (event == event->group_leader)
group_sched_out(event, cpuctx, ctx);
else
event_sched_out(event, cpuctx, ctx);
- event->state = PERF_EVENT_STATE_OFF;
+
+ perf_event_set_state(event, PERF_EVENT_STATE_OFF);
}
/*
@@ -2021,8 +2000,7 @@ void perf_event_disable_inatomic(struct perf_event *event)
}
static void perf_set_shadow_time(struct perf_event *event,
- struct perf_event_context *ctx,
- u64 tstamp)
+ struct perf_event_context *ctx)
{
/*
* use the correct time source for the time snapshot
@@ -2050,9 +2028,9 @@ static void perf_set_shadow_time(struct perf_event *event,
* is cleaner and simpler to understand.
*/
if (is_cgroup_event(event))
- perf_cgroup_set_shadow_time(event, tstamp);
+ perf_cgroup_set_shadow_time(event, event->tstamp);
else
- event->shadow_ctx_time = tstamp - ctx->timestamp;
+ event->shadow_ctx_time = event->tstamp - ctx->timestamp;
}
#define MAX_INTERRUPTS (~0ULL)
@@ -2065,7 +2043,6 @@ event_sched_in(struct perf_event *event,
struct perf_cpu_context *cpuctx,
struct perf_event_context *ctx)
{
- u64 tstamp = perf_event_time(event);
int ret = 0;
lockdep_assert_held(&ctx->lock);
@@ -2075,11 +2052,12 @@ event_sched_in(struct perf_event *event,
WRITE_ONCE(event->oncpu, smp_processor_id());
/*
- * Order event::oncpu write to happen before the ACTIVE state
- * is visible.
+ * Order event::oncpu write to happen before the ACTIVE state is
+ * visible. This allows perf_event_{stop,read}() to observe the correct
+ * ->oncpu if it sees ACTIVE.
*/
smp_wmb();
- WRITE_ONCE(event->state, PERF_EVENT_STATE_ACTIVE);
+ perf_event_set_state(event, PERF_EVENT_STATE_ACTIVE);
/*
* Unthrottle events, since we scheduled we might have missed several
@@ -2091,26 +2069,19 @@ event_sched_in(struct perf_event *event,
event->hw.interrupts = 0;
}
- /*
- * The new state must be visible before we turn it on in the hardware:
- */
- smp_wmb();
-
perf_pmu_disable(event->pmu);
- perf_set_shadow_time(event, ctx, tstamp);
+ perf_set_shadow_time(event, ctx);
perf_log_itrace_start(event);
if (event->pmu->add(event, PERF_EF_START)) {
- event->state = PERF_EVENT_STATE_INACTIVE;
+ perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE);
event->oncpu = -1;
ret = -EAGAIN;
goto out;
}
- event->tstamp_running += tstamp - event->tstamp_stopped;
-
if (!is_software_event(event))
cpuctx->active_oncpu++;
if (!ctx->nr_active++)
@@ -2134,8 +2105,6 @@ group_sched_in(struct perf_event *group_event,
{
struct perf_event *event, *partial_group = NULL;
struct pmu *pmu = ctx->pmu;
- u64 now = ctx->time;
- bool simulate = false;
if (group_event->state == PERF_EVENT_STATE_OFF)
return 0;
@@ -2165,27 +2134,13 @@ group_error:
/*
* Groups can be scheduled in as one unit only, so undo any
* partial group before returning:
- * The events up to the failed event are scheduled out normally,
- * tstamp_stopped will be updated.
- *
- * The failed events and the remaining siblings need to have
- * their timings updated as if they had gone thru event_sched_in()
- * and event_sched_out(). This is required to get consistent timings
- * across the group. This also takes care of the case where the group
- * could never be scheduled by ensuring tstamp_stopped is set to mark
- * the time the event was actually stopped, such that time delta
- * calculation in update_event_times() is correct.
+ * The events up to the failed event are scheduled out normally.
*/
list_for_each_entry(event, &group_event->sibling_list, group_entry) {
if (event == partial_group)
- simulate = true;
+ break;
- if (simulate) {
- event->tstamp_running += now - event->tstamp_stopped;
- event->tstamp_stopped = now;
- } else {
- event_sched_out(event, cpuctx, ctx);
- }
+ event_sched_out(event, cpuctx, ctx);
}
event_sched_out(group_event, cpuctx, ctx);
@@ -2227,46 +2182,11 @@ static int group_can_go_on(struct perf_event *event,
return can_add_hw;
}
-/*
- * Complement to update_event_times(). This computes the tstamp_* values to
- * continue 'enabled' state from @now, and effectively discards the time
- * between the prior tstamp_stopped and now (as we were in the OFF state, or
- * just switched (context) time base).
- *
- * This further assumes '@event->state == INACTIVE' (we just came from OFF) and
- * cannot have been scheduled in yet. And going into INACTIVE state means
- * '@event->tstamp_stopped = @now'.
- *
- * Thus given the rules of update_event_times():
- *
- * total_time_enabled = tstamp_stopped - tstamp_enabled
- * total_time_running = tstamp_stopped - tstamp_running
- *
- * We can insert 'tstamp_stopped == now' and reverse them to compute new
- * tstamp_* values.
- */
-static void __perf_event_enable_time(struct perf_event *event, u64 now)
-{
- WARN_ON_ONCE(event->state != PERF_EVENT_STATE_INACTIVE);
-
- event->tstamp_stopped = now;
- event->tstamp_enabled = now - event->total_time_enabled;
- event->tstamp_running = now - event->total_time_running;
-}
-
static void add_event_to_ctx(struct perf_event *event,
struct perf_event_context *ctx)
{
- u64 tstamp = perf_event_time(event);
-
list_add_event(event, ctx);
perf_group_attach(event);
- /*
- * We can be called with event->state == STATE_OFF when we create with
- * .disabled = 1. In that case the IOC_ENABLE will call this function.
- */
- if (event->state == PERF_EVENT_STATE_INACTIVE)
- __perf_event_enable_time(event, tstamp);
}
static void ctx_sched_out(struct perf_event_context *ctx,
@@ -2498,28 +2418,6 @@ again:
}
/*
- * Put a event into inactive state and update time fields.
- * Enabling the leader of a group effectively enables all
- * the group members that aren't explicitly disabled, so we
- * have to update their ->tstamp_enabled also.
- * Note: this works for group members as well as group leaders
- * since the non-leader members' sibling_lists will be empty.
- */
-static void __perf_event_mark_enabled(struct perf_event *event)
-{
- struct perf_event *sub;
- u64 tstamp = perf_event_time(event);
-
- event->state = PERF_EVENT_STATE_INACTIVE;
- __perf_event_enable_time(event, tstamp);
- list_for_each_entry(sub, &event->sibling_list, group_entry) {
- /* XXX should not be > INACTIVE if event isn't */
- if (sub->state >= PERF_EVENT_STATE_INACTIVE)
- __perf_event_enable_time(sub, tstamp);
- }
-}
-
-/*
* Cross CPU call to enable a performance event
*/
static void __perf_event_enable(struct perf_event *event,
@@ -2537,14 +2435,12 @@ static void __perf_event_enable(struct perf_event *event,
if (ctx->is_active)
ctx_sched_out(ctx, cpuctx, EVENT_TIME);
- __perf_event_mark_enabled(event);
+ perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE);
if (!ctx->is_active)
return;
if (!event_filter_match(event)) {
- if (is_cgroup_event(event))
- perf_cgroup_defer_enabled(event);
ctx_sched_in(ctx, cpuctx, EVENT_TIME, current);
return;
}
@@ -2864,18 +2760,10 @@ static void __perf_event_sync_stat(struct perf_event *event,
* we know the event must be on the current CPU, therefore we
* don't need to use it.
*/
- switch (event->state) {
- case PERF_EVENT_STATE_ACTIVE:
+ if (event->state == PERF_EVENT_STATE_ACTIVE)
event->pmu->read(event);
- /* fall-through */
-
- case PERF_EVENT_STATE_INACTIVE:
- update_event_times(event);
- break;
- default:
- break;
- }
+ perf_event_update_time(event);
/*
* In order to keep per-task stats reliable we need to flip the event
@@ -3112,10 +3000,6 @@ ctx_pinned_sched_in(struct perf_event_context *ctx,
if (!event_filter_match(event))
continue;
- /* may need to reset tstamp_enabled */
- if (is_cgroup_event(event))
- perf_cgroup_mark_enabled(event, ctx);
-
if (group_can_go_on(event, cpuctx, 1))
group_sched_in(event, cpuctx, ctx);
@@ -3123,10 +3007,8 @@ ctx_pinned_sched_in(struct perf_event_context *ctx,
* If this pinned group hasn't been scheduled,
* put it in error state.
*/
- if (event->state == PERF_EVENT_STATE_INACTIVE) {
- update_group_times(event);
- event->state = PERF_EVENT_STATE_ERROR;
- }
+ if (event->state == PERF_EVENT_STATE_INACTIVE)
+ perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
}
}
@@ -3148,10 +3030,6 @@ ctx_flexible_sched_in(struct perf_event_context *ctx,
if (!event_filter_match(event))
continue;
- /* may need to reset tstamp_enabled */
- if (is_cgroup_event(event))
- perf_cgroup_mark_enabled(event, ctx);
-
if (group_can_go_on(event, cpuctx, can_add_hw)) {
if (group_sched_in(event, cpuctx, ctx))
can_add_hw = 0;
@@ -3523,7 +3401,7 @@ void perf_event_task_tick(void)
struct perf_event_context *ctx, *tmp;
int throttled;
- WARN_ON(!irqs_disabled());
+ lockdep_assert_irqs_disabled();
__this_cpu_inc(perf_throttled_seq);
throttled = __this_cpu_xchg(perf_throttled_count, 0);
@@ -3543,7 +3421,7 @@ static int event_enable_on_exec(struct perf_event *event,
if (event->state >= PERF_EVENT_STATE_INACTIVE)
return 0;
- __perf_event_mark_enabled(event);
+ perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE);
return 1;
}
@@ -3637,12 +3515,15 @@ static void __perf_event_read(void *info)
return;
raw_spin_lock(&ctx->lock);
- if (ctx->is_active) {
+ if (ctx->is_active & EVENT_TIME) {
update_context_time(ctx);
update_cgrp_time_from_event(event);
}
- update_event_times(event);
+ perf_event_update_time(event);
+ if (data->group)
+ perf_event_update_sibling_time(event);
+
if (event->state != PERF_EVENT_STATE_ACTIVE)
goto unlock;
@@ -3657,7 +3538,6 @@ static void __perf_event_read(void *info)
pmu->read(event);
list_for_each_entry(sub, &event->sibling_list, group_entry) {
- update_event_times(sub);
if (sub->state == PERF_EVENT_STATE_ACTIVE) {
/*
* Use sibling's PMU rather than @event's since
@@ -3686,7 +3566,8 @@ static inline u64 perf_event_count(struct perf_event *event)
* will not be local and we cannot read them atomically
* - must not have a pmu::count method
*/
-int perf_event_read_local(struct perf_event *event, u64 *value)
+int perf_event_read_local(struct perf_event *event, u64 *value,
+ u64 *enabled, u64 *running)
{
unsigned long flags;
int ret = 0;
@@ -3720,6 +3601,7 @@ int perf_event_read_local(struct perf_event *event, u64 *value)
goto out;
}
+
/*
* If the event is currently on this CPU, its either a per-task event,
* or local to this CPU. Furthermore it means its ACTIVE (otherwise
@@ -3729,6 +3611,16 @@ int perf_event_read_local(struct perf_event *event, u64 *value)
event->pmu->read(event);
*value = local64_read(&event->count);
+ if (enabled || running) {
+ u64 now = event->shadow_ctx_time + perf_clock();
+ u64 __enabled, __running;
+
+ __perf_update_times(event, now, &__enabled, &__running);
+ if (enabled)
+ *enabled = __enabled;
+ if (running)
+ *running = __running;
+ }
out:
local_irq_restore(flags);
@@ -3737,23 +3629,35 @@ out:
static int perf_event_read(struct perf_event *event, bool group)
{
+ enum perf_event_state state = READ_ONCE(event->state);
int event_cpu, ret = 0;
/*
* If event is enabled and currently active on a CPU, update the
* value in the event structure:
*/
- if (event->state == PERF_EVENT_STATE_ACTIVE) {
- struct perf_read_data data = {
- .event = event,
- .group = group,
- .ret = 0,
- };
+again:
+ if (state == PERF_EVENT_STATE_ACTIVE) {
+ struct perf_read_data data;
+
+ /*
+ * Orders the ->state and ->oncpu loads such that if we see
+ * ACTIVE we must also see the right ->oncpu.
+ *
+ * Matches the smp_wmb() from event_sched_in().
+ */
+ smp_rmb();
event_cpu = READ_ONCE(event->oncpu);
if ((unsigned)event_cpu >= nr_cpu_ids)
return 0;
+ data = (struct perf_read_data){
+ .event = event,
+ .group = group,
+ .ret = 0,
+ };
+
preempt_disable();
event_cpu = __perf_event_read_cpu(event, event_cpu);
@@ -3770,24 +3674,30 @@ static int perf_event_read(struct perf_event *event, bool group)
(void)smp_call_function_single(event_cpu, __perf_event_read, &data, 1);
preempt_enable();
ret = data.ret;
- } else if (event->state == PERF_EVENT_STATE_INACTIVE) {
+
+ } else if (state == PERF_EVENT_STATE_INACTIVE) {
struct perf_event_context *ctx = event->ctx;
unsigned long flags;
raw_spin_lock_irqsave(&ctx->lock, flags);
+ state = event->state;
+ if (state != PERF_EVENT_STATE_INACTIVE) {
+ raw_spin_unlock_irqrestore(&ctx->lock, flags);
+ goto again;
+ }
+
/*
- * may read while context is not active
- * (e.g., thread is blocked), in that case
- * we cannot update context time
+ * May read while context is not active (e.g., thread is
+ * blocked), in that case we cannot update context time
*/
- if (ctx->is_active) {
+ if (ctx->is_active & EVENT_TIME) {
update_context_time(ctx);
update_cgrp_time_from_event(event);
}
+
+ perf_event_update_time(event);
if (group)
- update_group_times(event);
- else
- update_event_times(event);
+ perf_event_update_sibling_time(event);
raw_spin_unlock_irqrestore(&ctx->lock, flags);
}
@@ -4233,7 +4143,7 @@ static void perf_remove_from_owner(struct perf_event *event)
* indeed free this event, otherwise we need to serialize on
* owner->perf_event_mutex.
*/
- owner = lockless_dereference(event->owner);
+ owner = READ_ONCE(event->owner);
if (owner) {
/*
* Since delayed_put_task_struct() also drops the last
@@ -4330,7 +4240,7 @@ again:
* Cannot change, child events are not migrated, see the
* comment with perf_event_ctx_lock_nested().
*/
- ctx = lockless_dereference(child->ctx);
+ ctx = READ_ONCE(child->ctx);
/*
* Since child_mutex nests inside ctx::mutex, we must jump
* through hoops. We start by grabbing a reference on the ctx.
@@ -4390,7 +4300,7 @@ static int perf_release(struct inode *inode, struct file *file)
return 0;
}
-u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
+static u64 __perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
{
struct perf_event *child;
u64 total = 0;
@@ -4418,6 +4328,18 @@ u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
return total;
}
+
+u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
+{
+ struct perf_event_context *ctx;
+ u64 count;
+
+ ctx = perf_event_ctx_lock(event);
+ count = __perf_event_read_value(event, enabled, running);
+ perf_event_ctx_unlock(event, ctx);
+
+ return count;
+}
EXPORT_SYMBOL_GPL(perf_event_read_value);
static int __perf_read_group_add(struct perf_event *leader,
@@ -4433,6 +4355,8 @@ static int __perf_read_group_add(struct perf_event *leader,
if (ret)
return ret;
+ raw_spin_lock_irqsave(&ctx->lock, flags);
+
/*
* Since we co-schedule groups, {enabled,running} times of siblings
* will be identical to those of the leader, so we only publish one
@@ -4455,8 +4379,6 @@ static int __perf_read_group_add(struct perf_event *leader,
if (read_format & PERF_FORMAT_ID)
values[n++] = primary_event_id(leader);
- raw_spin_lock_irqsave(&ctx->lock, flags);
-
list_for_each_entry(sub, &leader->sibling_list, group_entry) {
values[n++] += perf_event_count(sub);
if (read_format & PERF_FORMAT_ID)
@@ -4520,7 +4442,7 @@ static int perf_read_one(struct perf_event *event,
u64 values[4];
int n = 0;
- values[n++] = perf_event_read_value(event, &enabled, &running);
+ values[n++] = __perf_event_read_value(event, &enabled, &running);
if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
values[n++] = enabled;
if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
@@ -4899,8 +4821,7 @@ static void calc_timer_values(struct perf_event *event,
*now = perf_clock();
ctx_time = event->shadow_ctx_time + *now;
- *enabled = ctx_time - event->tstamp_enabled;
- *running = ctx_time - event->tstamp_running;
+ __perf_update_times(event, ctx_time, enabled, running);
}
static void perf_event_init_userpage(struct perf_event *event)
@@ -5304,8 +5225,8 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
if (!rb)
goto aux_unlock;
- aux_offset = ACCESS_ONCE(rb->user_page->aux_offset);
- aux_size = ACCESS_ONCE(rb->user_page->aux_size);
+ aux_offset = READ_ONCE(rb->user_page->aux_offset);
+ aux_size = READ_ONCE(rb->user_page->aux_size);
if (aux_offset < perf_data_size(rb) + PAGE_SIZE)
goto aux_unlock;
@@ -8074,6 +7995,7 @@ static void bpf_overflow_handler(struct perf_event *event,
struct bpf_perf_event_data_kern ctx = {
.data = data,
.regs = regs,
+ .event = event,
};
int ret = 0;
@@ -9404,6 +9326,11 @@ static void account_event(struct perf_event *event)
inc = true;
if (inc) {
+ /*
+ * We need the mutex here because static_branch_enable()
+ * must complete *before* the perf_sched_count increment
+ * becomes visible.
+ */
if (atomic_inc_not_zero(&perf_sched_count))
goto enabled;
@@ -10529,7 +10456,7 @@ perf_event_exit_event(struct perf_event *child_event,
if (parent_event)
perf_group_detach(child_event);
list_del_event(child_event, child_ctx);
- child_event->state = PERF_EVENT_STATE_EXIT; /* is_event_hup() */
+ perf_event_set_state(child_event, PERF_EVENT_STATE_EXIT); /* is_event_hup() */
raw_spin_unlock_irq(&child_ctx->lock);
/*
@@ -10767,7 +10694,7 @@ inherit_event(struct perf_event *parent_event,
struct perf_event *group_leader,
struct perf_event_context *child_ctx)
{
- enum perf_event_active_state parent_state = parent_event->state;
+ enum perf_event_state parent_state = parent_event->state;
struct perf_event *child_event;
unsigned long flags;
@@ -11103,6 +11030,7 @@ static void __perf_event_exit_context(void *__info)
struct perf_event *event;
raw_spin_lock(&ctx->lock);
+ ctx_sched_out(ctx, cpuctx, EVENT_TIME);
list_for_each_entry(event, &ctx->event_list, event_entry)
__perf_remove_from_context(event, cpuctx, ctx, (void *)DETACH_GROUP);
raw_spin_unlock(&ctx->lock);
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index f684d8e5fa2b..f3e37971c842 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -381,7 +381,7 @@ void *perf_aux_output_begin(struct perf_output_handle *handle,
* (B) <-> (C) ordering is still observed by the pmu driver.
*/
if (!rb->aux_overwrite) {
- aux_tail = ACCESS_ONCE(rb->user_page->aux_tail);
+ aux_tail = READ_ONCE(rb->user_page->aux_tail);
handle->wakeup = rb->aux_wakeup + rb->aux_watermark;
if (aux_head - aux_tail < perf_aux_size(rb))
handle->size = CIRC_SPACE(aux_head, aux_tail, perf_aux_size(rb));
diff --git a/kernel/exit.c b/kernel/exit.c
index f6cad39f35df..6b4298a41167 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1339,7 +1339,7 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace,
* Ensure that EXIT_ZOMBIE -> EXIT_DEAD/EXIT_TRACE transition
* can't confuse the checks below.
*/
- int exit_state = ACCESS_ONCE(p->exit_state);
+ int exit_state = READ_ONCE(p->exit_state);
int ret;
if (unlikely(exit_state == EXIT_DEAD))
diff --git a/kernel/extable.c b/kernel/extable.c
index 9aa1cc41ecf7..a17fdb63dc3e 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -31,6 +31,8 @@
* mutex protecting text section modification (dynamic code patching).
* some users need to sleep (allocating memory...) while they hold this lock.
*
+ * Note: Also protects SMP-alternatives modification on x86.
+ *
* NOT exported to modules - patching kernel text is a really delicate matter.
*/
DEFINE_MUTEX(text_mutex);
diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig
index a117adf7084b..89e355866450 100644
--- a/kernel/irq/Kconfig
+++ b/kernel/irq/Kconfig
@@ -97,6 +97,12 @@ config HANDLE_DOMAIN_IRQ
config IRQ_TIMINGS
bool
+config GENERIC_IRQ_MATRIX_ALLOCATOR
+ bool
+
+config GENERIC_IRQ_RESERVATION_MODE
+ bool
+
config IRQ_DOMAIN_DEBUG
bool "Expose hardware/virtual IRQ mapping via debugfs"
depends on IRQ_DOMAIN && DEBUG_FS
diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile
index ed15d142694b..ff6e352e3a6c 100644
--- a/kernel/irq/Makefile
+++ b/kernel/irq/Makefile
@@ -14,3 +14,4 @@ obj-$(CONFIG_GENERIC_MSI_IRQ) += msi.o
obj-$(CONFIG_GENERIC_IRQ_IPI) += ipi.o
obj-$(CONFIG_SMP) += affinity.o
obj-$(CONFIG_GENERIC_IRQ_DEBUGFS) += debugfs.o
+obj-$(CONFIG_GENERIC_IRQ_MATRIX_ALLOCATOR) += matrix.o
diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c
index befa671fba64..4e8089b319ae 100644
--- a/kernel/irq/autoprobe.c
+++ b/kernel/irq/autoprobe.c
@@ -54,7 +54,7 @@ unsigned long probe_irq_on(void)
if (desc->irq_data.chip->irq_set_type)
desc->irq_data.chip->irq_set_type(&desc->irq_data,
IRQ_TYPE_PROBE);
- irq_startup(desc, IRQ_NORESEND, IRQ_START_FORCE);
+ irq_activate_and_startup(desc, IRQ_NORESEND);
}
raw_spin_unlock_irq(&desc->lock);
}
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 5a2ef92c2782..043bfc35b353 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -207,20 +207,24 @@ __irq_startup_managed(struct irq_desc *desc, struct cpumask *aff, bool force)
* Catch code which fiddles with enable_irq() on a managed
* and potentially shutdown IRQ. Chained interrupt
* installment or irq auto probing should not happen on
- * managed irqs either. Emit a warning, break the affinity
- * and start it up as a normal interrupt.
+ * managed irqs either.
*/
if (WARN_ON_ONCE(force))
- return IRQ_STARTUP_NORMAL;
+ return IRQ_STARTUP_ABORT;
/*
* The interrupt was requested, but there is no online CPU
* in it's affinity mask. Put it into managed shutdown
* state and let the cpu hotplug mechanism start it up once
* a CPU in the mask becomes available.
*/
- irqd_set_managed_shutdown(d);
return IRQ_STARTUP_ABORT;
}
+ /*
+ * Managed interrupts have reserved resources, so this should not
+ * happen.
+ */
+ if (WARN_ON(irq_domain_activate_irq(d, false)))
+ return IRQ_STARTUP_ABORT;
return IRQ_STARTUP_MANAGED;
}
#else
@@ -236,7 +240,9 @@ static int __irq_startup(struct irq_desc *desc)
struct irq_data *d = irq_desc_get_irq_data(desc);
int ret = 0;
- irq_domain_activate_irq(d);
+ /* Warn if this interrupt is not activated but try nevertheless */
+ WARN_ON_ONCE(!irqd_is_activated(d));
+
if (d->chip->irq_startup) {
ret = d->chip->irq_startup(d);
irq_state_clr_disabled(desc);
@@ -269,6 +275,7 @@ int irq_startup(struct irq_desc *desc, bool resend, bool force)
ret = __irq_startup(desc);
break;
case IRQ_STARTUP_ABORT:
+ irqd_set_managed_shutdown(d);
return 0;
}
}
@@ -278,6 +285,22 @@ int irq_startup(struct irq_desc *desc, bool resend, bool force)
return ret;
}
+int irq_activate(struct irq_desc *desc)
+{
+ struct irq_data *d = irq_desc_get_irq_data(desc);
+
+ if (!irqd_affinity_is_managed(d))
+ return irq_domain_activate_irq(d, false);
+ return 0;
+}
+
+void irq_activate_and_startup(struct irq_desc *desc, bool resend)
+{
+ if (WARN_ON(irq_activate(desc)))
+ return;
+ irq_startup(desc, resend, IRQ_START_FORCE);
+}
+
static void __irq_disable(struct irq_desc *desc, bool mask);
void irq_shutdown(struct irq_desc *desc)
@@ -953,7 +976,7 @@ __irq_do_set_handler(struct irq_desc *desc, irq_flow_handler_t handle,
irq_settings_set_norequest(desc);
irq_settings_set_nothread(desc);
desc->action = &chained_action;
- irq_startup(desc, IRQ_RESEND, IRQ_START_FORCE);
+ irq_activate_and_startup(desc, IRQ_RESEND);
}
}
diff --git a/kernel/irq/debugfs.c b/kernel/irq/debugfs.c
index c3fdb36dec30..7f608ac39653 100644
--- a/kernel/irq/debugfs.c
+++ b/kernel/irq/debugfs.c
@@ -81,6 +81,8 @@ irq_debug_show_data(struct seq_file *m, struct irq_data *data, int ind)
data->domain ? data->domain->name : "");
seq_printf(m, "%*shwirq: 0x%lx\n", ind + 1, "", data->hwirq);
irq_debug_show_chip(m, data, ind + 1);
+ if (data->domain && data->domain->ops && data->domain->ops->debug_show)
+ data->domain->ops->debug_show(m, NULL, data, ind + 1);
#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY
if (!data->parent_data)
return;
@@ -149,6 +151,7 @@ static int irq_debug_show(struct seq_file *m, void *p)
raw_spin_lock_irq(&desc->lock);
data = irq_desc_get_irq_data(desc);
seq_printf(m, "handler: %pf\n", desc->handle_irq);
+ seq_printf(m, "device: %s\n", desc->dev_name);
seq_printf(m, "status: 0x%08x\n", desc->status_use_accessors);
irq_debug_show_bits(m, 0, desc->status_use_accessors, irqdesc_states,
ARRAY_SIZE(irqdesc_states));
@@ -226,6 +229,15 @@ static const struct file_operations dfs_irq_ops = {
.release = single_release,
};
+void irq_debugfs_copy_devname(int irq, struct device *dev)
+{
+ struct irq_desc *desc = irq_to_desc(irq);
+ const char *name = dev_name(dev);
+
+ if (name)
+ desc->dev_name = kstrdup(name, GFP_KERNEL);
+}
+
void irq_add_debugfs_entry(unsigned int irq, struct irq_desc *desc)
{
char name [10];
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index 44ed5f8c8759..07d08ca701ec 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -75,6 +75,8 @@ extern void __enable_irq(struct irq_desc *desc);
#define IRQ_START_FORCE true
#define IRQ_START_COND false
+extern int irq_activate(struct irq_desc *desc);
+extern void irq_activate_and_startup(struct irq_desc *desc, bool resend);
extern int irq_startup(struct irq_desc *desc, bool resend, bool force);
extern void irq_shutdown(struct irq_desc *desc);
@@ -437,6 +439,18 @@ static inline bool irq_fixup_move_pending(struct irq_desc *desc, bool fclear)
}
#endif /* !CONFIG_GENERIC_PENDING_IRQ */
+#if !defined(CONFIG_IRQ_DOMAIN) || !defined(CONFIG_IRQ_DOMAIN_HIERARCHY)
+static inline int irq_domain_activate_irq(struct irq_data *data, bool early)
+{
+ irqd_set_activated(data);
+ return 0;
+}
+static inline void irq_domain_deactivate_irq(struct irq_data *data)
+{
+ irqd_clr_activated(data);
+}
+#endif
+
#ifdef CONFIG_GENERIC_IRQ_DEBUGFS
#include <linux/debugfs.h>
@@ -444,7 +458,9 @@ void irq_add_debugfs_entry(unsigned int irq, struct irq_desc *desc);
static inline void irq_remove_debugfs_entry(struct irq_desc *desc)
{
debugfs_remove(desc->debugfs_file);
+ kfree(desc->dev_name);
}
+void irq_debugfs_copy_devname(int irq, struct device *dev);
# ifdef CONFIG_IRQ_DOMAIN
void irq_domain_debugfs_init(struct dentry *root);
# else
@@ -459,4 +475,7 @@ static inline void irq_add_debugfs_entry(unsigned int irq, struct irq_desc *d)
static inline void irq_remove_debugfs_entry(struct irq_desc *d)
{
}
+static inline void irq_debugfs_copy_devname(int irq, struct device *dev)
+{
+}
#endif /* CONFIG_GENERIC_IRQ_DEBUGFS */
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 82afb7ed369f..f2edcf85780d 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -27,7 +27,7 @@ static struct lock_class_key irq_desc_lock_class;
#if defined(CONFIG_SMP)
static int __init irq_affinity_setup(char *str)
{
- zalloc_cpumask_var(&irq_default_affinity, GFP_NOWAIT);
+ alloc_bootmem_cpumask_var(&irq_default_affinity);
cpulist_parse(str, irq_default_affinity);
/*
* Set at least the boot cpu. We don't want to end up with
@@ -40,10 +40,8 @@ __setup("irqaffinity=", irq_affinity_setup);
static void __init init_irq_default_affinity(void)
{
-#ifdef CONFIG_CPUMASK_OFFSTACK
- if (!irq_default_affinity)
+ if (!cpumask_available(irq_default_affinity))
zalloc_cpumask_var(&irq_default_affinity, GFP_NOWAIT);
-#endif
if (cpumask_empty(irq_default_affinity))
cpumask_setall(irq_default_affinity);
}
@@ -448,7 +446,7 @@ static int alloc_descs(unsigned int start, unsigned int cnt, int node,
}
}
- flags = affinity ? IRQD_AFFINITY_MANAGED : 0;
+ flags = affinity ? IRQD_AFFINITY_MANAGED | IRQD_MANAGED_SHUTDOWN : 0;
mask = NULL;
for (i = 0; i < cnt; i++) {
@@ -462,6 +460,7 @@ static int alloc_descs(unsigned int start, unsigned int cnt, int node,
goto err;
irq_insert_desc(start + i, desc);
irq_sysfs_add(start + i, desc);
+ irq_add_debugfs_entry(start + i, desc);
}
bitmap_set(allocated_irqs, start, cnt);
return start;
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index ac4644e92b49..4f4f60015e8a 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -21,7 +21,6 @@
static LIST_HEAD(irq_domain_list);
static DEFINE_MUTEX(irq_domain_mutex);
-static DEFINE_MUTEX(revmap_trees_mutex);
static struct irq_domain *irq_default_domain;
static void irq_domain_check_hierarchy(struct irq_domain *domain);
@@ -211,6 +210,7 @@ struct irq_domain *__irq_domain_add(struct fwnode_handle *fwnode, int size,
/* Fill structure */
INIT_RADIX_TREE(&domain->revmap_tree, GFP_KERNEL);
+ mutex_init(&domain->revmap_tree_mutex);
domain->ops = ops;
domain->host_data = host_data;
domain->hwirq_max = hwirq_max;
@@ -462,9 +462,9 @@ static void irq_domain_clear_mapping(struct irq_domain *domain,
if (hwirq < domain->revmap_size) {
domain->linear_revmap[hwirq] = 0;
} else {
- mutex_lock(&revmap_trees_mutex);
+ mutex_lock(&domain->revmap_tree_mutex);
radix_tree_delete(&domain->revmap_tree, hwirq);
- mutex_unlock(&revmap_trees_mutex);
+ mutex_unlock(&domain->revmap_tree_mutex);
}
}
@@ -475,9 +475,9 @@ static void irq_domain_set_mapping(struct irq_domain *domain,
if (hwirq < domain->revmap_size) {
domain->linear_revmap[hwirq] = irq_data->irq;
} else {
- mutex_lock(&revmap_trees_mutex);
+ mutex_lock(&domain->revmap_tree_mutex);
radix_tree_insert(&domain->revmap_tree, hwirq, irq_data);
- mutex_unlock(&revmap_trees_mutex);
+ mutex_unlock(&domain->revmap_tree_mutex);
}
}
@@ -921,8 +921,7 @@ static void virq_debug_show_one(struct seq_file *m, struct irq_desc *desc)
chip = irq_data_get_irq_chip(data);
seq_printf(m, "%-15s ", (chip && chip->name) ? chip->name : "none");
- seq_printf(m, data ? "0x%p " : " %p ",
- irq_data_get_irq_chip_data(data));
+ seq_printf(m, "0x%p ", irq_data_get_irq_chip_data(data));
seq_printf(m, " %c ", (desc->action && desc->action->handler) ? '*' : ' ');
direct = (irq == hwirq) && (irq < domain->revmap_direct_max_irq);
@@ -1459,11 +1458,11 @@ static void irq_domain_fix_revmap(struct irq_data *d)
return; /* Not using radix tree. */
/* Fix up the revmap. */
- mutex_lock(&revmap_trees_mutex);
+ mutex_lock(&d->domain->revmap_tree_mutex);
slot = radix_tree_lookup_slot(&d->domain->revmap_tree, d->hwirq);
if (slot)
radix_tree_replace_slot(&d->domain->revmap_tree, slot, d);
- mutex_unlock(&revmap_trees_mutex);
+ mutex_unlock(&d->domain->revmap_tree_mutex);
}
/**
@@ -1682,28 +1681,36 @@ void irq_domain_free_irqs_parent(struct irq_domain *domain,
}
EXPORT_SYMBOL_GPL(irq_domain_free_irqs_parent);
-static void __irq_domain_activate_irq(struct irq_data *irq_data)
+static void __irq_domain_deactivate_irq(struct irq_data *irq_data)
{
if (irq_data && irq_data->domain) {
struct irq_domain *domain = irq_data->domain;
+ if (domain->ops->deactivate)
+ domain->ops->deactivate(domain, irq_data);
if (irq_data->parent_data)
- __irq_domain_activate_irq(irq_data->parent_data);
- if (domain->ops->activate)
- domain->ops->activate(domain, irq_data);
+ __irq_domain_deactivate_irq(irq_data->parent_data);
}
}
-static void __irq_domain_deactivate_irq(struct irq_data *irq_data)
+static int __irq_domain_activate_irq(struct irq_data *irqd, bool early)
{
- if (irq_data && irq_data->domain) {
- struct irq_domain *domain = irq_data->domain;
+ int ret = 0;
- if (domain->ops->deactivate)
- domain->ops->deactivate(domain, irq_data);
- if (irq_data->parent_data)
- __irq_domain_deactivate_irq(irq_data->parent_data);
+ if (irqd && irqd->domain) {
+ struct irq_domain *domain = irqd->domain;
+
+ if (irqd->parent_data)
+ ret = __irq_domain_activate_irq(irqd->parent_data,
+ early);
+ if (!ret && domain->ops->activate) {
+ ret = domain->ops->activate(domain, irqd, early);
+ /* Rollback in case of error */
+ if (ret && irqd->parent_data)
+ __irq_domain_deactivate_irq(irqd->parent_data);
+ }
}
+ return ret;
}
/**
@@ -1714,12 +1721,15 @@ static void __irq_domain_deactivate_irq(struct irq_data *irq_data)
* This is the second step to call domain_ops->activate to program interrupt
* controllers, so the interrupt could actually get delivered.
*/
-void irq_domain_activate_irq(struct irq_data *irq_data)
+int irq_domain_activate_irq(struct irq_data *irq_data, bool early)
{
- if (!irqd_is_activated(irq_data)) {
- __irq_domain_activate_irq(irq_data);
+ int ret = 0;
+
+ if (!irqd_is_activated(irq_data))
+ ret = __irq_domain_activate_irq(irq_data, early);
+ if (!ret)
irqd_set_activated(irq_data);
- }
+ return ret;
}
/**
@@ -1810,6 +1820,8 @@ irq_domain_debug_show_one(struct seq_file *m, struct irq_domain *d, int ind)
d->revmap_size + d->revmap_direct_max_irq);
seq_printf(m, "%*smapped: %u\n", ind + 1, "", d->mapcount);
seq_printf(m, "%*sflags: 0x%08x\n", ind +1 , "", d->flags);
+ if (d->ops && d->ops->debug_show)
+ d->ops->debug_show(m, d, NULL, ind + 1);
#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY
if (!d->parent)
return;
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 4bff6a10ae8e..2ff1c0c82fc9 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -398,7 +398,8 @@ int irq_select_affinity_usr(unsigned int irq)
/**
* irq_set_vcpu_affinity - Set vcpu affinity for the interrupt
* @irq: interrupt number to set affinity
- * @vcpu_info: vCPU specific data
+ * @vcpu_info: vCPU specific data or pointer to a percpu array of vCPU
+ * specific data for percpu_devid interrupts
*
* This function uses the vCPU specific data to set the vCPU
* affinity for an irq. The vCPU specific data is passed from
@@ -536,7 +537,7 @@ void __enable_irq(struct irq_desc *desc)
* time. If it was already started up, then irq_startup()
* will invoke irq_enable() under the hood.
*/
- irq_startup(desc, IRQ_RESEND, IRQ_START_COND);
+ irq_startup(desc, IRQ_RESEND, IRQ_START_FORCE);
break;
}
default:
@@ -1305,7 +1306,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
* thread_mask assigned. See the loop above which or's
* all existing action->thread_mask bits.
*/
- new->thread_mask = 1 << ffz(thread_mask);
+ new->thread_mask = 1UL << ffz(thread_mask);
} else if (new->handler == irq_default_primary_handler &&
!(desc->irq_data.chip->flags & IRQCHIP_ONESHOT_SAFE)) {
@@ -1342,6 +1343,21 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
goto out_unlock;
}
+ /*
+ * Activate the interrupt. That activation must happen
+ * independently of IRQ_NOAUTOEN. request_irq() can fail
+ * and the callers are supposed to handle
+ * that. enable_irq() of an interrupt requested with
+ * IRQ_NOAUTOEN is not supposed to fail. The activation
+ * keeps it in shutdown mode, it merily associates
+ * resources if necessary and if that's not possible it
+ * fails. Interrupts which are in managed shutdown mode
+ * will simply ignore that activation request.
+ */
+ ret = irq_activate(desc);
+ if (ret)
+ goto out_unlock;
+
desc->istate &= ~(IRQS_AUTODETECT | IRQS_SPURIOUS_DISABLED | \
IRQS_ONESHOT | IRQS_WAITING);
irqd_clear(&desc->irq_data, IRQD_IRQ_INPROGRESS);
@@ -1417,7 +1433,6 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
wake_up_process(new->secondary->thread);
register_irq_proc(irq, desc);
- irq_add_debugfs_entry(irq, desc);
new->dir = NULL;
register_handler_proc(irq, new);
return 0;
diff --git a/kernel/irq/matrix.c b/kernel/irq/matrix.c
new file mode 100644
index 000000000000..a3cbbc8191c5
--- /dev/null
+++ b/kernel/irq/matrix.c
@@ -0,0 +1,443 @@
+/*
+ * Copyright (C) 2017 Thomas Gleixner <tglx@linutronix.de>
+ *
+ * SPDX-License-Identifier: GPL-2.0
+ */
+#include <linux/spinlock.h>
+#include <linux/seq_file.h>
+#include <linux/bitmap.h>
+#include <linux/percpu.h>
+#include <linux/cpu.h>
+#include <linux/irq.h>
+
+#define IRQ_MATRIX_SIZE (BITS_TO_LONGS(IRQ_MATRIX_BITS) * sizeof(unsigned long))
+
+struct cpumap {
+ unsigned int available;
+ unsigned int allocated;
+ unsigned int managed;
+ bool online;
+ unsigned long alloc_map[IRQ_MATRIX_SIZE];
+ unsigned long managed_map[IRQ_MATRIX_SIZE];
+};
+
+struct irq_matrix {
+ unsigned int matrix_bits;
+ unsigned int alloc_start;
+ unsigned int alloc_end;
+ unsigned int alloc_size;
+ unsigned int global_available;
+ unsigned int global_reserved;
+ unsigned int systembits_inalloc;
+ unsigned int total_allocated;
+ unsigned int online_maps;
+ struct cpumap __percpu *maps;
+ unsigned long scratch_map[IRQ_MATRIX_SIZE];
+ unsigned long system_map[IRQ_MATRIX_SIZE];
+};
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/irq_matrix.h>
+
+/**
+ * irq_alloc_matrix - Allocate a irq_matrix structure and initialize it
+ * @matrix_bits: Number of matrix bits must be <= IRQ_MATRIX_BITS
+ * @alloc_start: From which bit the allocation search starts
+ * @alloc_end: At which bit the allocation search ends, i.e first
+ * invalid bit
+ */
+__init struct irq_matrix *irq_alloc_matrix(unsigned int matrix_bits,
+ unsigned int alloc_start,
+ unsigned int alloc_end)
+{
+ struct irq_matrix *m;
+
+ if (matrix_bits > IRQ_MATRIX_BITS)
+ return NULL;
+
+ m = kzalloc(sizeof(*m), GFP_KERNEL);
+ if (!m)
+ return NULL;
+
+ m->matrix_bits = matrix_bits;
+ m->alloc_start = alloc_start;
+ m->alloc_end = alloc_end;
+ m->alloc_size = alloc_end - alloc_start;
+ m->maps = alloc_percpu(*m->maps);
+ if (!m->maps) {
+ kfree(m);
+ return NULL;
+ }
+ return m;
+}
+
+/**
+ * irq_matrix_online - Bring the local CPU matrix online
+ * @m: Matrix pointer
+ */
+void irq_matrix_online(struct irq_matrix *m)
+{
+ struct cpumap *cm = this_cpu_ptr(m->maps);
+
+ BUG_ON(cm->online);
+
+ bitmap_zero(cm->alloc_map, m->matrix_bits);
+ cm->available = m->alloc_size - (cm->managed + m->systembits_inalloc);
+ cm->allocated = 0;
+ m->global_available += cm->available;
+ cm->online = true;
+ m->online_maps++;
+ trace_irq_matrix_online(m);
+}
+
+/**
+ * irq_matrix_offline - Bring the local CPU matrix offline
+ * @m: Matrix pointer
+ */
+void irq_matrix_offline(struct irq_matrix *m)
+{
+ struct cpumap *cm = this_cpu_ptr(m->maps);
+
+ /* Update the global available size */
+ m->global_available -= cm->available;
+ cm->online = false;
+ m->online_maps--;
+ trace_irq_matrix_offline(m);
+}
+
+static unsigned int matrix_alloc_area(struct irq_matrix *m, struct cpumap *cm,
+ unsigned int num, bool managed)
+{
+ unsigned int area, start = m->alloc_start;
+ unsigned int end = m->alloc_end;
+
+ bitmap_or(m->scratch_map, cm->managed_map, m->system_map, end);
+ bitmap_or(m->scratch_map, m->scratch_map, cm->alloc_map, end);
+ area = bitmap_find_next_zero_area(m->scratch_map, end, start, num, 0);
+ if (area >= end)
+ return area;
+ if (managed)
+ bitmap_set(cm->managed_map, area, num);
+ else
+ bitmap_set(cm->alloc_map, area, num);
+ return area;
+}
+
+/**
+ * irq_matrix_assign_system - Assign system wide entry in the matrix
+ * @m: Matrix pointer
+ * @bit: Which bit to reserve
+ * @replace: Replace an already allocated vector with a system
+ * vector at the same bit position.
+ *
+ * The BUG_ON()s below are on purpose. If this goes wrong in the
+ * early boot process, then the chance to survive is about zero.
+ * If this happens when the system is life, it's not much better.
+ */
+void irq_matrix_assign_system(struct irq_matrix *m, unsigned int bit,
+ bool replace)
+{
+ struct cpumap *cm = this_cpu_ptr(m->maps);
+
+ BUG_ON(bit > m->matrix_bits);
+ BUG_ON(m->online_maps > 1 || (m->online_maps && !replace));
+
+ set_bit(bit, m->system_map);
+ if (replace) {
+ BUG_ON(!test_and_clear_bit(bit, cm->alloc_map));
+ cm->allocated--;
+ m->total_allocated--;
+ }
+ if (bit >= m->alloc_start && bit < m->alloc_end)
+ m->systembits_inalloc++;
+
+ trace_irq_matrix_assign_system(bit, m);
+}
+
+/**
+ * irq_matrix_reserve_managed - Reserve a managed interrupt in a CPU map
+ * @m: Matrix pointer
+ * @msk: On which CPUs the bits should be reserved.
+ *
+ * Can be called for offline CPUs. Note, this will only reserve one bit
+ * on all CPUs in @msk, but it's not guaranteed that the bits are at the
+ * same offset on all CPUs
+ */
+int irq_matrix_reserve_managed(struct irq_matrix *m, const struct cpumask *msk)
+{
+ unsigned int cpu, failed_cpu;
+
+ for_each_cpu(cpu, msk) {
+ struct cpumap *cm = per_cpu_ptr(m->maps, cpu);
+ unsigned int bit;
+
+ bit = matrix_alloc_area(m, cm, 1, true);
+ if (bit >= m->alloc_end)
+ goto cleanup;
+ cm->managed++;
+ if (cm->online) {
+ cm->available--;
+ m->global_available--;
+ }
+ trace_irq_matrix_reserve_managed(bit, cpu, m, cm);
+ }
+ return 0;
+cleanup:
+ failed_cpu = cpu;
+ for_each_cpu(cpu, msk) {
+ if (cpu == failed_cpu)
+ break;
+ irq_matrix_remove_managed(m, cpumask_of(cpu));
+ }
+ return -ENOSPC;
+}
+
+/**
+ * irq_matrix_remove_managed - Remove managed interrupts in a CPU map
+ * @m: Matrix pointer
+ * @msk: On which CPUs the bits should be removed
+ *
+ * Can be called for offline CPUs
+ *
+ * This removes not allocated managed interrupts from the map. It does
+ * not matter which one because the managed interrupts free their
+ * allocation when they shut down. If not, the accounting is screwed,
+ * but all what can be done at this point is warn about it.
+ */
+void irq_matrix_remove_managed(struct irq_matrix *m, const struct cpumask *msk)
+{
+ unsigned int cpu;
+
+ for_each_cpu(cpu, msk) {
+ struct cpumap *cm = per_cpu_ptr(m->maps, cpu);
+ unsigned int bit, end = m->alloc_end;
+
+ if (WARN_ON_ONCE(!cm->managed))
+ continue;
+
+ /* Get managed bit which are not allocated */
+ bitmap_andnot(m->scratch_map, cm->managed_map, cm->alloc_map, end);
+
+ bit = find_first_bit(m->scratch_map, end);
+ if (WARN_ON_ONCE(bit >= end))
+ continue;
+
+ clear_bit(bit, cm->managed_map);
+
+ cm->managed--;
+ if (cm->online) {
+ cm->available++;
+ m->global_available++;
+ }
+ trace_irq_matrix_remove_managed(bit, cpu, m, cm);
+ }
+}
+
+/**
+ * irq_matrix_alloc_managed - Allocate a managed interrupt in a CPU map
+ * @m: Matrix pointer
+ * @cpu: On which CPU the interrupt should be allocated
+ */
+int irq_matrix_alloc_managed(struct irq_matrix *m, unsigned int cpu)
+{
+ struct cpumap *cm = per_cpu_ptr(m->maps, cpu);
+ unsigned int bit, end = m->alloc_end;
+
+ /* Get managed bit which are not allocated */
+ bitmap_andnot(m->scratch_map, cm->managed_map, cm->alloc_map, end);
+ bit = find_first_bit(m->scratch_map, end);
+ if (bit >= end)
+ return -ENOSPC;
+ set_bit(bit, cm->alloc_map);
+ cm->allocated++;
+ m->total_allocated++;
+ trace_irq_matrix_alloc_managed(bit, cpu, m, cm);
+ return bit;
+}
+
+/**
+ * irq_matrix_assign - Assign a preallocated interrupt in the local CPU map
+ * @m: Matrix pointer
+ * @bit: Which bit to mark
+ *
+ * This should only be used to mark preallocated vectors
+ */
+void irq_matrix_assign(struct irq_matrix *m, unsigned int bit)
+{
+ struct cpumap *cm = this_cpu_ptr(m->maps);
+
+ if (WARN_ON_ONCE(bit < m->alloc_start || bit >= m->alloc_end))
+ return;
+ if (WARN_ON_ONCE(test_and_set_bit(bit, cm->alloc_map)))
+ return;
+ cm->allocated++;
+ m->total_allocated++;
+ cm->available--;
+ m->global_available--;
+ trace_irq_matrix_assign(bit, smp_processor_id(), m, cm);
+}
+
+/**
+ * irq_matrix_reserve - Reserve interrupts
+ * @m: Matrix pointer
+ *
+ * This is merily a book keeping call. It increments the number of globally
+ * reserved interrupt bits w/o actually allocating them. This allows to
+ * setup interrupt descriptors w/o assigning low level resources to it.
+ * The actual allocation happens when the interrupt gets activated.
+ */
+void irq_matrix_reserve(struct irq_matrix *m)
+{
+ if (m->global_reserved <= m->global_available &&
+ m->global_reserved + 1 > m->global_available)
+ pr_warn("Interrupt reservation exceeds available resources\n");
+
+ m->global_reserved++;
+ trace_irq_matrix_reserve(m);
+}
+
+/**
+ * irq_matrix_remove_reserved - Remove interrupt reservation
+ * @m: Matrix pointer
+ *
+ * This is merily a book keeping call. It decrements the number of globally
+ * reserved interrupt bits. This is used to undo irq_matrix_reserve() when the
+ * interrupt was never in use and a real vector allocated, which undid the
+ * reservation.
+ */
+void irq_matrix_remove_reserved(struct irq_matrix *m)
+{
+ m->global_reserved--;
+ trace_irq_matrix_remove_reserved(m);
+}
+
+/**
+ * irq_matrix_alloc - Allocate a regular interrupt in a CPU map
+ * @m: Matrix pointer
+ * @msk: Which CPUs to search in
+ * @reserved: Allocate previously reserved interrupts
+ * @mapped_cpu: Pointer to store the CPU for which the irq was allocated
+ */
+int irq_matrix_alloc(struct irq_matrix *m, const struct cpumask *msk,
+ bool reserved, unsigned int *mapped_cpu)
+{
+ unsigned int cpu;
+
+ for_each_cpu(cpu, msk) {
+ struct cpumap *cm = per_cpu_ptr(m->maps, cpu);
+ unsigned int bit;
+
+ if (!cm->online)
+ continue;
+
+ bit = matrix_alloc_area(m, cm, 1, false);
+ if (bit < m->alloc_end) {
+ cm->allocated++;
+ cm->available--;
+ m->total_allocated++;
+ m->global_available--;
+ if (reserved)
+ m->global_reserved--;
+ *mapped_cpu = cpu;
+ trace_irq_matrix_alloc(bit, cpu, m, cm);
+ return bit;
+ }
+ }
+ return -ENOSPC;
+}
+
+/**
+ * irq_matrix_free - Free allocated interrupt in the matrix
+ * @m: Matrix pointer
+ * @cpu: Which CPU map needs be updated
+ * @bit: The bit to remove
+ * @managed: If true, the interrupt is managed and not accounted
+ * as available.
+ */
+void irq_matrix_free(struct irq_matrix *m, unsigned int cpu,
+ unsigned int bit, bool managed)
+{
+ struct cpumap *cm = per_cpu_ptr(m->maps, cpu);
+
+ if (WARN_ON_ONCE(bit < m->alloc_start || bit >= m->alloc_end))
+ return;
+
+ if (cm->online) {
+ clear_bit(bit, cm->alloc_map);
+ cm->allocated--;
+ m->total_allocated--;
+ if (!managed) {
+ cm->available++;
+ m->global_available++;
+ }
+ }
+ trace_irq_matrix_free(bit, cpu, m, cm);
+}
+
+/**
+ * irq_matrix_available - Get the number of globally available irqs
+ * @m: Pointer to the matrix to query
+ * @cpudown: If true, the local CPU is about to go down, adjust
+ * the number of available irqs accordingly
+ */
+unsigned int irq_matrix_available(struct irq_matrix *m, bool cpudown)
+{
+ struct cpumap *cm = this_cpu_ptr(m->maps);
+
+ return m->global_available - cpudown ? cm->available : 0;
+}
+
+/**
+ * irq_matrix_reserved - Get the number of globally reserved irqs
+ * @m: Pointer to the matrix to query
+ */
+unsigned int irq_matrix_reserved(struct irq_matrix *m)
+{
+ return m->global_reserved;
+}
+
+/**
+ * irq_matrix_allocated - Get the number of allocated irqs on the local cpu
+ * @m: Pointer to the matrix to search
+ *
+ * This returns number of allocated irqs
+ */
+unsigned int irq_matrix_allocated(struct irq_matrix *m)
+{
+ struct cpumap *cm = this_cpu_ptr(m->maps);
+
+ return cm->allocated;
+}
+
+#ifdef CONFIG_GENERIC_IRQ_DEBUGFS
+/**
+ * irq_matrix_debug_show - Show detailed allocation information
+ * @sf: Pointer to the seq_file to print to
+ * @m: Pointer to the matrix allocator
+ * @ind: Indentation for the print format
+ *
+ * Note, this is a lockless snapshot.
+ */
+void irq_matrix_debug_show(struct seq_file *sf, struct irq_matrix *m, int ind)
+{
+ unsigned int nsys = bitmap_weight(m->system_map, m->matrix_bits);
+ int cpu;
+
+ seq_printf(sf, "Online bitmaps: %6u\n", m->online_maps);
+ seq_printf(sf, "Global available: %6u\n", m->global_available);
+ seq_printf(sf, "Global reserved: %6u\n", m->global_reserved);
+ seq_printf(sf, "Total allocated: %6u\n", m->total_allocated);
+ seq_printf(sf, "System: %u: %*pbl\n", nsys, m->matrix_bits,
+ m->system_map);
+ seq_printf(sf, "%*s| CPU | avl | man | act | vectors\n", ind, " ");
+ cpus_read_lock();
+ for_each_online_cpu(cpu) {
+ struct cpumap *cm = per_cpu_ptr(m->maps, cpu);
+
+ seq_printf(sf, "%*s %4d %4u %4u %4u %*pbl\n", ind, " ",
+ cpu, cm->available, cm->managed, cm->allocated,
+ m->matrix_bits, cm->alloc_map);
+ }
+ cpus_read_unlock();
+}
+#endif
diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
index 3fa4bd59f569..edb987b2c58d 100644
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -16,6 +16,8 @@
#include <linux/msi.h>
#include <linux/slab.h>
+#include "internals.h"
+
/**
* alloc_msi_entry - Allocate an initialize msi_entry
* @dev: Pointer to the device for which this is allocated
@@ -100,13 +102,14 @@ int msi_domain_set_affinity(struct irq_data *irq_data,
return ret;
}
-static void msi_domain_activate(struct irq_domain *domain,
- struct irq_data *irq_data)
+static int msi_domain_activate(struct irq_domain *domain,
+ struct irq_data *irq_data, bool early)
{
struct msi_msg msg;
BUG_ON(irq_chip_compose_msi_msg(irq_data, &msg));
irq_chip_write_msi_msg(irq_data, &msg);
+ return 0;
}
static void msi_domain_deactivate(struct irq_domain *domain,
@@ -373,8 +376,10 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
return ret;
}
- for (i = 0; i < desc->nvec_used; i++)
+ for (i = 0; i < desc->nvec_used; i++) {
irq_set_msi_desc_off(virq, i, desc);
+ irq_debugfs_copy_devname(virq + i, dev);
+ }
}
if (ops->msi_finish)
@@ -396,11 +401,28 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
struct irq_data *irq_data;
irq_data = irq_domain_get_irq_data(domain, desc->irq);
- irq_domain_activate_irq(irq_data);
+ ret = irq_domain_activate_irq(irq_data, true);
+ if (ret)
+ goto cleanup;
+ if (info->flags & MSI_FLAG_MUST_REACTIVATE)
+ irqd_clr_activated(irq_data);
}
}
-
return 0;
+
+cleanup:
+ for_each_msi_entry(desc, dev) {
+ struct irq_data *irqd;
+
+ if (desc->irq == virq)
+ break;
+
+ irqd = irq_domain_get_irq_data(domain, desc->irq);
+ if (irqd_is_activated(irqd))
+ irq_domain_deactivate_irq(irqd);
+ }
+ msi_domain_free_irqs(domain, dev);
+ return ret;
}
/**
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index c010cc0daf79..e8f374971e37 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -155,8 +155,9 @@ static ssize_t write_irq_affinity(int type, struct file *file,
*/
err = irq_select_affinity_usr(irq) ? -EINVAL : count;
} else {
- irq_set_affinity(irq, new_value);
- err = count;
+ err = irq_set_affinity(irq, new_value);
+ if (!err)
+ err = count;
}
free_cpumask:
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index 987d7bca4864..1215229d1c12 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -21,7 +21,7 @@ static int irqfixup __read_mostly;
#define POLL_SPURIOUS_IRQ_INTERVAL (HZ/10)
static void poll_spurious_irqs(unsigned long dummy);
-static DEFINE_TIMER(poll_spurious_irq_timer, poll_spurious_irqs, 0, 0);
+static DEFINE_TIMER(poll_spurious_irq_timer, poll_spurious_irqs);
static int irq_poll_cpu;
static atomic_t irq_poll_active;
diff --git a/kernel/irq/timings.c b/kernel/irq/timings.c
index c8c1d073fbf1..e0923fa4927a 100644
--- a/kernel/irq/timings.c
+++ b/kernel/irq/timings.c
@@ -264,7 +264,7 @@ u64 irq_timings_next_event(u64 now)
* order to prevent the timings circular buffer to be updated
* while we are reading it.
*/
- WARN_ON_ONCE(!irqs_disabled());
+ lockdep_assert_irqs_disabled();
/*
* Number of elements in the circular buffer: If it happens it
diff --git a/kernel/irq_work.c b/kernel/irq_work.c
index bcf107ce0854..40e9d739c169 100644
--- a/kernel/irq_work.c
+++ b/kernel/irq_work.c
@@ -56,7 +56,6 @@ void __weak arch_irq_work_raise(void)
*/
}
-#ifdef CONFIG_SMP
/*
* Enqueue the irq_work @work on @cpu unless it's already pending
* somewhere.
@@ -68,6 +67,8 @@ bool irq_work_queue_on(struct irq_work *work, int cpu)
/* All work should have been flushed before going offline */
WARN_ON_ONCE(cpu_is_offline(cpu));
+#ifdef CONFIG_SMP
+
/* Arch remote IPI send/receive backend aren't NMI safe */
WARN_ON_ONCE(in_nmi());
@@ -78,10 +79,12 @@ bool irq_work_queue_on(struct irq_work *work, int cpu)
if (llist_add(&work->llnode, &per_cpu(raised_list, cpu)))
arch_send_call_function_single_ipi(cpu);
+#else /* #ifdef CONFIG_SMP */
+ irq_work_queue(work);
+#endif /* #else #ifdef CONFIG_SMP */
+
return true;
}
-EXPORT_SYMBOL_GPL(irq_work_queue_on);
-#endif
/* Enqueue the irq work @work on the current CPU */
bool irq_work_queue(struct irq_work *work)
@@ -128,9 +131,9 @@ bool irq_work_needs_cpu(void)
static void irq_work_run_list(struct llist_head *list)
{
- unsigned long flags;
- struct irq_work *work;
+ struct irq_work *work, *tmp;
struct llist_node *llnode;
+ unsigned long flags;
BUG_ON(!irqs_disabled());
@@ -138,11 +141,7 @@ static void irq_work_run_list(struct llist_head *list)
return;
llnode = llist_del_all(list);
- while (llnode != NULL) {
- work = llist_entry(llnode, struct irq_work, llnode);
-
- llnode = llist_next(llnode);
-
+ llist_for_each_entry_safe(work, tmp, llnode, llnode) {
/*
* Clear the PENDING bit, after this point the @work
* can be re-used.
@@ -188,7 +187,7 @@ void irq_work_tick(void)
*/
void irq_work_sync(struct irq_work *work)
{
- WARN_ON_ONCE(irqs_disabled());
+ lockdep_assert_irqs_enabled();
while (work->flags & IRQ_WORK_BUSY)
cpu_relax();
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index 0bf2e8f5244a..8ff4ca4665ff 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -83,7 +83,7 @@ static void static_key_slow_inc_cpuslocked(struct static_key *key)
{
int v, v1;
- STATIC_KEY_CHECK_USE();
+ STATIC_KEY_CHECK_USE(key);
/*
* Careful if we get concurrent static_key_slow_inc() calls;
@@ -128,7 +128,7 @@ EXPORT_SYMBOL_GPL(static_key_slow_inc);
void static_key_enable_cpuslocked(struct static_key *key)
{
- STATIC_KEY_CHECK_USE();
+ STATIC_KEY_CHECK_USE(key);
if (atomic_read(&key->enabled) > 0) {
WARN_ON_ONCE(atomic_read(&key->enabled) != 1);
@@ -158,7 +158,7 @@ EXPORT_SYMBOL_GPL(static_key_enable);
void static_key_disable_cpuslocked(struct static_key *key)
{
- STATIC_KEY_CHECK_USE();
+ STATIC_KEY_CHECK_USE(key);
if (atomic_read(&key->enabled) != 1) {
WARN_ON_ONCE(atomic_read(&key->enabled) != 0);
@@ -224,21 +224,21 @@ static void jump_label_update_timeout(struct work_struct *work)
void static_key_slow_dec(struct static_key *key)
{
- STATIC_KEY_CHECK_USE();
+ STATIC_KEY_CHECK_USE(key);
__static_key_slow_dec(key, 0, NULL);
}
EXPORT_SYMBOL_GPL(static_key_slow_dec);
void static_key_slow_dec_deferred(struct static_key_deferred *key)
{
- STATIC_KEY_CHECK_USE();
+ STATIC_KEY_CHECK_USE(key);
__static_key_slow_dec(&key->key, key->timeout, &key->work);
}
EXPORT_SYMBOL_GPL(static_key_slow_dec_deferred);
void static_key_deferred_flush(struct static_key_deferred *key)
{
- STATIC_KEY_CHECK_USE();
+ STATIC_KEY_CHECK_USE(key);
flush_delayed_work(&key->work);
}
EXPORT_SYMBOL_GPL(static_key_deferred_flush);
@@ -246,7 +246,7 @@ EXPORT_SYMBOL_GPL(static_key_deferred_flush);
void jump_label_rate_limit(struct static_key_deferred *key,
unsigned long rl)
{
- STATIC_KEY_CHECK_USE();
+ STATIC_KEY_CHECK_USE(key);
key->timeout = rl;
INIT_DELAYED_WORK(&key->work, jump_label_update_timeout);
}
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 127e7cfafa55..1e6ae66c6244 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -480,6 +480,7 @@ struct kallsym_iter {
char name[KSYM_NAME_LEN];
char module_name[MODULE_NAME_LEN];
int exported;
+ int show_value;
};
static int get_ksymbol_mod(struct kallsym_iter *iter)
@@ -582,12 +583,15 @@ static void s_stop(struct seq_file *m, void *p)
static int s_show(struct seq_file *m, void *p)
{
+ unsigned long value;
struct kallsym_iter *iter = m->private;
/* Some debugging symbols have no name. Ignore them. */
if (!iter->name[0])
return 0;
+ value = iter->show_value ? iter->value : 0;
+
if (iter->module_name[0]) {
char type;
@@ -597,10 +601,10 @@ static int s_show(struct seq_file *m, void *p)
*/
type = iter->exported ? toupper(iter->type) :
tolower(iter->type);
- seq_printf(m, "%pK %c %s\t[%s]\n", (void *)iter->value,
+ seq_printf(m, KALLSYM_FMT " %c %s\t[%s]\n", value,
type, iter->name, iter->module_name);
} else
- seq_printf(m, "%pK %c %s\n", (void *)iter->value,
+ seq_printf(m, KALLSYM_FMT " %c %s\n", value,
iter->type, iter->name);
return 0;
}
@@ -612,6 +616,40 @@ static const struct seq_operations kallsyms_op = {
.show = s_show
};
+static inline int kallsyms_for_perf(void)
+{
+#ifdef CONFIG_PERF_EVENTS
+ extern int sysctl_perf_event_paranoid;
+ if (sysctl_perf_event_paranoid <= 1)
+ return 1;
+#endif
+ return 0;
+}
+
+/*
+ * We show kallsyms information even to normal users if we've enabled
+ * kernel profiling and are explicitly not paranoid (so kptr_restrict
+ * is clear, and sysctl_perf_event_paranoid isn't set).
+ *
+ * Otherwise, require CAP_SYSLOG (assuming kptr_restrict isn't set to
+ * block even that).
+ */
+int kallsyms_show_value(void)
+{
+ switch (kptr_restrict) {
+ case 0:
+ if (kallsyms_for_perf())
+ return 1;
+ /* fallthrough */
+ case 1:
+ if (has_capability_noaudit(current, CAP_SYSLOG))
+ return 1;
+ /* fallthrough */
+ default:
+ return 0;
+ }
+}
+
static int kallsyms_open(struct inode *inode, struct file *file)
{
/*
@@ -625,6 +663,7 @@ static int kallsyms_open(struct inode *inode, struct file *file)
return -ENOMEM;
reset_iter(iter, 0);
+ iter->show_value = kallsyms_show_value();
return 0;
}
diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index 9f48f4412297..e5bcd94c1efb 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -406,9 +406,10 @@ static int locate_mem_hole_bottom_up(unsigned long start, unsigned long end,
return 1;
}
-static int locate_mem_hole_callback(u64 start, u64 end, void *arg)
+static int locate_mem_hole_callback(struct resource *res, void *arg)
{
struct kexec_buf *kbuf = (struct kexec_buf *)arg;
+ u64 start = res->start, end = res->end;
unsigned long sz = end - start + 1;
/* Returning 0 will take to next memory range */
@@ -437,7 +438,7 @@ static int locate_mem_hole_callback(u64 start, u64 end, void *arg)
* func returning non-zero, then zero will be returned.
*/
int __weak arch_kexec_walk_mem(struct kexec_buf *kbuf,
- int (*func)(u64, u64, void *))
+ int (*func)(struct resource *, void *))
{
if (kbuf->image->type == KEXEC_TYPE_CRASH)
return walk_iomem_res_desc(crashk_res.desc,
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index a1606a4224e1..da2ccf142358 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -117,7 +117,7 @@ enum kprobe_slot_state {
SLOT_USED = 2,
};
-static void *alloc_insn_page(void)
+void __weak *alloc_insn_page(void)
{
return module_alloc(PAGE_SIZE);
}
@@ -573,13 +573,15 @@ static void kprobe_optimizer(struct work_struct *work)
do_unoptimize_kprobes();
/*
- * Step 2: Wait for quiesence period to ensure all running interrupts
- * are done. Because optprobe may modify multiple instructions
- * there is a chance that Nth instruction is interrupted. In that
- * case, running interrupt can return to 2nd-Nth byte of jump
- * instruction. This wait is for avoiding it.
+ * Step 2: Wait for quiesence period to ensure all potentially
+ * preempted tasks to have normally scheduled. Because optprobe
+ * may modify multiple instructions, there is a chance that Nth
+ * instruction is preempted. In that case, such tasks can return
+ * to 2nd-Nth byte of jump instruction. This wait is for avoiding it.
+ * Note that on non-preemptive kernel, this is transparently converted
+ * to synchronoze_sched() to wait for all interrupts to have completed.
*/
- synchronize_sched();
+ synchronize_rcu_tasks();
/* Step 3: Optimize kprobes after quiesence period */
do_optimize_kprobes();
@@ -1769,6 +1771,7 @@ unsigned long __weak arch_deref_entry_point(void *entry)
return (unsigned long)entry;
}
+#if 0
int register_jprobes(struct jprobe **jps, int num)
{
int ret = 0, i;
@@ -1837,6 +1840,7 @@ void unregister_jprobes(struct jprobe **jps, int num)
}
}
EXPORT_SYMBOL_GPL(unregister_jprobes);
+#endif
#ifdef CONFIG_KRETPROBES
/*
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 1c19edf82427..ba3992c8c375 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -798,15 +798,14 @@ EXPORT_SYMBOL_GPL(kthread_queue_work);
/**
* kthread_delayed_work_timer_fn - callback that queues the associated kthread
* delayed work when the timer expires.
- * @__data: pointer to the data associated with the timer
+ * @t: pointer to the expired timer
*
* The format of the function is defined by struct timer_list.
* It should have been called from irqsafe timer with irq already off.
*/
-void kthread_delayed_work_timer_fn(unsigned long __data)
+void kthread_delayed_work_timer_fn(struct timer_list *t)
{
- struct kthread_delayed_work *dwork =
- (struct kthread_delayed_work *)__data;
+ struct kthread_delayed_work *dwork = from_timer(dwork, t, timer);
struct kthread_work *work = &dwork->work;
struct kthread_worker *worker = work->worker;
@@ -837,8 +836,7 @@ void __kthread_queue_delayed_work(struct kthread_worker *worker,
struct timer_list *timer = &dwork->timer;
struct kthread_work *work = &dwork->work;
- WARN_ON_ONCE(timer->function != kthread_delayed_work_timer_fn ||
- timer->data != (unsigned long)dwork);
+ WARN_ON_ONCE(timer->function != (TIMER_FUNC_TYPE)kthread_delayed_work_timer_fn);
/*
* If @delay is 0, queue @dwork->work immediately. This is for
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index e36e652d996f..db933d063bfc 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -76,6 +76,19 @@ module_param(lock_stat, int, 0644);
#define lock_stat 0
#endif
+#ifdef CONFIG_BOOTPARAM_LOCKDEP_CROSSRELEASE_FULLSTACK
+static int crossrelease_fullstack = 1;
+#else
+static int crossrelease_fullstack;
+#endif
+static int __init allow_crossrelease_fullstack(char *str)
+{
+ crossrelease_fullstack = 1;
+ return 0;
+}
+
+early_param("crossrelease_fullstack", allow_crossrelease_fullstack);
+
/*
* lockdep_lock: protects the lockdep graph, the hashes and the
* class/list/hash allocators.
@@ -4863,8 +4876,14 @@ static void add_xhlock(struct held_lock *hlock)
xhlock->trace.nr_entries = 0;
xhlock->trace.max_entries = MAX_XHLOCK_TRACE_ENTRIES;
xhlock->trace.entries = xhlock->trace_entries;
- xhlock->trace.skip = 3;
- save_stack_trace(&xhlock->trace);
+
+ if (crossrelease_fullstack) {
+ xhlock->trace.skip = 3;
+ save_stack_trace(&xhlock->trace);
+ } else {
+ xhlock->trace.nr_entries = 1;
+ xhlock->trace.entries[0] = hlock->acquire_ip;
+ }
}
static inline int same_context_xhlock(struct hist_lock *xhlock)
diff --git a/kernel/locking/qrwlock.c b/kernel/locking/qrwlock.c
index 2655f26ec882..c7471c3fb798 100644
--- a/kernel/locking/qrwlock.c
+++ b/kernel/locking/qrwlock.c
@@ -23,49 +23,11 @@
#include <linux/spinlock.h>
#include <asm/qrwlock.h>
-/*
- * This internal data structure is used for optimizing access to some of
- * the subfields within the atomic_t cnts.
- */
-struct __qrwlock {
- union {
- atomic_t cnts;
- struct {
-#ifdef __LITTLE_ENDIAN
- u8 wmode; /* Writer mode */
- u8 rcnts[3]; /* Reader counts */
-#else
- u8 rcnts[3]; /* Reader counts */
- u8 wmode; /* Writer mode */
-#endif
- };
- };
- arch_spinlock_t lock;
-};
-
-/**
- * rspin_until_writer_unlock - inc reader count & spin until writer is gone
- * @lock : Pointer to queue rwlock structure
- * @writer: Current queue rwlock writer status byte
- *
- * In interrupt context or at the head of the queue, the reader will just
- * increment the reader count & wait until the writer releases the lock.
- */
-static __always_inline void
-rspin_until_writer_unlock(struct qrwlock *lock, u32 cnts)
-{
- while ((cnts & _QW_WMASK) == _QW_LOCKED) {
- cpu_relax();
- cnts = atomic_read_acquire(&lock->cnts);
- }
-}
-
/**
* queued_read_lock_slowpath - acquire read lock of a queue rwlock
* @lock: Pointer to queue rwlock structure
- * @cnts: Current qrwlock lock value
*/
-void queued_read_lock_slowpath(struct qrwlock *lock, u32 cnts)
+void queued_read_lock_slowpath(struct qrwlock *lock)
{
/*
* Readers come here when they cannot get the lock without waiting
@@ -73,13 +35,11 @@ void queued_read_lock_slowpath(struct qrwlock *lock, u32 cnts)
if (unlikely(in_interrupt())) {
/*
* Readers in interrupt context will get the lock immediately
- * if the writer is just waiting (not holding the lock yet).
- * The rspin_until_writer_unlock() function returns immediately
- * in this case. Otherwise, they will spin (with ACQUIRE
- * semantics) until the lock is available without waiting in
- * the queue.
+ * if the writer is just waiting (not holding the lock yet),
+ * so spin with ACQUIRE semantics until the lock is available
+ * without waiting in the queue.
*/
- rspin_until_writer_unlock(lock, cnts);
+ atomic_cond_read_acquire(&lock->cnts, !(VAL & _QW_LOCKED));
return;
}
atomic_sub(_QR_BIAS, &lock->cnts);
@@ -88,14 +48,14 @@ void queued_read_lock_slowpath(struct qrwlock *lock, u32 cnts)
* Put the reader into the wait queue
*/
arch_spin_lock(&lock->wait_lock);
+ atomic_add(_QR_BIAS, &lock->cnts);
/*
* The ACQUIRE semantics of the following spinning code ensure
* that accesses can't leak upwards out of our subsequent critical
* section in the case that the lock is currently held for write.
*/
- cnts = atomic_fetch_add_acquire(_QR_BIAS, &lock->cnts);
- rspin_until_writer_unlock(lock, cnts);
+ atomic_cond_read_acquire(&lock->cnts, !(VAL & _QW_LOCKED));
/*
* Signal the next one in queue to become queue head
@@ -110,8 +70,6 @@ EXPORT_SYMBOL(queued_read_lock_slowpath);
*/
void queued_write_lock_slowpath(struct qrwlock *lock)
{
- u32 cnts;
-
/* Put the writer into the wait queue */
arch_spin_lock(&lock->wait_lock);
@@ -120,30 +78,14 @@ void queued_write_lock_slowpath(struct qrwlock *lock)
(atomic_cmpxchg_acquire(&lock->cnts, 0, _QW_LOCKED) == 0))
goto unlock;
- /*
- * Set the waiting flag to notify readers that a writer is pending,
- * or wait for a previous writer to go away.
- */
- for (;;) {
- struct __qrwlock *l = (struct __qrwlock *)lock;
-
- if (!READ_ONCE(l->wmode) &&
- (cmpxchg_relaxed(&l->wmode, 0, _QW_WAITING) == 0))
- break;
+ /* Set the waiting flag to notify readers that a writer is pending */
+ atomic_add(_QW_WAITING, &lock->cnts);
- cpu_relax();
- }
-
- /* When no more readers, set the locked flag */
- for (;;) {
- cnts = atomic_read(&lock->cnts);
- if ((cnts == _QW_WAITING) &&
- (atomic_cmpxchg_acquire(&lock->cnts, _QW_WAITING,
- _QW_LOCKED) == _QW_WAITING))
- break;
-
- cpu_relax();
- }
+ /* When no more readers or writers, set the locked flag */
+ do {
+ atomic_cond_read_acquire(&lock->cnts, VAL == _QW_WAITING);
+ } while (atomic_cmpxchg_relaxed(&lock->cnts, _QW_WAITING,
+ _QW_LOCKED) != _QW_WAITING);
unlock:
arch_spin_unlock(&lock->wait_lock);
}
diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h
index 15b6a39366c6..6ee477765e6c 100644
--- a/kernel/locking/qspinlock_paravirt.h
+++ b/kernel/locking/qspinlock_paravirt.h
@@ -61,21 +61,50 @@ struct pv_node {
#include "qspinlock_stat.h"
/*
+ * Hybrid PV queued/unfair lock
+ *
* By replacing the regular queued_spin_trylock() with the function below,
* it will be called once when a lock waiter enter the PV slowpath before
- * being queued. By allowing one lock stealing attempt here when the pending
- * bit is off, it helps to reduce the performance impact of lock waiter
- * preemption without the drawback of lock starvation.
+ * being queued.
+ *
+ * The pending bit is set by the queue head vCPU of the MCS wait queue in
+ * pv_wait_head_or_lock() to signal that it is ready to spin on the lock.
+ * When that bit becomes visible to the incoming waiters, no lock stealing
+ * is allowed. The function will return immediately to make the waiters
+ * enter the MCS wait queue. So lock starvation shouldn't happen as long
+ * as the queued mode vCPUs are actively running to set the pending bit
+ * and hence disabling lock stealing.
+ *
+ * When the pending bit isn't set, the lock waiters will stay in the unfair
+ * mode spinning on the lock unless the MCS wait queue is empty. In this
+ * case, the lock waiters will enter the queued mode slowpath trying to
+ * become the queue head and set the pending bit.
+ *
+ * This hybrid PV queued/unfair lock combines the best attributes of a
+ * queued lock (no lock starvation) and an unfair lock (good performance
+ * on not heavily contended locks).
*/
-#define queued_spin_trylock(l) pv_queued_spin_steal_lock(l)
-static inline bool pv_queued_spin_steal_lock(struct qspinlock *lock)
+#define queued_spin_trylock(l) pv_hybrid_queued_unfair_trylock(l)
+static inline bool pv_hybrid_queued_unfair_trylock(struct qspinlock *lock)
{
struct __qspinlock *l = (void *)lock;
- if (!(atomic_read(&lock->val) & _Q_LOCKED_PENDING_MASK) &&
- (cmpxchg_acquire(&l->locked, 0, _Q_LOCKED_VAL) == 0)) {
- qstat_inc(qstat_pv_lock_stealing, true);
- return true;
+ /*
+ * Stay in unfair lock mode as long as queued mode waiters are
+ * present in the MCS wait queue but the pending bit isn't set.
+ */
+ for (;;) {
+ int val = atomic_read(&lock->val);
+
+ if (!(val & _Q_LOCKED_PENDING_MASK) &&
+ (cmpxchg_acquire(&l->locked, 0, _Q_LOCKED_VAL) == 0)) {
+ qstat_inc(qstat_pv_lock_stealing, true);
+ return true;
+ }
+ if (!(val & _Q_TAIL_MASK) || (val & _Q_PENDING_MASK))
+ break;
+
+ cpu_relax();
}
return false;
diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index a6c76a4832b4..f549c552dbf1 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -29,6 +29,22 @@ void __sched down_read(struct rw_semaphore *sem)
EXPORT_SYMBOL(down_read);
+int __sched down_read_killable(struct rw_semaphore *sem)
+{
+ might_sleep();
+ rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_);
+
+ if (LOCK_CONTENDED_RETURN(sem, __down_read_trylock, __down_read_killable)) {
+ rwsem_release(&sem->dep_map, 1, _RET_IP_);
+ return -EINTR;
+ }
+
+ rwsem_set_reader_owned(sem);
+ return 0;
+}
+
+EXPORT_SYMBOL(down_read_killable);
+
/*
* trylock for reading -- returns 1 if successful, 0 if contention
*/
diff --git a/kernel/locking/spinlock.c b/kernel/locking/spinlock.c
index 6e40fdfba326..1fd1a7543cdd 100644
--- a/kernel/locking/spinlock.c
+++ b/kernel/locking/spinlock.c
@@ -30,11 +30,10 @@
#if !defined(CONFIG_GENERIC_LOCKBREAK) || defined(CONFIG_DEBUG_LOCK_ALLOC)
/*
* The __lock_function inlines are taken from
- * include/linux/spinlock_api_smp.h
+ * spinlock : include/linux/spinlock_api_smp.h
+ * rwlock : include/linux/rwlock_api_smp.h
*/
#else
-#define raw_read_can_lock(l) read_can_lock(l)
-#define raw_write_can_lock(l) write_can_lock(l)
/*
* Some architectures can relax in favour of the CPU owning the lock.
@@ -69,7 +68,7 @@ void __lockfunc __raw_##op##_lock(locktype##_t *lock) \
\
if (!(lock)->break_lock) \
(lock)->break_lock = 1; \
- while (!raw_##op##_can_lock(lock) && (lock)->break_lock)\
+ while ((lock)->break_lock) \
arch_##op##_relax(&lock->raw_lock); \
} \
(lock)->break_lock = 0; \
@@ -89,7 +88,7 @@ unsigned long __lockfunc __raw_##op##_lock_irqsave(locktype##_t *lock) \
\
if (!(lock)->break_lock) \
(lock)->break_lock = 1; \
- while (!raw_##op##_can_lock(lock) && (lock)->break_lock)\
+ while ((lock)->break_lock) \
arch_##op##_relax(&lock->raw_lock); \
} \
(lock)->break_lock = 0; \
diff --git a/kernel/module.c b/kernel/module.c
index de66ec825992..32c2cdaccd93 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -278,6 +278,16 @@ static bool sig_enforce = IS_ENABLED(CONFIG_MODULE_SIG_FORCE);
module_param(sig_enforce, bool_enable_only, 0644);
#endif /* !CONFIG_MODULE_SIG_FORCE */
+/*
+ * Export sig_enforce kernel cmdline parameter to allow other subsystems rely
+ * on that instead of directly to CONFIG_MODULE_SIG_FORCE config.
+ */
+bool is_module_sig_enforced(void)
+{
+ return sig_enforce;
+}
+EXPORT_SYMBOL(is_module_sig_enforced);
+
/* Block module loading/unloading? */
int modules_disabled = 0;
core_param(nomodule, modules_disabled, bint, 0);
@@ -1516,7 +1526,7 @@ static void add_sect_attrs(struct module *mod, const struct load_info *info)
sattr->mattr.show = module_sect_show;
sattr->mattr.store = NULL;
sattr->mattr.attr.name = sattr->name;
- sattr->mattr.attr.mode = S_IRUGO;
+ sattr->mattr.attr.mode = S_IRUSR;
*(gattr++) = &(sattr++)->mattr.attr;
}
*gattr = NULL;
@@ -4147,6 +4157,7 @@ static int m_show(struct seq_file *m, void *p)
{
struct module *mod = list_entry(p, struct module, list);
char buf[MODULE_FLAGS_BUF_SIZE];
+ unsigned long value;
/* We always ignore unformed modules. */
if (mod->state == MODULE_STATE_UNFORMED)
@@ -4162,7 +4173,8 @@ static int m_show(struct seq_file *m, void *p)
mod->state == MODULE_STATE_COMING ? "Loading" :
"Live");
/* Used by oprofile and other similar tools. */
- seq_printf(m, " 0x%pK", mod->core_layout.base);
+ value = m->private ? 0 : (unsigned long)mod->core_layout.base;
+ seq_printf(m, " 0x" KALLSYM_FMT, value);
/* Taints info */
if (mod->taints)
@@ -4184,9 +4196,23 @@ static const struct seq_operations modules_op = {
.show = m_show
};
+/*
+ * This also sets the "private" pointer to non-NULL if the
+ * kernel pointers should be hidden (so you can just test
+ * "m->private" to see if you should keep the values private).
+ *
+ * We use the same logic as for /proc/kallsyms.
+ */
static int modules_open(struct inode *inode, struct file *file)
{
- return seq_open(file, &modules_op);
+ int err = seq_open(file, &modules_op);
+
+ if (!err) {
+ struct seq_file *m = file->private_data;
+ m->private = kallsyms_show_value() ? NULL : (void *)8ul;
+ }
+
+ return 0;
}
static const struct file_operations proc_modules_operations = {
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index e4b43fef89f5..59c471de342a 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -203,6 +203,21 @@ static inline bool __rcu_reclaim(const char *rn, struct rcu_head *head)
extern int rcu_cpu_stall_suppress;
int rcu_jiffies_till_stall_check(void);
+#define rcu_ftrace_dump_stall_suppress() \
+do { \
+ if (!rcu_cpu_stall_suppress) \
+ rcu_cpu_stall_suppress = 3; \
+} while (0)
+
+#define rcu_ftrace_dump_stall_unsuppress() \
+do { \
+ if (rcu_cpu_stall_suppress == 3) \
+ rcu_cpu_stall_suppress = 0; \
+} while (0)
+
+#else /* #endif #ifdef CONFIG_RCU_STALL_COMMON */
+#define rcu_ftrace_dump_stall_suppress()
+#define rcu_ftrace_dump_stall_unsuppress()
#endif /* #ifdef CONFIG_RCU_STALL_COMMON */
/*
@@ -220,8 +235,12 @@ do { \
static atomic_t ___rfd_beenhere = ATOMIC_INIT(0); \
\
if (!atomic_read(&___rfd_beenhere) && \
- !atomic_xchg(&___rfd_beenhere, 1)) \
+ !atomic_xchg(&___rfd_beenhere, 1)) { \
+ tracing_off(); \
+ rcu_ftrace_dump_stall_suppress(); \
ftrace_dump(oops_dump_mode); \
+ rcu_ftrace_dump_stall_unsuppress(); \
+ } \
} while (0)
void rcu_early_boot_tests(void);
diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c
index 7649fcd2c4c7..88cba7c2956c 100644
--- a/kernel/rcu/rcu_segcblist.c
+++ b/kernel/rcu/rcu_segcblist.c
@@ -23,6 +23,7 @@
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/interrupt.h>
+#include <linux/rcupdate.h>
#include "rcu_segcblist.h"
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 45f2ffbc1e78..74f6b0146b98 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -51,6 +51,7 @@
#include <asm/byteorder.h>
#include <linux/torture.h>
#include <linux/vmalloc.h>
+#include <linux/sched/debug.h>
#include "rcu.h"
@@ -89,6 +90,7 @@ torture_param(int, shutdown_secs, 0, "Shutdown time (s), <= zero to disable.");
torture_param(int, stall_cpu, 0, "Stall duration (s), zero to disable.");
torture_param(int, stall_cpu_holdoff, 10,
"Time to wait before starting stall (s).");
+torture_param(int, stall_cpu_irqsoff, 0, "Disable interrupts while stalling.");
torture_param(int, stat_interval, 60,
"Number of seconds between stats printk()s");
torture_param(int, stutter, 5, "Number of seconds to run/halt test");
@@ -1076,7 +1078,7 @@ static void rcu_torture_timer_cb(struct rcu_head *rhp)
* counter in the element should never be greater than 1, otherwise, the
* RCU implementation is broken.
*/
-static void rcu_torture_timer(unsigned long unused)
+static void rcu_torture_timer(struct timer_list *unused)
{
int idx;
unsigned long started;
@@ -1163,7 +1165,7 @@ rcu_torture_reader(void *arg)
VERBOSE_TOROUT_STRING("rcu_torture_reader task started");
set_user_nice(current, MAX_NICE);
if (irqreader && cur_ops->irq_capable)
- setup_timer_on_stack(&t, rcu_torture_timer, 0);
+ timer_setup_on_stack(&t, rcu_torture_timer, 0);
do {
if (irqreader && cur_ops->irq_capable) {
@@ -1239,6 +1241,7 @@ rcu_torture_stats_print(void)
long pipesummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 };
long batchsummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 };
static unsigned long rtcv_snap = ULONG_MAX;
+ static bool splatted;
struct task_struct *wtp;
for_each_possible_cpu(cpu) {
@@ -1324,6 +1327,10 @@ rcu_torture_stats_print(void)
gpnum, completed, flags,
wtp == NULL ? ~0UL : wtp->state,
wtp == NULL ? -1 : (int)task_cpu(wtp));
+ if (!splatted && wtp) {
+ sched_show_task(wtp);
+ splatted = true;
+ }
show_rcu_gp_kthreads();
rcu_ftrace_dump(DUMP_ALL);
}
@@ -1357,7 +1364,7 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag)
"fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d "
"test_boost=%d/%d test_boost_interval=%d "
"test_boost_duration=%d shutdown_secs=%d "
- "stall_cpu=%d stall_cpu_holdoff=%d "
+ "stall_cpu=%d stall_cpu_holdoff=%d stall_cpu_irqsoff=%d "
"n_barrier_cbs=%d "
"onoff_interval=%d onoff_holdoff=%d\n",
torture_type, tag, nrealreaders, nfakewriters,
@@ -1365,7 +1372,7 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag)
stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter,
test_boost, cur_ops->can_boost,
test_boost_interval, test_boost_duration, shutdown_secs,
- stall_cpu, stall_cpu_holdoff,
+ stall_cpu, stall_cpu_holdoff, stall_cpu_irqsoff,
n_barrier_cbs,
onoff_interval, onoff_holdoff);
}
@@ -1430,12 +1437,19 @@ static int rcu_torture_stall(void *args)
if (!kthread_should_stop()) {
stop_at = get_seconds() + stall_cpu;
/* RCU CPU stall is expected behavior in following code. */
- pr_alert("rcu_torture_stall start.\n");
rcu_read_lock();
- preempt_disable();
+ if (stall_cpu_irqsoff)
+ local_irq_disable();
+ else
+ preempt_disable();
+ pr_alert("rcu_torture_stall start on CPU %d.\n",
+ smp_processor_id());
while (ULONG_CMP_LT(get_seconds(), stop_at))
continue; /* Induce RCU CPU stall warning. */
- preempt_enable();
+ if (stall_cpu_irqsoff)
+ local_irq_enable();
+ else
+ preempt_enable();
rcu_read_unlock();
pr_alert("rcu_torture_stall end.\n");
}
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 3e3650e94ae6..f9c0ca2ccf0c 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -534,8 +534,8 @@ module_param(rcu_kick_kthreads, bool, 0644);
* How long the grace period must be before we start recruiting
* quiescent-state help from rcu_note_context_switch().
*/
-static ulong jiffies_till_sched_qs = HZ / 20;
-module_param(jiffies_till_sched_qs, ulong, 0644);
+static ulong jiffies_till_sched_qs = HZ / 10;
+module_param(jiffies_till_sched_qs, ulong, 0444);
static bool rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp,
struct rcu_data *rdp);
@@ -734,7 +734,7 @@ static int rcu_future_needs_gp(struct rcu_state *rsp)
int idx = (READ_ONCE(rnp->completed) + 1) & 0x1;
int *fp = &rnp->need_future_gp[idx];
- RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_future_needs_gp() invoked with irqs enabled!!!");
+ lockdep_assert_irqs_disabled();
return READ_ONCE(*fp);
}
@@ -746,7 +746,7 @@ static int rcu_future_needs_gp(struct rcu_state *rsp)
static bool
cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp)
{
- RCU_LOCKDEP_WARN(!irqs_disabled(), "cpu_needs_another_gp() invoked with irqs enabled!!!");
+ lockdep_assert_irqs_disabled();
if (rcu_gp_in_progress(rsp))
return false; /* No, a grace period is already in progress. */
if (rcu_future_needs_gp(rsp))
@@ -773,7 +773,7 @@ static void rcu_eqs_enter_common(bool user)
struct rcu_data *rdp;
struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
- RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_eqs_enter_common() invoked with irqs enabled!!!");
+ lockdep_assert_irqs_disabled();
trace_rcu_dyntick(TPS("Start"), rdtp->dynticks_nesting, 0);
if (IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
!user && !is_idle_task(current)) {
@@ -837,10 +837,13 @@ static void rcu_eqs_enter(bool user)
* We crowbar the ->dynticks_nesting field to zero to allow for
* the possibility of usermode upcalls having messed up our count
* of interrupt nesting level during the prior busy period.
+ *
+ * If you add or remove a call to rcu_idle_enter(), be sure to test with
+ * CONFIG_RCU_EQS_DEBUG=y.
*/
void rcu_idle_enter(void)
{
- RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_idle_enter() invoked with irqs enabled!!!");
+ lockdep_assert_irqs_disabled();
rcu_eqs_enter(false);
}
@@ -852,10 +855,13 @@ void rcu_idle_enter(void)
* is permitted between this call and rcu_user_exit(). This way the
* CPU doesn't need to maintain the tick for RCU maintenance purposes
* when the CPU runs in userspace.
+ *
+ * If you add or remove a call to rcu_user_enter(), be sure to test with
+ * CONFIG_RCU_EQS_DEBUG=y.
*/
void rcu_user_enter(void)
{
- RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_user_enter() invoked with irqs enabled!!!");
+ lockdep_assert_irqs_disabled();
rcu_eqs_enter(true);
}
#endif /* CONFIG_NO_HZ_FULL */
@@ -875,12 +881,15 @@ void rcu_user_enter(void)
* Use things like work queues to work around this limitation.
*
* You have been warned.
+ *
+ * If you add or remove a call to rcu_irq_exit(), be sure to test with
+ * CONFIG_RCU_EQS_DEBUG=y.
*/
void rcu_irq_exit(void)
{
struct rcu_dynticks *rdtp;
- RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_irq_exit() invoked with irqs enabled!!!");
+ lockdep_assert_irqs_disabled();
rdtp = this_cpu_ptr(&rcu_dynticks);
/* Page faults can happen in NMI handlers, so check... */
@@ -899,6 +908,9 @@ void rcu_irq_exit(void)
/*
* Wrapper for rcu_irq_exit() where interrupts are enabled.
+ *
+ * If you add or remove a call to rcu_irq_exit_irqson(), be sure to test
+ * with CONFIG_RCU_EQS_DEBUG=y.
*/
void rcu_irq_exit_irqson(void)
{
@@ -947,7 +959,7 @@ static void rcu_eqs_exit(bool user)
struct rcu_dynticks *rdtp;
long long oldval;
- RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_eqs_exit() invoked with irqs enabled!!!");
+ lockdep_assert_irqs_disabled();
rdtp = this_cpu_ptr(&rcu_dynticks);
oldval = rdtp->dynticks_nesting;
WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && oldval < 0);
@@ -971,6 +983,9 @@ static void rcu_eqs_exit(bool user)
* allow for the possibility of usermode upcalls messing up our count
* of interrupt nesting level during the busy period that is just
* now starting.
+ *
+ * If you add or remove a call to rcu_idle_exit(), be sure to test with
+ * CONFIG_RCU_EQS_DEBUG=y.
*/
void rcu_idle_exit(void)
{
@@ -987,6 +1002,9 @@ void rcu_idle_exit(void)
*
* Exit RCU idle mode while entering the kernel because it can
* run a RCU read side critical section anytime.
+ *
+ * If you add or remove a call to rcu_user_exit(), be sure to test with
+ * CONFIG_RCU_EQS_DEBUG=y.
*/
void rcu_user_exit(void)
{
@@ -1012,13 +1030,16 @@ void rcu_user_exit(void)
* Use things like work queues to work around this limitation.
*
* You have been warned.
+ *
+ * If you add or remove a call to rcu_irq_enter(), be sure to test with
+ * CONFIG_RCU_EQS_DEBUG=y.
*/
void rcu_irq_enter(void)
{
struct rcu_dynticks *rdtp;
long long oldval;
- RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_irq_enter() invoked with irqs enabled!!!");
+ lockdep_assert_irqs_disabled();
rdtp = this_cpu_ptr(&rcu_dynticks);
/* Page faults can happen in NMI handlers, so check... */
@@ -1037,6 +1058,9 @@ void rcu_irq_enter(void)
/*
* Wrapper for rcu_irq_enter() where interrupts are enabled.
+ *
+ * If you add or remove a call to rcu_irq_enter_irqson(), be sure to test
+ * with CONFIG_RCU_EQS_DEBUG=y.
*/
void rcu_irq_enter_irqson(void)
{
@@ -1055,6 +1079,9 @@ void rcu_irq_enter_irqson(void)
* that the CPU is active. This implementation permits nested NMIs, as
* long as the nesting level does not overflow an int. (You will probably
* run out of stack space first.)
+ *
+ * If you add or remove a call to rcu_nmi_enter(), be sure to test
+ * with CONFIG_RCU_EQS_DEBUG=y.
*/
void rcu_nmi_enter(void)
{
@@ -1087,6 +1114,9 @@ void rcu_nmi_enter(void)
* RCU-idle period, update rdtp->dynticks and rdtp->dynticks_nmi_nesting
* to let the RCU grace-period handling know that the CPU is back to
* being RCU-idle.
+ *
+ * If you add or remove a call to rcu_nmi_exit(), be sure to test
+ * with CONFIG_RCU_EQS_DEBUG=y.
*/
void rcu_nmi_exit(void)
{
@@ -1207,6 +1237,22 @@ static int rcu_is_cpu_rrupt_from_idle(void)
}
/*
+ * We are reporting a quiescent state on behalf of some other CPU, so
+ * it is our responsibility to check for and handle potential overflow
+ * of the rcu_node ->gpnum counter with respect to the rcu_data counters.
+ * After all, the CPU might be in deep idle state, and thus executing no
+ * code whatsoever.
+ */
+static void rcu_gpnum_ovf(struct rcu_node *rnp, struct rcu_data *rdp)
+{
+ lockdep_assert_held(&rnp->lock);
+ if (ULONG_CMP_LT(READ_ONCE(rdp->gpnum) + ULONG_MAX / 4, rnp->gpnum))
+ WRITE_ONCE(rdp->gpwrap, true);
+ if (ULONG_CMP_LT(rdp->rcu_iw_gpnum + ULONG_MAX / 4, rnp->gpnum))
+ rdp->rcu_iw_gpnum = rnp->gpnum + ULONG_MAX / 4;
+}
+
+/*
* Snapshot the specified CPU's dynticks counter so that we can later
* credit them with an implicit quiescent state. Return 1 if this CPU
* is in dynticks idle mode, which is an extended quiescent state.
@@ -1216,15 +1262,34 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp)
rdp->dynticks_snap = rcu_dynticks_snap(rdp->dynticks);
if (rcu_dynticks_in_eqs(rdp->dynticks_snap)) {
trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("dti"));
- if (ULONG_CMP_LT(READ_ONCE(rdp->gpnum) + ULONG_MAX / 4,
- rdp->mynode->gpnum))
- WRITE_ONCE(rdp->gpwrap, true);
+ rcu_gpnum_ovf(rdp->mynode, rdp);
return 1;
}
return 0;
}
/*
+ * Handler for the irq_work request posted when a grace period has
+ * gone on for too long, but not yet long enough for an RCU CPU
+ * stall warning. Set state appropriately, but just complain if
+ * there is unexpected state on entry.
+ */
+static void rcu_iw_handler(struct irq_work *iwp)
+{
+ struct rcu_data *rdp;
+ struct rcu_node *rnp;
+
+ rdp = container_of(iwp, struct rcu_data, rcu_iw);
+ rnp = rdp->mynode;
+ raw_spin_lock_rcu_node(rnp);
+ if (!WARN_ON_ONCE(!rdp->rcu_iw_pending)) {
+ rdp->rcu_iw_gpnum = rnp->gpnum;
+ rdp->rcu_iw_pending = false;
+ }
+ raw_spin_unlock_rcu_node(rnp);
+}
+
+/*
* Return true if the specified CPU has passed through a quiescent
* state by virtue of being in or having passed through an dynticks
* idle state since the last call to dyntick_save_progress_counter()
@@ -1235,8 +1300,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
unsigned long jtsq;
bool *rnhqp;
bool *ruqp;
- unsigned long rjtsc;
- struct rcu_node *rnp;
+ struct rcu_node *rnp = rdp->mynode;
/*
* If the CPU passed through or entered a dynticks idle phase with
@@ -1249,34 +1313,25 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
if (rcu_dynticks_in_eqs_since(rdp->dynticks, rdp->dynticks_snap)) {
trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("dti"));
rdp->dynticks_fqs++;
+ rcu_gpnum_ovf(rnp, rdp);
return 1;
}
- /* Compute and saturate jiffies_till_sched_qs. */
- jtsq = jiffies_till_sched_qs;
- rjtsc = rcu_jiffies_till_stall_check();
- if (jtsq > rjtsc / 2) {
- WRITE_ONCE(jiffies_till_sched_qs, rjtsc);
- jtsq = rjtsc / 2;
- } else if (jtsq < 1) {
- WRITE_ONCE(jiffies_till_sched_qs, 1);
- jtsq = 1;
- }
-
/*
* Has this CPU encountered a cond_resched_rcu_qs() since the
* beginning of the grace period? For this to be the case,
* the CPU has to have noticed the current grace period. This
* might not be the case for nohz_full CPUs looping in the kernel.
*/
- rnp = rdp->mynode;
+ jtsq = jiffies_till_sched_qs;
ruqp = per_cpu_ptr(&rcu_dynticks.rcu_urgent_qs, rdp->cpu);
if (time_after(jiffies, rdp->rsp->gp_start + jtsq) &&
READ_ONCE(rdp->rcu_qs_ctr_snap) != per_cpu(rcu_dynticks.rcu_qs_ctr, rdp->cpu) &&
READ_ONCE(rdp->gpnum) == rnp->gpnum && !rdp->gpwrap) {
trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("rqc"));
+ rcu_gpnum_ovf(rnp, rdp);
return 1;
- } else {
+ } else if (time_after(jiffies, rdp->rsp->gp_start + jtsq)) {
/* Load rcu_qs_ctr before store to rcu_urgent_qs. */
smp_store_release(ruqp, true);
}
@@ -1285,6 +1340,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
if (!(rdp->grpmask & rcu_rnp_online_cpus(rnp))) {
trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("ofl"));
rdp->offline_fqs++;
+ rcu_gpnum_ovf(rnp, rdp);
return 1;
}
@@ -1304,10 +1360,6 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
* updates are only once every few jiffies, the probability of
* lossage (and thus of slight grace-period extension) is
* quite low.
- *
- * Note that if the jiffies_till_sched_qs boot/sysfs parameter
- * is set too high, we override with half of the RCU CPU stall
- * warning delay.
*/
rnhqp = &per_cpu(rcu_dynticks.rcu_need_heavy_qs, rdp->cpu);
if (!READ_ONCE(*rnhqp) &&
@@ -1316,15 +1368,26 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
WRITE_ONCE(*rnhqp, true);
/* Store rcu_need_heavy_qs before rcu_urgent_qs. */
smp_store_release(ruqp, true);
- rdp->rsp->jiffies_resched += 5; /* Re-enable beating. */
+ rdp->rsp->jiffies_resched += jtsq; /* Re-enable beating. */
}
/*
- * If more than halfway to RCU CPU stall-warning time, do
- * a resched_cpu() to try to loosen things up a bit.
+ * If more than halfway to RCU CPU stall-warning time, do a
+ * resched_cpu() to try to loosen things up a bit. Also check to
+ * see if the CPU is getting hammered with interrupts, but only
+ * once per grace period, just to keep the IPIs down to a dull roar.
*/
- if (jiffies - rdp->rsp->gp_start > rcu_jiffies_till_stall_check() / 2)
+ if (jiffies - rdp->rsp->gp_start > rcu_jiffies_till_stall_check() / 2) {
resched_cpu(rdp->cpu);
+ if (IS_ENABLED(CONFIG_IRQ_WORK) &&
+ !rdp->rcu_iw_pending && rdp->rcu_iw_gpnum != rnp->gpnum &&
+ (rnp->ffmask & rdp->grpmask)) {
+ init_irq_work(&rdp->rcu_iw, rcu_iw_handler);
+ rdp->rcu_iw_pending = true;
+ rdp->rcu_iw_gpnum = rnp->gpnum;
+ irq_work_queue_on(&rdp->rcu_iw, rdp->cpu);
+ }
+ }
return 0;
}
@@ -1513,6 +1576,7 @@ static void print_cpu_stall(struct rcu_state *rsp)
{
int cpu;
unsigned long flags;
+ struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
struct rcu_node *rnp = rcu_get_root(rsp);
long totqlen = 0;
@@ -1528,7 +1592,9 @@ static void print_cpu_stall(struct rcu_state *rsp)
*/
pr_err("INFO: %s self-detected stall on CPU", rsp->name);
print_cpu_stall_info_begin();
+ raw_spin_lock_irqsave_rcu_node(rdp->mynode, flags);
print_cpu_stall_info(rsp, smp_processor_id());
+ raw_spin_unlock_irqrestore_rcu_node(rdp->mynode, flags);
print_cpu_stall_info_end();
for_each_possible_cpu(cpu)
totqlen += rcu_segcblist_n_cbs(&per_cpu_ptr(rsp->rda,
@@ -1922,6 +1988,7 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp,
rdp->core_needs_qs = need_gp;
zero_cpu_stall_ticks(rdp);
WRITE_ONCE(rdp->gpwrap, false);
+ rcu_gpnum_ovf(rnp, rdp);
}
return ret;
}
@@ -3702,6 +3769,8 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
rdp->cpu_no_qs.b.norm = true;
rdp->rcu_qs_ctr_snap = per_cpu(rcu_dynticks.rcu_qs_ctr, cpu);
rdp->core_needs_qs = false;
+ rdp->rcu_iw_pending = false;
+ rdp->rcu_iw_gpnum = rnp->gpnum - 1;
trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuonl"));
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
}
@@ -3739,10 +3808,24 @@ static void rcutree_affinity_setting(unsigned int cpu, int outgoing)
*/
int rcutree_online_cpu(unsigned int cpu)
{
- sync_sched_exp_online_cleanup(cpu);
- rcutree_affinity_setting(cpu, -1);
+ unsigned long flags;
+ struct rcu_data *rdp;
+ struct rcu_node *rnp;
+ struct rcu_state *rsp;
+
+ for_each_rcu_flavor(rsp) {
+ rdp = per_cpu_ptr(rsp->rda, cpu);
+ rnp = rdp->mynode;
+ raw_spin_lock_irqsave_rcu_node(rnp, flags);
+ rnp->ffmask |= rdp->grpmask;
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+ }
if (IS_ENABLED(CONFIG_TREE_SRCU))
srcu_online_cpu(cpu);
+ if (rcu_scheduler_active == RCU_SCHEDULER_INACTIVE)
+ return 0; /* Too early in boot for scheduler work. */
+ sync_sched_exp_online_cleanup(cpu);
+ rcutree_affinity_setting(cpu, -1);
return 0;
}
@@ -3752,6 +3835,19 @@ int rcutree_online_cpu(unsigned int cpu)
*/
int rcutree_offline_cpu(unsigned int cpu)
{
+ unsigned long flags;
+ struct rcu_data *rdp;
+ struct rcu_node *rnp;
+ struct rcu_state *rsp;
+
+ for_each_rcu_flavor(rsp) {
+ rdp = per_cpu_ptr(rsp->rda, cpu);
+ rnp = rdp->mynode;
+ raw_spin_lock_irqsave_rcu_node(rnp, flags);
+ rnp->ffmask &= ~rdp->grpmask;
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+ }
+
rcutree_affinity_setting(cpu, cpu);
if (IS_ENABLED(CONFIG_TREE_SRCU))
srcu_offline_cpu(cpu);
@@ -4200,8 +4296,7 @@ void __init rcu_init(void)
for_each_online_cpu(cpu) {
rcutree_prepare_cpu(cpu);
rcu_cpu_starting(cpu);
- if (IS_ENABLED(CONFIG_TREE_SRCU))
- srcu_online_cpu(cpu);
+ rcutree_online_cpu(cpu);
}
}
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 8e1f285f0a70..46a5d1991450 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -103,6 +103,7 @@ struct rcu_node {
/* Online CPUs for next expedited GP. */
/* Any CPU that has ever been online will */
/* have its bit set. */
+ unsigned long ffmask; /* Fully functional CPUs. */
unsigned long grpmask; /* Mask to apply to parent qsmask. */
/* Only one bit will be set in this mask. */
int grplo; /* lowest-numbered CPU or group here. */
@@ -285,6 +286,10 @@ struct rcu_data {
/* 8) RCU CPU stall data. */
unsigned int softirq_snap; /* Snapshot of softirq activity. */
+ /* ->rcu_iw* fields protected by leaf rcu_node ->lock. */
+ struct irq_work rcu_iw; /* Check for non-irq activity. */
+ bool rcu_iw_pending; /* Is ->rcu_iw pending? */
+ unsigned long rcu_iw_gpnum; /* ->gpnum associated with ->rcu_iw. */
int cpu;
struct rcu_state *rsp;
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index e012b9be777e..db85ca3975f1 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -29,6 +29,7 @@
#include <linux/oom.h>
#include <linux/sched/debug.h>
#include <linux/smpboot.h>
+#include <linux/sched/isolation.h>
#include <uapi/linux/sched/types.h>
#include "../time/tick-internal.h"
@@ -54,6 +55,7 @@ DEFINE_PER_CPU(char, rcu_cpu_has_work);
* This probably needs to be excluded from -rt builds.
*/
#define rt_mutex_owner(a) ({ WARN_ON_ONCE(1); NULL; })
+#define rt_mutex_futex_unlock(x) WARN_ON_ONCE(1)
#endif /* #else #ifdef CONFIG_RCU_BOOST */
@@ -325,7 +327,7 @@ static void rcu_preempt_note_context_switch(bool preempt)
struct rcu_data *rdp;
struct rcu_node *rnp;
- RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_preempt_note_context_switch() invoked with interrupts enabled!!!\n");
+ lockdep_assert_irqs_disabled();
WARN_ON_ONCE(!preempt && t->rcu_read_lock_nesting > 0);
if (t->rcu_read_lock_nesting > 0 &&
!t->rcu_read_unlock_special.b.blocked) {
@@ -530,7 +532,7 @@ void rcu_read_unlock_special(struct task_struct *t)
/* Unboost if we were boosted. */
if (IS_ENABLED(CONFIG_RCU_BOOST) && drop_boost_mutex)
- rt_mutex_unlock(&rnp->boost_mtx);
+ rt_mutex_futex_unlock(&rnp->boost_mtx);
/*
* If this was the last task on the expedited lists,
@@ -911,8 +913,6 @@ void exit_rcu(void)
#ifdef CONFIG_RCU_BOOST
-#include "../locking/rtmutex_common.h"
-
static void rcu_wake_cond(struct task_struct *t, int status)
{
/*
@@ -1421,7 +1421,7 @@ int rcu_needs_cpu(u64 basemono, u64 *nextevt)
struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
unsigned long dj;
- RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_needs_cpu() invoked with irqs enabled!!!");
+ lockdep_assert_irqs_disabled();
/* Snapshot to detect later posting of non-lazy callback. */
rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted;
@@ -1470,7 +1470,7 @@ static void rcu_prepare_for_idle(void)
struct rcu_state *rsp;
int tne;
- RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_prepare_for_idle() invoked with irqs enabled!!!");
+ lockdep_assert_irqs_disabled();
if (rcu_is_nocb_cpu(smp_processor_id()))
return;
@@ -1507,7 +1507,7 @@ static void rcu_prepare_for_idle(void)
rdtp->last_accelerate = jiffies;
for_each_rcu_flavor(rsp) {
rdp = this_cpu_ptr(rsp->rda);
- if (rcu_segcblist_pend_cbs(&rdp->cblist))
+ if (!rcu_segcblist_pend_cbs(&rdp->cblist))
continue;
rnp = rdp->mynode;
raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */
@@ -1525,7 +1525,7 @@ static void rcu_prepare_for_idle(void)
*/
static void rcu_cleanup_after_idle(void)
{
- RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_cleanup_after_idle() invoked with irqs enabled!!!");
+ lockdep_assert_irqs_disabled();
if (rcu_is_nocb_cpu(smp_processor_id()))
return;
if (rcu_try_advance_all_cbs())
@@ -1671,6 +1671,7 @@ static void print_cpu_stall_info_begin(void)
*/
static void print_cpu_stall_info(struct rcu_state *rsp, int cpu)
{
+ unsigned long delta;
char fast_no_hz[72];
struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
struct rcu_dynticks *rdtp = rdp->dynticks;
@@ -1685,11 +1686,15 @@ static void print_cpu_stall_info(struct rcu_state *rsp, int cpu)
ticks_value = rsp->gpnum - rdp->gpnum;
}
print_cpu_stall_fast_no_hz(fast_no_hz, cpu);
- pr_err("\t%d-%c%c%c: (%lu %s) idle=%03x/%llx/%d softirq=%u/%u fqs=%ld %s\n",
+ delta = rdp->mynode->gpnum - rdp->rcu_iw_gpnum;
+ pr_err("\t%d-%c%c%c%c: (%lu %s) idle=%03x/%llx/%d softirq=%u/%u fqs=%ld %s\n",
cpu,
"O."[!!cpu_online(cpu)],
"o."[!!(rdp->grpmask & rdp->mynode->qsmaskinit)],
"N."[!!(rdp->grpmask & rdp->mynode->qsmaskinitnext)],
+ !IS_ENABLED(CONFIG_IRQ_WORK) ? '?' :
+ rdp->rcu_iw_pending ? (int)min(delta, 9UL) + '0' :
+ "!."[!delta],
ticks_value, ticks_title,
rcu_dynticks_snap(rdtp) & 0xfff,
rdtp->dynticks_nesting, rdtp->dynticks_nmi_nesting,
@@ -2012,7 +2017,7 @@ static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_data *my_rdp,
struct rcu_data *rdp,
unsigned long flags)
{
- RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_nocb_adopt_orphan_cbs() invoked with irqs enabled!!!");
+ lockdep_assert_irqs_disabled();
if (!rcu_is_nocb_cpu(smp_processor_id()))
return false; /* Not NOCBs CPU, caller must migrate CBs. */
__call_rcu_nocb_enqueue(my_rdp, rcu_segcblist_head(&rdp->cblist),
@@ -2261,9 +2266,11 @@ static void do_nocb_deferred_wakeup_common(struct rcu_data *rdp)
}
/* Do a deferred wakeup of rcu_nocb_kthread() from a timer handler. */
-static void do_nocb_deferred_wakeup_timer(unsigned long x)
+static void do_nocb_deferred_wakeup_timer(struct timer_list *t)
{
- do_nocb_deferred_wakeup_common((struct rcu_data *)x);
+ struct rcu_data *rdp = from_timer(rdp, t, nocb_timer);
+
+ do_nocb_deferred_wakeup_common(rdp);
}
/*
@@ -2327,8 +2334,7 @@ static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp)
init_swait_queue_head(&rdp->nocb_wq);
rdp->nocb_follower_tail = &rdp->nocb_follower_head;
raw_spin_lock_init(&rdp->nocb_lock);
- setup_timer(&rdp->nocb_timer, do_nocb_deferred_wakeup_timer,
- (unsigned long)rdp);
+ timer_setup(&rdp->nocb_timer, do_nocb_deferred_wakeup_timer, 0);
}
/*
@@ -2583,7 +2589,7 @@ static void rcu_bind_gp_kthread(void)
if (!tick_nohz_full_enabled())
return;
- housekeeping_affine(current);
+ housekeeping_affine(current, HK_FLAG_RCU);
}
/* Record the current task on dyntick-idle entry. */
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index 5033b66d2753..fbd56d6e575b 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -51,6 +51,7 @@
#include <linux/kthread.h>
#include <linux/tick.h>
#include <linux/rcupdate_wait.h>
+#include <linux/sched/isolation.h>
#define CREATE_TRACE_POINTS
@@ -494,6 +495,7 @@ EXPORT_SYMBOL_GPL(do_trace_rcu_torture_read);
#endif
int rcu_cpu_stall_suppress __read_mostly; /* 1 = suppress stall warnings. */
+EXPORT_SYMBOL_GPL(rcu_cpu_stall_suppress);
static int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT;
module_param(rcu_cpu_stall_suppress, int, 0644);
@@ -575,7 +577,6 @@ DEFINE_STATIC_SRCU(tasks_rcu_exit_srcu);
static int rcu_task_stall_timeout __read_mostly = RCU_TASK_STALL_TIMEOUT;
module_param(rcu_task_stall_timeout, int, 0644);
-static void rcu_spawn_tasks_kthread(void);
static struct task_struct *rcu_tasks_kthread_ptr;
/**
@@ -600,7 +601,6 @@ void call_rcu_tasks(struct rcu_head *rhp, rcu_callback_t func)
{
unsigned long flags;
bool needwake;
- bool havetask = READ_ONCE(rcu_tasks_kthread_ptr);
rhp->next = NULL;
rhp->func = func;
@@ -610,11 +610,8 @@ void call_rcu_tasks(struct rcu_head *rhp, rcu_callback_t func)
rcu_tasks_cbs_tail = &rhp->next;
raw_spin_unlock_irqrestore(&rcu_tasks_cbs_lock, flags);
/* We can't create the thread unless interrupts are enabled. */
- if ((needwake && havetask) ||
- (!havetask && !irqs_disabled_flags(flags))) {
- rcu_spawn_tasks_kthread();
+ if (needwake && READ_ONCE(rcu_tasks_kthread_ptr))
wake_up(&rcu_tasks_cbs_wq);
- }
}
EXPORT_SYMBOL_GPL(call_rcu_tasks);
@@ -718,7 +715,7 @@ static int __noreturn rcu_tasks_kthread(void *arg)
LIST_HEAD(rcu_tasks_holdouts);
/* Run on housekeeping CPUs by default. Sysadm can move if desired. */
- housekeeping_affine(current);
+ housekeeping_affine(current, HK_FLAG_RCU);
/*
* Each pass through the following loop makes one check for
@@ -853,27 +850,18 @@ static int __noreturn rcu_tasks_kthread(void *arg)
}
}
-/* Spawn rcu_tasks_kthread() at first call to call_rcu_tasks(). */
-static void rcu_spawn_tasks_kthread(void)
+/* Spawn rcu_tasks_kthread() at core_initcall() time. */
+static int __init rcu_spawn_tasks_kthread(void)
{
- static DEFINE_MUTEX(rcu_tasks_kthread_mutex);
struct task_struct *t;
- if (READ_ONCE(rcu_tasks_kthread_ptr)) {
- smp_mb(); /* Ensure caller sees full kthread. */
- return;
- }
- mutex_lock(&rcu_tasks_kthread_mutex);
- if (rcu_tasks_kthread_ptr) {
- mutex_unlock(&rcu_tasks_kthread_mutex);
- return;
- }
t = kthread_run(rcu_tasks_kthread, NULL, "rcu_tasks_kthread");
BUG_ON(IS_ERR(t));
smp_mb(); /* Ensure others see full kthread. */
WRITE_ONCE(rcu_tasks_kthread_ptr, t);
- mutex_unlock(&rcu_tasks_kthread_mutex);
+ return 0;
}
+core_initcall(rcu_spawn_tasks_kthread);
/* Do the srcu_read_lock() for the above synchronize_srcu(). */
void exit_tasks_rcu_start(void)
diff --git a/kernel/resource.c b/kernel/resource.c
index 9b5f04404152..54ba6de3757c 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -397,9 +397,32 @@ static int find_next_iomem_res(struct resource *res, unsigned long desc,
res->start = p->start;
if (res->end > p->end)
res->end = p->end;
+ res->flags = p->flags;
+ res->desc = p->desc;
return 0;
}
+static int __walk_iomem_res_desc(struct resource *res, unsigned long desc,
+ bool first_level_children_only,
+ void *arg,
+ int (*func)(struct resource *, void *))
+{
+ u64 orig_end = res->end;
+ int ret = -1;
+
+ while ((res->start < res->end) &&
+ !find_next_iomem_res(res, desc, first_level_children_only)) {
+ ret = (*func)(res, arg);
+ if (ret)
+ break;
+
+ res->start = res->end + 1;
+ res->end = orig_end;
+ }
+
+ return ret;
+}
+
/*
* Walks through iomem resources and calls func() with matching resource
* ranges. This walks through whole tree and not just first level children.
@@ -415,29 +438,15 @@ static int find_next_iomem_res(struct resource *res, unsigned long desc,
* <linux/ioport.h> and set it in 'desc' of a target resource entry.
*/
int walk_iomem_res_desc(unsigned long desc, unsigned long flags, u64 start,
- u64 end, void *arg, int (*func)(u64, u64, void *))
+ u64 end, void *arg, int (*func)(struct resource *, void *))
{
struct resource res;
- u64 orig_end;
- int ret = -1;
res.start = start;
res.end = end;
res.flags = flags;
- orig_end = res.end;
-
- while ((res.start < res.end) &&
- (!find_next_iomem_res(&res, desc, false))) {
-
- ret = (*func)(res.start, res.end, arg);
- if (ret)
- break;
-
- res.start = res.end + 1;
- res.end = orig_end;
- }
- return ret;
+ return __walk_iomem_res_desc(&res, desc, false, arg, func);
}
/*
@@ -448,25 +457,33 @@ int walk_iomem_res_desc(unsigned long desc, unsigned long flags, u64 start,
* ranges.
*/
int walk_system_ram_res(u64 start, u64 end, void *arg,
- int (*func)(u64, u64, void *))
+ int (*func)(struct resource *, void *))
{
struct resource res;
- u64 orig_end;
- int ret = -1;
res.start = start;
res.end = end;
res.flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
- orig_end = res.end;
- while ((res.start < res.end) &&
- (!find_next_iomem_res(&res, IORES_DESC_NONE, true))) {
- ret = (*func)(res.start, res.end, arg);
- if (ret)
- break;
- res.start = res.end + 1;
- res.end = orig_end;
- }
- return ret;
+
+ return __walk_iomem_res_desc(&res, IORES_DESC_NONE, true,
+ arg, func);
+}
+
+/*
+ * This function calls the @func callback against all memory ranges, which
+ * are ranges marked as IORESOURCE_MEM and IORESOUCE_BUSY.
+ */
+int walk_mem_res(u64 start, u64 end, void *arg,
+ int (*func)(struct resource *, void *))
+{
+ struct resource res;
+
+ res.start = start;
+ res.end = end;
+ res.flags = IORESOURCE_MEM | IORESOURCE_BUSY;
+
+ return __walk_iomem_res_desc(&res, IORES_DESC_NONE, true,
+ arg, func);
}
#if !defined(CONFIG_ARCH_HAS_WALK_MEMORY)
@@ -508,6 +525,7 @@ static int __is_ram(unsigned long pfn, unsigned long nr_pages, void *arg)
{
return 1;
}
+
/*
* This generic page_is_ram() returns true if specified address is
* registered as System RAM in iomem_resource list.
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index a9ee16bbc693..e2f9d4feff40 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -27,3 +27,4 @@ obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o
obj-$(CONFIG_CPU_FREQ) += cpufreq.o
obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o
obj-$(CONFIG_MEMBARRIER) += membarrier.o
+obj-$(CONFIG_CPU_ISOLATION) += isolation.o
diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c
index ca0f8fc945c6..e086babe6c61 100644
--- a/kernel/sched/clock.c
+++ b/kernel/sched/clock.c
@@ -388,7 +388,7 @@ void sched_clock_tick(void)
if (unlikely(!sched_clock_running))
return;
- WARN_ON_ONCE(!irqs_disabled());
+ lockdep_assert_irqs_disabled();
scd = this_scd();
__scd_stamp(scd);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index d17c5da523a0..5b82a0073532 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -26,6 +26,7 @@
#include <linux/profile.h>
#include <linux/security.h>
#include <linux/syscalls.h>
+#include <linux/sched/isolation.h>
#include <asm/switch_to.h>
#include <asm/tlb.h>
@@ -42,18 +43,21 @@
DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
+#if defined(CONFIG_SCHED_DEBUG) && defined(HAVE_JUMP_LABEL)
/*
* Debugging: various feature bits
+ *
+ * If SCHED_DEBUG is disabled, each compilation unit has its own copy of
+ * sysctl_sched_features, defined in sched.h, to allow constants propagation
+ * at compile time and compiler optimization based on features default.
*/
-
#define SCHED_FEAT(name, enabled) \
(1UL << __SCHED_FEAT_##name) * enabled |
-
const_debug unsigned int sysctl_sched_features =
#include "features.h"
0;
-
#undef SCHED_FEAT
+#endif
/*
* Number of tasks to iterate in a single balance run.
@@ -83,9 +87,6 @@ __read_mostly int scheduler_running;
*/
int sysctl_sched_rt_runtime = 950000;
-/* CPUs with isolated domains */
-cpumask_var_t cpu_isolated_map;
-
/*
* __task_rq_lock - lock the rq @p resides on.
*/
@@ -505,8 +506,7 @@ void resched_cpu(int cpu)
struct rq *rq = cpu_rq(cpu);
unsigned long flags;
- if (!raw_spin_trylock_irqsave(&rq->lock, flags))
- return;
+ raw_spin_lock_irqsave(&rq->lock, flags);
resched_curr(rq);
raw_spin_unlock_irqrestore(&rq->lock, flags);
}
@@ -526,7 +526,7 @@ int get_nohz_timer_target(void)
int i, cpu = smp_processor_id();
struct sched_domain *sd;
- if (!idle_cpu(cpu) && is_housekeeping_cpu(cpu))
+ if (!idle_cpu(cpu) && housekeeping_cpu(cpu, HK_FLAG_TIMER))
return cpu;
rcu_read_lock();
@@ -535,15 +535,15 @@ int get_nohz_timer_target(void)
if (cpu == i)
continue;
- if (!idle_cpu(i) && is_housekeeping_cpu(i)) {
+ if (!idle_cpu(i) && housekeeping_cpu(i, HK_FLAG_TIMER)) {
cpu = i;
goto unlock;
}
}
}
- if (!is_housekeeping_cpu(cpu))
- cpu = housekeeping_any_cpu();
+ if (!housekeeping_cpu(cpu, HK_FLAG_TIMER))
+ cpu = housekeeping_any_cpu(HK_FLAG_TIMER);
unlock:
rcu_read_unlock();
return cpu;
@@ -733,7 +733,7 @@ int tg_nop(struct task_group *tg, void *data)
}
#endif
-static void set_load_weight(struct task_struct *p)
+static void set_load_weight(struct task_struct *p, bool update_load)
{
int prio = p->static_prio - MAX_RT_PRIO;
struct load_weight *load = &p->se.load;
@@ -747,8 +747,16 @@ static void set_load_weight(struct task_struct *p)
return;
}
- load->weight = scale_load(sched_prio_to_weight[prio]);
- load->inv_weight = sched_prio_to_wmult[prio];
+ /*
+ * SCHED_OTHER tasks have to update their load when changing their
+ * weight
+ */
+ if (update_load && p->sched_class == &fair_sched_class) {
+ reweight_task(p, prio);
+ } else {
+ load->weight = scale_load(sched_prio_to_weight[prio]);
+ load->inv_weight = sched_prio_to_wmult[prio];
+ }
}
static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
@@ -2358,7 +2366,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
p->static_prio = NICE_TO_PRIO(0);
p->prio = p->normal_prio = __normal_prio(p);
- set_load_weight(p);
+ set_load_weight(p, false);
/*
* We don't need the reset flag anymore after the fork. It has
@@ -3805,7 +3813,7 @@ void set_user_nice(struct task_struct *p, long nice)
put_prev_task(rq, p);
p->static_prio = NICE_TO_PRIO(nice);
- set_load_weight(p);
+ set_load_weight(p, true);
old_prio = p->prio;
p->prio = effective_prio(p);
delta = p->prio - old_prio;
@@ -3962,7 +3970,7 @@ static void __setscheduler_params(struct task_struct *p,
*/
p->rt_priority = attr->sched_priority;
p->normal_prio = normal_prio(p);
- set_load_weight(p);
+ set_load_weight(p, true);
}
/* Actually do priority change: must hold pi & rq lock. */
@@ -4842,6 +4850,7 @@ int __sched _cond_resched(void)
preempt_schedule_common();
return 1;
}
+ rcu_all_qs();
return 0;
}
EXPORT_SYMBOL(_cond_resched);
@@ -5165,6 +5174,7 @@ void sched_show_task(struct task_struct *p)
show_stack(p, NULL);
put_task_stack(p);
}
+EXPORT_SYMBOL_GPL(sched_show_task);
static inline bool
state_filter_match(unsigned long state_filter, struct task_struct *p)
@@ -5726,10 +5736,6 @@ static inline void sched_init_smt(void) { }
void __init sched_init_smp(void)
{
- cpumask_var_t non_isolated_cpus;
-
- alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
-
sched_init_numa();
/*
@@ -5739,16 +5745,12 @@ void __init sched_init_smp(void)
*/
mutex_lock(&sched_domains_mutex);
sched_init_domains(cpu_active_mask);
- cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
- if (cpumask_empty(non_isolated_cpus))
- cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);
mutex_unlock(&sched_domains_mutex);
/* Move init over to a non-isolated CPU */
- if (set_cpus_allowed_ptr(current, non_isolated_cpus) < 0)
+ if (set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_FLAG_DOMAIN)) < 0)
BUG();
sched_init_granularity();
- free_cpumask_var(non_isolated_cpus);
init_sched_rt_class();
init_sched_dl_class();
@@ -5933,7 +5935,7 @@ void __init sched_init(void)
atomic_set(&rq->nr_iowait, 0);
}
- set_load_weight(&init_task);
+ set_load_weight(&init_task, false);
/*
* The boot idle thread does lazy MMU switching as well:
@@ -5952,9 +5954,6 @@ void __init sched_init(void)
calc_load_update = jiffies + LOAD_FREQ;
#ifdef CONFIG_SMP
- /* May be allocated at isolcpus cmdline parse time */
- if (cpu_isolated_map == NULL)
- zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
idle_thread_set_boot_cpu();
set_cpu_rq_start_time(smp_processor_id());
#endif
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 14d2dbf97c53..9be8b68a66da 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -259,8 +259,7 @@ static inline u64 account_other_time(u64 max)
{
u64 accounted;
- /* Shall be converted to a lockdep-enabled lightweight check */
- WARN_ON_ONCE(!irqs_disabled());
+ lockdep_assert_irqs_disabled();
accounted = steal_account_process_time(max);
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 4ae5c1ea90e2..f349f7e98dec 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -243,7 +243,7 @@ static void task_non_contending(struct task_struct *p)
if (p->state == TASK_DEAD)
sub_rq_bw(p->dl.dl_bw, &rq->dl);
raw_spin_lock(&dl_b->lock);
- __dl_clear(dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p)));
+ __dl_sub(dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p)));
__dl_clear_params(p);
raw_spin_unlock(&dl_b->lock);
}
@@ -1210,7 +1210,7 @@ static enum hrtimer_restart inactive_task_timer(struct hrtimer *timer)
}
raw_spin_lock(&dl_b->lock);
- __dl_clear(dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p)));
+ __dl_sub(dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p)));
raw_spin_unlock(&dl_b->lock);
__dl_clear_params(p);
@@ -1365,6 +1365,10 @@ enqueue_dl_entity(struct sched_dl_entity *dl_se,
update_dl_entity(dl_se, pi_se);
} else if (flags & ENQUEUE_REPLENISH) {
replenish_dl_entity(dl_se, pi_se);
+ } else if ((flags & ENQUEUE_RESTORE) &&
+ dl_time_before(dl_se->deadline,
+ rq_clock(rq_of_dl_rq(dl_rq_of_se(dl_se))))) {
+ setup_new_dl_entity(dl_se);
}
__enqueue_dl_entity(dl_se);
@@ -2167,7 +2171,7 @@ static void set_cpus_allowed_dl(struct task_struct *p,
* until we complete the update.
*/
raw_spin_lock(&src_dl_b->lock);
- __dl_clear(src_dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p)));
+ __dl_sub(src_dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p)));
raw_spin_unlock(&src_dl_b->lock);
}
@@ -2256,13 +2260,6 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p)
return;
}
- /*
- * If p is boosted we already updated its params in
- * rt_mutex_setprio()->enqueue_task(..., ENQUEUE_REPLENISH),
- * p's deadline being now already after rq_clock(rq).
- */
- if (dl_time_before(p->dl.deadline, rq_clock(rq)))
- setup_new_dl_entity(&p->dl);
if (rq->curr != p) {
#ifdef CONFIG_SMP
@@ -2452,7 +2449,7 @@ int sched_dl_overflow(struct task_struct *p, int policy,
if (dl_policy(policy) && !task_has_dl_policy(p) &&
!__dl_overflow(dl_b, cpus, 0, new_bw)) {
if (hrtimer_active(&p->dl.inactive_timer))
- __dl_clear(dl_b, p->dl.dl_bw, cpus);
+ __dl_sub(dl_b, p->dl.dl_bw, cpus);
__dl_add(dl_b, new_bw, cpus);
err = 0;
} else if (dl_policy(policy) && task_has_dl_policy(p) &&
@@ -2464,7 +2461,7 @@ int sched_dl_overflow(struct task_struct *p, int policy,
* But this would require to set the task's "inactive
* timer" when the task is not inactive.
*/
- __dl_clear(dl_b, p->dl.dl_bw, cpus);
+ __dl_sub(dl_b, p->dl.dl_bw, cpus);
__dl_add(dl_b, new_bw, cpus);
dl_change_utilization(p, new_bw);
err = 0;
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 2f93e4a2d9f6..1ca0130ed4f9 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -441,9 +441,11 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group
P_SCHEDSTAT(se->statistics.wait_count);
}
P(se->load.weight);
+ P(se->runnable_weight);
#ifdef CONFIG_SMP
P(se->avg.load_avg);
P(se->avg.util_avg);
+ P(se->avg.runnable_load_avg);
#endif
#undef PN_SCHEDSTAT
@@ -558,16 +560,19 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
SEQ_printf(m, " .%-30s: %d\n", "nr_running", cfs_rq->nr_running);
SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight);
#ifdef CONFIG_SMP
+ SEQ_printf(m, " .%-30s: %ld\n", "runnable_weight", cfs_rq->runnable_weight);
SEQ_printf(m, " .%-30s: %lu\n", "load_avg",
cfs_rq->avg.load_avg);
SEQ_printf(m, " .%-30s: %lu\n", "runnable_load_avg",
- cfs_rq->runnable_load_avg);
+ cfs_rq->avg.runnable_load_avg);
SEQ_printf(m, " .%-30s: %lu\n", "util_avg",
cfs_rq->avg.util_avg);
- SEQ_printf(m, " .%-30s: %ld\n", "removed_load_avg",
- atomic_long_read(&cfs_rq->removed_load_avg));
- SEQ_printf(m, " .%-30s: %ld\n", "removed_util_avg",
- atomic_long_read(&cfs_rq->removed_util_avg));
+ SEQ_printf(m, " .%-30s: %ld\n", "removed.load_avg",
+ cfs_rq->removed.load_avg);
+ SEQ_printf(m, " .%-30s: %ld\n", "removed.util_avg",
+ cfs_rq->removed.util_avg);
+ SEQ_printf(m, " .%-30s: %ld\n", "removed.runnable_sum",
+ cfs_rq->removed.runnable_sum);
#ifdef CONFIG_FAIR_GROUP_SCHED
SEQ_printf(m, " .%-30s: %lu\n", "tg_load_avg_contrib",
cfs_rq->tg_load_avg_contrib);
@@ -1004,10 +1009,13 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
"nr_involuntary_switches", (long long)p->nivcsw);
P(se.load.weight);
+ P(se.runnable_weight);
#ifdef CONFIG_SMP
P(se.avg.load_sum);
+ P(se.avg.runnable_load_sum);
P(se.avg.util_sum);
P(se.avg.load_avg);
+ P(se.avg.runnable_load_avg);
P(se.avg.util_avg);
P(se.avg.last_update_time);
#endif
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 5c09ddf8c832..0989676c50e9 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -33,6 +33,7 @@
#include <linux/mempolicy.h>
#include <linux/migrate.h>
#include <linux/task_work.h>
+#include <linux/sched/isolation.h>
#include <trace/events/sched.h>
@@ -717,13 +718,8 @@ void init_entity_runnable_average(struct sched_entity *se)
{
struct sched_avg *sa = &se->avg;
- sa->last_update_time = 0;
- /*
- * sched_avg's period_contrib should be strictly less then 1024, so
- * we give it 1023 to make sure it is almost a period (1024us), and
- * will definitely be update (after enqueue).
- */
- sa->period_contrib = 1023;
+ memset(sa, 0, sizeof(*sa));
+
/*
* Tasks are intialized with full load to be seen as heavy tasks until
* they get a chance to stabilize to their real load level.
@@ -731,13 +727,10 @@ void init_entity_runnable_average(struct sched_entity *se)
* nothing has been attached to the task group yet.
*/
if (entity_is_task(se))
- sa->load_avg = scale_load_down(se->load.weight);
- sa->load_sum = sa->load_avg * LOAD_AVG_MAX;
- /*
- * At this point, util_avg won't be used in select_task_rq_fair anyway
- */
- sa->util_avg = 0;
- sa->util_sum = 0;
+ sa->runnable_load_avg = sa->load_avg = scale_load_down(se->load.weight);
+
+ se->runnable_weight = se->load.weight;
+
/* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */
}
@@ -785,7 +778,6 @@ void post_init_entity_util_avg(struct sched_entity *se)
} else {
sa->util_avg = cap;
}
- sa->util_sum = sa->util_avg * LOAD_AVG_MAX;
}
if (entity_is_task(se)) {
@@ -2026,7 +2018,7 @@ static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period)
delta = runtime - p->last_sum_exec_runtime;
*period = now - p->last_task_numa_placement;
} else {
- delta = p->se.avg.load_sum / p->se.load.weight;
+ delta = p->se.avg.load_sum;
*period = LOAD_AVG_MAX;
}
@@ -2693,18 +2685,226 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
cfs_rq->nr_running--;
}
+/*
+ * Signed add and clamp on underflow.
+ *
+ * Explicitly do a load-store to ensure the intermediate value never hits
+ * memory. This allows lockless observations without ever seeing the negative
+ * values.
+ */
+#define add_positive(_ptr, _val) do { \
+ typeof(_ptr) ptr = (_ptr); \
+ typeof(_val) val = (_val); \
+ typeof(*ptr) res, var = READ_ONCE(*ptr); \
+ \
+ res = var + val; \
+ \
+ if (val < 0 && res > var) \
+ res = 0; \
+ \
+ WRITE_ONCE(*ptr, res); \
+} while (0)
+
+/*
+ * Unsigned subtract and clamp on underflow.
+ *
+ * Explicitly do a load-store to ensure the intermediate value never hits
+ * memory. This allows lockless observations without ever seeing the negative
+ * values.
+ */
+#define sub_positive(_ptr, _val) do { \
+ typeof(_ptr) ptr = (_ptr); \
+ typeof(*ptr) val = (_val); \
+ typeof(*ptr) res, var = READ_ONCE(*ptr); \
+ res = var - val; \
+ if (res > var) \
+ res = 0; \
+ WRITE_ONCE(*ptr, res); \
+} while (0)
+
+#ifdef CONFIG_SMP
+/*
+ * XXX we want to get rid of these helpers and use the full load resolution.
+ */
+static inline long se_weight(struct sched_entity *se)
+{
+ return scale_load_down(se->load.weight);
+}
+
+static inline long se_runnable(struct sched_entity *se)
+{
+ return scale_load_down(se->runnable_weight);
+}
+
+static inline void
+enqueue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ cfs_rq->runnable_weight += se->runnable_weight;
+
+ cfs_rq->avg.runnable_load_avg += se->avg.runnable_load_avg;
+ cfs_rq->avg.runnable_load_sum += se_runnable(se) * se->avg.runnable_load_sum;
+}
+
+static inline void
+dequeue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ cfs_rq->runnable_weight -= se->runnable_weight;
+
+ sub_positive(&cfs_rq->avg.runnable_load_avg, se->avg.runnable_load_avg);
+ sub_positive(&cfs_rq->avg.runnable_load_sum,
+ se_runnable(se) * se->avg.runnable_load_sum);
+}
+
+static inline void
+enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ cfs_rq->avg.load_avg += se->avg.load_avg;
+ cfs_rq->avg.load_sum += se_weight(se) * se->avg.load_sum;
+}
+
+static inline void
+dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg);
+ sub_positive(&cfs_rq->avg.load_sum, se_weight(se) * se->avg.load_sum);
+}
+#else
+static inline void
+enqueue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
+static inline void
+dequeue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
+static inline void
+enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
+static inline void
+dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
+#endif
+
+static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
+ unsigned long weight, unsigned long runnable)
+{
+ if (se->on_rq) {
+ /* commit outstanding execution time */
+ if (cfs_rq->curr == se)
+ update_curr(cfs_rq);
+ account_entity_dequeue(cfs_rq, se);
+ dequeue_runnable_load_avg(cfs_rq, se);
+ }
+ dequeue_load_avg(cfs_rq, se);
+
+ se->runnable_weight = runnable;
+ update_load_set(&se->load, weight);
+
+#ifdef CONFIG_SMP
+ do {
+ u32 divider = LOAD_AVG_MAX - 1024 + se->avg.period_contrib;
+
+ se->avg.load_avg = div_u64(se_weight(se) * se->avg.load_sum, divider);
+ se->avg.runnable_load_avg =
+ div_u64(se_runnable(se) * se->avg.runnable_load_sum, divider);
+ } while (0);
+#endif
+
+ enqueue_load_avg(cfs_rq, se);
+ if (se->on_rq) {
+ account_entity_enqueue(cfs_rq, se);
+ enqueue_runnable_load_avg(cfs_rq, se);
+ }
+}
+
+void reweight_task(struct task_struct *p, int prio)
+{
+ struct sched_entity *se = &p->se;
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
+ struct load_weight *load = &se->load;
+ unsigned long weight = scale_load(sched_prio_to_weight[prio]);
+
+ reweight_entity(cfs_rq, se, weight, weight);
+ load->inv_weight = sched_prio_to_wmult[prio];
+}
+
#ifdef CONFIG_FAIR_GROUP_SCHED
# ifdef CONFIG_SMP
-static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
+/*
+ * All this does is approximate the hierarchical proportion which includes that
+ * global sum we all love to hate.
+ *
+ * That is, the weight of a group entity, is the proportional share of the
+ * group weight based on the group runqueue weights. That is:
+ *
+ * tg->weight * grq->load.weight
+ * ge->load.weight = ----------------------------- (1)
+ * \Sum grq->load.weight
+ *
+ * Now, because computing that sum is prohibitively expensive to compute (been
+ * there, done that) we approximate it with this average stuff. The average
+ * moves slower and therefore the approximation is cheaper and more stable.
+ *
+ * So instead of the above, we substitute:
+ *
+ * grq->load.weight -> grq->avg.load_avg (2)
+ *
+ * which yields the following:
+ *
+ * tg->weight * grq->avg.load_avg
+ * ge->load.weight = ------------------------------ (3)
+ * tg->load_avg
+ *
+ * Where: tg->load_avg ~= \Sum grq->avg.load_avg
+ *
+ * That is shares_avg, and it is right (given the approximation (2)).
+ *
+ * The problem with it is that because the average is slow -- it was designed
+ * to be exactly that of course -- this leads to transients in boundary
+ * conditions. In specific, the case where the group was idle and we start the
+ * one task. It takes time for our CPU's grq->avg.load_avg to build up,
+ * yielding bad latency etc..
+ *
+ * Now, in that special case (1) reduces to:
+ *
+ * tg->weight * grq->load.weight
+ * ge->load.weight = ----------------------------- = tg->weight (4)
+ * grp->load.weight
+ *
+ * That is, the sum collapses because all other CPUs are idle; the UP scenario.
+ *
+ * So what we do is modify our approximation (3) to approach (4) in the (near)
+ * UP case, like:
+ *
+ * ge->load.weight =
+ *
+ * tg->weight * grq->load.weight
+ * --------------------------------------------------- (5)
+ * tg->load_avg - grq->avg.load_avg + grq->load.weight
+ *
+ * But because grq->load.weight can drop to 0, resulting in a divide by zero,
+ * we need to use grq->avg.load_avg as its lower bound, which then gives:
+ *
+ *
+ * tg->weight * grq->load.weight
+ * ge->load.weight = ----------------------------- (6)
+ * tg_load_avg'
+ *
+ * Where:
+ *
+ * tg_load_avg' = tg->load_avg - grq->avg.load_avg +
+ * max(grq->load.weight, grq->avg.load_avg)
+ *
+ * And that is shares_weight and is icky. In the (near) UP case it approaches
+ * (4) while in the normal case it approaches (3). It consistently
+ * overestimates the ge->load.weight and therefore:
+ *
+ * \Sum ge->load.weight >= tg->weight
+ *
+ * hence icky!
+ */
+static long calc_group_shares(struct cfs_rq *cfs_rq)
{
- long tg_weight, load, shares;
+ long tg_weight, tg_shares, load, shares;
+ struct task_group *tg = cfs_rq->tg;
- /*
- * This really should be: cfs_rq->avg.load_avg, but instead we use
- * cfs_rq->load.weight, which is its upper bound. This helps ramp up
- * the shares for small weight interactive tasks.
- */
- load = scale_load_down(cfs_rq->load.weight);
+ tg_shares = READ_ONCE(tg->shares);
+
+ load = max(scale_load_down(cfs_rq->load.weight), cfs_rq->avg.load_avg);
tg_weight = atomic_long_read(&tg->load_avg);
@@ -2712,7 +2912,7 @@ static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
tg_weight -= cfs_rq->tg_load_avg_contrib;
tg_weight += load;
- shares = (tg->shares * load);
+ shares = (tg_shares * load);
if (tg_weight)
shares /= tg_weight;
@@ -2728,63 +2928,86 @@ static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
* case no task is runnable on a CPU MIN_SHARES=2 should be returned
* instead of 0.
*/
- if (shares < MIN_SHARES)
- shares = MIN_SHARES;
- if (shares > tg->shares)
- shares = tg->shares;
-
- return shares;
-}
-# else /* CONFIG_SMP */
-static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
-{
- return tg->shares;
+ return clamp_t(long, shares, MIN_SHARES, tg_shares);
}
-# endif /* CONFIG_SMP */
-static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
- unsigned long weight)
+/*
+ * This calculates the effective runnable weight for a group entity based on
+ * the group entity weight calculated above.
+ *
+ * Because of the above approximation (2), our group entity weight is
+ * an load_avg based ratio (3). This means that it includes blocked load and
+ * does not represent the runnable weight.
+ *
+ * Approximate the group entity's runnable weight per ratio from the group
+ * runqueue:
+ *
+ * grq->avg.runnable_load_avg
+ * ge->runnable_weight = ge->load.weight * -------------------------- (7)
+ * grq->avg.load_avg
+ *
+ * However, analogous to above, since the avg numbers are slow, this leads to
+ * transients in the from-idle case. Instead we use:
+ *
+ * ge->runnable_weight = ge->load.weight *
+ *
+ * max(grq->avg.runnable_load_avg, grq->runnable_weight)
+ * ----------------------------------------------------- (8)
+ * max(grq->avg.load_avg, grq->load.weight)
+ *
+ * Where these max() serve both to use the 'instant' values to fix the slow
+ * from-idle and avoid the /0 on to-idle, similar to (6).
+ */
+static long calc_group_runnable(struct cfs_rq *cfs_rq, long shares)
{
- if (se->on_rq) {
- /* commit outstanding execution time */
- if (cfs_rq->curr == se)
- update_curr(cfs_rq);
- account_entity_dequeue(cfs_rq, se);
- }
+ long runnable, load_avg;
- update_load_set(&se->load, weight);
+ load_avg = max(cfs_rq->avg.load_avg,
+ scale_load_down(cfs_rq->load.weight));
- if (se->on_rq)
- account_entity_enqueue(cfs_rq, se);
+ runnable = max(cfs_rq->avg.runnable_load_avg,
+ scale_load_down(cfs_rq->runnable_weight));
+
+ runnable *= shares;
+ if (load_avg)
+ runnable /= load_avg;
+
+ return clamp_t(long, runnable, MIN_SHARES, shares);
}
+# endif /* CONFIG_SMP */
static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
-static void update_cfs_shares(struct sched_entity *se)
+/*
+ * Recomputes the group entity based on the current state of its group
+ * runqueue.
+ */
+static void update_cfs_group(struct sched_entity *se)
{
- struct cfs_rq *cfs_rq = group_cfs_rq(se);
- struct task_group *tg;
- long shares;
+ struct cfs_rq *gcfs_rq = group_cfs_rq(se);
+ long shares, runnable;
- if (!cfs_rq)
+ if (!gcfs_rq)
return;
- if (throttled_hierarchy(cfs_rq))
+ if (throttled_hierarchy(gcfs_rq))
return;
- tg = cfs_rq->tg;
-
#ifndef CONFIG_SMP
- if (likely(se->load.weight == tg->shares))
+ runnable = shares = READ_ONCE(gcfs_rq->tg->shares);
+
+ if (likely(se->load.weight == shares))
return;
+#else
+ shares = calc_group_shares(gcfs_rq);
+ runnable = calc_group_runnable(gcfs_rq, shares);
#endif
- shares = calc_cfs_shares(cfs_rq, tg);
- reweight_entity(cfs_rq_of(se), se, shares);
+ reweight_entity(cfs_rq_of(se), se, shares, runnable);
}
#else /* CONFIG_FAIR_GROUP_SCHED */
-static inline void update_cfs_shares(struct sched_entity *se)
+static inline void update_cfs_group(struct sched_entity *se)
{
}
#endif /* CONFIG_FAIR_GROUP_SCHED */
@@ -2893,7 +3116,7 @@ static u32 __accumulate_pelt_segments(u64 periods, u32 d1, u32 d3)
*/
static __always_inline u32
accumulate_sum(u64 delta, int cpu, struct sched_avg *sa,
- unsigned long weight, int running, struct cfs_rq *cfs_rq)
+ unsigned long load, unsigned long runnable, int running)
{
unsigned long scale_freq, scale_cpu;
u32 contrib = (u32)delta; /* p == 0 -> delta < 1024 */
@@ -2910,10 +3133,8 @@ accumulate_sum(u64 delta, int cpu, struct sched_avg *sa,
*/
if (periods) {
sa->load_sum = decay_load(sa->load_sum, periods);
- if (cfs_rq) {
- cfs_rq->runnable_load_sum =
- decay_load(cfs_rq->runnable_load_sum, periods);
- }
+ sa->runnable_load_sum =
+ decay_load(sa->runnable_load_sum, periods);
sa->util_sum = decay_load((u64)(sa->util_sum), periods);
/*
@@ -2926,11 +3147,10 @@ accumulate_sum(u64 delta, int cpu, struct sched_avg *sa,
sa->period_contrib = delta;
contrib = cap_scale(contrib, scale_freq);
- if (weight) {
- sa->load_sum += weight * contrib;
- if (cfs_rq)
- cfs_rq->runnable_load_sum += weight * contrib;
- }
+ if (load)
+ sa->load_sum += load * contrib;
+ if (runnable)
+ sa->runnable_load_sum += runnable * contrib;
if (running)
sa->util_sum += contrib * scale_cpu;
@@ -2966,8 +3186,8 @@ accumulate_sum(u64 delta, int cpu, struct sched_avg *sa,
* = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}]
*/
static __always_inline int
-___update_load_avg(u64 now, int cpu, struct sched_avg *sa,
- unsigned long weight, int running, struct cfs_rq *cfs_rq)
+___update_load_sum(u64 now, int cpu, struct sched_avg *sa,
+ unsigned long load, unsigned long runnable, int running)
{
u64 delta;
@@ -3000,8 +3220,8 @@ ___update_load_avg(u64 now, int cpu, struct sched_avg *sa,
* this happens during idle_balance() which calls
* update_blocked_averages()
*/
- if (!weight)
- running = 0;
+ if (!load)
+ runnable = running = 0;
/*
* Now we know we crossed measurement unit boundaries. The *_avg
@@ -3010,63 +3230,96 @@ ___update_load_avg(u64 now, int cpu, struct sched_avg *sa,
* Step 1: accumulate *_sum since last_update_time. If we haven't
* crossed period boundaries, finish.
*/
- if (!accumulate_sum(delta, cpu, sa, weight, running, cfs_rq))
+ if (!accumulate_sum(delta, cpu, sa, load, runnable, running))
return 0;
+ return 1;
+}
+
+static __always_inline void
+___update_load_avg(struct sched_avg *sa, unsigned long load, unsigned long runnable)
+{
+ u32 divider = LOAD_AVG_MAX - 1024 + sa->period_contrib;
+
/*
* Step 2: update *_avg.
*/
- sa->load_avg = div_u64(sa->load_sum, LOAD_AVG_MAX - 1024 + sa->period_contrib);
- if (cfs_rq) {
- cfs_rq->runnable_load_avg =
- div_u64(cfs_rq->runnable_load_sum, LOAD_AVG_MAX - 1024 + sa->period_contrib);
- }
- sa->util_avg = sa->util_sum / (LOAD_AVG_MAX - 1024 + sa->period_contrib);
-
- return 1;
+ sa->load_avg = div_u64(load * sa->load_sum, divider);
+ sa->runnable_load_avg = div_u64(runnable * sa->runnable_load_sum, divider);
+ sa->util_avg = sa->util_sum / divider;
}
+/*
+ * sched_entity:
+ *
+ * task:
+ * se_runnable() == se_weight()
+ *
+ * group: [ see update_cfs_group() ]
+ * se_weight() = tg->weight * grq->load_avg / tg->load_avg
+ * se_runnable() = se_weight(se) * grq->runnable_load_avg / grq->load_avg
+ *
+ * load_sum := runnable_sum
+ * load_avg = se_weight(se) * runnable_avg
+ *
+ * runnable_load_sum := runnable_sum
+ * runnable_load_avg = se_runnable(se) * runnable_avg
+ *
+ * XXX collapse load_sum and runnable_load_sum
+ *
+ * cfq_rs:
+ *
+ * load_sum = \Sum se_weight(se) * se->avg.load_sum
+ * load_avg = \Sum se->avg.load_avg
+ *
+ * runnable_load_sum = \Sum se_runnable(se) * se->avg.runnable_load_sum
+ * runnable_load_avg = \Sum se->avg.runable_load_avg
+ */
+
static int
__update_load_avg_blocked_se(u64 now, int cpu, struct sched_entity *se)
{
- return ___update_load_avg(now, cpu, &se->avg, 0, 0, NULL);
+ if (entity_is_task(se))
+ se->runnable_weight = se->load.weight;
+
+ if (___update_load_sum(now, cpu, &se->avg, 0, 0, 0)) {
+ ___update_load_avg(&se->avg, se_weight(se), se_runnable(se));
+ return 1;
+ }
+
+ return 0;
}
static int
__update_load_avg_se(u64 now, int cpu, struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- return ___update_load_avg(now, cpu, &se->avg,
- se->on_rq * scale_load_down(se->load.weight),
- cfs_rq->curr == se, NULL);
+ if (entity_is_task(se))
+ se->runnable_weight = se->load.weight;
+
+ if (___update_load_sum(now, cpu, &se->avg, !!se->on_rq, !!se->on_rq,
+ cfs_rq->curr == se)) {
+
+ ___update_load_avg(&se->avg, se_weight(se), se_runnable(se));
+ return 1;
+ }
+
+ return 0;
}
static int
__update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq *cfs_rq)
{
- return ___update_load_avg(now, cpu, &cfs_rq->avg,
- scale_load_down(cfs_rq->load.weight),
- cfs_rq->curr != NULL, cfs_rq);
-}
+ if (___update_load_sum(now, cpu, &cfs_rq->avg,
+ scale_load_down(cfs_rq->load.weight),
+ scale_load_down(cfs_rq->runnable_weight),
+ cfs_rq->curr != NULL)) {
-/*
- * Signed add and clamp on underflow.
- *
- * Explicitly do a load-store to ensure the intermediate value never hits
- * memory. This allows lockless observations without ever seeing the negative
- * values.
- */
-#define add_positive(_ptr, _val) do { \
- typeof(_ptr) ptr = (_ptr); \
- typeof(_val) val = (_val); \
- typeof(*ptr) res, var = READ_ONCE(*ptr); \
- \
- res = var + val; \
- \
- if (val < 0 && res > var) \
- res = 0; \
- \
- WRITE_ONCE(*ptr, res); \
-} while (0)
+ ___update_load_avg(&cfs_rq->avg, 1, 1);
+ return 1;
+ }
+
+ return 0;
+}
#ifdef CONFIG_FAIR_GROUP_SCHED
/**
@@ -3149,11 +3402,77 @@ void set_task_rq_fair(struct sched_entity *se,
se->avg.last_update_time = n_last_update_time;
}
-/* Take into account change of utilization of a child task group */
+
+/*
+ * When on migration a sched_entity joins/leaves the PELT hierarchy, we need to
+ * propagate its contribution. The key to this propagation is the invariant
+ * that for each group:
+ *
+ * ge->avg == grq->avg (1)
+ *
+ * _IFF_ we look at the pure running and runnable sums. Because they
+ * represent the very same entity, just at different points in the hierarchy.
+ *
+ *
+ * Per the above update_tg_cfs_util() is trivial (and still 'wrong') and
+ * simply copies the running sum over.
+ *
+ * However, update_tg_cfs_runnable() is more complex. So we have:
+ *
+ * ge->avg.load_avg = ge->load.weight * ge->avg.runnable_avg (2)
+ *
+ * And since, like util, the runnable part should be directly transferable,
+ * the following would _appear_ to be the straight forward approach:
+ *
+ * grq->avg.load_avg = grq->load.weight * grq->avg.running_avg (3)
+ *
+ * And per (1) we have:
+ *
+ * ge->avg.running_avg == grq->avg.running_avg
+ *
+ * Which gives:
+ *
+ * ge->load.weight * grq->avg.load_avg
+ * ge->avg.load_avg = ----------------------------------- (4)
+ * grq->load.weight
+ *
+ * Except that is wrong!
+ *
+ * Because while for entities historical weight is not important and we
+ * really only care about our future and therefore can consider a pure
+ * runnable sum, runqueues can NOT do this.
+ *
+ * We specifically want runqueues to have a load_avg that includes
+ * historical weights. Those represent the blocked load, the load we expect
+ * to (shortly) return to us. This only works by keeping the weights as
+ * integral part of the sum. We therefore cannot decompose as per (3).
+ *
+ * OK, so what then?
+ *
+ *
+ * Another way to look at things is:
+ *
+ * grq->avg.load_avg = \Sum se->avg.load_avg
+ *
+ * Therefore, per (2):
+ *
+ * grq->avg.load_avg = \Sum se->load.weight * se->avg.runnable_avg
+ *
+ * And the very thing we're propagating is a change in that sum (someone
+ * joined/left). So we can easily know the runnable change, which would be, per
+ * (2) the already tracked se->load_avg divided by the corresponding
+ * se->weight.
+ *
+ * Basically (4) but in differential form:
+ *
+ * d(runnable_avg) += se->avg.load_avg / se->load.weight
+ * (5)
+ * ge->avg.load_avg += ge->load.weight * d(runnable_avg)
+ */
+
static inline void
-update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se)
+update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
{
- struct cfs_rq *gcfs_rq = group_cfs_rq(se);
long delta = gcfs_rq->avg.util_avg - se->avg.util_avg;
/* Nothing to update */
@@ -3169,102 +3488,65 @@ update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se)
cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * LOAD_AVG_MAX;
}
-/* Take into account change of load of a child task group */
static inline void
-update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se)
+update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
{
- struct cfs_rq *gcfs_rq = group_cfs_rq(se);
- long delta, load = gcfs_rq->avg.load_avg;
+ long runnable_sum = gcfs_rq->prop_runnable_sum;
+ long runnable_load_avg, load_avg;
+ s64 runnable_load_sum, load_sum;
- /*
- * If the load of group cfs_rq is null, the load of the
- * sched_entity will also be null so we can skip the formula
- */
- if (load) {
- long tg_load;
+ if (!runnable_sum)
+ return;
- /* Get tg's load and ensure tg_load > 0 */
- tg_load = atomic_long_read(&gcfs_rq->tg->load_avg) + 1;
+ gcfs_rq->prop_runnable_sum = 0;
- /* Ensure tg_load >= load and updated with current load*/
- tg_load -= gcfs_rq->tg_load_avg_contrib;
- tg_load += load;
+ load_sum = (s64)se_weight(se) * runnable_sum;
+ load_avg = div_s64(load_sum, LOAD_AVG_MAX);
- /*
- * We need to compute a correction term in the case that the
- * task group is consuming more CPU than a task of equal
- * weight. A task with a weight equals to tg->shares will have
- * a load less or equal to scale_load_down(tg->shares).
- * Similarly, the sched_entities that represent the task group
- * at parent level, can't have a load higher than
- * scale_load_down(tg->shares). And the Sum of sched_entities'
- * load must be <= scale_load_down(tg->shares).
- */
- if (tg_load > scale_load_down(gcfs_rq->tg->shares)) {
- /* scale gcfs_rq's load into tg's shares*/
- load *= scale_load_down(gcfs_rq->tg->shares);
- load /= tg_load;
- }
- }
+ add_positive(&se->avg.load_sum, runnable_sum);
+ add_positive(&se->avg.load_avg, load_avg);
- delta = load - se->avg.load_avg;
+ add_positive(&cfs_rq->avg.load_avg, load_avg);
+ add_positive(&cfs_rq->avg.load_sum, load_sum);
- /* Nothing to update */
- if (!delta)
- return;
-
- /* Set new sched_entity's load */
- se->avg.load_avg = load;
- se->avg.load_sum = se->avg.load_avg * LOAD_AVG_MAX;
+ runnable_load_sum = (s64)se_runnable(se) * runnable_sum;
+ runnable_load_avg = div_s64(runnable_load_sum, LOAD_AVG_MAX);
- /* Update parent cfs_rq load */
- add_positive(&cfs_rq->avg.load_avg, delta);
- cfs_rq->avg.load_sum = cfs_rq->avg.load_avg * LOAD_AVG_MAX;
+ add_positive(&se->avg.runnable_load_sum, runnable_sum);
+ add_positive(&se->avg.runnable_load_avg, runnable_load_avg);
- /*
- * If the sched_entity is already enqueued, we also have to update the
- * runnable load avg.
- */
if (se->on_rq) {
- /* Update parent cfs_rq runnable_load_avg */
- add_positive(&cfs_rq->runnable_load_avg, delta);
- cfs_rq->runnable_load_sum = cfs_rq->runnable_load_avg * LOAD_AVG_MAX;
+ add_positive(&cfs_rq->avg.runnable_load_avg, runnable_load_avg);
+ add_positive(&cfs_rq->avg.runnable_load_sum, runnable_load_sum);
}
}
-static inline void set_tg_cfs_propagate(struct cfs_rq *cfs_rq)
-{
- cfs_rq->propagate_avg = 1;
-}
-
-static inline int test_and_clear_tg_cfs_propagate(struct sched_entity *se)
+static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum)
{
- struct cfs_rq *cfs_rq = group_cfs_rq(se);
-
- if (!cfs_rq->propagate_avg)
- return 0;
-
- cfs_rq->propagate_avg = 0;
- return 1;
+ cfs_rq->propagate = 1;
+ cfs_rq->prop_runnable_sum += runnable_sum;
}
/* Update task and its cfs_rq load average */
static inline int propagate_entity_load_avg(struct sched_entity *se)
{
- struct cfs_rq *cfs_rq;
+ struct cfs_rq *cfs_rq, *gcfs_rq;
if (entity_is_task(se))
return 0;
- if (!test_and_clear_tg_cfs_propagate(se))
+ gcfs_rq = group_cfs_rq(se);
+ if (!gcfs_rq->propagate)
return 0;
+ gcfs_rq->propagate = 0;
+
cfs_rq = cfs_rq_of(se);
- set_tg_cfs_propagate(cfs_rq);
+ add_tg_cfs_propagate(cfs_rq, gcfs_rq->prop_runnable_sum);
- update_tg_cfs_util(cfs_rq, se);
- update_tg_cfs_load(cfs_rq, se);
+ update_tg_cfs_util(cfs_rq, se, gcfs_rq);
+ update_tg_cfs_runnable(cfs_rq, se, gcfs_rq);
return 1;
}
@@ -3288,7 +3570,7 @@ static inline bool skip_blocked_update(struct sched_entity *se)
* If there is a pending propagation, we have to update the load and
* the utilization of the sched_entity:
*/
- if (gcfs_rq->propagate_avg)
+ if (gcfs_rq->propagate)
return false;
/*
@@ -3308,27 +3590,10 @@ static inline int propagate_entity_load_avg(struct sched_entity *se)
return 0;
}
-static inline void set_tg_cfs_propagate(struct cfs_rq *cfs_rq) {}
+static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum) {}
#endif /* CONFIG_FAIR_GROUP_SCHED */
-/*
- * Unsigned subtract and clamp on underflow.
- *
- * Explicitly do a load-store to ensure the intermediate value never hits
- * memory. This allows lockless observations without ever seeing the negative
- * values.
- */
-#define sub_positive(_ptr, _val) do { \
- typeof(_ptr) ptr = (_ptr); \
- typeof(*ptr) val = (_val); \
- typeof(*ptr) res, var = READ_ONCE(*ptr); \
- res = var - val; \
- if (res > var) \
- res = 0; \
- WRITE_ONCE(*ptr, res); \
-} while (0)
-
/**
* update_cfs_rq_load_avg - update the cfs_rq's load/util averages
* @now: current time, as per cfs_rq_clock_task()
@@ -3348,65 +3613,45 @@ static inline void set_tg_cfs_propagate(struct cfs_rq *cfs_rq) {}
static inline int
update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
{
+ unsigned long removed_load = 0, removed_util = 0, removed_runnable_sum = 0;
struct sched_avg *sa = &cfs_rq->avg;
- int decayed, removed_load = 0, removed_util = 0;
+ int decayed = 0;
- if (atomic_long_read(&cfs_rq->removed_load_avg)) {
- s64 r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0);
+ if (cfs_rq->removed.nr) {
+ unsigned long r;
+ u32 divider = LOAD_AVG_MAX - 1024 + sa->period_contrib;
+
+ raw_spin_lock(&cfs_rq->removed.lock);
+ swap(cfs_rq->removed.util_avg, removed_util);
+ swap(cfs_rq->removed.load_avg, removed_load);
+ swap(cfs_rq->removed.runnable_sum, removed_runnable_sum);
+ cfs_rq->removed.nr = 0;
+ raw_spin_unlock(&cfs_rq->removed.lock);
+
+ r = removed_load;
sub_positive(&sa->load_avg, r);
- sub_positive(&sa->load_sum, r * LOAD_AVG_MAX);
- removed_load = 1;
- set_tg_cfs_propagate(cfs_rq);
- }
+ sub_positive(&sa->load_sum, r * divider);
- if (atomic_long_read(&cfs_rq->removed_util_avg)) {
- long r = atomic_long_xchg(&cfs_rq->removed_util_avg, 0);
+ r = removed_util;
sub_positive(&sa->util_avg, r);
- sub_positive(&sa->util_sum, r * LOAD_AVG_MAX);
- removed_util = 1;
- set_tg_cfs_propagate(cfs_rq);
+ sub_positive(&sa->util_sum, r * divider);
+
+ add_tg_cfs_propagate(cfs_rq, -(long)removed_runnable_sum);
+
+ decayed = 1;
}
- decayed = __update_load_avg_cfs_rq(now, cpu_of(rq_of(cfs_rq)), cfs_rq);
+ decayed |= __update_load_avg_cfs_rq(now, cpu_of(rq_of(cfs_rq)), cfs_rq);
#ifndef CONFIG_64BIT
smp_wmb();
cfs_rq->load_last_update_time_copy = sa->last_update_time;
#endif
- if (decayed || removed_util)
+ if (decayed)
cfs_rq_util_change(cfs_rq);
- return decayed || removed_load;
-}
-
-/*
- * Optional action to be done while updating the load average
- */
-#define UPDATE_TG 0x1
-#define SKIP_AGE_LOAD 0x2
-
-/* Update task and its cfs_rq load average */
-static inline void update_load_avg(struct sched_entity *se, int flags)
-{
- struct cfs_rq *cfs_rq = cfs_rq_of(se);
- u64 now = cfs_rq_clock_task(cfs_rq);
- struct rq *rq = rq_of(cfs_rq);
- int cpu = cpu_of(rq);
- int decayed;
-
- /*
- * Track task load average for carrying it to new CPU after migrated, and
- * track group sched_entity load average for task_h_load calc in migration
- */
- if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD))
- __update_load_avg_se(now, cpu, cfs_rq, se);
-
- decayed = update_cfs_rq_load_avg(now, cfs_rq);
- decayed |= propagate_entity_load_avg(se);
-
- if (decayed && (flags & UPDATE_TG))
- update_tg_load_avg(cfs_rq, 0);
+ return decayed;
}
/**
@@ -3419,12 +3664,39 @@ static inline void update_load_avg(struct sched_entity *se, int flags)
*/
static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
+ u32 divider = LOAD_AVG_MAX - 1024 + cfs_rq->avg.period_contrib;
+
+ /*
+ * When we attach the @se to the @cfs_rq, we must align the decay
+ * window because without that, really weird and wonderful things can
+ * happen.
+ *
+ * XXX illustrate
+ */
se->avg.last_update_time = cfs_rq->avg.last_update_time;
- cfs_rq->avg.load_avg += se->avg.load_avg;
- cfs_rq->avg.load_sum += se->avg.load_sum;
+ se->avg.period_contrib = cfs_rq->avg.period_contrib;
+
+ /*
+ * Hell(o) Nasty stuff.. we need to recompute _sum based on the new
+ * period_contrib. This isn't strictly correct, but since we're
+ * entirely outside of the PELT hierarchy, nobody cares if we truncate
+ * _sum a little.
+ */
+ se->avg.util_sum = se->avg.util_avg * divider;
+
+ se->avg.load_sum = divider;
+ if (se_weight(se)) {
+ se->avg.load_sum =
+ div_u64(se->avg.load_avg * se->avg.load_sum, se_weight(se));
+ }
+
+ se->avg.runnable_load_sum = se->avg.load_sum;
+
+ enqueue_load_avg(cfs_rq, se);
cfs_rq->avg.util_avg += se->avg.util_avg;
cfs_rq->avg.util_sum += se->avg.util_sum;
- set_tg_cfs_propagate(cfs_rq);
+
+ add_tg_cfs_propagate(cfs_rq, se->avg.load_sum);
cfs_rq_util_change(cfs_rq);
}
@@ -3439,39 +3711,47 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
*/
static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
-
- sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg);
- sub_positive(&cfs_rq->avg.load_sum, se->avg.load_sum);
+ dequeue_load_avg(cfs_rq, se);
sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg);
sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum);
- set_tg_cfs_propagate(cfs_rq);
+
+ add_tg_cfs_propagate(cfs_rq, -se->avg.load_sum);
cfs_rq_util_change(cfs_rq);
}
-/* Add the load generated by se into cfs_rq's load average */
-static inline void
-enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
+/*
+ * Optional action to be done while updating the load average
+ */
+#define UPDATE_TG 0x1
+#define SKIP_AGE_LOAD 0x2
+#define DO_ATTACH 0x4
+
+/* Update task and its cfs_rq load average */
+static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
- struct sched_avg *sa = &se->avg;
+ u64 now = cfs_rq_clock_task(cfs_rq);
+ struct rq *rq = rq_of(cfs_rq);
+ int cpu = cpu_of(rq);
+ int decayed;
+
+ /*
+ * Track task load average for carrying it to new CPU after migrated, and
+ * track group sched_entity load average for task_h_load calc in migration
+ */
+ if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD))
+ __update_load_avg_se(now, cpu, cfs_rq, se);
- cfs_rq->runnable_load_avg += sa->load_avg;
- cfs_rq->runnable_load_sum += sa->load_sum;
+ decayed = update_cfs_rq_load_avg(now, cfs_rq);
+ decayed |= propagate_entity_load_avg(se);
+
+ if (!se->avg.last_update_time && (flags & DO_ATTACH)) {
- if (!sa->last_update_time) {
attach_entity_load_avg(cfs_rq, se);
update_tg_load_avg(cfs_rq, 0);
- }
-}
-/* Remove the runnable load generated by se from cfs_rq's runnable load average */
-static inline void
-dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
- cfs_rq->runnable_load_avg =
- max_t(long, cfs_rq->runnable_load_avg - se->avg.load_avg, 0);
- cfs_rq->runnable_load_sum =
- max_t(s64, cfs_rq->runnable_load_sum - se->avg.load_sum, 0);
+ } else if (decayed && (flags & UPDATE_TG))
+ update_tg_load_avg(cfs_rq, 0);
}
#ifndef CONFIG_64BIT
@@ -3515,6 +3795,7 @@ void sync_entity_load_avg(struct sched_entity *se)
void remove_entity_load_avg(struct sched_entity *se)
{
struct cfs_rq *cfs_rq = cfs_rq_of(se);
+ unsigned long flags;
/*
* tasks cannot exit without having gone through wake_up_new_task() ->
@@ -3527,13 +3808,18 @@ void remove_entity_load_avg(struct sched_entity *se)
*/
sync_entity_load_avg(se);
- atomic_long_add(se->avg.load_avg, &cfs_rq->removed_load_avg);
- atomic_long_add(se->avg.util_avg, &cfs_rq->removed_util_avg);
+
+ raw_spin_lock_irqsave(&cfs_rq->removed.lock, flags);
+ ++cfs_rq->removed.nr;
+ cfs_rq->removed.util_avg += se->avg.util_avg;
+ cfs_rq->removed.load_avg += se->avg.load_avg;
+ cfs_rq->removed.runnable_sum += se->avg.load_sum; /* == runnable_sum */
+ raw_spin_unlock_irqrestore(&cfs_rq->removed.lock, flags);
}
static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq)
{
- return cfs_rq->runnable_load_avg;
+ return cfs_rq->avg.runnable_load_avg;
}
static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq)
@@ -3553,16 +3839,13 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
#define UPDATE_TG 0x0
#define SKIP_AGE_LOAD 0x0
+#define DO_ATTACH 0x0
-static inline void update_load_avg(struct sched_entity *se, int not_used1)
+static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int not_used1)
{
- cfs_rq_util_change(cfs_rq_of(se));
+ cfs_rq_util_change(cfs_rq);
}
-static inline void
-enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
-static inline void
-dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
static inline void remove_entity_load_avg(struct sched_entity *se) {}
static inline void
@@ -3707,9 +3990,9 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
* its group cfs_rq
* - Add its new weight to cfs_rq->load.weight
*/
- update_load_avg(se, UPDATE_TG);
- enqueue_entity_load_avg(cfs_rq, se);
- update_cfs_shares(se);
+ update_load_avg(cfs_rq, se, UPDATE_TG | DO_ATTACH);
+ update_cfs_group(se);
+ enqueue_runnable_load_avg(cfs_rq, se);
account_entity_enqueue(cfs_rq, se);
if (flags & ENQUEUE_WAKEUP)
@@ -3791,8 +4074,8 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
* - For group entity, update its weight to reflect the new share
* of its group cfs_rq.
*/
- update_load_avg(se, UPDATE_TG);
- dequeue_entity_load_avg(cfs_rq, se);
+ update_load_avg(cfs_rq, se, UPDATE_TG);
+ dequeue_runnable_load_avg(cfs_rq, se);
update_stats_dequeue(cfs_rq, se, flags);
@@ -3815,7 +4098,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
/* return excess runtime on last dequeue */
return_cfs_rq_runtime(cfs_rq);
- update_cfs_shares(se);
+ update_cfs_group(se);
/*
* Now advance min_vruntime if @se was the entity holding it back,
@@ -3879,7 +4162,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
*/
update_stats_wait_end(cfs_rq, se);
__dequeue_entity(cfs_rq, se);
- update_load_avg(se, UPDATE_TG);
+ update_load_avg(cfs_rq, se, UPDATE_TG);
}
update_stats_curr_start(cfs_rq, se);
@@ -3981,7 +4264,7 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
/* Put 'current' back into the tree. */
__enqueue_entity(cfs_rq, prev);
/* in !on_rq case, update occurred at dequeue */
- update_load_avg(prev, 0);
+ update_load_avg(cfs_rq, prev, 0);
}
cfs_rq->curr = NULL;
}
@@ -3997,8 +4280,8 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
/*
* Ensure that runnable average is periodically updated.
*/
- update_load_avg(curr, UPDATE_TG);
- update_cfs_shares(curr);
+ update_load_avg(cfs_rq, curr, UPDATE_TG);
+ update_cfs_group(curr);
#ifdef CONFIG_SCHED_HRTICK
/*
@@ -4915,8 +5198,8 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
if (cfs_rq_throttled(cfs_rq))
break;
- update_load_avg(se, UPDATE_TG);
- update_cfs_shares(se);
+ update_load_avg(cfs_rq, se, UPDATE_TG);
+ update_cfs_group(se);
}
if (!se)
@@ -4974,8 +5257,8 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
if (cfs_rq_throttled(cfs_rq))
break;
- update_load_avg(se, UPDATE_TG);
- update_cfs_shares(se);
+ update_load_avg(cfs_rq, se, UPDATE_TG);
+ update_cfs_group(se);
}
if (!se)
@@ -5449,6 +5732,8 @@ static unsigned long capacity_spare_wake(int cpu, struct task_struct *p)
/*
* find_idlest_group finds and returns the least busy CPU group within the
* domain.
+ *
+ * Assumes p is allowed on at least one CPU in sd.
*/
static struct sched_group *
find_idlest_group(struct sched_domain *sd, struct task_struct *p,
@@ -5456,8 +5741,9 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
{
struct sched_group *idlest = NULL, *group = sd->groups;
struct sched_group *most_spare_sg = NULL;
- unsigned long min_runnable_load = ULONG_MAX, this_runnable_load = 0;
- unsigned long min_avg_load = ULONG_MAX, this_avg_load = 0;
+ unsigned long min_runnable_load = ULONG_MAX;
+ unsigned long this_runnable_load = ULONG_MAX;
+ unsigned long min_avg_load = ULONG_MAX, this_avg_load = ULONG_MAX;
unsigned long most_spare = 0, this_spare = 0;
int load_idx = sd->forkexec_idx;
int imbalance_scale = 100 + (sd->imbalance_pct-100)/2;
@@ -5578,10 +5864,10 @@ skip_spare:
}
/*
- * find_idlest_cpu - find the idlest cpu among the cpus in group.
+ * find_idlest_group_cpu - find the idlest cpu among the cpus in group.
*/
static int
-find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
+find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
{
unsigned long load, min_load = ULONG_MAX;
unsigned int min_exit_latency = UINT_MAX;
@@ -5630,6 +5916,53 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu;
}
+static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p,
+ int cpu, int prev_cpu, int sd_flag)
+{
+ int new_cpu = cpu;
+
+ if (!cpumask_intersects(sched_domain_span(sd), &p->cpus_allowed))
+ return prev_cpu;
+
+ while (sd) {
+ struct sched_group *group;
+ struct sched_domain *tmp;
+ int weight;
+
+ if (!(sd->flags & sd_flag)) {
+ sd = sd->child;
+ continue;
+ }
+
+ group = find_idlest_group(sd, p, cpu, sd_flag);
+ if (!group) {
+ sd = sd->child;
+ continue;
+ }
+
+ new_cpu = find_idlest_group_cpu(group, p, cpu);
+ if (new_cpu == cpu) {
+ /* Now try balancing at a lower domain level of cpu */
+ sd = sd->child;
+ continue;
+ }
+
+ /* Now try balancing at a lower domain level of new_cpu */
+ cpu = new_cpu;
+ weight = sd->span_weight;
+ sd = NULL;
+ for_each_domain(cpu, tmp) {
+ if (weight <= tmp->span_weight)
+ break;
+ if (tmp->flags & sd_flag)
+ sd = tmp;
+ }
+ /* while loop will break here if sd == NULL */
+ }
+
+ return new_cpu;
+}
+
#ifdef CONFIG_SCHED_SMT
static inline void set_idle_cores(int cpu, int val)
@@ -5982,50 +6315,30 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
new_cpu = cpu;
}
+ if (sd && !(sd_flag & SD_BALANCE_FORK)) {
+ /*
+ * We're going to need the task's util for capacity_spare_wake
+ * in find_idlest_group. Sync it up to prev_cpu's
+ * last_update_time.
+ */
+ sync_entity_load_avg(&p->se);
+ }
+
if (!sd) {
- pick_cpu:
+pick_cpu:
if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */
new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);
- } else while (sd) {
- struct sched_group *group;
- int weight;
-
- if (!(sd->flags & sd_flag)) {
- sd = sd->child;
- continue;
- }
-
- group = find_idlest_group(sd, p, cpu, sd_flag);
- if (!group) {
- sd = sd->child;
- continue;
- }
-
- new_cpu = find_idlest_cpu(group, p, cpu);
- if (new_cpu == -1 || new_cpu == cpu) {
- /* Now try balancing at a lower domain level of cpu */
- sd = sd->child;
- continue;
- }
-
- /* Now try balancing at a lower domain level of new_cpu */
- cpu = new_cpu;
- weight = sd->span_weight;
- sd = NULL;
- for_each_domain(cpu, tmp) {
- if (weight <= tmp->span_weight)
- break;
- if (tmp->flags & sd_flag)
- sd = tmp;
- }
- /* while loop will break here if sd == NULL */
+ } else {
+ new_cpu = find_idlest_cpu(sd, p, cpu, prev_cpu, sd_flag);
}
rcu_read_unlock();
return new_cpu;
}
+static void detach_entity_cfs_rq(struct sched_entity *se);
+
/*
* Called immediately before a task is migrated to a new cpu; task_cpu(p) and
* cfs_rq_of(p) references at time of call are still valid and identify the
@@ -6059,14 +6372,25 @@ static void migrate_task_rq_fair(struct task_struct *p)
se->vruntime -= min_vruntime;
}
- /*
- * We are supposed to update the task to "current" time, then its up to date
- * and ready to go to new CPU/cfs_rq. But we have difficulty in getting
- * what current time is, so simply throw away the out-of-date time. This
- * will result in the wakee task is less decayed, but giving the wakee more
- * load sounds not bad.
- */
- remove_entity_load_avg(&p->se);
+ if (p->on_rq == TASK_ON_RQ_MIGRATING) {
+ /*
+ * In case of TASK_ON_RQ_MIGRATING we in fact hold the 'old'
+ * rq->lock and can modify state directly.
+ */
+ lockdep_assert_held(&task_rq(p)->lock);
+ detach_entity_cfs_rq(&p->se);
+
+ } else {
+ /*
+ * We are supposed to update the task to "current" time, then
+ * its up to date and ready to go to new CPU/cfs_rq. But we
+ * have difficulty in getting what current time is, so simply
+ * throw away the out-of-date time. This will result in the
+ * wakee task is less decayed, but giving the wakee more load
+ * sounds not bad.
+ */
+ remove_entity_load_avg(&p->se);
+ }
/* Tell new CPU we are migrated */
p->se.avg.last_update_time = 0;
@@ -6334,10 +6658,7 @@ again:
set_next_entity(cfs_rq, se);
}
- if (hrtick_enabled(rq))
- hrtick_start_fair(rq, p);
-
- return p;
+ goto done;
simple:
#endif
@@ -6351,6 +6672,16 @@ simple:
p = task_of(se);
+done: __maybe_unused
+#ifdef CONFIG_SMP
+ /*
+ * Move the next running task to the front of
+ * the list, so our cfs_tasks list becomes MRU
+ * one.
+ */
+ list_move(&p->se.group_node, &rq->cfs_tasks);
+#endif
+
if (hrtick_enabled(rq))
hrtick_start_fair(rq, p);
@@ -6786,11 +7117,12 @@ static void detach_task(struct task_struct *p, struct lb_env *env)
*/
static struct task_struct *detach_one_task(struct lb_env *env)
{
- struct task_struct *p, *n;
+ struct task_struct *p;
lockdep_assert_held(&env->src_rq->lock);
- list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) {
+ list_for_each_entry_reverse(p,
+ &env->src_rq->cfs_tasks, se.group_node) {
if (!can_migrate_task(p, env))
continue;
@@ -6836,7 +7168,7 @@ static int detach_tasks(struct lb_env *env)
if (env->idle != CPU_NOT_IDLE && env->src_rq->nr_running <= 1)
break;
- p = list_first_entry(tasks, struct task_struct, se.group_node);
+ p = list_last_entry(tasks, struct task_struct, se.group_node);
env->loop++;
/* We've more or less seen every task there is, call it quits */
@@ -6886,7 +7218,7 @@ static int detach_tasks(struct lb_env *env)
continue;
next:
- list_move_tail(&p->se.group_node, tasks);
+ list_move(&p->se.group_node, tasks);
}
/*
@@ -6962,7 +7294,7 @@ static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
if (cfs_rq->avg.util_sum)
return false;
- if (cfs_rq->runnable_load_sum)
+ if (cfs_rq->avg.runnable_load_sum)
return false;
return true;
@@ -6994,7 +7326,7 @@ static void update_blocked_averages(int cpu)
/* Propagate pending load changes to the parent, if any: */
se = cfs_rq->tg->se[cpu];
if (se && !skip_blocked_update(se))
- update_load_avg(se, 0);
+ update_load_avg(cfs_rq_of(se), se, 0);
/*
* There can be a lot of idle CPU cgroups. Don't let fully
@@ -7875,8 +8207,11 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
if (busiest->group_type == group_imbalanced)
goto force_balance;
- /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
- if (env->idle == CPU_NEWLY_IDLE && group_has_capacity(env, local) &&
+ /*
+ * When dst_cpu is idle, prevent SMP nice and/or asymmetric group
+ * capacities from resulting in underutilization due to avg_load.
+ */
+ if (env->idle != CPU_NOT_IDLE && group_has_capacity(env, local) &&
busiest->group_no_capacity)
goto force_balance;
@@ -8693,7 +9028,7 @@ void nohz_balance_enter_idle(int cpu)
return;
/* Spare idle load balancing on CPUs that don't want to be disturbed: */
- if (!is_housekeeping_cpu(cpu))
+ if (!housekeeping_cpu(cpu, HK_FLAG_SCHED))
return;
if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))
@@ -9158,7 +9493,7 @@ static void propagate_entity_cfs_rq(struct sched_entity *se)
if (cfs_rq_throttled(cfs_rq))
break;
- update_load_avg(se, UPDATE_TG);
+ update_load_avg(cfs_rq, se, UPDATE_TG);
}
}
#else
@@ -9170,7 +9505,7 @@ static void detach_entity_cfs_rq(struct sched_entity *se)
struct cfs_rq *cfs_rq = cfs_rq_of(se);
/* Catch up with the cfs_rq and remove our load when we leave */
- update_load_avg(se, 0);
+ update_load_avg(cfs_rq, se, 0);
detach_entity_load_avg(cfs_rq, se);
update_tg_load_avg(cfs_rq, false);
propagate_entity_cfs_rq(se);
@@ -9189,7 +9524,7 @@ static void attach_entity_cfs_rq(struct sched_entity *se)
#endif
/* Synchronize entity with its cfs_rq */
- update_load_avg(se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD);
+ update_load_avg(cfs_rq, se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD);
attach_entity_load_avg(cfs_rq, se);
update_tg_load_avg(cfs_rq, false);
propagate_entity_cfs_rq(se);
@@ -9271,11 +9606,7 @@ void init_cfs_rq(struct cfs_rq *cfs_rq)
cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
#endif
#ifdef CONFIG_SMP
-#ifdef CONFIG_FAIR_GROUP_SCHED
- cfs_rq->propagate_avg = 0;
-#endif
- atomic_long_set(&cfs_rq->removed_load_avg, 0);
- atomic_long_set(&cfs_rq->removed_util_avg, 0);
+ raw_spin_lock_init(&cfs_rq->removed.lock);
#endif
}
@@ -9473,8 +9804,8 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
rq_lock_irqsave(rq, &rf);
update_rq_clock(rq);
for_each_sched_entity(se) {
- update_load_avg(se, UPDATE_TG);
- update_cfs_shares(se);
+ update_load_avg(cfs_rq_of(se), se, UPDATE_TG);
+ update_cfs_group(se);
}
rq_unlock_irqrestore(rq, &rf);
}
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 257f4f0b4532..7dae9eb8c042 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -209,6 +209,7 @@ exit_idle:
*/
static void do_idle(void)
{
+ int cpu = smp_processor_id();
/*
* If the arch has a polling bit, we maintain an invariant:
*
@@ -219,14 +220,13 @@ static void do_idle(void)
*/
__current_set_polling();
- quiet_vmstat();
tick_nohz_idle_enter();
while (!need_resched()) {
check_pgt_cache();
rmb();
- if (cpu_is_offline(smp_processor_id())) {
+ if (cpu_is_offline(cpu)) {
cpuhp_report_idle_dead();
arch_cpu_idle_dead();
}
diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c
new file mode 100644
index 000000000000..b71b436f59f2
--- /dev/null
+++ b/kernel/sched/isolation.c
@@ -0,0 +1,155 @@
+/*
+ * Housekeeping management. Manage the targets for routine code that can run on
+ * any CPU: unbound workqueues, timers, kthreads and any offloadable work.
+ *
+ * Copyright (C) 2017 Red Hat, Inc., Frederic Weisbecker
+ *
+ */
+
+#include <linux/sched/isolation.h>
+#include <linux/tick.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/static_key.h>
+#include <linux/ctype.h>
+
+DEFINE_STATIC_KEY_FALSE(housekeeping_overriden);
+EXPORT_SYMBOL_GPL(housekeeping_overriden);
+static cpumask_var_t housekeeping_mask;
+static unsigned int housekeeping_flags;
+
+int housekeeping_any_cpu(enum hk_flags flags)
+{
+ if (static_branch_unlikely(&housekeeping_overriden))
+ if (housekeeping_flags & flags)
+ return cpumask_any_and(housekeeping_mask, cpu_online_mask);
+ return smp_processor_id();
+}
+EXPORT_SYMBOL_GPL(housekeeping_any_cpu);
+
+const struct cpumask *housekeeping_cpumask(enum hk_flags flags)
+{
+ if (static_branch_unlikely(&housekeeping_overriden))
+ if (housekeeping_flags & flags)
+ return housekeeping_mask;
+ return cpu_possible_mask;
+}
+EXPORT_SYMBOL_GPL(housekeeping_cpumask);
+
+void housekeeping_affine(struct task_struct *t, enum hk_flags flags)
+{
+ if (static_branch_unlikely(&housekeeping_overriden))
+ if (housekeeping_flags & flags)
+ set_cpus_allowed_ptr(t, housekeeping_mask);
+}
+EXPORT_SYMBOL_GPL(housekeeping_affine);
+
+bool housekeeping_test_cpu(int cpu, enum hk_flags flags)
+{
+ if (static_branch_unlikely(&housekeeping_overriden))
+ if (housekeeping_flags & flags)
+ return cpumask_test_cpu(cpu, housekeeping_mask);
+ return true;
+}
+EXPORT_SYMBOL_GPL(housekeeping_test_cpu);
+
+void __init housekeeping_init(void)
+{
+ if (!housekeeping_flags)
+ return;
+
+ static_branch_enable(&housekeeping_overriden);
+
+ /* We need at least one CPU to handle housekeeping work */
+ WARN_ON_ONCE(cpumask_empty(housekeeping_mask));
+}
+
+static int __init housekeeping_setup(char *str, enum hk_flags flags)
+{
+ cpumask_var_t non_housekeeping_mask;
+ int err;
+
+ alloc_bootmem_cpumask_var(&non_housekeeping_mask);
+ err = cpulist_parse(str, non_housekeeping_mask);
+ if (err < 0 || cpumask_last(non_housekeeping_mask) >= nr_cpu_ids) {
+ pr_warn("Housekeeping: nohz_full= or isolcpus= incorrect CPU range\n");
+ free_bootmem_cpumask_var(non_housekeeping_mask);
+ return 0;
+ }
+
+ if (!housekeeping_flags) {
+ alloc_bootmem_cpumask_var(&housekeeping_mask);
+ cpumask_andnot(housekeeping_mask,
+ cpu_possible_mask, non_housekeeping_mask);
+ if (cpumask_empty(housekeeping_mask))
+ cpumask_set_cpu(smp_processor_id(), housekeeping_mask);
+ } else {
+ cpumask_var_t tmp;
+
+ alloc_bootmem_cpumask_var(&tmp);
+ cpumask_andnot(tmp, cpu_possible_mask, non_housekeeping_mask);
+ if (!cpumask_equal(tmp, housekeeping_mask)) {
+ pr_warn("Housekeeping: nohz_full= must match isolcpus=\n");
+ free_bootmem_cpumask_var(tmp);
+ free_bootmem_cpumask_var(non_housekeeping_mask);
+ return 0;
+ }
+ free_bootmem_cpumask_var(tmp);
+ }
+
+ if ((flags & HK_FLAG_TICK) && !(housekeeping_flags & HK_FLAG_TICK)) {
+ if (IS_ENABLED(CONFIG_NO_HZ_FULL)) {
+ tick_nohz_full_setup(non_housekeeping_mask);
+ } else {
+ pr_warn("Housekeeping: nohz unsupported."
+ " Build with CONFIG_NO_HZ_FULL\n");
+ free_bootmem_cpumask_var(non_housekeeping_mask);
+ return 0;
+ }
+ }
+
+ housekeeping_flags |= flags;
+
+ free_bootmem_cpumask_var(non_housekeeping_mask);
+
+ return 1;
+}
+
+static int __init housekeeping_nohz_full_setup(char *str)
+{
+ unsigned int flags;
+
+ flags = HK_FLAG_TICK | HK_FLAG_TIMER | HK_FLAG_RCU | HK_FLAG_MISC;
+
+ return housekeeping_setup(str, flags);
+}
+__setup("nohz_full=", housekeeping_nohz_full_setup);
+
+static int __init housekeeping_isolcpus_setup(char *str)
+{
+ unsigned int flags = 0;
+
+ while (isalpha(*str)) {
+ if (!strncmp(str, "nohz,", 5)) {
+ str += 5;
+ flags |= HK_FLAG_TICK;
+ continue;
+ }
+
+ if (!strncmp(str, "domain,", 7)) {
+ str += 7;
+ flags |= HK_FLAG_DOMAIN;
+ continue;
+ }
+
+ pr_warn("isolcpus: Error, unknown flag\n");
+ return 0;
+ }
+
+ /* Default behaviour for isolcpus without flags */
+ if (!flags)
+ flags |= HK_FLAG_DOMAIN;
+
+ return housekeeping_setup(str, flags);
+}
+__setup("isolcpus=", housekeeping_isolcpus_setup);
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 3c96c80e0992..d8c43d73e078 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -74,10 +74,6 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
raw_spin_unlock(&rt_b->rt_runtime_lock);
}
-#if defined(CONFIG_SMP) && defined(HAVE_RT_PUSH_IPI)
-static void push_irq_work_func(struct irq_work *work);
-#endif
-
void init_rt_rq(struct rt_rq *rt_rq)
{
struct rt_prio_array *array;
@@ -97,13 +93,6 @@ void init_rt_rq(struct rt_rq *rt_rq)
rt_rq->rt_nr_migratory = 0;
rt_rq->overloaded = 0;
plist_head_init(&rt_rq->pushable_tasks);
-
-#ifdef HAVE_RT_PUSH_IPI
- rt_rq->push_flags = 0;
- rt_rq->push_cpu = nr_cpu_ids;
- raw_spin_lock_init(&rt_rq->push_lock);
- init_irq_work(&rt_rq->push_work, push_irq_work_func);
-#endif
#endif /* CONFIG_SMP */
/* We start is dequeued state, because no RT tasks are queued */
rt_rq->rt_queued = 0;
@@ -1876,241 +1865,166 @@ static void push_rt_tasks(struct rq *rq)
}
#ifdef HAVE_RT_PUSH_IPI
+
/*
- * The search for the next cpu always starts at rq->cpu and ends
- * when we reach rq->cpu again. It will never return rq->cpu.
- * This returns the next cpu to check, or nr_cpu_ids if the loop
- * is complete.
+ * When a high priority task schedules out from a CPU and a lower priority
+ * task is scheduled in, a check is made to see if there's any RT tasks
+ * on other CPUs that are waiting to run because a higher priority RT task
+ * is currently running on its CPU. In this case, the CPU with multiple RT
+ * tasks queued on it (overloaded) needs to be notified that a CPU has opened
+ * up that may be able to run one of its non-running queued RT tasks.
+ *
+ * All CPUs with overloaded RT tasks need to be notified as there is currently
+ * no way to know which of these CPUs have the highest priority task waiting
+ * to run. Instead of trying to take a spinlock on each of these CPUs,
+ * which has shown to cause large latency when done on machines with many
+ * CPUs, sending an IPI to the CPUs to have them push off the overloaded
+ * RT tasks waiting to run.
+ *
+ * Just sending an IPI to each of the CPUs is also an issue, as on large
+ * count CPU machines, this can cause an IPI storm on a CPU, especially
+ * if its the only CPU with multiple RT tasks queued, and a large number
+ * of CPUs scheduling a lower priority task at the same time.
+ *
+ * Each root domain has its own irq work function that can iterate over
+ * all CPUs with RT overloaded tasks. Since all CPUs with overloaded RT
+ * tassk must be checked if there's one or many CPUs that are lowering
+ * their priority, there's a single irq work iterator that will try to
+ * push off RT tasks that are waiting to run.
+ *
+ * When a CPU schedules a lower priority task, it will kick off the
+ * irq work iterator that will jump to each CPU with overloaded RT tasks.
+ * As it only takes the first CPU that schedules a lower priority task
+ * to start the process, the rto_start variable is incremented and if
+ * the atomic result is one, then that CPU will try to take the rto_lock.
+ * This prevents high contention on the lock as the process handles all
+ * CPUs scheduling lower priority tasks.
+ *
+ * All CPUs that are scheduling a lower priority task will increment the
+ * rt_loop_next variable. This will make sure that the irq work iterator
+ * checks all RT overloaded CPUs whenever a CPU schedules a new lower
+ * priority task, even if the iterator is in the middle of a scan. Incrementing
+ * the rt_loop_next will cause the iterator to perform another scan.
*
- * rq->rt.push_cpu holds the last cpu returned by this function,
- * or if this is the first instance, it must hold rq->cpu.
*/
static int rto_next_cpu(struct rq *rq)
{
- int prev_cpu = rq->rt.push_cpu;
+ struct root_domain *rd = rq->rd;
+ int next;
int cpu;
- cpu = cpumask_next(prev_cpu, rq->rd->rto_mask);
-
/*
- * If the previous cpu is less than the rq's CPU, then it already
- * passed the end of the mask, and has started from the beginning.
- * We end if the next CPU is greater or equal to rq's CPU.
+ * When starting the IPI RT pushing, the rto_cpu is set to -1,
+ * rt_next_cpu() will simply return the first CPU found in
+ * the rto_mask.
+ *
+ * If rto_next_cpu() is called with rto_cpu is a valid cpu, it
+ * will return the next CPU found in the rto_mask.
+ *
+ * If there are no more CPUs left in the rto_mask, then a check is made
+ * against rto_loop and rto_loop_next. rto_loop is only updated with
+ * the rto_lock held, but any CPU may increment the rto_loop_next
+ * without any locking.
*/
- if (prev_cpu < rq->cpu) {
- if (cpu >= rq->cpu)
- return nr_cpu_ids;
+ for (;;) {
- } else if (cpu >= nr_cpu_ids) {
- /*
- * We passed the end of the mask, start at the beginning.
- * If the result is greater or equal to the rq's CPU, then
- * the loop is finished.
- */
- cpu = cpumask_first(rq->rd->rto_mask);
- if (cpu >= rq->cpu)
- return nr_cpu_ids;
- }
- rq->rt.push_cpu = cpu;
+ /* When rto_cpu is -1 this acts like cpumask_first() */
+ cpu = cpumask_next(rd->rto_cpu, rd->rto_mask);
- /* Return cpu to let the caller know if the loop is finished or not */
- return cpu;
-}
+ rd->rto_cpu = cpu;
-static int find_next_push_cpu(struct rq *rq)
-{
- struct rq *next_rq;
- int cpu;
+ if (cpu < nr_cpu_ids)
+ return cpu;
- while (1) {
- cpu = rto_next_cpu(rq);
- if (cpu >= nr_cpu_ids)
- break;
- next_rq = cpu_rq(cpu);
+ rd->rto_cpu = -1;
+
+ /*
+ * ACQUIRE ensures we see the @rto_mask changes
+ * made prior to the @next value observed.
+ *
+ * Matches WMB in rt_set_overload().
+ */
+ next = atomic_read_acquire(&rd->rto_loop_next);
- /* Make sure the next rq can push to this rq */
- if (next_rq->rt.highest_prio.next < rq->rt.highest_prio.curr)
+ if (rd->rto_loop == next)
break;
+
+ rd->rto_loop = next;
}
- return cpu;
+ return -1;
}
-#define RT_PUSH_IPI_EXECUTING 1
-#define RT_PUSH_IPI_RESTART 2
+static inline bool rto_start_trylock(atomic_t *v)
+{
+ return !atomic_cmpxchg_acquire(v, 0, 1);
+}
-/*
- * When a high priority task schedules out from a CPU and a lower priority
- * task is scheduled in, a check is made to see if there's any RT tasks
- * on other CPUs that are waiting to run because a higher priority RT task
- * is currently running on its CPU. In this case, the CPU with multiple RT
- * tasks queued on it (overloaded) needs to be notified that a CPU has opened
- * up that may be able to run one of its non-running queued RT tasks.
- *
- * On large CPU boxes, there's the case that several CPUs could schedule
- * a lower priority task at the same time, in which case it will look for
- * any overloaded CPUs that it could pull a task from. To do this, the runqueue
- * lock must be taken from that overloaded CPU. Having 10s of CPUs all fighting
- * for a single overloaded CPU's runqueue lock can produce a large latency.
- * (This has actually been observed on large boxes running cyclictest).
- * Instead of taking the runqueue lock of the overloaded CPU, each of the
- * CPUs that scheduled a lower priority task simply sends an IPI to the
- * overloaded CPU. An IPI is much cheaper than taking an runqueue lock with
- * lots of contention. The overloaded CPU will look to push its non-running
- * RT task off, and if it does, it can then ignore the other IPIs coming
- * in, and just pass those IPIs off to any other overloaded CPU.
- *
- * When a CPU schedules a lower priority task, it only sends an IPI to
- * the "next" CPU that has overloaded RT tasks. This prevents IPI storms,
- * as having 10 CPUs scheduling lower priority tasks and 10 CPUs with
- * RT overloaded tasks, would cause 100 IPIs to go out at once.
- *
- * The overloaded RT CPU, when receiving an IPI, will try to push off its
- * overloaded RT tasks and then send an IPI to the next CPU that has
- * overloaded RT tasks. This stops when all CPUs with overloaded RT tasks
- * have completed. Just because a CPU may have pushed off its own overloaded
- * RT task does not mean it should stop sending the IPI around to other
- * overloaded CPUs. There may be another RT task waiting to run on one of
- * those CPUs that are of higher priority than the one that was just
- * pushed.
- *
- * An optimization that could possibly be made is to make a CPU array similar
- * to the cpupri array mask of all running RT tasks, but for the overloaded
- * case, then the IPI could be sent to only the CPU with the highest priority
- * RT task waiting, and that CPU could send off further IPIs to the CPU with
- * the next highest waiting task. Since the overloaded case is much less likely
- * to happen, the complexity of this implementation may not be worth it.
- * Instead, just send an IPI around to all overloaded CPUs.
- *
- * The rq->rt.push_flags holds the status of the IPI that is going around.
- * A run queue can only send out a single IPI at a time. The possible flags
- * for rq->rt.push_flags are:
- *
- * (None or zero): No IPI is going around for the current rq
- * RT_PUSH_IPI_EXECUTING: An IPI for the rq is being passed around
- * RT_PUSH_IPI_RESTART: The priority of the running task for the rq
- * has changed, and the IPI should restart
- * circulating the overloaded CPUs again.
- *
- * rq->rt.push_cpu contains the CPU that is being sent the IPI. It is updated
- * before sending to the next CPU.
- *
- * Instead of having all CPUs that schedule a lower priority task send
- * an IPI to the same "first" CPU in the RT overload mask, they send it
- * to the next overloaded CPU after their own CPU. This helps distribute
- * the work when there's more than one overloaded CPU and multiple CPUs
- * scheduling in lower priority tasks.
- *
- * When a rq schedules a lower priority task than what was currently
- * running, the next CPU with overloaded RT tasks is examined first.
- * That is, if CPU 1 and 5 are overloaded, and CPU 3 schedules a lower
- * priority task, it will send an IPI first to CPU 5, then CPU 5 will
- * send to CPU 1 if it is still overloaded. CPU 1 will clear the
- * rq->rt.push_flags if RT_PUSH_IPI_RESTART is not set.
- *
- * The first CPU to notice IPI_RESTART is set, will clear that flag and then
- * send an IPI to the next overloaded CPU after the rq->cpu and not the next
- * CPU after push_cpu. That is, if CPU 1, 4 and 5 are overloaded when CPU 3
- * schedules a lower priority task, and the IPI_RESTART gets set while the
- * handling is being done on CPU 5, it will clear the flag and send it back to
- * CPU 4 instead of CPU 1.
- *
- * Note, the above logic can be disabled by turning off the sched_feature
- * RT_PUSH_IPI. Then the rq lock of the overloaded CPU will simply be
- * taken by the CPU requesting a pull and the waiting RT task will be pulled
- * by that CPU. This may be fine for machines with few CPUs.
- */
-static void tell_cpu_to_push(struct rq *rq)
+static inline void rto_start_unlock(atomic_t *v)
{
- int cpu;
+ atomic_set_release(v, 0);
+}
- if (rq->rt.push_flags & RT_PUSH_IPI_EXECUTING) {
- raw_spin_lock(&rq->rt.push_lock);
- /* Make sure it's still executing */
- if (rq->rt.push_flags & RT_PUSH_IPI_EXECUTING) {
- /*
- * Tell the IPI to restart the loop as things have
- * changed since it started.
- */
- rq->rt.push_flags |= RT_PUSH_IPI_RESTART;
- raw_spin_unlock(&rq->rt.push_lock);
- return;
- }
- raw_spin_unlock(&rq->rt.push_lock);
- }
+static void tell_cpu_to_push(struct rq *rq)
+{
+ int cpu = -1;
- /* When here, there's no IPI going around */
+ /* Keep the loop going if the IPI is currently active */
+ atomic_inc(&rq->rd->rto_loop_next);
- rq->rt.push_cpu = rq->cpu;
- cpu = find_next_push_cpu(rq);
- if (cpu >= nr_cpu_ids)
+ /* Only one CPU can initiate a loop at a time */
+ if (!rto_start_trylock(&rq->rd->rto_loop_start))
return;
- rq->rt.push_flags = RT_PUSH_IPI_EXECUTING;
+ raw_spin_lock(&rq->rd->rto_lock);
+
+ /*
+ * The rto_cpu is updated under the lock, if it has a valid cpu
+ * then the IPI is still running and will continue due to the
+ * update to loop_next, and nothing needs to be done here.
+ * Otherwise it is finishing up and an ipi needs to be sent.
+ */
+ if (rq->rd->rto_cpu < 0)
+ cpu = rto_next_cpu(rq);
- irq_work_queue_on(&rq->rt.push_work, cpu);
+ raw_spin_unlock(&rq->rd->rto_lock);
+
+ rto_start_unlock(&rq->rd->rto_loop_start);
+
+ if (cpu >= 0)
+ irq_work_queue_on(&rq->rd->rto_push_work, cpu);
}
/* Called from hardirq context */
-static void try_to_push_tasks(void *arg)
+void rto_push_irq_work_func(struct irq_work *work)
{
- struct rt_rq *rt_rq = arg;
- struct rq *rq, *src_rq;
- int this_cpu;
+ struct rq *rq;
int cpu;
- this_cpu = rt_rq->push_cpu;
+ rq = this_rq();
- /* Paranoid check */
- BUG_ON(this_cpu != smp_processor_id());
-
- rq = cpu_rq(this_cpu);
- src_rq = rq_of_rt_rq(rt_rq);
-
-again:
+ /*
+ * We do not need to grab the lock to check for has_pushable_tasks.
+ * When it gets updated, a check is made if a push is possible.
+ */
if (has_pushable_tasks(rq)) {
raw_spin_lock(&rq->lock);
- push_rt_task(rq);
+ push_rt_tasks(rq);
raw_spin_unlock(&rq->lock);
}
- /* Pass the IPI to the next rt overloaded queue */
- raw_spin_lock(&rt_rq->push_lock);
- /*
- * If the source queue changed since the IPI went out,
- * we need to restart the search from that CPU again.
- */
- if (rt_rq->push_flags & RT_PUSH_IPI_RESTART) {
- rt_rq->push_flags &= ~RT_PUSH_IPI_RESTART;
- rt_rq->push_cpu = src_rq->cpu;
- }
+ raw_spin_lock(&rq->rd->rto_lock);
- cpu = find_next_push_cpu(src_rq);
+ /* Pass the IPI to the next rt overloaded queue */
+ cpu = rto_next_cpu(rq);
- if (cpu >= nr_cpu_ids)
- rt_rq->push_flags &= ~RT_PUSH_IPI_EXECUTING;
- raw_spin_unlock(&rt_rq->push_lock);
+ raw_spin_unlock(&rq->rd->rto_lock);
- if (cpu >= nr_cpu_ids)
+ if (cpu < 0)
return;
- /*
- * It is possible that a restart caused this CPU to be
- * chosen again. Don't bother with an IPI, just see if we
- * have more to push.
- */
- if (unlikely(cpu == rq->cpu))
- goto again;
-
/* Try the next RT overloaded CPU */
- irq_work_queue_on(&rt_rq->push_work, cpu);
-}
-
-static void push_irq_work_func(struct irq_work *work)
-{
- struct rt_rq *rt_rq = container_of(work, struct rt_rq, push_work);
-
- try_to_push_tasks(rt_rq);
+ irq_work_queue_on(&rq->rd->rto_push_work, cpu);
}
#endif /* HAVE_RT_PUSH_IPI */
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 3b448ba82225..45ab0bf564e7 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -227,7 +227,7 @@ struct dl_bw {
static inline void __dl_update(struct dl_bw *dl_b, s64 bw);
static inline
-void __dl_clear(struct dl_bw *dl_b, u64 tsk_bw, int cpus)
+void __dl_sub(struct dl_bw *dl_b, u64 tsk_bw, int cpus)
{
dl_b->total_bw -= tsk_bw;
__dl_update(dl_b, (s32)tsk_bw / cpus);
@@ -256,7 +256,6 @@ extern int sched_dl_overflow(struct task_struct *p, int policy,
extern void __setparam_dl(struct task_struct *p, const struct sched_attr *attr);
extern void __getparam_dl(struct task_struct *p, struct sched_attr *attr);
extern bool __checkparam_dl(const struct sched_attr *attr);
-extern void __dl_clear_params(struct task_struct *p);
extern bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr);
extern int dl_task_can_attach(struct task_struct *p,
const struct cpumask *cs_cpus_allowed);
@@ -419,6 +418,7 @@ struct cfs_bandwidth { };
/* CFS-related fields in a runqueue */
struct cfs_rq {
struct load_weight load;
+ unsigned long runnable_weight;
unsigned int nr_running, h_nr_running;
u64 exec_clock;
@@ -444,18 +444,22 @@ struct cfs_rq {
* CFS load tracking
*/
struct sched_avg avg;
- u64 runnable_load_sum;
- unsigned long runnable_load_avg;
-#ifdef CONFIG_FAIR_GROUP_SCHED
- unsigned long tg_load_avg_contrib;
- unsigned long propagate_avg;
-#endif
- atomic_long_t removed_load_avg, removed_util_avg;
#ifndef CONFIG_64BIT
u64 load_last_update_time_copy;
#endif
+ struct {
+ raw_spinlock_t lock ____cacheline_aligned;
+ int nr;
+ unsigned long load_avg;
+ unsigned long util_avg;
+ unsigned long runnable_sum;
+ } removed;
#ifdef CONFIG_FAIR_GROUP_SCHED
+ unsigned long tg_load_avg_contrib;
+ long propagate;
+ long prop_runnable_sum;
+
/*
* h_load = weight * f(tg)
*
@@ -502,7 +506,7 @@ static inline int rt_bandwidth_enabled(void)
}
/* RT IPI pull logic requires IRQ_WORK */
-#ifdef CONFIG_IRQ_WORK
+#if defined(CONFIG_IRQ_WORK) && defined(CONFIG_SMP)
# define HAVE_RT_PUSH_IPI
#endif
@@ -524,12 +528,6 @@ struct rt_rq {
unsigned long rt_nr_total;
int overloaded;
struct plist_head pushable_tasks;
-#ifdef HAVE_RT_PUSH_IPI
- int push_flags;
- int push_cpu;
- struct irq_work push_work;
- raw_spinlock_t push_lock;
-#endif
#endif /* CONFIG_SMP */
int rt_queued;
@@ -638,6 +636,19 @@ struct root_domain {
struct dl_bw dl_bw;
struct cpudl cpudl;
+#ifdef HAVE_RT_PUSH_IPI
+ /*
+ * For IPI pull requests, loop across the rto_mask.
+ */
+ struct irq_work rto_push_work;
+ raw_spinlock_t rto_lock;
+ /* These are only updated and read within rto_lock */
+ int rto_loop;
+ int rto_cpu;
+ /* These atomics are updated outside of a lock */
+ atomic_t rto_loop_next;
+ atomic_t rto_loop_start;
+#endif
/*
* The "RT overload" flag: it gets set if a CPU has more than
* one runnable RT task.
@@ -655,6 +666,9 @@ extern void init_defrootdomain(void);
extern int sched_init_domains(const struct cpumask *cpu_map);
extern void rq_attach_root(struct rq *rq, struct root_domain *rd);
+#ifdef HAVE_RT_PUSH_IPI
+extern void rto_push_irq_work_func(struct irq_work *work);
+#endif
#endif /* CONFIG_SMP */
/*
@@ -1219,8 +1233,6 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
# define const_debug const
#endif
-extern const_debug unsigned int sysctl_sched_features;
-
#define SCHED_FEAT(name, enabled) \
__SCHED_FEAT_##name ,
@@ -1232,6 +1244,13 @@ enum {
#undef SCHED_FEAT
#if defined(CONFIG_SCHED_DEBUG) && defined(HAVE_JUMP_LABEL)
+
+/*
+ * To support run-time toggling of sched features, all the translation units
+ * (but core.c) reference the sysctl_sched_features defined in core.c.
+ */
+extern const_debug unsigned int sysctl_sched_features;
+
#define SCHED_FEAT(name, enabled) \
static __always_inline bool static_branch_##name(struct static_key *key) \
{ \
@@ -1239,13 +1258,27 @@ static __always_inline bool static_branch_##name(struct static_key *key) \
}
#include "features.h"
-
#undef SCHED_FEAT
extern struct static_key sched_feat_keys[__SCHED_FEAT_NR];
#define sched_feat(x) (static_branch_##x(&sched_feat_keys[__SCHED_FEAT_##x]))
+
#else /* !(SCHED_DEBUG && HAVE_JUMP_LABEL) */
+
+/*
+ * Each translation unit has its own copy of sysctl_sched_features to allow
+ * constants propagation at compile time and compiler optimization based on
+ * features default.
+ */
+#define SCHED_FEAT(name, enabled) \
+ (1UL << __SCHED_FEAT_##name) * enabled |
+static const_debug __maybe_unused unsigned int sysctl_sched_features =
+#include "features.h"
+ 0;
+#undef SCHED_FEAT
+
#define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x))
+
#endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */
extern struct static_key_false sched_numa_balancing;
@@ -1530,6 +1563,8 @@ extern void init_sched_dl_class(void);
extern void init_sched_rt_class(void);
extern void init_sched_fair_class(void);
+extern void reweight_task(struct task_struct *p, int prio);
+
extern void resched_curr(struct rq *rq);
extern void resched_cpu(int cpu);
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 6798276d29af..034cbed7f88b 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -4,6 +4,7 @@
*/
#include <linux/sched.h>
#include <linux/mutex.h>
+#include <linux/sched/isolation.h>
#include "sched.h"
@@ -269,6 +270,12 @@ static int init_rootdomain(struct root_domain *rd)
if (!zalloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
goto free_dlo_mask;
+#ifdef HAVE_RT_PUSH_IPI
+ rd->rto_cpu = -1;
+ raw_spin_lock_init(&rd->rto_lock);
+ init_irq_work(&rd->rto_push_work, rto_push_irq_work_func);
+#endif
+
init_dl_bw(&rd->dl_bw);
if (cpudl_init(&rd->cpudl) != 0)
goto free_rto_mask;
@@ -464,21 +471,6 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
update_top_cache_domain(cpu);
}
-/* Setup the mask of CPUs configured for isolated domains */
-static int __init isolated_cpu_setup(char *str)
-{
- int ret;
-
- alloc_bootmem_cpumask_var(&cpu_isolated_map);
- ret = cpulist_parse(str, cpu_isolated_map);
- if (ret) {
- pr_err("sched: Error, all isolcpus= values must be between 0 and %u\n", nr_cpu_ids);
- return 0;
- }
- return 1;
-}
-__setup("isolcpus=", isolated_cpu_setup);
-
struct s_data {
struct sched_domain ** __percpu sd;
struct root_domain *rd;
@@ -1158,6 +1150,7 @@ sd_init(struct sched_domain_topology_level *tl,
sd->smt_gain = 1178; /* ~15% */
} else if (sd->flags & SD_SHARE_PKG_RESOURCES) {
+ sd->flags |= SD_PREFER_SIBLING;
sd->imbalance_pct = 117;
sd->cache_nice_tries = 1;
sd->busy_idx = 2;
@@ -1332,6 +1325,10 @@ void sched_init_numa(void)
if (!sched_domains_numa_distance)
return;
+ /* Includes NUMA identity node at level 0. */
+ sched_domains_numa_distance[level++] = curr_distance;
+ sched_domains_numa_levels = level;
+
/*
* O(nr_nodes^2) deduplicating selection sort -- in order to find the
* unique distances in the node_distance() table.
@@ -1379,8 +1376,7 @@ void sched_init_numa(void)
return;
/*
- * 'level' contains the number of unique distances, excluding the
- * identity distance node_distance(i,i).
+ * 'level' contains the number of unique distances
*
* The sched_domains_numa_distance[] array includes the actual distance
* numbers.
@@ -1442,9 +1438,18 @@ void sched_init_numa(void)
tl[i] = sched_domain_topology[i];
/*
+ * Add the NUMA identity distance, aka single NODE.
+ */
+ tl[i++] = (struct sched_domain_topology_level){
+ .mask = sd_numa_mask,
+ .numa_level = 0,
+ SD_INIT_NAME(NODE)
+ };
+
+ /*
* .. and append 'j' levels of NUMA goodness.
*/
- for (j = 0; j < level; i++, j++) {
+ for (j = 1; j < level; i++, j++) {
tl[i] = (struct sched_domain_topology_level){
.mask = sd_numa_mask,
.sd_flags = cpu_numa_flags,
@@ -1774,7 +1779,7 @@ int sched_init_domains(const struct cpumask *cpu_map)
doms_cur = alloc_sched_domains(ndoms_cur);
if (!doms_cur)
doms_cur = &fallback_doms;
- cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map);
+ cpumask_and(doms_cur[0], cpu_map, housekeeping_cpumask(HK_FLAG_DOMAIN));
err = build_sched_domains(doms_cur[0], NULL);
register_sched_domain_sysctl();
@@ -1857,7 +1862,8 @@ void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
doms_new = alloc_sched_domains(1);
if (doms_new) {
n = 1;
- cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map);
+ cpumask_and(doms_new[0], cpu_active_mask,
+ housekeeping_cpumask(HK_FLAG_DOMAIN));
}
} else {
n = ndoms_new;
@@ -1880,7 +1886,8 @@ match1:
if (!doms_new) {
n = 0;
doms_new = &fallback_doms;
- cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map);
+ cpumask_and(doms_new[0], cpu_active_mask,
+ housekeeping_cpumask(HK_FLAG_DOMAIN));
}
/* Build new domains: */
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 418a1c045933..5f0dfb2abb8d 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -190,7 +190,7 @@ static u32 seccomp_run_filters(const struct seccomp_data *sd,
u32 ret = SECCOMP_RET_ALLOW;
/* Make sure cross-thread synced filter points somewhere sane. */
struct seccomp_filter *f =
- lockless_dereference(current->seccomp.filter);
+ READ_ONCE(current->seccomp.filter);
/* Ensure unexpected behavior doesn't result in failing open. */
if (unlikely(WARN_ON(f == NULL)))
diff --git a/kernel/smp.c b/kernel/smp.c
index c94dd85c8d41..084c8b3a2681 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -213,7 +213,7 @@ static void flush_smp_call_function_queue(bool warn_cpu_offline)
call_single_data_t *csd, *csd_next;
static bool warned;
- WARN_ON(!irqs_disabled());
+ lockdep_assert_irqs_disabled();
head = this_cpu_ptr(&call_single_queue);
entry = llist_del_all(head);
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 4e09821f9d9e..662f7b1b7a78 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -137,7 +137,7 @@ EXPORT_SYMBOL(__local_bh_disable_ip);
static void __local_bh_enable(unsigned int cnt)
{
- WARN_ON_ONCE(!irqs_disabled());
+ lockdep_assert_irqs_disabled();
if (softirq_count() == (cnt & SOFTIRQ_MASK))
trace_softirqs_on(_RET_IP_);
@@ -158,7 +158,8 @@ EXPORT_SYMBOL(_local_bh_enable);
void __local_bh_enable_ip(unsigned long ip, unsigned int cnt)
{
- WARN_ON_ONCE(in_irq() || irqs_disabled());
+ WARN_ON_ONCE(in_irq());
+ lockdep_assert_irqs_enabled();
#ifdef CONFIG_TRACE_IRQFLAGS
local_irq_disable();
#endif
@@ -396,9 +397,8 @@ void irq_exit(void)
#ifndef __ARCH_IRQ_EXIT_IRQS_DISABLED
local_irq_disable();
#else
- WARN_ON_ONCE(!irqs_disabled());
+ lockdep_assert_irqs_disabled();
#endif
-
account_irq_exit_time(current);
preempt_count_sub(HARDIRQ_OFFSET);
if (!in_interrupt() && local_softirq_pending())
@@ -488,7 +488,7 @@ EXPORT_SYMBOL(__tasklet_hi_schedule);
void __tasklet_hi_schedule_first(struct tasklet_struct *t)
{
- BUG_ON(!irqs_disabled());
+ lockdep_assert_irqs_disabled();
t->next = __this_cpu_read(tasklet_hi_vec.head);
__this_cpu_write(tasklet_hi_vec.head, t);
diff --git a/kernel/task_work.c b/kernel/task_work.c
index 5718b3ea202a..0fef395662a6 100644
--- a/kernel/task_work.c
+++ b/kernel/task_work.c
@@ -68,7 +68,7 @@ task_work_cancel(struct task_struct *task, task_work_func_t func)
* we raced with task_work_run(), *pprev == NULL/exited.
*/
raw_spin_lock_irqsave(&task->pi_lock, flags);
- while ((work = lockless_dereference(*pprev))) {
+ while ((work = READ_ONCE(*pprev))) {
if (work->func != func)
pprev = &work->next;
else if (cmpxchg(pprev, work, work->next) == work)
diff --git a/kernel/test_kprobes.c b/kernel/test_kprobes.c
index 0dbab6d1acb4..dd53e354f630 100644
--- a/kernel/test_kprobes.c
+++ b/kernel/test_kprobes.c
@@ -22,7 +22,7 @@
#define div_factor 3
-static u32 rand1, preh_val, posth_val, jph_val;
+static u32 rand1, preh_val, posth_val;
static int errors, handler_errors, num_tests;
static u32 (*target)(u32 value);
static u32 (*target2)(u32 value);
@@ -34,6 +34,10 @@ static noinline u32 kprobe_target(u32 value)
static int kp_pre_handler(struct kprobe *p, struct pt_regs *regs)
{
+ if (preemptible()) {
+ handler_errors++;
+ pr_err("pre-handler is preemptible\n");
+ }
preh_val = (rand1 / div_factor);
return 0;
}
@@ -41,6 +45,10 @@ static int kp_pre_handler(struct kprobe *p, struct pt_regs *regs)
static void kp_post_handler(struct kprobe *p, struct pt_regs *regs,
unsigned long flags)
{
+ if (preemptible()) {
+ handler_errors++;
+ pr_err("post-handler is preemptible\n");
+ }
if (preh_val != (rand1 / div_factor)) {
handler_errors++;
pr_err("incorrect value in post_handler\n");
@@ -154,8 +162,15 @@ static int test_kprobes(void)
}
+#if 0
+static u32 jph_val;
+
static u32 j_kprobe_target(u32 value)
{
+ if (preemptible()) {
+ handler_errors++;
+ pr_err("jprobe-handler is preemptible\n");
+ }
if (value != rand1) {
handler_errors++;
pr_err("incorrect value in jprobe handler\n");
@@ -227,11 +242,19 @@ static int test_jprobes(void)
return 0;
}
+#else
+#define test_jprobe() (0)
+#define test_jprobes() (0)
+#endif
#ifdef CONFIG_KRETPROBES
static u32 krph_val;
static int entry_handler(struct kretprobe_instance *ri, struct pt_regs *regs)
{
+ if (preemptible()) {
+ handler_errors++;
+ pr_err("kretprobe entry handler is preemptible\n");
+ }
krph_val = (rand1 / div_factor);
return 0;
}
@@ -240,6 +263,10 @@ static int return_handler(struct kretprobe_instance *ri, struct pt_regs *regs)
{
unsigned long ret = regs_return_value(regs);
+ if (preemptible()) {
+ handler_errors++;
+ pr_err("kretprobe return handler is preemptible\n");
+ }
if (ret != (rand1 / div_factor)) {
handler_errors++;
pr_err("incorrect value in kretprobe handler\n");
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
index ac09bc29eb08..d689a9557e17 100644
--- a/kernel/time/Kconfig
+++ b/kernel/time/Kconfig
@@ -56,7 +56,7 @@ menu "Timers subsystem"
# Core internal switch. Selected by NO_HZ_COMMON / HIGH_RES_TIMERS. This is
# only related to the tick functionality. Oneshot clockevent devices
-# are supported independ of this.
+# are supported independent of this.
config TICK_ONESHOT
bool
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index 4237e0744e26..16c027e9cc73 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -280,17 +280,22 @@ static int clockevents_program_min_delta(struct clock_event_device *dev)
static int clockevents_program_min_delta(struct clock_event_device *dev)
{
unsigned long long clc;
- int64_t delta;
+ int64_t delta = 0;
+ int i;
- delta = dev->min_delta_ns;
- dev->next_event = ktime_add_ns(ktime_get(), delta);
+ for (i = 0; i < 10; i++) {
+ delta += dev->min_delta_ns;
+ dev->next_event = ktime_add_ns(ktime_get(), delta);
- if (clockevent_state_shutdown(dev))
- return 0;
+ if (clockevent_state_shutdown(dev))
+ return 0;
- dev->retries++;
- clc = ((unsigned long long) delta * dev->mult) >> dev->shift;
- return dev->set_next_event((unsigned long) clc, dev);
+ dev->retries++;
+ clc = ((unsigned long long) delta * dev->mult) >> dev->shift;
+ if (dev->set_next_event((unsigned long) clc, dev) == 0)
+ return 0;
+ }
+ return -ETIME;
}
#endif /* CONFIG_GENERIC_CLOCKEVENTS_MIN_ADJUST */
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 88f75f92ef36..d32520840fde 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -758,9 +758,7 @@ void clock_was_set(void)
*/
void hrtimers_resume(void)
{
- WARN_ONCE(!irqs_disabled(),
- KERN_INFO "hrtimers_resume() called with IRQs enabled!");
-
+ lockdep_assert_irqs_disabled();
/* Retrigger on the local CPU */
retrigger_next_event(NULL);
/* And schedule a retrigger for all others */
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 99e03bec68e4..8d70da1b9a0d 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -493,6 +493,67 @@ out:
return leap;
}
+static void sync_hw_clock(struct work_struct *work);
+static DECLARE_DELAYED_WORK(sync_work, sync_hw_clock);
+
+static void sched_sync_hw_clock(struct timespec64 now,
+ unsigned long target_nsec, bool fail)
+
+{
+ struct timespec64 next;
+
+ getnstimeofday64(&next);
+ if (!fail)
+ next.tv_sec = 659;
+ else {
+ /*
+ * Try again as soon as possible. Delaying long periods
+ * decreases the accuracy of the work queue timer. Due to this
+ * the algorithm is very likely to require a short-sleep retry
+ * after the above long sleep to synchronize ts_nsec.
+ */
+ next.tv_sec = 0;
+ }
+
+ /* Compute the needed delay that will get to tv_nsec == target_nsec */
+ next.tv_nsec = target_nsec - next.tv_nsec;
+ if (next.tv_nsec <= 0)
+ next.tv_nsec += NSEC_PER_SEC;
+ if (next.tv_nsec >= NSEC_PER_SEC) {
+ next.tv_sec++;
+ next.tv_nsec -= NSEC_PER_SEC;
+ }
+
+ queue_delayed_work(system_power_efficient_wq, &sync_work,
+ timespec64_to_jiffies(&next));
+}
+
+static void sync_rtc_clock(void)
+{
+ unsigned long target_nsec;
+ struct timespec64 adjust, now;
+ int rc;
+
+ if (!IS_ENABLED(CONFIG_RTC_SYSTOHC))
+ return;
+
+ getnstimeofday64(&now);
+
+ adjust = now;
+ if (persistent_clock_is_local)
+ adjust.tv_sec -= (sys_tz.tz_minuteswest * 60);
+
+ /*
+ * The current RTC in use will provide the target_nsec it wants to be
+ * called at, and does rtc_tv_nsec_ok internally.
+ */
+ rc = rtc_set_ntp_time(adjust, &target_nsec);
+ if (rc == -ENODEV)
+ return;
+
+ sched_sync_hw_clock(now, target_nsec, rc);
+}
+
#ifdef CONFIG_GENERIC_CMOS_UPDATE
int __weak update_persistent_clock(struct timespec now)
{
@@ -508,76 +569,75 @@ int __weak update_persistent_clock64(struct timespec64 now64)
}
#endif
-#if defined(CONFIG_GENERIC_CMOS_UPDATE) || defined(CONFIG_RTC_SYSTOHC)
-static void sync_cmos_clock(struct work_struct *work);
-
-static DECLARE_DELAYED_WORK(sync_cmos_work, sync_cmos_clock);
-
-static void sync_cmos_clock(struct work_struct *work)
+static bool sync_cmos_clock(void)
{
+ static bool no_cmos;
struct timespec64 now;
- struct timespec64 next;
- int fail = 1;
+ struct timespec64 adjust;
+ int rc = -EPROTO;
+ long target_nsec = NSEC_PER_SEC / 2;
+
+ if (!IS_ENABLED(CONFIG_GENERIC_CMOS_UPDATE))
+ return false;
+
+ if (no_cmos)
+ return false;
/*
- * If we have an externally synchronized Linux clock, then update
- * CMOS clock accordingly every ~11 minutes. Set_rtc_mmss() has to be
- * called as close as possible to 500 ms before the new second starts.
- * This code is run on a timer. If the clock is set, that timer
- * may not expire at the correct time. Thus, we adjust...
- * We want the clock to be within a couple of ticks from the target.
+ * Historically update_persistent_clock64() has followed x86
+ * semantics, which match the MC146818A/etc RTC. This RTC will store
+ * 'adjust' and then in .5s it will advance once second.
+ *
+ * Architectures are strongly encouraged to use rtclib and not
+ * implement this legacy API.
*/
- if (!ntp_synced()) {
- /*
- * Not synced, exit, do not restart a timer (if one is
- * running, let it run out).
- */
- return;
- }
-
getnstimeofday64(&now);
- if (abs(now.tv_nsec - (NSEC_PER_SEC / 2)) <= tick_nsec * 5) {
- struct timespec64 adjust = now;
-
- fail = -ENODEV;
+ if (rtc_tv_nsec_ok(-1 * target_nsec, &adjust, &now)) {
if (persistent_clock_is_local)
adjust.tv_sec -= (sys_tz.tz_minuteswest * 60);
-#ifdef CONFIG_GENERIC_CMOS_UPDATE
- fail = update_persistent_clock64(adjust);
-#endif
-
-#ifdef CONFIG_RTC_SYSTOHC
- if (fail == -ENODEV)
- fail = rtc_set_ntp_time(adjust);
-#endif
+ rc = update_persistent_clock64(adjust);
+ /*
+ * The machine does not support update_persistent_clock64 even
+ * though it defines CONFIG_GENERIC_CMOS_UPDATE.
+ */
+ if (rc == -ENODEV) {
+ no_cmos = true;
+ return false;
+ }
}
- next.tv_nsec = (NSEC_PER_SEC / 2) - now.tv_nsec - (TICK_NSEC / 2);
- if (next.tv_nsec <= 0)
- next.tv_nsec += NSEC_PER_SEC;
+ sched_sync_hw_clock(now, target_nsec, rc);
+ return true;
+}
- if (!fail || fail == -ENODEV)
- next.tv_sec = 659;
- else
- next.tv_sec = 0;
+/*
+ * If we have an externally synchronized Linux clock, then update RTC clock
+ * accordingly every ~11 minutes. Generally RTCs can only store second
+ * precision, but many RTCs will adjust the phase of their second tick to
+ * match the moment of update. This infrastructure arranges to call to the RTC
+ * set at the correct moment to phase synchronize the RTC second tick over
+ * with the kernel clock.
+ */
+static void sync_hw_clock(struct work_struct *work)
+{
+ if (!ntp_synced())
+ return;
- if (next.tv_nsec >= NSEC_PER_SEC) {
- next.tv_sec++;
- next.tv_nsec -= NSEC_PER_SEC;
- }
- queue_delayed_work(system_power_efficient_wq,
- &sync_cmos_work, timespec64_to_jiffies(&next));
+ if (sync_cmos_clock())
+ return;
+
+ sync_rtc_clock();
}
void ntp_notify_cmos_timer(void)
{
- queue_delayed_work(system_power_efficient_wq, &sync_cmos_work, 0);
-}
-
-#else
-void ntp_notify_cmos_timer(void) { }
-#endif
+ if (!ntp_synced())
+ return;
+ if (IS_ENABLED(CONFIG_GENERIC_CMOS_UPDATE) ||
+ IS_ENABLED(CONFIG_RTC_SYSTOHC))
+ queue_delayed_work(system_power_efficient_wq, &sync_work, 0);
+}
/*
* Propagate a new txc->status value into the NTP state:
@@ -654,67 +714,6 @@ static inline void process_adjtimex_modes(struct timex *txc,
}
-
-/**
- * ntp_validate_timex - Ensures the timex is ok for use in do_adjtimex
- */
-int ntp_validate_timex(struct timex *txc)
-{
- if (txc->modes & ADJ_ADJTIME) {
- /* singleshot must not be used with any other mode bits */
- if (!(txc->modes & ADJ_OFFSET_SINGLESHOT))
- return -EINVAL;
- if (!(txc->modes & ADJ_OFFSET_READONLY) &&
- !capable(CAP_SYS_TIME))
- return -EPERM;
- } else {
- /* In order to modify anything, you gotta be super-user! */
- if (txc->modes && !capable(CAP_SYS_TIME))
- return -EPERM;
- /*
- * if the quartz is off by more than 10% then
- * something is VERY wrong!
- */
- if (txc->modes & ADJ_TICK &&
- (txc->tick < 900000/USER_HZ ||
- txc->tick > 1100000/USER_HZ))
- return -EINVAL;
- }
-
- if (txc->modes & ADJ_SETOFFSET) {
- /* In order to inject time, you gotta be super-user! */
- if (!capable(CAP_SYS_TIME))
- return -EPERM;
-
- if (txc->modes & ADJ_NANO) {
- struct timespec ts;
-
- ts.tv_sec = txc->time.tv_sec;
- ts.tv_nsec = txc->time.tv_usec;
- if (!timespec_inject_offset_valid(&ts))
- return -EINVAL;
-
- } else {
- if (!timeval_inject_offset_valid(&txc->time))
- return -EINVAL;
- }
- }
-
- /*
- * Check for potential multiplication overflows that can
- * only happen on 64-bit systems:
- */
- if ((txc->modes & ADJ_FREQUENCY) && (BITS_PER_LONG == 64)) {
- if (LLONG_MIN / PPM_SCALE > txc->freq)
- return -EINVAL;
- if (LLONG_MAX / PPM_SCALE < txc->freq)
- return -EINVAL;
- }
-
- return 0;
-}
-
-
/*
* adjtimex mainly allows reading (and writing, if superuser) of
* kernel time-keeping variables. used by xntpd.
diff --git a/kernel/time/ntp_internal.h b/kernel/time/ntp_internal.h
index 0a53e6ea47b1..909bd1f1bfb1 100644
--- a/kernel/time/ntp_internal.h
+++ b/kernel/time/ntp_internal.h
@@ -8,7 +8,6 @@ extern void ntp_clear(void);
extern u64 ntp_tick_length(void);
extern ktime_t ntp_get_next_leap(void);
extern int second_overflow(time64_t secs);
-extern int ntp_validate_timex(struct timex *);
extern int __do_adjtimex(struct timex *, struct timespec64 *, s32 *);
extern void __hardpps(const struct timespec64 *, const struct timespec64 *);
#endif /* _LINUX_NTP_INTERNAL_H */
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index 5b117110b55b..1f27887aa194 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -603,7 +603,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags,
/*
* Disarm any old timer after extracting its expiry time.
*/
- WARN_ON_ONCE(!irqs_disabled());
+ lockdep_assert_irqs_disabled();
ret = 0;
old_incr = timer->it.cpu.incr;
@@ -1034,7 +1034,7 @@ static void posix_cpu_timer_rearm(struct k_itimer *timer)
/*
* Now re-arm for the new expiry time.
*/
- WARN_ON_ONCE(!irqs_disabled());
+ lockdep_assert_irqs_disabled();
arm_timer(timer);
unlock:
unlock_task_sighand(p, &flags);
@@ -1125,7 +1125,7 @@ void run_posix_cpu_timers(struct task_struct *tsk)
struct k_itimer *timer, *next;
unsigned long flags;
- WARN_ON_ONCE(!irqs_disabled());
+ lockdep_assert_irqs_disabled();
/*
* The fast path checks that there are no expired thread or thread
diff --git a/kernel/time/posix-stubs.c b/kernel/time/posix-stubs.c
index 06f34feb635e..b258bee13b02 100644
--- a/kernel/time/posix-stubs.c
+++ b/kernel/time/posix-stubs.c
@@ -117,8 +117,7 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags,
const struct timespec __user *, rqtp,
struct timespec __user *, rmtp)
{
- struct timespec64 t64;
- struct timespec t;
+ struct timespec64 t;
switch (which_clock) {
case CLOCK_REALTIME:
@@ -129,16 +128,15 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags,
return -EINVAL;
}
- if (copy_from_user(&t, rqtp, sizeof (struct timespec)))
+ if (get_timespec64(&t, rqtp))
return -EFAULT;
- t64 = timespec_to_timespec64(t);
- if (!timespec64_valid(&t64))
+ if (!timespec64_valid(&t))
return -EINVAL;
if (flags & TIMER_ABSTIME)
rmtp = NULL;
current->restart_block.nanosleep.type = rmtp ? TT_NATIVE : TT_NONE;
current->restart_block.nanosleep.rmtp = rmtp;
- return hrtimer_nanosleep(&t64, flags & TIMER_ABSTIME ?
+ return hrtimer_nanosleep(&t, flags & TIMER_ABSTIME ?
HRTIMER_MODE_ABS : HRTIMER_MODE_REL,
which_clock);
}
@@ -203,8 +201,7 @@ COMPAT_SYSCALL_DEFINE4(clock_nanosleep, clockid_t, which_clock, int, flags,
struct compat_timespec __user *, rqtp,
struct compat_timespec __user *, rmtp)
{
- struct timespec64 t64;
- struct timespec t;
+ struct timespec64 t;
switch (which_clock) {
case CLOCK_REALTIME:
@@ -215,16 +212,15 @@ COMPAT_SYSCALL_DEFINE4(clock_nanosleep, clockid_t, which_clock, int, flags,
return -EINVAL;
}
- if (compat_get_timespec(&t, rqtp))
+ if (compat_get_timespec64(&t, rqtp))
return -EFAULT;
- t64 = timespec_to_timespec64(t);
- if (!timespec64_valid(&t64))
+ if (!timespec64_valid(&t))
return -EINVAL;
if (flags & TIMER_ABSTIME)
rmtp = NULL;
current->restart_block.nanosleep.type = rmtp ? TT_COMPAT : TT_NONE;
current->restart_block.nanosleep.compat_rmtp = rmtp;
- return hrtimer_nanosleep(&t64, flags & TIMER_ABSTIME ?
+ return hrtimer_nanosleep(&t, flags & TIMER_ABSTIME ?
HRTIMER_MODE_ABS : HRTIMER_MODE_REL,
which_clock);
}
diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c
index 6b009c207671..c1f518e7aa80 100644
--- a/kernel/time/tick-oneshot.c
+++ b/kernel/time/tick-oneshot.c
@@ -33,6 +33,7 @@ int tick_program_event(ktime_t expires, int force)
* We don't need the clock event device any more, stop it.
*/
clockevents_switch_state(dev, CLOCK_EVT_STATE_ONESHOT_STOPPED);
+ dev->next_event = KTIME_MAX;
return 0;
}
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index c7a899c5ce64..99578f06c8d4 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -27,6 +27,7 @@
#include <linux/irq_work.h>
#include <linux/posix-timers.h>
#include <linux/context_tracking.h>
+#include <linux/mm.h>
#include <asm/irq_regs.h>
@@ -165,7 +166,6 @@ static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs)
#ifdef CONFIG_NO_HZ_FULL
cpumask_var_t tick_nohz_full_mask;
-cpumask_var_t housekeeping_mask;
bool tick_nohz_full_running;
static atomic_t tick_dep_mask;
@@ -198,7 +198,7 @@ static bool check_tick_dependency(atomic_t *dep)
static bool can_stop_full_tick(int cpu, struct tick_sched *ts)
{
- WARN_ON_ONCE(!irqs_disabled());
+ lockdep_assert_irqs_disabled();
if (unlikely(!cpu_online(cpu)))
return false;
@@ -385,20 +385,13 @@ out:
local_irq_restore(flags);
}
-/* Parse the boot-time nohz CPU list from the kernel parameters. */
-static int __init tick_nohz_full_setup(char *str)
+/* Get the boot-time nohz CPU list from the kernel parameters. */
+void __init tick_nohz_full_setup(cpumask_var_t cpumask)
{
alloc_bootmem_cpumask_var(&tick_nohz_full_mask);
- if (cpulist_parse(str, tick_nohz_full_mask) < 0) {
- pr_warn("NO_HZ: Incorrect nohz_full cpumask\n");
- free_bootmem_cpumask_var(tick_nohz_full_mask);
- return 1;
- }
+ cpumask_copy(tick_nohz_full_mask, cpumask);
tick_nohz_full_running = true;
-
- return 1;
}
-__setup("nohz_full=", tick_nohz_full_setup);
static int tick_nohz_cpu_down(unsigned int cpu)
{
@@ -437,13 +430,6 @@ void __init tick_nohz_init(void)
return;
}
- if (!alloc_cpumask_var(&housekeeping_mask, GFP_KERNEL)) {
- WARN(1, "NO_HZ: Can't allocate not-full dynticks cpumask\n");
- cpumask_clear(tick_nohz_full_mask);
- tick_nohz_full_running = false;
- return;
- }
-
/*
* Full dynticks uses irq work to drive the tick rescheduling on safe
* locking contexts. But then we need irq work to raise its own
@@ -452,7 +438,6 @@ void __init tick_nohz_init(void)
if (!arch_irq_work_has_interrupt()) {
pr_warn("NO_HZ: Can't run full dynticks because arch doesn't support irq work self-IPIs\n");
cpumask_clear(tick_nohz_full_mask);
- cpumask_copy(housekeeping_mask, cpu_possible_mask);
tick_nohz_full_running = false;
return;
}
@@ -465,9 +450,6 @@ void __init tick_nohz_init(void)
cpumask_clear_cpu(cpu, tick_nohz_full_mask);
}
- cpumask_andnot(housekeeping_mask,
- cpu_possible_mask, tick_nohz_full_mask);
-
for_each_cpu(cpu, tick_nohz_full_mask)
context_tracking_cpu_set(cpu);
@@ -477,12 +459,6 @@ void __init tick_nohz_init(void)
WARN_ON(ret < 0);
pr_info("NO_HZ: Full dynticks CPUs: %*pbl.\n",
cpumask_pr_args(tick_nohz_full_mask));
-
- /*
- * We need at least one CPU to handle housekeeping work such
- * as timekeeping, unbound timers, workqueues, ...
- */
- WARN_ON_ONCE(cpumask_empty(housekeeping_mask));
}
#endif
@@ -787,6 +763,7 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
if (!ts->tick_stopped) {
calc_load_nohz_start();
cpu_load_update_nohz_start();
+ quiet_vmstat();
ts->last_tick = hrtimer_get_expires(&ts->sched_timer);
ts->tick_stopped = 1;
@@ -960,8 +937,7 @@ void tick_nohz_idle_enter(void)
{
struct tick_sched *ts;
- WARN_ON_ONCE(irqs_disabled());
-
+ lockdep_assert_irqs_enabled();
/*
* Update the idle state in the scheduler domain hierarchy
* when tick_nohz_stop_sched_tick() is called from the idle loop.
diff --git a/kernel/time/time.c b/kernel/time/time.c
index 44a8c1402133..bd4e6c7dd689 100644
--- a/kernel/time/time.c
+++ b/kernel/time/time.c
@@ -82,7 +82,7 @@ SYSCALL_DEFINE1(time, time_t __user *, tloc)
SYSCALL_DEFINE1(stime, time_t __user *, tptr)
{
- struct timespec tv;
+ struct timespec64 tv;
int err;
if (get_user(tv.tv_sec, tptr))
@@ -90,11 +90,11 @@ SYSCALL_DEFINE1(stime, time_t __user *, tptr)
tv.tv_nsec = 0;
- err = security_settime(&tv, NULL);
+ err = security_settime64(&tv, NULL);
if (err)
return err;
- do_settimeofday(&tv);
+ do_settimeofday64(&tv);
return 0;
}
@@ -122,7 +122,7 @@ COMPAT_SYSCALL_DEFINE1(time, compat_time_t __user *, tloc)
COMPAT_SYSCALL_DEFINE1(stime, compat_time_t __user *, tptr)
{
- struct timespec tv;
+ struct timespec64 tv;
int err;
if (get_user(tv.tv_sec, tptr))
@@ -130,11 +130,11 @@ COMPAT_SYSCALL_DEFINE1(stime, compat_time_t __user *, tptr)
tv.tv_nsec = 0;
- err = security_settime(&tv, NULL);
+ err = security_settime64(&tv, NULL);
if (err)
return err;
- do_settimeofday(&tv);
+ do_settimeofday64(&tv);
return 0;
}
@@ -158,40 +158,6 @@ SYSCALL_DEFINE2(gettimeofday, struct timeval __user *, tv,
}
/*
- * Indicates if there is an offset between the system clock and the hardware
- * clock/persistent clock/rtc.
- */
-int persistent_clock_is_local;
-
-/*
- * Adjust the time obtained from the CMOS to be UTC time instead of
- * local time.
- *
- * This is ugly, but preferable to the alternatives. Otherwise we
- * would either need to write a program to do it in /etc/rc (and risk
- * confusion if the program gets run more than once; it would also be
- * hard to make the program warp the clock precisely n hours) or
- * compile in the timezone information into the kernel. Bad, bad....
- *
- * - TYT, 1992-01-01
- *
- * The best thing to do is to keep the CMOS clock in universal time (UTC)
- * as real UNIX machines always do it. This avoids all headaches about
- * daylight saving times and warping kernel clocks.
- */
-static inline void warp_clock(void)
-{
- if (sys_tz.tz_minuteswest != 0) {
- struct timespec adjust;
-
- persistent_clock_is_local = 1;
- adjust.tv_sec = sys_tz.tz_minuteswest * 60;
- adjust.tv_nsec = 0;
- timekeeping_inject_offset(&adjust);
- }
-}
-
-/*
* In case for some reason the CMOS clock has not already been running
* in UTC, but in some local time: The first time we set the timezone,
* we will warp the clock so that it is ticking UTC time instead of
@@ -224,7 +190,7 @@ int do_sys_settimeofday64(const struct timespec64 *tv, const struct timezone *tz
if (firsttime) {
firsttime = 0;
if (!tv)
- warp_clock();
+ timekeeping_warp_clock();
}
}
if (tv)
@@ -441,6 +407,7 @@ time64_t mktime64(const unsigned int year0, const unsigned int mon0,
}
EXPORT_SYMBOL(mktime64);
+#if __BITS_PER_LONG == 32
/**
* set_normalized_timespec - set timespec sec and nsec parts and normalize
*
@@ -501,6 +468,7 @@ struct timespec ns_to_timespec(const s64 nsec)
return ts;
}
EXPORT_SYMBOL(ns_to_timespec);
+#endif
/**
* ns_to_timeval - Convert nanoseconds to timeval
@@ -520,7 +488,6 @@ struct timeval ns_to_timeval(const s64 nsec)
}
EXPORT_SYMBOL(ns_to_timeval);
-#if BITS_PER_LONG == 32
/**
* set_normalized_timespec - set timespec sec and nsec parts and normalize
*
@@ -581,7 +548,7 @@ struct timespec64 ns_to_timespec64(const s64 nsec)
return ts;
}
EXPORT_SYMBOL(ns_to_timespec64);
-#endif
+
/**
* msecs_to_jiffies: - convert milliseconds to jiffies
* @m: time in milliseconds
@@ -853,24 +820,6 @@ unsigned long nsecs_to_jiffies(u64 n)
EXPORT_SYMBOL_GPL(nsecs_to_jiffies);
/*
- * Add two timespec values and do a safety check for overflow.
- * It's assumed that both values are valid (>= 0)
- */
-struct timespec timespec_add_safe(const struct timespec lhs,
- const struct timespec rhs)
-{
- struct timespec res;
-
- set_normalized_timespec(&res, lhs.tv_sec + rhs.tv_sec,
- lhs.tv_nsec + rhs.tv_nsec);
-
- if (res.tv_sec < lhs.tv_sec || res.tv_sec < rhs.tv_sec)
- res.tv_sec = TIME_T_MAX;
-
- return res;
-}
-
-/*
* Add two timespec64 values and do a safety check for overflow.
* It's assumed that both values are valid (>= 0).
* And, each timespec64 is in normalized form.
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 2cafb49aa65e..198afa78bf69 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -60,8 +60,27 @@ struct tk_fast {
struct tk_read_base base[2];
};
-static struct tk_fast tk_fast_mono ____cacheline_aligned;
-static struct tk_fast tk_fast_raw ____cacheline_aligned;
+/* Suspend-time cycles value for halted fast timekeeper. */
+static u64 cycles_at_suspend;
+
+static u64 dummy_clock_read(struct clocksource *cs)
+{
+ return cycles_at_suspend;
+}
+
+static struct clocksource dummy_clock = {
+ .read = dummy_clock_read,
+};
+
+static struct tk_fast tk_fast_mono ____cacheline_aligned = {
+ .base[0] = { .clock = &dummy_clock, },
+ .base[1] = { .clock = &dummy_clock, },
+};
+
+static struct tk_fast tk_fast_raw ____cacheline_aligned = {
+ .base[0] = { .clock = &dummy_clock, },
+ .base[1] = { .clock = &dummy_clock, },
+};
/* flag for if timekeeping is suspended */
int __read_mostly timekeeping_suspended;
@@ -477,17 +496,39 @@ u64 notrace ktime_get_boot_fast_ns(void)
}
EXPORT_SYMBOL_GPL(ktime_get_boot_fast_ns);
-/* Suspend-time cycles value for halted fast timekeeper. */
-static u64 cycles_at_suspend;
-static u64 dummy_clock_read(struct clocksource *cs)
+/*
+ * See comment for __ktime_get_fast_ns() vs. timestamp ordering
+ */
+static __always_inline u64 __ktime_get_real_fast_ns(struct tk_fast *tkf)
{
- return cycles_at_suspend;
+ struct tk_read_base *tkr;
+ unsigned int seq;
+ u64 now;
+
+ do {
+ seq = raw_read_seqcount_latch(&tkf->seq);
+ tkr = tkf->base + (seq & 0x01);
+ now = ktime_to_ns(tkr->base_real);
+
+ now += timekeeping_delta_to_ns(tkr,
+ clocksource_delta(
+ tk_clock_read(tkr),
+ tkr->cycle_last,
+ tkr->mask));
+ } while (read_seqcount_retry(&tkf->seq, seq));
+
+ return now;
}
-static struct clocksource dummy_clock = {
- .read = dummy_clock_read,
-};
+/**
+ * ktime_get_real_fast_ns: - NMI safe and fast access to clock realtime.
+ */
+u64 ktime_get_real_fast_ns(void)
+{
+ return __ktime_get_real_fast_ns(&tk_fast_mono);
+}
+EXPORT_SYMBOL_GPL(ktime_get_real_fast_ns);
/**
* halt_fast_timekeeper - Prevent fast timekeeper from accessing clocksource.
@@ -507,6 +548,7 @@ static void halt_fast_timekeeper(struct timekeeper *tk)
memcpy(&tkr_dummy, tkr, sizeof(tkr_dummy));
cycles_at_suspend = tk_clock_read(tkr);
tkr_dummy.clock = &dummy_clock;
+ tkr_dummy.base_real = tkr->base + tk->offs_real;
update_fast_timekeeper(&tkr_dummy, &tk_fast_mono);
tkr = &tk->tkr_raw;
@@ -654,6 +696,7 @@ static void timekeeping_update(struct timekeeper *tk, unsigned int action)
update_vsyscall(tk);
update_pvclock_gtod(tk, action & TK_CLOCK_WAS_SET);
+ tk->tkr_mono.base_real = tk->tkr_mono.base + tk->offs_real;
update_fast_timekeeper(&tk->tkr_mono, &tk_fast_mono);
update_fast_timekeeper(&tk->tkr_raw, &tk_fast_raw);
@@ -1264,33 +1307,31 @@ EXPORT_SYMBOL(do_settimeofday64);
*
* Adds or subtracts an offset value from the current time.
*/
-int timekeeping_inject_offset(struct timespec *ts)
+static int timekeeping_inject_offset(struct timespec64 *ts)
{
struct timekeeper *tk = &tk_core.timekeeper;
unsigned long flags;
- struct timespec64 ts64, tmp;
+ struct timespec64 tmp;
int ret = 0;
- if (!timespec_inject_offset_valid(ts))
+ if (ts->tv_nsec < 0 || ts->tv_nsec >= NSEC_PER_SEC)
return -EINVAL;
- ts64 = timespec_to_timespec64(*ts);
-
raw_spin_lock_irqsave(&timekeeper_lock, flags);
write_seqcount_begin(&tk_core.seq);
timekeeping_forward_now(tk);
/* Make sure the proposed value is valid */
- tmp = timespec64_add(tk_xtime(tk), ts64);
- if (timespec64_compare(&tk->wall_to_monotonic, &ts64) > 0 ||
+ tmp = timespec64_add(tk_xtime(tk), *ts);
+ if (timespec64_compare(&tk->wall_to_monotonic, ts) > 0 ||
!timespec64_valid_strict(&tmp)) {
ret = -EINVAL;
goto error;
}
- tk_xtime_add(tk, &ts64);
- tk_set_wall_to_mono(tk, timespec64_sub(tk->wall_to_monotonic, ts64));
+ tk_xtime_add(tk, ts);
+ tk_set_wall_to_mono(tk, timespec64_sub(tk->wall_to_monotonic, *ts));
error: /* even if we error out, we forwarded the time, so call update */
timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET);
@@ -1303,7 +1344,40 @@ error: /* even if we error out, we forwarded the time, so call update */
return ret;
}
-EXPORT_SYMBOL(timekeeping_inject_offset);
+
+/*
+ * Indicates if there is an offset between the system clock and the hardware
+ * clock/persistent clock/rtc.
+ */
+int persistent_clock_is_local;
+
+/*
+ * Adjust the time obtained from the CMOS to be UTC time instead of
+ * local time.
+ *
+ * This is ugly, but preferable to the alternatives. Otherwise we
+ * would either need to write a program to do it in /etc/rc (and risk
+ * confusion if the program gets run more than once; it would also be
+ * hard to make the program warp the clock precisely n hours) or
+ * compile in the timezone information into the kernel. Bad, bad....
+ *
+ * - TYT, 1992-01-01
+ *
+ * The best thing to do is to keep the CMOS clock in universal time (UTC)
+ * as real UNIX machines always do it. This avoids all headaches about
+ * daylight saving times and warping kernel clocks.
+ */
+void timekeeping_warp_clock(void)
+{
+ if (sys_tz.tz_minuteswest != 0) {
+ struct timespec64 adjust;
+
+ persistent_clock_is_local = 1;
+ adjust.tv_sec = sys_tz.tz_minuteswest * 60;
+ adjust.tv_nsec = 0;
+ timekeeping_inject_offset(&adjust);
+ }
+}
/**
* __timekeeping_set_tai_offset - Sets the TAI offset from UTC and monotonic
@@ -2248,6 +2322,72 @@ ktime_t ktime_get_update_offsets_now(unsigned int *cwsseq, ktime_t *offs_real,
}
/**
+ * timekeeping_validate_timex - Ensures the timex is ok for use in do_adjtimex
+ */
+static int timekeeping_validate_timex(struct timex *txc)
+{
+ if (txc->modes & ADJ_ADJTIME) {
+ /* singleshot must not be used with any other mode bits */
+ if (!(txc->modes & ADJ_OFFSET_SINGLESHOT))
+ return -EINVAL;
+ if (!(txc->modes & ADJ_OFFSET_READONLY) &&
+ !capable(CAP_SYS_TIME))
+ return -EPERM;
+ } else {
+ /* In order to modify anything, you gotta be super-user! */
+ if (txc->modes && !capable(CAP_SYS_TIME))
+ return -EPERM;
+ /*
+ * if the quartz is off by more than 10% then
+ * something is VERY wrong!
+ */
+ if (txc->modes & ADJ_TICK &&
+ (txc->tick < 900000/USER_HZ ||
+ txc->tick > 1100000/USER_HZ))
+ return -EINVAL;
+ }
+
+ if (txc->modes & ADJ_SETOFFSET) {
+ /* In order to inject time, you gotta be super-user! */
+ if (!capable(CAP_SYS_TIME))
+ return -EPERM;
+
+ /*
+ * Validate if a timespec/timeval used to inject a time
+ * offset is valid. Offsets can be postive or negative, so
+ * we don't check tv_sec. The value of the timeval/timespec
+ * is the sum of its fields,but *NOTE*:
+ * The field tv_usec/tv_nsec must always be non-negative and
+ * we can't have more nanoseconds/microseconds than a second.
+ */
+ if (txc->time.tv_usec < 0)
+ return -EINVAL;
+
+ if (txc->modes & ADJ_NANO) {
+ if (txc->time.tv_usec >= NSEC_PER_SEC)
+ return -EINVAL;
+ } else {
+ if (txc->time.tv_usec >= USEC_PER_SEC)
+ return -EINVAL;
+ }
+ }
+
+ /*
+ * Check for potential multiplication overflows that can
+ * only happen on 64-bit systems:
+ */
+ if ((txc->modes & ADJ_FREQUENCY) && (BITS_PER_LONG == 64)) {
+ if (LLONG_MIN / PPM_SCALE > txc->freq)
+ return -EINVAL;
+ if (LLONG_MAX / PPM_SCALE < txc->freq)
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+
+/**
* do_adjtimex() - Accessor function to NTP __do_adjtimex function
*/
int do_adjtimex(struct timex *txc)
@@ -2259,12 +2399,12 @@ int do_adjtimex(struct timex *txc)
int ret;
/* Validate the data before disabling interrupts */
- ret = ntp_validate_timex(txc);
+ ret = timekeeping_validate_timex(txc);
if (ret)
return ret;
if (txc->modes & ADJ_SETOFFSET) {
- struct timespec delta;
+ struct timespec64 delta;
delta.tv_sec = txc->time.tv_sec;
delta.tv_nsec = txc->time.tv_usec;
if (!(txc->modes & ADJ_NANO))
diff --git a/kernel/time/timekeeping.h b/kernel/time/timekeeping.h
index c9f9af339914..7a9b4eb7a1d5 100644
--- a/kernel/time/timekeeping.h
+++ b/kernel/time/timekeeping.h
@@ -11,7 +11,7 @@ extern ktime_t ktime_get_update_offsets_now(unsigned int *cwsseq,
extern int timekeeping_valid_for_hres(void);
extern u64 timekeeping_max_deferment(void);
-extern int timekeeping_inject_offset(struct timespec *ts);
+extern void timekeeping_warp_clock(void);
extern int timekeeping_suspend(void);
extern void timekeeping_resume(void);
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index f2674a056c26..af0b8bae4502 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -610,7 +610,7 @@ static bool timer_fixup_init(void *addr, enum debug_obj_state state)
}
/* Stub timer callback for improperly used timers. */
-static void stub_timer(unsigned long data)
+static void stub_timer(struct timer_list *unused)
{
WARN_ON(1);
}
@@ -626,7 +626,7 @@ static bool timer_fixup_activate(void *addr, enum debug_obj_state state)
switch (state) {
case ODEBUG_STATE_NOTAVAILABLE:
- setup_timer(timer, stub_timer, 0);
+ timer_setup(timer, stub_timer, 0);
return true;
case ODEBUG_STATE_ACTIVE:
@@ -665,7 +665,7 @@ static bool timer_fixup_assert_init(void *addr, enum debug_obj_state state)
switch (state) {
case ODEBUG_STATE_NOTAVAILABLE:
- setup_timer(timer, stub_timer, 0);
+ timer_setup(timer, stub_timer, 0);
return true;
default:
return false;
@@ -929,8 +929,11 @@ static struct timer_base *lock_timer_base(struct timer_list *timer,
}
}
+#define MOD_TIMER_PENDING_ONLY 0x01
+#define MOD_TIMER_REDUCE 0x02
+
static inline int
-__mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only)
+__mod_timer(struct timer_list *timer, unsigned long expires, unsigned int options)
{
struct timer_base *base, *new_base;
unsigned int idx = UINT_MAX;
@@ -950,7 +953,11 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only)
* larger granularity than you would get from adding a new
* timer with this expiry.
*/
- if (timer->expires == expires)
+ long diff = timer->expires - expires;
+
+ if (!diff)
+ return 1;
+ if (options & MOD_TIMER_REDUCE && diff <= 0)
return 1;
/*
@@ -962,6 +969,12 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only)
base = lock_timer_base(timer, &flags);
forward_timer_base(base);
+ if (timer_pending(timer) && (options & MOD_TIMER_REDUCE) &&
+ time_before_eq(timer->expires, expires)) {
+ ret = 1;
+ goto out_unlock;
+ }
+
clk = base->clk;
idx = calc_wheel_index(expires, clk);
@@ -971,7 +984,10 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only)
* subsequent call will exit in the expires check above.
*/
if (idx == timer_get_idx(timer)) {
- timer->expires = expires;
+ if (!(options & MOD_TIMER_REDUCE))
+ timer->expires = expires;
+ else if (time_after(timer->expires, expires))
+ timer->expires = expires;
ret = 1;
goto out_unlock;
}
@@ -981,7 +997,7 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only)
}
ret = detach_if_pending(timer, base, false);
- if (!ret && pending_only)
+ if (!ret && (options & MOD_TIMER_PENDING_ONLY))
goto out_unlock;
debug_activate(timer, expires);
@@ -1042,7 +1058,7 @@ out_unlock:
*/
int mod_timer_pending(struct timer_list *timer, unsigned long expires)
{
- return __mod_timer(timer, expires, true);
+ return __mod_timer(timer, expires, MOD_TIMER_PENDING_ONLY);
}
EXPORT_SYMBOL(mod_timer_pending);
@@ -1068,11 +1084,26 @@ EXPORT_SYMBOL(mod_timer_pending);
*/
int mod_timer(struct timer_list *timer, unsigned long expires)
{
- return __mod_timer(timer, expires, false);
+ return __mod_timer(timer, expires, 0);
}
EXPORT_SYMBOL(mod_timer);
/**
+ * timer_reduce - Modify a timer's timeout if it would reduce the timeout
+ * @timer: The timer to be modified
+ * @expires: New timeout in jiffies
+ *
+ * timer_reduce() is very similar to mod_timer(), except that it will only
+ * modify a running timer if that would reduce the expiration time (it will
+ * start a timer that isn't running).
+ */
+int timer_reduce(struct timer_list *timer, unsigned long expires)
+{
+ return __mod_timer(timer, expires, MOD_TIMER_REDUCE);
+}
+EXPORT_SYMBOL(timer_reduce);
+
+/**
* add_timer - start a timer
* @timer: the timer to be added
*
@@ -1560,8 +1591,11 @@ static int collect_expired_timers(struct timer_base *base,
* jiffies, otherwise forward to the next expiry time:
*/
if (time_after(next, jiffies)) {
- /* The call site will increment clock! */
- base->clk = jiffies - 1;
+ /*
+ * The call site will increment base->clk and then
+ * terminate the expiry loop immediately.
+ */
+ base->clk = jiffies;
return 0;
}
base->clk = next;
@@ -1668,9 +1702,20 @@ void run_local_timers(void)
raise_softirq(TIMER_SOFTIRQ);
}
-static void process_timeout(unsigned long __data)
+/*
+ * Since schedule_timeout()'s timer is defined on the stack, it must store
+ * the target task on the stack as well.
+ */
+struct process_timer {
+ struct timer_list timer;
+ struct task_struct *task;
+};
+
+static void process_timeout(struct timer_list *t)
{
- wake_up_process((struct task_struct *)__data);
+ struct process_timer *timeout = from_timer(timeout, t, timer);
+
+ wake_up_process(timeout->task);
}
/**
@@ -1704,7 +1749,7 @@ static void process_timeout(unsigned long __data)
*/
signed long __sched schedule_timeout(signed long timeout)
{
- struct timer_list timer;
+ struct process_timer timer;
unsigned long expire;
switch (timeout)
@@ -1738,13 +1783,14 @@ signed long __sched schedule_timeout(signed long timeout)
expire = timeout + jiffies;
- setup_timer_on_stack(&timer, process_timeout, (unsigned long)current);
- __mod_timer(&timer, expire, false);
+ timer.task = current;
+ timer_setup_on_stack(&timer.timer, process_timeout, 0);
+ __mod_timer(&timer.timer, expire, 0);
schedule();
- del_singleshot_timer_sync(&timer);
+ del_singleshot_timer_sync(&timer.timer);
/* Remove the timer from the object tracker */
- destroy_timer_on_stack(&timer);
+ destroy_timer_on_stack(&timer.timer);
timeout = expire - jiffies;
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index dc498b605d5d..95888ae6c263 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -275,7 +275,7 @@ BPF_CALL_2(bpf_perf_event_read, struct bpf_map *, map, u64, flags)
if (!ee)
return -ENOENT;
- err = perf_event_read_local(ee->event, &value);
+ err = perf_event_read_local(ee->event, &value, NULL, NULL);
/*
* this api is ugly since we miss [-22..-2] range of valid
* counter values, but that's uapi
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 81279c6602ff..845f3805c73d 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -2724,7 +2724,7 @@ rb_reserve_next_event(struct ring_buffer *buffer,
* if it happened, we have to fail the write.
*/
barrier();
- if (unlikely(ACCESS_ONCE(cpu_buffer->buffer) != buffer)) {
+ if (unlikely(READ_ONCE(cpu_buffer->buffer) != buffer)) {
local_dec(&cpu_buffer->committing);
local_dec(&cpu_buffer->commits);
return NULL;
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 401b0639116f..6b0b343a36a2 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -1460,7 +1460,7 @@ extern struct trace_event_file *find_event_file(struct trace_array *tr,
static inline void *event_file_data(struct file *filp)
{
- return ACCESS_ONCE(file_inode(filp)->i_private);
+ return READ_ONCE(file_inode(filp)->i_private);
}
extern struct mutex event_mutex;
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index c738e764e2a5..90db994ac900 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -921,8 +921,8 @@ static enum print_line_t trace_ctxwake_print(struct trace_iterator *iter,
trace_assign_type(field, iter->ent);
- T = __task_state_to_char(field->next_state);
- S = __task_state_to_char(field->prev_state);
+ T = task_index_to_char(field->next_state);
+ S = task_index_to_char(field->prev_state);
trace_find_cmdline(field->next_pid, comm);
trace_seq_printf(&iter->seq,
" %5d:%3d:%c %s [%03d] %5d:%3d:%c %s\n",
@@ -957,8 +957,8 @@ static int trace_ctxwake_raw(struct trace_iterator *iter, char S)
trace_assign_type(field, iter->ent);
if (!S)
- S = __task_state_to_char(field->prev_state);
- T = __task_state_to_char(field->next_state);
+ S = task_index_to_char(field->prev_state);
+ T = task_index_to_char(field->next_state);
trace_seq_printf(&iter->seq, "%d %d %c %d %d %d %c\n",
field->prev_pid,
field->prev_prio,
@@ -993,8 +993,8 @@ static int trace_ctxwake_hex(struct trace_iterator *iter, char S)
trace_assign_type(field, iter->ent);
if (!S)
- S = __task_state_to_char(field->prev_state);
- T = __task_state_to_char(field->next_state);
+ S = task_index_to_char(field->prev_state);
+ T = task_index_to_char(field->next_state);
SEQ_PUT_HEX_FIELD(s, field->prev_pid);
SEQ_PUT_HEX_FIELD(s, field->prev_prio);
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 7d461dcd4831..a86b303e6c67 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -398,10 +398,10 @@ tracing_sched_switch_trace(struct trace_array *tr,
entry = ring_buffer_event_data(event);
entry->prev_pid = prev->pid;
entry->prev_prio = prev->prio;
- entry->prev_state = __get_task_state(prev);
+ entry->prev_state = task_state_index(prev);
entry->next_pid = next->pid;
entry->next_prio = next->prio;
- entry->next_state = __get_task_state(next);
+ entry->next_state = task_state_index(next);
entry->next_cpu = task_cpu(next);
if (!call_filter_check_discard(call, entry, buffer, event))
@@ -426,10 +426,10 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
entry = ring_buffer_event_data(event);
entry->prev_pid = curr->pid;
entry->prev_prio = curr->prio;
- entry->prev_state = __get_task_state(curr);
+ entry->prev_state = task_state_index(curr);
entry->next_pid = wakee->pid;
entry->next_prio = wakee->prio;
- entry->next_state = __get_task_state(wakee);
+ entry->next_state = task_state_index(wakee);
entry->next_cpu = task_cpu(wakee);
if (!call_filter_check_discard(call, entry, buffer, event))
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 719a52a4064a..734accc02418 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -78,7 +78,7 @@ check_stack(unsigned long ip, unsigned long *stack)
{
unsigned long this_size, flags; unsigned long *p, *top, *start;
static int tracer_frame;
- int frame_size = ACCESS_ONCE(tracer_frame);
+ int frame_size = READ_ONCE(tracer_frame);
int i, x;
this_size = ((unsigned long)stack) & (THREAD_SIZE-1);
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index c490f1e4313b..d32b45662fb6 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -894,7 +894,7 @@ static bool new_idmap_permitted(const struct file *file,
int proc_setgroups_show(struct seq_file *seq, void *v)
{
struct user_namespace *ns = seq->private;
- unsigned long userns_flags = ACCESS_ONCE(ns->flags);
+ unsigned long userns_flags = READ_ONCE(ns->flags);
seq_printf(seq, "%s\n",
(userns_flags & USERNS_SETGROUPS_ALLOWED) ?
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index c8e06703e44c..576d18045811 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -25,6 +25,7 @@
#include <linux/workqueue.h>
#include <linux/sched/clock.h>
#include <linux/sched/debug.h>
+#include <linux/sched/isolation.h>
#include <asm/irq_regs.h>
#include <linux/kvm_para.h>
@@ -774,15 +775,11 @@ int proc_watchdog_cpumask(struct ctl_table *table, int write,
void __init lockup_detector_init(void)
{
-#ifdef CONFIG_NO_HZ_FULL
- if (tick_nohz_full_enabled()) {
+ if (tick_nohz_full_enabled())
pr_info("Disabling watchdog on nohz_full cores by default\n");
- cpumask_copy(&watchdog_cpumask, housekeeping_mask);
- } else
- cpumask_copy(&watchdog_cpumask, cpu_possible_mask);
-#else
- cpumask_copy(&watchdog_cpumask, cpu_possible_mask);
-#endif
+
+ cpumask_copy(&watchdog_cpumask,
+ housekeeping_cpumask(HK_FLAG_TIMER));
if (!watchdog_nmi_probe())
nmi_watchdog_available = true;
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index a2dccfe1acec..7368b57842ea 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -1376,7 +1376,7 @@ static void __queue_work(int cpu, struct workqueue_struct *wq,
* queued or lose PENDING. Grabbing PENDING and queueing should
* happen with IRQ disabled.
*/
- WARN_ON_ONCE(!irqs_disabled());
+ lockdep_assert_irqs_disabled();
debug_work_activate(work);
@@ -1493,9 +1493,9 @@ bool queue_work_on(int cpu, struct workqueue_struct *wq,
}
EXPORT_SYMBOL(queue_work_on);
-void delayed_work_timer_fn(unsigned long __data)
+void delayed_work_timer_fn(struct timer_list *t)
{
- struct delayed_work *dwork = (struct delayed_work *)__data;
+ struct delayed_work *dwork = from_timer(dwork, t, timer);
/* should have been called from irqsafe timer with irq already off */
__queue_work(dwork->cpu, dwork->wq, &dwork->work);
@@ -1509,8 +1509,7 @@ static void __queue_delayed_work(int cpu, struct workqueue_struct *wq,
struct work_struct *work = &dwork->work;
WARN_ON_ONCE(!wq);
- WARN_ON_ONCE(timer->function != delayed_work_timer_fn ||
- timer->data != (unsigned long)dwork);
+ WARN_ON_ONCE(timer->function != (TIMER_FUNC_TYPE)delayed_work_timer_fn);
WARN_ON_ONCE(timer_pending(timer));
WARN_ON_ONCE(!list_empty(&work->entry));
@@ -1833,9 +1832,9 @@ static void destroy_worker(struct worker *worker)
wake_up_process(worker->task);
}
-static void idle_worker_timeout(unsigned long __pool)
+static void idle_worker_timeout(struct timer_list *t)
{
- struct worker_pool *pool = (void *)__pool;
+ struct worker_pool *pool = from_timer(pool, t, idle_timer);
spin_lock_irq(&pool->lock);
@@ -1881,9 +1880,9 @@ static void send_mayday(struct work_struct *work)
}
}
-static void pool_mayday_timeout(unsigned long __pool)
+static void pool_mayday_timeout(struct timer_list *t)
{
- struct worker_pool *pool = (void *)__pool;
+ struct worker_pool *pool = from_timer(pool, t, mayday_timer);
struct work_struct *work;
spin_lock_irq(&pool->lock);
@@ -2491,15 +2490,8 @@ static void insert_wq_barrier(struct pool_workqueue *pwq,
INIT_WORK_ONSTACK(&barr->work, wq_barrier_func);
__set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(&barr->work));
- /*
- * Explicitly init the crosslock for wq_barrier::done, make its lock
- * key a subkey of the corresponding work. As a result we won't
- * build a dependency between wq_barrier::done and unrelated work.
- */
- lockdep_init_map_crosslock((struct lockdep_map *)&barr->done.map,
- "(complete)wq_barr::done",
- target->lockdep_map.key, 1);
- __init_completion(&barr->done);
+ init_completion_map(&barr->done, &target->lockdep_map);
+
barr->task = current;
/*
@@ -2605,16 +2597,13 @@ void flush_workqueue(struct workqueue_struct *wq)
struct wq_flusher this_flusher = {
.list = LIST_HEAD_INIT(this_flusher.list),
.flush_color = -1,
- .done = COMPLETION_INITIALIZER_ONSTACK(this_flusher.done),
+ .done = COMPLETION_INITIALIZER_ONSTACK_MAP(this_flusher.done, wq->lockdep_map),
};
int next_color;
if (WARN_ON(!wq_online))
return;
- lock_map_acquire(&wq->lockdep_map);
- lock_map_release(&wq->lockdep_map);
-
mutex_lock(&wq->mutex);
/*
@@ -2877,9 +2866,6 @@ bool flush_work(struct work_struct *work)
if (WARN_ON(!wq_online))
return false;
- lock_map_acquire(&work->lockdep_map);
- lock_map_release(&work->lockdep_map);
-
if (start_flush_work(work, &barr)) {
wait_for_completion(&barr.done);
destroy_work_on_stack(&barr.work);
@@ -3236,11 +3222,9 @@ static int init_worker_pool(struct worker_pool *pool)
INIT_LIST_HEAD(&pool->idle_list);
hash_init(pool->busy_hash);
- setup_deferrable_timer(&pool->idle_timer, idle_worker_timeout,
- (unsigned long)pool);
+ timer_setup(&pool->idle_timer, idle_worker_timeout, TIMER_DEFERRABLE);
- setup_timer(&pool->mayday_timer, pool_mayday_timeout,
- (unsigned long)pool);
+ timer_setup(&pool->mayday_timer, pool_mayday_timeout, 0);
mutex_init(&pool->attach_mutex);
INIT_LIST_HEAD(&pool->workers);
@@ -4640,7 +4624,7 @@ static void rebind_workers(struct worker_pool *pool)
* concurrency management. Note that when or whether
* @worker clears REBOUND doesn't affect correctness.
*
- * ACCESS_ONCE() is necessary because @worker->flags may be
+ * WRITE_ONCE() is necessary because @worker->flags may be
* tested without holding any lock in
* wq_worker_waking_up(). Without it, NOT_RUNNING test may
* fail incorrectly leading to premature concurrency
@@ -4649,7 +4633,7 @@ static void rebind_workers(struct worker_pool *pool)
WARN_ON_ONCE(!(worker_flags & WORKER_UNBOUND));
worker_flags |= WORKER_REBOUND;
worker_flags &= ~WORKER_UNBOUND;
- ACCESS_ONCE(worker->flags) = worker_flags;
+ WRITE_ONCE(worker->flags, worker_flags);
}
spin_unlock_irq(&pool->lock);
@@ -5383,11 +5367,8 @@ static void workqueue_sysfs_unregister(struct workqueue_struct *wq) { }
*/
#ifdef CONFIG_WQ_WATCHDOG
-static void wq_watchdog_timer_fn(unsigned long data);
-
static unsigned long wq_watchdog_thresh = 30;
-static struct timer_list wq_watchdog_timer =
- TIMER_DEFERRED_INITIALIZER(wq_watchdog_timer_fn, 0, 0);
+static struct timer_list wq_watchdog_timer;
static unsigned long wq_watchdog_touched = INITIAL_JIFFIES;
static DEFINE_PER_CPU(unsigned long, wq_watchdog_touched_cpu) = INITIAL_JIFFIES;
@@ -5401,7 +5382,7 @@ static void wq_watchdog_reset_touched(void)
per_cpu(wq_watchdog_touched_cpu, cpu) = jiffies;
}
-static void wq_watchdog_timer_fn(unsigned long data)
+static void wq_watchdog_timer_fn(struct timer_list *unused)
{
unsigned long thresh = READ_ONCE(wq_watchdog_thresh) * HZ;
bool lockup_detected = false;
@@ -5503,6 +5484,7 @@ module_param_cb(watchdog_thresh, &wq_watchdog_thresh_ops, &wq_watchdog_thresh,
static void wq_watchdog_init(void)
{
+ timer_setup(&wq_watchdog_timer, wq_watchdog_timer_fn, TIMER_DEFERRABLE);
wq_watchdog_set_thresh(wq_watchdog_thresh);
}
diff --git a/kernel/workqueue_internal.h b/kernel/workqueue_internal.h
index efdd72e15794..d390d1be3748 100644
--- a/kernel/workqueue_internal.h
+++ b/kernel/workqueue_internal.h
@@ -10,6 +10,7 @@
#include <linux/workqueue.h>
#include <linux/kthread.h>
+#include <linux/preempt.h>
struct worker_pool;
@@ -60,7 +61,7 @@ struct worker {
*/
static inline struct worker *current_wq_worker(void)
{
- if (current->flags & PF_WQ_WORKER)
+ if (in_task() && (current->flags & PF_WQ_WORKER))
return kthread_data(current);
return NULL;
}