summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Documentation/scheduler/sched-deadline.txt168
-rw-r--r--arch/arm/kernel/smp.c3
-rw-r--r--arch/arm64/kernel/smp.c3
-rw-r--r--arch/metag/kernel/smp.c3
-rw-r--r--arch/powerpc/kernel/smp.c2
-rw-r--r--arch/x86/events/core.c12
-rw-r--r--arch/x86/include/asm/timer.h8
-rw-r--r--arch/x86/kernel/smpboot.c2
-rw-r--r--arch/x86/kernel/tsc.c206
-rw-r--r--arch/x86/platform/uv/tlb_uv.c14
-rw-r--r--drivers/acpi/pci_root.c2
-rw-r--r--drivers/base/node.c2
-rw-r--r--drivers/cpufreq/pasemi-cpufreq.c2
-rw-r--r--drivers/cpuidle/cpuidle.c1
-rw-r--r--drivers/iommu/intel-iommu.c4
-rw-r--r--drivers/iommu/of_iommu.c2
-rw-r--r--drivers/xen/manage.c1
-rw-r--r--include/linux/clocksource.h1
-rw-r--r--include/linux/cpumask.h28
-rw-r--r--include/linux/kernel.h6
-rw-r--r--include/linux/llist.h19
-rw-r--r--include/linux/sched.h22
-rw-r--r--include/linux/sched/clock.h11
-rw-r--r--include/uapi/linux/sched.h1
-rw-r--r--init/main.c27
-rw-r--r--kernel/async.c8
-rw-r--r--kernel/extable.c2
-rw-r--r--kernel/printk/printk.c2
-rw-r--r--kernel/sched/Makefile4
-rw-r--r--kernel/sched/clock.c128
-rw-r--r--kernel/sched/core.c165
-rw-r--r--kernel/sched/deadline.c550
-rw-r--r--kernel/sched/fair.c182
-rw-r--r--kernel/sched/features.h2
-rw-r--r--kernel/sched/idle.c1
-rw-r--r--kernel/sched/rt.c13
-rw-r--r--kernel/sched/sched.h93
-rw-r--r--kernel/sched/topology.c430
-rw-r--r--kernel/smp.c16
-rw-r--r--kernel/time/clocksource.c3
-rw-r--r--kernel/time/tick-sched.c2
-rw-r--r--lib/cpumask.c32
-rw-r--r--lib/smp_processor_id.c2
-rw-r--r--mm/vmscan.c2
44 files changed, 1634 insertions, 553 deletions
diff --git a/Documentation/scheduler/sched-deadline.txt b/Documentation/scheduler/sched-deadline.txt
index cbc1b46cbf70..e89e36ec15a5 100644
--- a/Documentation/scheduler/sched-deadline.txt
+++ b/Documentation/scheduler/sched-deadline.txt
@@ -7,6 +7,8 @@ CONTENTS
0. WARNING
1. Overview
2. Scheduling algorithm
+ 2.1 Main algorithm
+ 2.2 Bandwidth reclaiming
3. Scheduling Real-Time Tasks
3.1 Definitions
3.2 Schedulability Analysis for Uniprocessor Systems
@@ -44,6 +46,9 @@ CONTENTS
2. Scheduling algorithm
==================
+2.1 Main algorithm
+------------------
+
SCHED_DEADLINE uses three parameters, named "runtime", "period", and
"deadline", to schedule tasks. A SCHED_DEADLINE task should receive
"runtime" microseconds of execution time every "period" microseconds, and
@@ -113,6 +118,160 @@ CONTENTS
remaining runtime = remaining runtime + runtime
+2.2 Bandwidth reclaiming
+------------------------
+
+ Bandwidth reclaiming for deadline tasks is based on the GRUB (Greedy
+ Reclamation of Unused Bandwidth) algorithm [15, 16, 17] and it is enabled
+ when flag SCHED_FLAG_RECLAIM is set.
+
+ The following diagram illustrates the state names for tasks handled by GRUB:
+
+ ------------
+ (d) | Active |
+ ------------->| |
+ | | Contending |
+ | ------------
+ | A |
+ ---------- | |
+ | | | |
+ | Inactive | |(b) | (a)
+ | | | |
+ ---------- | |
+ A | V
+ | ------------
+ | | Active |
+ --------------| Non |
+ (c) | Contending |
+ ------------
+
+ A task can be in one of the following states:
+
+ - ActiveContending: if it is ready for execution (or executing);
+
+ - ActiveNonContending: if it just blocked and has not yet surpassed the 0-lag
+ time;
+
+ - Inactive: if it is blocked and has surpassed the 0-lag time.
+
+ State transitions:
+
+ (a) When a task blocks, it does not become immediately inactive since its
+ bandwidth cannot be immediately reclaimed without breaking the
+ real-time guarantees. It therefore enters a transitional state called
+ ActiveNonContending. The scheduler arms the "inactive timer" to fire at
+ the 0-lag time, when the task's bandwidth can be reclaimed without
+ breaking the real-time guarantees.
+
+ The 0-lag time for a task entering the ActiveNonContending state is
+ computed as
+
+ (runtime * dl_period)
+ deadline - ---------------------
+ dl_runtime
+
+ where runtime is the remaining runtime, while dl_runtime and dl_period
+ are the reservation parameters.
+
+ (b) If the task wakes up before the inactive timer fires, the task re-enters
+ the ActiveContending state and the "inactive timer" is canceled.
+ In addition, if the task wakes up on a different runqueue, then
+ the task's utilization must be removed from the previous runqueue's active
+ utilization and must be added to the new runqueue's active utilization.
+ In order to avoid races between a task waking up on a runqueue while the
+ "inactive timer" is running on a different CPU, the "dl_non_contending"
+ flag is used to indicate that a task is not on a runqueue but is active
+ (so, the flag is set when the task blocks and is cleared when the
+ "inactive timer" fires or when the task wakes up).
+
+ (c) When the "inactive timer" fires, the task enters the Inactive state and
+ its utilization is removed from the runqueue's active utilization.
+
+ (d) When an inactive task wakes up, it enters the ActiveContending state and
+ its utilization is added to the active utilization of the runqueue where
+ it has been enqueued.
+
+ For each runqueue, the algorithm GRUB keeps track of two different bandwidths:
+
+ - Active bandwidth (running_bw): this is the sum of the bandwidths of all
+ tasks in active state (i.e., ActiveContending or ActiveNonContending);
+
+ - Total bandwidth (this_bw): this is the sum of all tasks "belonging" to the
+ runqueue, including the tasks in Inactive state.
+
+
+ The algorithm reclaims the bandwidth of the tasks in Inactive state.
+ It does so by decrementing the runtime of the executing task Ti at a pace equal
+ to
+
+ dq = -max{ Ui, (1 - Uinact) } dt
+
+ where Uinact is the inactive utilization, computed as (this_bq - running_bw),
+ and Ui is the bandwidth of task Ti.
+
+
+ Let's now see a trivial example of two deadline tasks with runtime equal
+ to 4 and period equal to 8 (i.e., bandwidth equal to 0.5):
+
+ A Task T1
+ |
+ | |
+ | |
+ |-------- |----
+ | | V
+ |---|---|---|---|---|---|---|---|--------->t
+ 0 1 2 3 4 5 6 7 8
+
+
+ A Task T2
+ |
+ | |
+ | |
+ | ------------------------|
+ | | V
+ |---|---|---|---|---|---|---|---|--------->t
+ 0 1 2 3 4 5 6 7 8
+
+
+ A running_bw
+ |
+ 1 ----------------- ------
+ | | |
+ 0.5- -----------------
+ | |
+ |---|---|---|---|---|---|---|---|--------->t
+ 0 1 2 3 4 5 6 7 8
+
+
+ - Time t = 0:
+
+ Both tasks are ready for execution and therefore in ActiveContending state.
+ Suppose Task T1 is the first task to start execution.
+ Since there are no inactive tasks, its runtime is decreased as dq = -1 dt.
+
+ - Time t = 2:
+
+ Suppose that task T1 blocks
+ Task T1 therefore enters the ActiveNonContending state. Since its remaining
+ runtime is equal to 2, its 0-lag time is equal to t = 4.
+ Task T2 start execution, with runtime still decreased as dq = -1 dt since
+ there are no inactive tasks.
+
+ - Time t = 4:
+
+ This is the 0-lag time for Task T1. Since it didn't woken up in the
+ meantime, it enters the Inactive state. Its bandwidth is removed from
+ running_bw.
+ Task T2 continues its execution. However, its runtime is now decreased as
+ dq = - 0.5 dt because Uinact = 0.5.
+ Task T2 therefore reclaims the bandwidth unused by Task T1.
+
+ - Time t = 8:
+
+ Task T1 wakes up. It enters the ActiveContending state again, and the
+ running_bw is incremented.
+
+
3. Scheduling Real-Time Tasks
=============================
@@ -330,6 +489,15 @@ CONTENTS
14 - J. Erickson, U. Devi and S. Baruah. Improved tardiness bounds for
Global EDF. Proceedings of the 22nd Euromicro Conference on
Real-Time Systems, 2010.
+ 15 - G. Lipari, S. Baruah, Greedy reclamation of unused bandwidth in
+ constant-bandwidth servers, 12th IEEE Euromicro Conference on Real-Time
+ Systems, 2000.
+ 16 - L. Abeni, J. Lelli, C. Scordino, L. Palopoli, Greedy CPU reclaiming for
+ SCHED DEADLINE. In Proceedings of the Real-Time Linux Workshop (RTLWS),
+ Dusseldorf, Germany, 2014.
+ 17 - L. Abeni, G. Lipari, A. Parri, Y. Sun, Multicore CPU reclaiming: parallel
+ or sequential?. In Proceedings of the 31st Annual ACM Symposium on Applied
+ Computing, 2016.
4. Bandwidth management
diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c
index 572a8df1b766..c9a0a5299827 100644
--- a/arch/arm/kernel/smp.c
+++ b/arch/arm/kernel/smp.c
@@ -555,8 +555,7 @@ static DEFINE_RAW_SPINLOCK(stop_lock);
*/
static void ipi_cpu_stop(unsigned int cpu)
{
- if (system_state == SYSTEM_BOOTING ||
- system_state == SYSTEM_RUNNING) {
+ if (system_state <= SYSTEM_RUNNING) {
raw_spin_lock(&stop_lock);
pr_crit("CPU%u: stopping\n", cpu);
dump_stack();
diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c
index 6e0e16a3a7d4..321119881abf 100644
--- a/arch/arm64/kernel/smp.c
+++ b/arch/arm64/kernel/smp.c
@@ -961,8 +961,7 @@ void smp_send_stop(void)
cpumask_copy(&mask, cpu_online_mask);
cpumask_clear_cpu(smp_processor_id(), &mask);
- if (system_state == SYSTEM_BOOTING ||
- system_state == SYSTEM_RUNNING)
+ if (system_state <= SYSTEM_RUNNING)
pr_crit("SMP: stopping secondary CPUs\n");
smp_cross_call(&mask, IPI_CPU_STOP);
}
diff --git a/arch/metag/kernel/smp.c b/arch/metag/kernel/smp.c
index 232a12bf3f99..2dbbb7c66043 100644
--- a/arch/metag/kernel/smp.c
+++ b/arch/metag/kernel/smp.c
@@ -567,8 +567,7 @@ static void stop_this_cpu(void *data)
{
unsigned int cpu = smp_processor_id();
- if (system_state == SYSTEM_BOOTING ||
- system_state == SYSTEM_RUNNING) {
+ if (system_state <= SYSTEM_RUNNING) {
spin_lock(&stop_lock);
pr_crit("CPU%u: stopping\n", cpu);
dump_stack();
diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index df2a41647d8e..1069f74fca47 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -97,7 +97,7 @@ int smp_generic_cpu_bootable(unsigned int nr)
/* Special case - we inhibit secondary thread startup
* during boot if the user requests it.
*/
- if (system_state == SYSTEM_BOOTING && cpu_has_feature(CPU_FTR_SMT)) {
+ if (system_state < SYSTEM_RUNNING && cpu_has_feature(CPU_FTR_SMT)) {
if (!smt_enabled_at_boot && cpu_thread_in_core(nr) != 0)
return 0;
if (smt_enabled_at_boot
diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 580b60f5ac83..d3990462582c 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -2255,7 +2255,7 @@ static struct pmu pmu = {
void arch_perf_update_userpage(struct perf_event *event,
struct perf_event_mmap_page *userpg, u64 now)
{
- struct cyc2ns_data *data;
+ struct cyc2ns_data data;
u64 offset;
userpg->cap_user_time = 0;
@@ -2267,17 +2267,17 @@ void arch_perf_update_userpage(struct perf_event *event,
if (!using_native_sched_clock() || !sched_clock_stable())
return;
- data = cyc2ns_read_begin();
+ cyc2ns_read_begin(&data);
- offset = data->cyc2ns_offset + __sched_clock_offset;
+ offset = data.cyc2ns_offset + __sched_clock_offset;
/*
* Internal timekeeping for enabled/running/stopped times
* is always in the local_clock domain.
*/
userpg->cap_user_time = 1;
- userpg->time_mult = data->cyc2ns_mul;
- userpg->time_shift = data->cyc2ns_shift;
+ userpg->time_mult = data.cyc2ns_mul;
+ userpg->time_shift = data.cyc2ns_shift;
userpg->time_offset = offset - now;
/*
@@ -2289,7 +2289,7 @@ void arch_perf_update_userpage(struct perf_event *event,
userpg->time_zero = offset;
}
- cyc2ns_read_end(data);
+ cyc2ns_read_end();
}
void
diff --git a/arch/x86/include/asm/timer.h b/arch/x86/include/asm/timer.h
index 27e9f9d769b8..2016962103df 100644
--- a/arch/x86/include/asm/timer.h
+++ b/arch/x86/include/asm/timer.h
@@ -29,11 +29,9 @@ struct cyc2ns_data {
u32 cyc2ns_mul;
u32 cyc2ns_shift;
u64 cyc2ns_offset;
- u32 __count;
- /* u32 hole */
-}; /* 24 bytes -- do not grow */
+}; /* 16 bytes */
-extern struct cyc2ns_data *cyc2ns_read_begin(void);
-extern void cyc2ns_read_end(struct cyc2ns_data *);
+extern void cyc2ns_read_begin(struct cyc2ns_data *);
+extern void cyc2ns_read_end(void);
#endif /* _ASM_X86_TIMER_H */
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index f04479a8f74f..045e4f993bd2 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -863,7 +863,7 @@ static void announce_cpu(int cpu, int apicid)
if (cpu == 1)
printk(KERN_INFO "x86: Booting SMP configuration:\n");
- if (system_state == SYSTEM_BOOTING) {
+ if (system_state < SYSTEM_RUNNING) {
if (node != current_node) {
if (current_node > (-1))
pr_cont("\n");
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 714dfba6a1e7..5270fc0c2df6 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -51,115 +51,34 @@ static u32 art_to_tsc_denominator;
static u64 art_to_tsc_offset;
struct clocksource *art_related_clocksource;
-/*
- * Use a ring-buffer like data structure, where a writer advances the head by
- * writing a new data entry and a reader advances the tail when it observes a
- * new entry.
- *
- * Writers are made to wait on readers until there's space to write a new
- * entry.
- *
- * This means that we can always use an {offset, mul} pair to compute a ns
- * value that is 'roughly' in the right direction, even if we're writing a new
- * {offset, mul} pair during the clock read.
- *
- * The down-side is that we can no longer guarantee strict monotonicity anymore
- * (assuming the TSC was that to begin with), because while we compute the
- * intersection point of the two clock slopes and make sure the time is
- * continuous at the point of switching; we can no longer guarantee a reader is
- * strictly before or after the switch point.
- *
- * It does mean a reader no longer needs to disable IRQs in order to avoid
- * CPU-Freq updates messing with his times, and similarly an NMI reader will
- * no longer run the risk of hitting half-written state.
- */
-
struct cyc2ns {
- struct cyc2ns_data data[2]; /* 0 + 2*24 = 48 */
- struct cyc2ns_data *head; /* 48 + 8 = 56 */
- struct cyc2ns_data *tail; /* 56 + 8 = 64 */
-}; /* exactly fits one cacheline */
-
-static DEFINE_PER_CPU_ALIGNED(struct cyc2ns, cyc2ns);
-
-struct cyc2ns_data *cyc2ns_read_begin(void)
-{
- struct cyc2ns_data *head;
-
- preempt_disable();
-
- head = this_cpu_read(cyc2ns.head);
- /*
- * Ensure we observe the entry when we observe the pointer to it.
- * matches the wmb from cyc2ns_write_end().
- */
- smp_read_barrier_depends();
- head->__count++;
- barrier();
+ struct cyc2ns_data data[2]; /* 0 + 2*16 = 32 */
+ seqcount_t seq; /* 32 + 4 = 36 */
- return head;
-}
+}; /* fits one cacheline */
-void cyc2ns_read_end(struct cyc2ns_data *head)
-{
- barrier();
- /*
- * If we're the outer most nested read; update the tail pointer
- * when we're done. This notifies possible pending writers
- * that we've observed the head pointer and that the other
- * entry is now free.
- */
- if (!--head->__count) {
- /*
- * x86-TSO does not reorder writes with older reads;
- * therefore once this write becomes visible to another
- * cpu, we must be finished reading the cyc2ns_data.
- *
- * matches with cyc2ns_write_begin().
- */
- this_cpu_write(cyc2ns.tail, head);
- }
- preempt_enable();
-}
+static DEFINE_PER_CPU_ALIGNED(struct cyc2ns, cyc2ns);
-/*
- * Begin writing a new @data entry for @cpu.
- *
- * Assumes some sort of write side lock; currently 'provided' by the assumption
- * that cpufreq will call its notifiers sequentially.
- */
-static struct cyc2ns_data *cyc2ns_write_begin(int cpu)
+void cyc2ns_read_begin(struct cyc2ns_data *data)
{
- struct cyc2ns *c2n = &per_cpu(cyc2ns, cpu);
- struct cyc2ns_data *data = c2n->data;
+ int seq, idx;
- if (data == c2n->head)
- data++;
+ preempt_disable_notrace();
- /* XXX send an IPI to @cpu in order to guarantee a read? */
+ do {
+ seq = this_cpu_read(cyc2ns.seq.sequence);
+ idx = seq & 1;
- /*
- * When we observe the tail write from cyc2ns_read_end(),
- * the cpu must be done with that entry and its safe
- * to start writing to it.
- */
- while (c2n->tail == data)
- cpu_relax();
+ data->cyc2ns_offset = this_cpu_read(cyc2ns.data[idx].cyc2ns_offset);
+ data->cyc2ns_mul = this_cpu_read(cyc2ns.data[idx].cyc2ns_mul);
+ data->cyc2ns_shift = this_cpu_read(cyc2ns.data[idx].cyc2ns_shift);
- return data;
+ } while (unlikely(seq != this_cpu_read(cyc2ns.seq.sequence)));
}
-static void cyc2ns_write_end(int cpu, struct cyc2ns_data *data)
+void cyc2ns_read_end(void)
{
- struct cyc2ns *c2n = &per_cpu(cyc2ns, cpu);
-
- /*
- * Ensure the @data writes are visible before we publish the
- * entry. Matches the data-depencency in cyc2ns_read_begin().
- */
- smp_wmb();
-
- ACCESS_ONCE(c2n->head) = data;
+ preempt_enable_notrace();
}
/*
@@ -191,7 +110,6 @@ static void cyc2ns_data_init(struct cyc2ns_data *data)
data->cyc2ns_mul = 0;
data->cyc2ns_shift = 0;
data->cyc2ns_offset = 0;
- data->__count = 0;
}
static void cyc2ns_init(int cpu)
@@ -201,51 +119,29 @@ static void cyc2ns_init(int cpu)
cyc2ns_data_init(&c2n->data[0]);
cyc2ns_data_init(&c2n->data[1]);
- c2n->head = c2n->data;
- c2n->tail = c2n->data;
+ seqcount_init(&c2n->seq);
}
static inline unsigned long long cycles_2_ns(unsigned long long cyc)
{
- struct cyc2ns_data *data, *tail;
+ struct cyc2ns_data data;
unsigned long long ns;
- /*
- * See cyc2ns_read_*() for details; replicated in order to avoid
- * an extra few instructions that came with the abstraction.
- * Notable, it allows us to only do the __count and tail update
- * dance when its actually needed.
- */
-
- preempt_disable_notrace();
- data = this_cpu_read(cyc2ns.head);
- tail = this_cpu_read(cyc2ns.tail);
-
- if (likely(data == tail)) {
- ns = data->cyc2ns_offset;
- ns += mul_u64_u32_shr(cyc, data->cyc2ns_mul, data->cyc2ns_shift);
- } else {
- data->__count++;
-
- barrier();
-
- ns = data->cyc2ns_offset;
- ns += mul_u64_u32_shr(cyc, data->cyc2ns_mul, data->cyc2ns_shift);
+ cyc2ns_read_begin(&data);
- barrier();
+ ns = data.cyc2ns_offset;
+ ns += mul_u64_u32_shr(cyc, data.cyc2ns_mul, data.cyc2ns_shift);
- if (!--data->__count)
- this_cpu_write(cyc2ns.tail, data);
- }
- preempt_enable_notrace();
+ cyc2ns_read_end();
return ns;
}
-static void set_cyc2ns_scale(unsigned long khz, int cpu)
+static void set_cyc2ns_scale(unsigned long khz, int cpu, unsigned long long tsc_now)
{
- unsigned long long tsc_now, ns_now;
- struct cyc2ns_data *data;
+ unsigned long long ns_now;
+ struct cyc2ns_data data;
+ struct cyc2ns *c2n;
unsigned long flags;
local_irq_save(flags);
@@ -254,9 +150,6 @@ static void set_cyc2ns_scale(unsigned long khz, int cpu)
if (!khz)
goto done;
- data = cyc2ns_write_begin(cpu);
-
- tsc_now = rdtsc();
ns_now = cycles_2_ns(tsc_now);
/*
@@ -264,7 +157,7 @@ static void set_cyc2ns_scale(unsigned long khz, int cpu)
* time function is continuous; see the comment near struct
* cyc2ns_data.
*/
- clocks_calc_mult_shift(&data->cyc2ns_mul, &data->cyc2ns_shift, khz,
+ clocks_calc_mult_shift(&data.cyc2ns_mul, &data.cyc2ns_shift, khz,
NSEC_PER_MSEC, 0);
/*
@@ -273,20 +166,26 @@ static void set_cyc2ns_scale(unsigned long khz, int cpu)
* conversion algorithm shifting a 32-bit value (now specifies a 64-bit
* value) - refer perf_event_mmap_page documentation in perf_event.h.
*/
- if (data->cyc2ns_shift == 32) {
- data->cyc2ns_shift = 31;
- data->cyc2ns_mul >>= 1;
+ if (data.cyc2ns_shift == 32) {
+ data.cyc2ns_shift = 31;
+ data.cyc2ns_mul >>= 1;
}
- data->cyc2ns_offset = ns_now -
- mul_u64_u32_shr(tsc_now, data->cyc2ns_mul, data->cyc2ns_shift);
+ data.cyc2ns_offset = ns_now -
+ mul_u64_u32_shr(tsc_now, data.cyc2ns_mul, data.cyc2ns_shift);
+
+ c2n = per_cpu_ptr(&cyc2ns, cpu);
- cyc2ns_write_end(cpu, data);
+ raw_write_seqcount_latch(&c2n->seq);
+ c2n->data[0] = data;
+ raw_write_seqcount_latch(&c2n->seq);
+ c2n->data[1] = data;
done:
- sched_clock_idle_wakeup_event(0);
+ sched_clock_idle_wakeup_event();
local_irq_restore(flags);
}
+
/*
* Scheduler clock - returns current time in nanosec units.
*/
@@ -374,6 +273,8 @@ static int __init tsc_setup(char *str)
tsc_clocksource_reliable = 1;
if (!strncmp(str, "noirqtime", 9))
no_sched_irq_time = 1;
+ if (!strcmp(str, "unstable"))
+ mark_tsc_unstable("boot parameter");
return 1;
}
@@ -986,7 +887,6 @@ void tsc_restore_sched_clock_state(void)
}
#ifdef CONFIG_CPU_FREQ
-
/* Frequency scaling support. Adjust the TSC based timer when the cpu frequency
* changes.
*
@@ -1027,7 +927,7 @@ static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
if (!(freq->flags & CPUFREQ_CONST_LOOPS))
mark_tsc_unstable("cpufreq changes");
- set_cyc2ns_scale(tsc_khz, freq->cpu);
+ set_cyc2ns_scale(tsc_khz, freq->cpu, rdtsc());
}
return 0;
@@ -1127,6 +1027,15 @@ static void tsc_cs_mark_unstable(struct clocksource *cs)
pr_info("Marking TSC unstable due to clocksource watchdog\n");
}
+static void tsc_cs_tick_stable(struct clocksource *cs)
+{
+ if (tsc_unstable)
+ return;
+
+ if (using_native_sched_clock())
+ sched_clock_tick_stable();
+}
+
/*
* .mask MUST be CLOCKSOURCE_MASK(64). See comment above read_tsc()
*/
@@ -1140,6 +1049,7 @@ static struct clocksource clocksource_tsc = {
.archdata = { .vclock_mode = VCLOCK_TSC },
.resume = tsc_resume,
.mark_unstable = tsc_cs_mark_unstable,
+ .tick_stable = tsc_cs_tick_stable,
};
void mark_tsc_unstable(char *reason)
@@ -1255,6 +1165,7 @@ static void tsc_refine_calibration_work(struct work_struct *work)
static int hpet;
u64 tsc_stop, ref_stop, delta;
unsigned long freq;
+ int cpu;
/* Don't bother refining TSC on unstable systems */
if (check_tsc_unstable())
@@ -1305,6 +1216,10 @@ static void tsc_refine_calibration_work(struct work_struct *work)
/* Inform the TSC deadline clockevent devices about the recalibration */
lapic_update_tsc_freq();
+ /* Update the sched_clock() rate to match the clocksource one */
+ for_each_possible_cpu(cpu)
+ set_cyc2ns_scale(tsc_khz, cpu, tsc_stop);
+
out:
if (boot_cpu_has(X86_FEATURE_ART))
art_related_clocksource = &clocksource_tsc;
@@ -1350,7 +1265,7 @@ device_initcall(init_tsc_clocksource);
void __init tsc_init(void)
{
- u64 lpj;
+ u64 lpj, cyc;
int cpu;
if (!boot_cpu_has(X86_FEATURE_TSC)) {
@@ -1390,9 +1305,10 @@ void __init tsc_init(void)
* speed as the bootup CPU. (cpufreq notifiers will fix this
* up if their speed diverges)
*/
+ cyc = rdtsc();
for_each_possible_cpu(cpu) {
cyc2ns_init(cpu);
- set_cyc2ns_scale(tsc_khz, cpu);
+ set_cyc2ns_scale(tsc_khz, cpu, cyc);
}
if (tsc_disabled > 0)
diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c
index 42e65fee5673..795671593528 100644
--- a/arch/x86/platform/uv/tlb_uv.c
+++ b/arch/x86/platform/uv/tlb_uv.c
@@ -456,12 +456,13 @@ static void reset_with_ipi(struct pnmask *distribution, struct bau_control *bcp)
*/
static inline unsigned long long cycles_2_ns(unsigned long long cyc)
{
- struct cyc2ns_data *data = cyc2ns_read_begin();
+ struct cyc2ns_data data;
unsigned long long ns;
- ns = mul_u64_u32_shr(cyc, data->cyc2ns_mul, data->cyc2ns_shift);
+ cyc2ns_read_begin(&data);
+ ns = mul_u64_u32_shr(cyc, data.cyc2ns_mul, data.cyc2ns_shift);
+ cyc2ns_read_end();
- cyc2ns_read_end(data);
return ns;
}
@@ -470,12 +471,13 @@ static inline unsigned long long cycles_2_ns(unsigned long long cyc)
*/
static inline unsigned long long ns_2_cycles(unsigned long long ns)
{
- struct cyc2ns_data *data = cyc2ns_read_begin();
+ struct cyc2ns_data data;
unsigned long long cyc;
- cyc = (ns << data->cyc2ns_shift) / data->cyc2ns_mul;
+ cyc2ns_read_begin(&data);
+ cyc = (ns << data.cyc2ns_shift) / data.cyc2ns_mul;
+ cyc2ns_read_end();
- cyc2ns_read_end(data);
return cyc;
}
diff --git a/drivers/acpi/pci_root.c b/drivers/acpi/pci_root.c
index 919be0aa2578..240544253ccd 100644
--- a/drivers/acpi/pci_root.c
+++ b/drivers/acpi/pci_root.c
@@ -523,7 +523,7 @@ static int acpi_pci_root_add(struct acpi_device *device,
struct acpi_pci_root *root;
acpi_handle handle = device->handle;
int no_aspm = 0;
- bool hotadd = system_state != SYSTEM_BOOTING;
+ bool hotadd = system_state == SYSTEM_RUNNING;
root = kzalloc(sizeof(struct acpi_pci_root), GFP_KERNEL);
if (!root)
diff --git a/drivers/base/node.c b/drivers/base/node.c
index 5548f9686016..0440d95c9b5b 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -377,7 +377,7 @@ static int __ref get_nid_for_pfn(unsigned long pfn)
if (!pfn_valid_within(pfn))
return -1;
#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
- if (system_state == SYSTEM_BOOTING)
+ if (system_state < SYSTEM_RUNNING)
return early_pfn_to_nid(pfn);
#endif
page = pfn_to_page(pfn);
diff --git a/drivers/cpufreq/pasemi-cpufreq.c b/drivers/cpufreq/pasemi-cpufreq.c
index 35dd4d7ffee0..b257fc7d5204 100644
--- a/drivers/cpufreq/pasemi-cpufreq.c
+++ b/drivers/cpufreq/pasemi-cpufreq.c
@@ -226,7 +226,7 @@ static int pas_cpufreq_cpu_exit(struct cpufreq_policy *policy)
* We don't support CPU hotplug. Don't unmap after the system
* has already made it to a running state.
*/
- if (system_state != SYSTEM_BOOTING)
+ if (system_state >= SYSTEM_RUNNING)
return 0;
if (sdcasr_mapbase)
diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c
index 2706be7ed334..60bb64f4329d 100644
--- a/drivers/cpuidle/cpuidle.c
+++ b/drivers/cpuidle/cpuidle.c
@@ -220,6 +220,7 @@ int cpuidle_enter_state(struct cpuidle_device *dev, struct cpuidle_driver *drv,
entered_state = target_state->enter(dev, drv, index);
start_critical_timings();
+ sched_clock_idle_wakeup_event();
time_end = ns_to_ktime(local_clock());
trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, dev->cpu);
diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index fc2765ccdb57..8500deda9175 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -4315,7 +4315,7 @@ int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
struct acpi_dmar_atsr *atsr;
struct dmar_atsr_unit *atsru;
- if (system_state != SYSTEM_BOOTING && !intel_iommu_enabled)
+ if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
return 0;
atsr = container_of(hdr, struct acpi_dmar_atsr, header);
@@ -4565,7 +4565,7 @@ int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
struct acpi_dmar_atsr *atsr;
struct acpi_dmar_reserved_memory *rmrr;
- if (!intel_iommu_enabled && system_state != SYSTEM_BOOTING)
+ if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
return 0;
list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
diff --git a/drivers/iommu/of_iommu.c b/drivers/iommu/of_iommu.c
index 19779b88a479..8cb60829a7a1 100644
--- a/drivers/iommu/of_iommu.c
+++ b/drivers/iommu/of_iommu.c
@@ -103,7 +103,7 @@ static bool of_iommu_driver_present(struct device_node *np)
* it never will be. We don't want to defer indefinitely, nor attempt
* to dereference __iommu_of_table after it's been freed.
*/
- if (system_state > SYSTEM_BOOTING)
+ if (system_state >= SYSTEM_RUNNING)
return false;
return of_match_node(&__iommu_of_table, np);
diff --git a/drivers/xen/manage.c b/drivers/xen/manage.c
index c1ec8ee80924..9e35032351a0 100644
--- a/drivers/xen/manage.c
+++ b/drivers/xen/manage.c
@@ -190,6 +190,7 @@ static void do_poweroff(void)
{
switch (system_state) {
case SYSTEM_BOOTING:
+ case SYSTEM_SCHEDULING:
orderly_poweroff(true);
break;
case SYSTEM_RUNNING:
diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h
index f2b10d9ebd04..81490456c242 100644
--- a/include/linux/clocksource.h
+++ b/include/linux/clocksource.h
@@ -96,6 +96,7 @@ struct clocksource {
void (*suspend)(struct clocksource *cs);
void (*resume)(struct clocksource *cs);
void (*mark_unstable)(struct clocksource *cs);
+ void (*tick_stable)(struct clocksource *cs);
/* private: */
#ifdef CONFIG_CLOCKSOURCE_WATCHDOG
diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index 2404ad238c0b..4bf4479a3a80 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -236,6 +236,23 @@ unsigned int cpumask_local_spread(unsigned int i, int node);
(cpu) = cpumask_next_zero((cpu), (mask)), \
(cpu) < nr_cpu_ids;)
+extern int cpumask_next_wrap(int n, const struct cpumask *mask, int start, bool wrap);
+
+/**
+ * for_each_cpu_wrap - iterate over every cpu in a mask, starting at a specified location
+ * @cpu: the (optionally unsigned) integer iterator
+ * @mask: the cpumask poiter
+ * @start: the start location
+ *
+ * The implementation does not assume any bit in @mask is set (including @start).
+ *
+ * After the loop, cpu is >= nr_cpu_ids.
+ */
+#define for_each_cpu_wrap(cpu, mask, start) \
+ for ((cpu) = cpumask_next_wrap((start)-1, (mask), (start), false); \
+ (cpu) < nr_cpumask_bits; \
+ (cpu) = cpumask_next_wrap((cpu), (mask), (start), true))
+
/**
* for_each_cpu_and - iterate over every cpu in both masks
* @cpu: the (optionally unsigned) integer iterator
@@ -276,6 +293,12 @@ static inline void cpumask_set_cpu(unsigned int cpu, struct cpumask *dstp)
set_bit(cpumask_check(cpu), cpumask_bits(dstp));
}
+static inline void __cpumask_set_cpu(unsigned int cpu, struct cpumask *dstp)
+{
+ __set_bit(cpumask_check(cpu), cpumask_bits(dstp));
+}
+
+
/**
* cpumask_clear_cpu - clear a cpu in a cpumask
* @cpu: cpu number (< nr_cpu_ids)
@@ -286,6 +309,11 @@ static inline void cpumask_clear_cpu(int cpu, struct cpumask *dstp)
clear_bit(cpumask_check(cpu), cpumask_bits(dstp));
}
+static inline void __cpumask_clear_cpu(int cpu, struct cpumask *dstp)
+{
+ __clear_bit(cpumask_check(cpu), cpumask_bits(dstp));
+}
+
/**
* cpumask_test_cpu - test for a cpu in a cpumask
* @cpu: cpu number (< nr_cpu_ids)
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 13bc08aba704..1c91f26e2996 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -490,9 +490,13 @@ extern int root_mountflags;
extern bool early_boot_irqs_disabled;
-/* Values used for system_state */
+/*
+ * Values used for system_state. Ordering of the states must not be changed
+ * as code checks for <, <=, >, >= STATE.
+ */
extern enum system_states {
SYSTEM_BOOTING,
+ SYSTEM_SCHEDULING,
SYSTEM_RUNNING,
SYSTEM_HALT,
SYSTEM_POWER_OFF,
diff --git a/include/linux/llist.h b/include/linux/llist.h
index 171baa90f6f6..d11738110a7a 100644
--- a/include/linux/llist.h
+++ b/include/linux/llist.h
@@ -110,6 +110,25 @@ static inline void init_llist_head(struct llist_head *list)
for ((pos) = (node); pos; (pos) = (pos)->next)
/**
+ * llist_for_each_safe - iterate over some deleted entries of a lock-less list
+ * safe against removal of list entry
+ * @pos: the &struct llist_node to use as a loop cursor
+ * @n: another &struct llist_node to use as temporary storage
+ * @node: the first entry of deleted list entries
+ *
+ * In general, some entries of the lock-less list can be traversed
+ * safely only after being deleted from list, so start with an entry
+ * instead of list head.
+ *
+ * If being used on entries deleted from lock-less list directly, the
+ * traverse order is from the newest to the oldest added entry. If
+ * you want to traverse from the oldest to the newest, you must
+ * reverse the order by yourself before traversing.
+ */
+#define llist_for_each_safe(pos, n, node) \
+ for ((pos) = (node); (pos) && ((n) = (pos)->next, true); (pos) = (n))
+
+/**
* llist_for_each_entry - iterate over some deleted entries of lock-less list of given type
* @pos: the type * to use as a loop cursor.
* @node: the fist entry of deleted list entries.
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 2b69fc650201..1f0f427e0292 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -421,7 +421,8 @@ struct sched_dl_entity {
u64 dl_runtime; /* Maximum runtime for each instance */
u64 dl_deadline; /* Relative deadline of each instance */
u64 dl_period; /* Separation of two instances (period) */
- u64 dl_bw; /* dl_runtime / dl_deadline */
+ u64 dl_bw; /* dl_runtime / dl_period */
+ u64 dl_density; /* dl_runtime / dl_deadline */
/*
* Actual scheduling parameters. Initialized with the values above,
@@ -445,16 +446,33 @@ struct sched_dl_entity {
*
* @dl_yielded tells if task gave up the CPU before consuming
* all its available runtime during the last job.
+ *
+ * @dl_non_contending tells if the task is inactive while still
+ * contributing to the active utilization. In other words, it
+ * indicates if the inactive timer has been armed and its handler
+ * has not been executed yet. This flag is useful to avoid race
+ * conditions between the inactive timer handler and the wakeup
+ * code.
*/
int dl_throttled;
int dl_boosted;
int dl_yielded;
+ int dl_non_contending;
/*
* Bandwidth enforcement timer. Each -deadline task has its
* own bandwidth to be enforced, thus we need one timer per task.
*/
struct hrtimer dl_timer;
+
+ /*
+ * Inactive timer, responsible for decreasing the active utilization
+ * at the "0-lag time". When a -deadline task blocks, it contributes
+ * to GRUB's active utilization until the "0-lag time", hence a
+ * timer is needed to decrease the active utilization at the correct
+ * time.
+ */
+ struct hrtimer inactive_timer;
};
union rcu_special {
@@ -1096,8 +1114,6 @@ static inline struct pid *task_session(struct task_struct *task)
* current.
* task_xid_nr_ns() : id seen from the ns specified;
*
- * set_task_vxid() : assigns a virtual id to a task;
- *
* see also pid_nr() etc in include/linux/pid.h
*/
pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type, struct pid_namespace *ns);
diff --git a/include/linux/sched/clock.h b/include/linux/sched/clock.h
index 34fe92ce1ebd..a55600ffdf4b 100644
--- a/include/linux/sched/clock.h
+++ b/include/linux/sched/clock.h
@@ -23,10 +23,6 @@ extern u64 sched_clock_cpu(int cpu);
extern void sched_clock_init(void);
#ifndef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
-static inline void sched_clock_init_late(void)
-{
-}
-
static inline void sched_clock_tick(void)
{
}
@@ -39,7 +35,7 @@ static inline void sched_clock_idle_sleep_event(void)
{
}
-static inline void sched_clock_idle_wakeup_event(u64 delta_ns)
+static inline void sched_clock_idle_wakeup_event(void)
{
}
@@ -53,7 +49,6 @@ static inline u64 local_clock(void)
return sched_clock();
}
#else
-extern void sched_clock_init_late(void);
extern int sched_clock_stable(void);
extern void clear_sched_clock_stable(void);
@@ -63,10 +58,10 @@ extern void clear_sched_clock_stable(void);
*/
extern u64 __sched_clock_offset;
-
extern void sched_clock_tick(void);
+extern void sched_clock_tick_stable(void);
extern void sched_clock_idle_sleep_event(void);
-extern void sched_clock_idle_wakeup_event(u64 delta_ns);
+extern void sched_clock_idle_wakeup_event(void);
/*
* As outlined in clock.c, provides a fast, high resolution, nanosecond
diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h
index 5f0fe019a720..e2a6c7b3510b 100644
--- a/include/uapi/linux/sched.h
+++ b/include/uapi/linux/sched.h
@@ -47,5 +47,6 @@
* For the sched_{set,get}attr() calls
*/
#define SCHED_FLAG_RESET_ON_FORK 0x01
+#define SCHED_FLAG_RECLAIM 0x02
#endif /* _UAPI_LINUX_SCHED_H */
diff --git a/init/main.c b/init/main.c
index f866510472d7..df58a416dd1d 100644
--- a/init/main.c
+++ b/init/main.c
@@ -389,6 +389,7 @@ static __initdata DECLARE_COMPLETION(kthreadd_done);
static noinline void __ref rest_init(void)
{
+ struct task_struct *tsk;
int pid;
rcu_scheduler_starting();
@@ -397,12 +398,32 @@ static noinline void __ref rest_init(void)
* the init task will end up wanting to create kthreads, which, if
* we schedule it before we create kthreadd, will OOPS.
*/
- kernel_thread(kernel_init, NULL, CLONE_FS);
+ pid = kernel_thread(kernel_init, NULL, CLONE_FS);
+ /*
+ * Pin init on the boot CPU. Task migration is not properly working
+ * until sched_init_smp() has been run. It will set the allowed
+ * CPUs for init to the non isolated CPUs.
+ */
+ rcu_read_lock();
+ tsk = find_task_by_pid_ns(pid, &init_pid_ns);
+ set_cpus_allowed_ptr(tsk, cpumask_of(smp_processor_id()));
+ rcu_read_unlock();
+
numa_default_policy();
pid = kernel_thread(kthreadd, NULL, CLONE_FS | CLONE_FILES);
rcu_read_lock();
kthreadd_task = find_task_by_pid_ns(pid, &init_pid_ns);
rcu_read_unlock();
+
+ /*
+ * Enable might_sleep() and smp_processor_id() checks.
+ * They cannot be enabled earlier because with CONFIG_PRREMPT=y
+ * kernel_thread() would trigger might_sleep() splats. With
+ * CONFIG_PREEMPT_VOLUNTARY=y the init task might have scheduled
+ * already, but it's stuck on the kthreadd_done completion.
+ */
+ system_state = SYSTEM_SCHEDULING;
+
complete(&kthreadd_done);
/*
@@ -1015,10 +1036,6 @@ static noinline void __init kernel_init_freeable(void)
* init can allocate pages on any node
*/
set_mems_allowed(node_states[N_MEMORY]);
- /*
- * init can run on any cpu.
- */
- set_cpus_allowed_ptr(current, cpu_all_mask);
cad_pid = task_pid(current);
diff --git a/kernel/async.c b/kernel/async.c
index d2edd6efec56..2cbd3dd5940d 100644
--- a/kernel/async.c
+++ b/kernel/async.c
@@ -114,14 +114,14 @@ static void async_run_entry_fn(struct work_struct *work)
ktime_t uninitialized_var(calltime), delta, rettime;
/* 1) run (and print duration) */
- if (initcall_debug && system_state == SYSTEM_BOOTING) {
+ if (initcall_debug && system_state < SYSTEM_RUNNING) {
pr_debug("calling %lli_%pF @ %i\n",
(long long)entry->cookie,
entry->func, task_pid_nr(current));
calltime = ktime_get();
}
entry->func(entry->data, entry->cookie);
- if (initcall_debug && system_state == SYSTEM_BOOTING) {
+ if (initcall_debug && system_state < SYSTEM_RUNNING) {
rettime = ktime_get();
delta = ktime_sub(rettime, calltime);
pr_debug("initcall %lli_%pF returned 0 after %lld usecs\n",
@@ -284,14 +284,14 @@ void async_synchronize_cookie_domain(async_cookie_t cookie, struct async_domain
{
ktime_t uninitialized_var(starttime), delta, endtime;
- if (initcall_debug && system_state == SYSTEM_BOOTING) {
+ if (initcall_debug && system_state < SYSTEM_RUNNING) {
pr_debug("async_waiting @ %i\n", task_pid_nr(current));
starttime = ktime_get();
}
wait_event(async_done, lowest_in_progress(domain) >= cookie);
- if (initcall_debug && system_state == SYSTEM_BOOTING) {
+ if (initcall_debug && system_state < SYSTEM_RUNNING) {
endtime = ktime_get();
delta = ktime_sub(endtime, starttime);
diff --git a/kernel/extable.c b/kernel/extable.c
index 2676d7f8baf6..0fbdd8582f08 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -75,7 +75,7 @@ int core_kernel_text(unsigned long addr)
addr < (unsigned long)_etext)
return 1;
- if (system_state == SYSTEM_BOOTING &&
+ if (system_state < SYSTEM_RUNNING &&
init_kernel_text(addr))
return 1;
return 0;
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index a1db38abac5b..bd53ea579dc8 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -1175,7 +1175,7 @@ static void boot_delay_msec(int level)
unsigned long long k;
unsigned long timeout;
- if ((boot_delay == 0 || system_state != SYSTEM_BOOTING)
+ if ((boot_delay == 0 || system_state >= SYSTEM_RUNNING)
|| suppress_message_printing(level)) {
return;
}
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index 16277e2ed8ee..53f0164ed362 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -16,9 +16,9 @@ CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer
endif
obj-y += core.o loadavg.o clock.o cputime.o
-obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o
+obj-y += idle_task.o fair.o rt.o deadline.o
obj-y += wait.o wait_bit.o swait.o completion.o idle.o
-obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o topology.o
+obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o topology.o stop_task.o
obj-$(CONFIG_SCHED_AUTOGROUP) += autogroup.o
obj-$(CONFIG_SCHEDSTATS) += stats.o
obj-$(CONFIG_SCHED_DEBUG) += debug.o
diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c
index 00a45c45beca..ca0f8fc945c6 100644
--- a/kernel/sched/clock.c
+++ b/kernel/sched/clock.c
@@ -64,6 +64,7 @@
#include <linux/workqueue.h>
#include <linux/compiler.h>
#include <linux/tick.h>
+#include <linux/init.h>
/*
* Scheduler clock - returns current time in nanosec units.
@@ -124,14 +125,27 @@ int sched_clock_stable(void)
return static_branch_likely(&__sched_clock_stable);
}
+static void __scd_stamp(struct sched_clock_data *scd)
+{
+ scd->tick_gtod = ktime_get_ns();
+ scd->tick_raw = sched_clock();
+}
+
static void __set_sched_clock_stable(void)
{
- struct sched_clock_data *scd = this_scd();
+ struct sched_clock_data *scd;
/*
+ * Since we're still unstable and the tick is already running, we have
+ * to disable IRQs in order to get a consistent scd->tick* reading.
+ */
+ local_irq_disable();
+ scd = this_scd();
+ /*
* Attempt to make the (initial) unstable->stable transition continuous.
*/
__sched_clock_offset = (scd->tick_gtod + __gtod_offset) - (scd->tick_raw);
+ local_irq_enable();
printk(KERN_INFO "sched_clock: Marking stable (%lld, %lld)->(%lld, %lld)\n",
scd->tick_gtod, __gtod_offset,
@@ -141,8 +155,38 @@ static void __set_sched_clock_stable(void)
tick_dep_clear(TICK_DEP_BIT_CLOCK_UNSTABLE);
}
+/*
+ * If we ever get here, we're screwed, because we found out -- typically after
+ * the fact -- that TSC wasn't good. This means all our clocksources (including
+ * ktime) could have reported wrong values.
+ *
+ * What we do here is an attempt to fix up and continue sort of where we left
+ * off in a coherent manner.
+ *
+ * The only way to fully avoid random clock jumps is to boot with:
+ * "tsc=unstable".
+ */
static void __sched_clock_work(struct work_struct *work)
{
+ struct sched_clock_data *scd;
+ int cpu;
+
+ /* take a current timestamp and set 'now' */
+ preempt_disable();
+ scd = this_scd();
+ __scd_stamp(scd);
+ scd->clock = scd->tick_gtod + __gtod_offset;
+ preempt_enable();
+
+ /* clone to all CPUs */
+ for_each_possible_cpu(cpu)
+ per_cpu(sched_clock_data, cpu) = *scd;
+
+ printk(KERN_WARNING "TSC found unstable after boot, most likely due to broken BIOS. Use 'tsc=unstable'.\n");
+ printk(KERN_INFO "sched_clock: Marking unstable (%lld, %lld)<-(%lld, %lld)\n",
+ scd->tick_gtod, __gtod_offset,
+ scd->tick_raw, __sched_clock_offset);
+
static_branch_disable(&__sched_clock_stable);
}
@@ -150,27 +194,11 @@ static DECLARE_WORK(sched_clock_work, __sched_clock_work);
static void __clear_sched_clock_stable(void)
{
- struct sched_clock_data *scd = this_scd();
-
- /*
- * Attempt to make the stable->unstable transition continuous.
- *
- * Trouble is, this is typically called from the TSC watchdog
- * timer, which is late per definition. This means the tick
- * values can already be screwy.
- *
- * Still do what we can.
- */
- __gtod_offset = (scd->tick_raw + __sched_clock_offset) - (scd->tick_gtod);
-
- printk(KERN_INFO "sched_clock: Marking unstable (%lld, %lld)<-(%lld, %lld)\n",
- scd->tick_gtod, __gtod_offset,
- scd->tick_raw, __sched_clock_offset);
+ if (!sched_clock_stable())
+ return;
tick_dep_set(TICK_DEP_BIT_CLOCK_UNSTABLE);
-
- if (sched_clock_stable())
- schedule_work(&sched_clock_work);
+ schedule_work(&sched_clock_work);
}
void clear_sched_clock_stable(void)
@@ -183,7 +211,11 @@ void clear_sched_clock_stable(void)
__clear_sched_clock_stable();
}
-void sched_clock_init_late(void)
+/*
+ * We run this as late_initcall() such that it runs after all built-in drivers,
+ * notably: acpi_processor and intel_idle, which can mark the TSC as unstable.
+ */
+static int __init sched_clock_init_late(void)
{
sched_clock_running = 2;
/*
@@ -197,7 +229,10 @@ void sched_clock_init_late(void)
if (__sched_clock_stable_early)
__set_sched_clock_stable();
+
+ return 0;
}
+late_initcall(sched_clock_init_late);
/*
* min, max except they take wrapping into account
@@ -347,21 +382,38 @@ void sched_clock_tick(void)
{
struct sched_clock_data *scd;
+ if (sched_clock_stable())
+ return;
+
+ if (unlikely(!sched_clock_running))
+ return;
+
WARN_ON_ONCE(!irqs_disabled());
+ scd = this_scd();
+ __scd_stamp(scd);
+ sched_clock_local(scd);
+}
+
+void sched_clock_tick_stable(void)
+{
+ u64 gtod, clock;
+
+ if (!sched_clock_stable())
+ return;
+
/*
- * Update these values even if sched_clock_stable(), because it can
- * become unstable at any point in time at which point we need some
- * values to fall back on.
+ * Called under watchdog_lock.
*
- * XXX arguably we can skip this if we expose tsc_clocksource_reliable
+ * The watchdog just found this TSC to (still) be stable, so now is a
+ * good moment to update our __gtod_offset. Because once we find the
+ * TSC to be unstable, any computation will be computing crap.
*/
- scd = this_scd();
- scd->tick_raw = sched_clock();
- scd->tick_gtod = ktime_get_ns();
-
- if (!sched_clock_stable() && likely(sched_clock_running))
- sched_clock_local(scd);
+ local_irq_disable();
+ gtod = ktime_get_ns();
+ clock = sched_clock();
+ __gtod_offset = (clock + __sched_clock_offset) - gtod;
+ local_irq_enable();
}
/*
@@ -374,15 +426,21 @@ void sched_clock_idle_sleep_event(void)
EXPORT_SYMBOL_GPL(sched_clock_idle_sleep_event);
/*
- * We just idled delta nanoseconds (called with irqs disabled):
+ * We just idled; resync with ktime.
*/
-void sched_clock_idle_wakeup_event(u64 delta_ns)
+void sched_clock_idle_wakeup_event(void)
{
- if (timekeeping_suspended)
+ unsigned long flags;
+
+ if (sched_clock_stable())
+ return;
+
+ if (unlikely(timekeeping_suspended))
return;
+ local_irq_save(flags);
sched_clock_tick();
- touch_softlockup_watchdog_sched();
+ local_irq_restore(flags);
}
EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index e7b9ef8df126..62166da1c359 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -789,36 +789,6 @@ void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
dequeue_task(rq, p, flags);
}
-void sched_set_stop_task(int cpu, struct task_struct *stop)
-{
- struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
- struct task_struct *old_stop = cpu_rq(cpu)->stop;
-
- if (stop) {
- /*
- * Make it appear like a SCHED_FIFO task, its something
- * userspace knows about and won't get confused about.
- *
- * Also, it will make PI more or less work without too
- * much confusion -- but then, stop work should not
- * rely on PI working anyway.
- */
- sched_setscheduler_nocheck(stop, SCHED_FIFO, &param);
-
- stop->sched_class = &stop_sched_class;
- }
-
- cpu_rq(cpu)->stop = stop;
-
- if (old_stop) {
- /*
- * Reset it back to a normal scheduling class so that
- * it can die in pieces.
- */
- old_stop->sched_class = &rt_sched_class;
- }
-}
-
/*
* __normal_prio - return the priority that is based on the static prio
*/
@@ -1589,6 +1559,36 @@ static void update_avg(u64 *avg, u64 sample)
*avg += diff >> 3;
}
+void sched_set_stop_task(int cpu, struct task_struct *stop)
+{
+ struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
+ struct task_struct *old_stop = cpu_rq(cpu)->stop;
+
+ if (stop) {
+ /*
+ * Make it appear like a SCHED_FIFO task, its something
+ * userspace knows about and won't get confused about.
+ *
+ * Also, it will make PI more or less work without too
+ * much confusion -- but then, stop work should not
+ * rely on PI working anyway.
+ */
+ sched_setscheduler_nocheck(stop, SCHED_FIFO, &param);
+
+ stop->sched_class = &stop_sched_class;
+ }
+
+ cpu_rq(cpu)->stop = stop;
+
+ if (old_stop) {
+ /*
+ * Reset it back to a normal scheduling class so that
+ * it can die in pieces.
+ */
+ old_stop->sched_class = &rt_sched_class;
+ }
+}
+
#else
static inline int __set_cpus_allowed_ptr(struct task_struct *p,
@@ -1732,7 +1732,7 @@ void sched_ttwu_pending(void)
{
struct rq *rq = this_rq();
struct llist_node *llist = llist_del_all(&rq->wake_list);
- struct task_struct *p;
+ struct task_struct *p, *t;
struct rq_flags rf;
if (!llist)
@@ -1741,17 +1741,8 @@ void sched_ttwu_pending(void)
rq_lock_irqsave(rq, &rf);
update_rq_clock(rq);
- while (llist) {
- int wake_flags = 0;
-
- p = llist_entry(llist, struct task_struct, wake_entry);
- llist = llist_next(llist);
-
- if (p->sched_remote_wakeup)
- wake_flags = WF_MIGRATED;
-
- ttwu_do_activate(rq, p, wake_flags, &rf);
- }
+ llist_for_each_entry_safe(p, t, llist, wake_entry)
+ ttwu_do_activate(rq, p, p->sched_remote_wakeup ? WF_MIGRATED : 0, &rf);
rq_unlock_irqrestore(rq, &rf);
}
@@ -2160,9 +2151,11 @@ void __dl_clear_params(struct task_struct *p)
dl_se->dl_period = 0;
dl_se->flags = 0;
dl_se->dl_bw = 0;
+ dl_se->dl_density = 0;
dl_se->dl_throttled = 0;
dl_se->dl_yielded = 0;
+ dl_se->dl_non_contending = 0;
}
/*
@@ -2194,6 +2187,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
RB_CLEAR_NODE(&p->dl.rb_node);
init_dl_task_timer(&p->dl);
+ init_dl_inactive_task_timer(&p->dl);
__dl_clear_params(p);
INIT_LIST_HEAD(&p->rt.run_list);
@@ -2431,7 +2425,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
unsigned long to_ratio(u64 period, u64 runtime)
{
if (runtime == RUNTIME_INF)
- return 1ULL << 20;
+ return BW_UNIT;
/*
* Doing this here saves a lot of checks in all
@@ -2441,7 +2435,7 @@ unsigned long to_ratio(u64 period, u64 runtime)
if (period == 0)
return 0;
- return div64_u64(runtime << 20, period);
+ return div64_u64(runtime << BW_SHIFT, period);
}
#ifdef CONFIG_SMP
@@ -2452,7 +2446,7 @@ inline struct dl_bw *dl_bw_of(int i)
return &cpu_rq(i)->rd->dl_bw;
}
-static inline int dl_bw_cpus(int i)
+inline int dl_bw_cpus(int i)
{
struct root_domain *rd = cpu_rq(i)->rd;
int cpus = 0;
@@ -2470,7 +2464,7 @@ inline struct dl_bw *dl_bw_of(int i)
return &cpu_rq(i)->dl.dl_bw;
}
-static inline int dl_bw_cpus(int i)
+inline int dl_bw_cpus(int i)
{
return 1;
}
@@ -2483,9 +2477,6 @@ static inline int dl_bw_cpus(int i)
* allocated bandwidth to reflect the new situation.
*
* This function is called while holding p's rq->lock.
- *
- * XXX we should delay bw change until the task's 0-lag point, see
- * __setparam_dl().
*/
static int dl_overflow(struct task_struct *p, int policy,
const struct sched_attr *attr)
@@ -2510,15 +2501,29 @@ static int dl_overflow(struct task_struct *p, int policy,
cpus = dl_bw_cpus(task_cpu(p));
if (dl_policy(policy) && !task_has_dl_policy(p) &&
!__dl_overflow(dl_b, cpus, 0, new_bw)) {
- __dl_add(dl_b, new_bw);
+ if (hrtimer_active(&p->dl.inactive_timer))
+ __dl_clear(dl_b, p->dl.dl_bw, cpus);
+ __dl_add(dl_b, new_bw, cpus);
err = 0;
} else if (dl_policy(policy) && task_has_dl_policy(p) &&
!__dl_overflow(dl_b, cpus, p->dl.dl_bw, new_bw)) {
- __dl_clear(dl_b, p->dl.dl_bw);
- __dl_add(dl_b, new_bw);
+ /*
+ * XXX this is slightly incorrect: when the task
+ * utilization decreases, we should delay the total
+ * utilization change until the task's 0-lag point.
+ * But this would require to set the task's "inactive
+ * timer" when the task is not inactive.
+ */
+ __dl_clear(dl_b, p->dl.dl_bw, cpus);
+ __dl_add(dl_b, new_bw, cpus);
+ dl_change_utilization(p, new_bw);
err = 0;
} else if (!dl_policy(policy) && task_has_dl_policy(p)) {
- __dl_clear(dl_b, p->dl.dl_bw);
+ /*
+ * Do not decrease the total deadline utilization here,
+ * switched_from_dl() will take care to do it at the correct
+ * (0-lag) time.
+ */
err = 0;
}
raw_spin_unlock(&dl_b->lock);
@@ -4027,26 +4032,7 @@ __setparam_dl(struct task_struct *p, const struct sched_attr *attr)
dl_se->dl_period = attr->sched_period ?: dl_se->dl_deadline;
dl_se->flags = attr->sched_flags;
dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime);
-
- /*
- * Changing the parameters of a task is 'tricky' and we're not doing
- * the correct thing -- also see task_dead_dl() and switched_from_dl().
- *
- * What we SHOULD do is delay the bandwidth release until the 0-lag
- * point. This would include retaining the task_struct until that time
- * and change dl_overflow() to not immediately decrement the current
- * amount.
- *
- * Instead we retain the current runtime/deadline and let the new
- * parameters take effect after the current reservation period lapses.
- * This is safe (albeit pessimistic) because the 0-lag point is always
- * before the current scheduling deadline.
- *
- * We can still have temporary overloads because we do not delay the
- * change in bandwidth until that time; so admission control is
- * not on the safe side. It does however guarantee tasks will never
- * consume more than promised.
- */
+ dl_se->dl_density = to_ratio(dl_se->dl_deadline, dl_se->dl_runtime);
}
/*
@@ -4198,8 +4184,8 @@ static int __sched_setscheduler(struct task_struct *p,
int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
struct rq *rq;
- /* May grab non-irq protected spin_locks: */
- BUG_ON(in_interrupt());
+ /* The pi code expects interrupts enabled */
+ BUG_ON(pi && in_interrupt());
recheck:
/* Double check policy once rq lock held: */
if (policy < 0) {
@@ -4212,7 +4198,8 @@ recheck:
return -EINVAL;
}
- if (attr->sched_flags & ~(SCHED_FLAG_RESET_ON_FORK))
+ if (attr->sched_flags &
+ ~(SCHED_FLAG_RESET_ON_FORK | SCHED_FLAG_RECLAIM))
return -EINVAL;
/*
@@ -5531,7 +5518,7 @@ int task_can_attach(struct task_struct *p,
* We will free resources in the source root_domain
* later on (see set_cpus_allowed_dl()).
*/
- __dl_add(dl_b, p->dl.dl_bw);
+ __dl_add(dl_b, p->dl.dl_bw, cpus);
}
raw_spin_unlock_irqrestore(&dl_b->lock, flags);
rcu_read_unlock_sched();
@@ -5959,7 +5946,6 @@ void __init sched_init_smp(void)
cpumask_var_t non_isolated_cpus;
alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
- alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
sched_init_numa();
@@ -5969,7 +5955,7 @@ void __init sched_init_smp(void)
* happen.
*/
mutex_lock(&sched_domains_mutex);
- init_sched_domains(cpu_active_mask);
+ sched_init_domains(cpu_active_mask);
cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
if (cpumask_empty(non_isolated_cpus))
cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);
@@ -5985,7 +5971,6 @@ void __init sched_init_smp(void)
init_sched_dl_class();
sched_init_smt();
- sched_clock_init_late();
sched_smp_initialized = true;
}
@@ -6001,7 +5986,6 @@ early_initcall(migration_init);
void __init sched_init_smp(void)
{
sched_init_granularity();
- sched_clock_init_late();
}
#endif /* CONFIG_SMP */
@@ -6185,7 +6169,6 @@ void __init sched_init(void)
calc_load_update = jiffies + LOAD_FREQ;
#ifdef CONFIG_SMP
- zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT);
/* May be allocated at isolcpus cmdline parse time */
if (cpu_isolated_map == NULL)
zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
@@ -6237,8 +6220,10 @@ void ___might_sleep(const char *file, int line, int preempt_offset)
if ((preempt_count_equals(preempt_offset) && !irqs_disabled() &&
!is_idle_task(current)) ||
- system_state != SYSTEM_RUNNING || oops_in_progress)
+ system_state == SYSTEM_BOOTING || system_state > SYSTEM_RUNNING ||
+ oops_in_progress)
return;
+
if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
return;
prev_jiffy = jiffies;
@@ -6763,6 +6748,19 @@ static int sched_dl_global_validate(void)
return ret;
}
+void init_dl_rq_bw_ratio(struct dl_rq *dl_rq)
+{
+ if (global_rt_runtime() == RUNTIME_INF) {
+ dl_rq->bw_ratio = 1 << RATIO_SHIFT;
+ dl_rq->extra_bw = 1 << BW_SHIFT;
+ } else {
+ dl_rq->bw_ratio = to_ratio(global_rt_runtime(),
+ global_rt_period()) >> (BW_SHIFT - RATIO_SHIFT);
+ dl_rq->extra_bw = to_ratio(global_rt_period(),
+ global_rt_runtime());
+ }
+}
+
static void sched_dl_do_global(void)
{
u64 new_bw = -1;
@@ -6788,6 +6786,7 @@ static void sched_dl_do_global(void)
raw_spin_unlock_irqrestore(&dl_b->lock, flags);
rcu_read_unlock_sched();
+ init_dl_rq_bw_ratio(&cpu_rq(cpu)->dl);
}
}
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index a2ce59015642..e12f85975857 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -43,6 +43,222 @@ static inline int on_dl_rq(struct sched_dl_entity *dl_se)
return !RB_EMPTY_NODE(&dl_se->rb_node);
}
+static inline
+void add_running_bw(u64 dl_bw, struct dl_rq *dl_rq)
+{
+ u64 old = dl_rq->running_bw;
+
+ lockdep_assert_held(&(rq_of_dl_rq(dl_rq))->lock);
+ dl_rq->running_bw += dl_bw;
+ SCHED_WARN_ON(dl_rq->running_bw < old); /* overflow */
+ SCHED_WARN_ON(dl_rq->running_bw > dl_rq->this_bw);
+}
+
+static inline
+void sub_running_bw(u64 dl_bw, struct dl_rq *dl_rq)
+{
+ u64 old = dl_rq->running_bw;
+
+ lockdep_assert_held(&(rq_of_dl_rq(dl_rq))->lock);
+ dl_rq->running_bw -= dl_bw;
+ SCHED_WARN_ON(dl_rq->running_bw > old); /* underflow */
+ if (dl_rq->running_bw > old)
+ dl_rq->running_bw = 0;
+}
+
+static inline
+void add_rq_bw(u64 dl_bw, struct dl_rq *dl_rq)
+{
+ u64 old = dl_rq->this_bw;
+
+ lockdep_assert_held(&(rq_of_dl_rq(dl_rq))->lock);
+ dl_rq->this_bw += dl_bw;
+ SCHED_WARN_ON(dl_rq->this_bw < old); /* overflow */
+}
+
+static inline
+void sub_rq_bw(u64 dl_bw, struct dl_rq *dl_rq)
+{
+ u64 old = dl_rq->this_bw;
+
+ lockdep_assert_held(&(rq_of_dl_rq(dl_rq))->lock);
+ dl_rq->this_bw -= dl_bw;
+ SCHED_WARN_ON(dl_rq->this_bw > old); /* underflow */
+ if (dl_rq->this_bw > old)
+ dl_rq->this_bw = 0;
+ SCHED_WARN_ON(dl_rq->running_bw > dl_rq->this_bw);
+}
+
+void dl_change_utilization(struct task_struct *p, u64 new_bw)
+{
+ struct rq *rq;
+
+ if (task_on_rq_queued(p))
+ return;
+
+ rq = task_rq(p);
+ if (p->dl.dl_non_contending) {
+ sub_running_bw(p->dl.dl_bw, &rq->dl);
+ p->dl.dl_non_contending = 0;
+ /*
+ * If the timer handler is currently running and the
+ * timer cannot be cancelled, inactive_task_timer()
+ * will see that dl_not_contending is not set, and
+ * will not touch the rq's active utilization,
+ * so we are still safe.
+ */
+ if (hrtimer_try_to_cancel(&p->dl.inactive_timer) == 1)
+ put_task_struct(p);
+ }
+ sub_rq_bw(p->dl.dl_bw, &rq->dl);
+ add_rq_bw(new_bw, &rq->dl);
+}
+
+/*
+ * The utilization of a task cannot be immediately removed from
+ * the rq active utilization (running_bw) when the task blocks.
+ * Instead, we have to wait for the so called "0-lag time".
+ *
+ * If a task blocks before the "0-lag time", a timer (the inactive
+ * timer) is armed, and running_bw is decreased when the timer
+ * fires.
+ *
+ * If the task wakes up again before the inactive timer fires,
+ * the timer is cancelled, whereas if the task wakes up after the
+ * inactive timer fired (and running_bw has been decreased) the
+ * task's utilization has to be added to running_bw again.
+ * A flag in the deadline scheduling entity (dl_non_contending)
+ * is used to avoid race conditions between the inactive timer handler
+ * and task wakeups.
+ *
+ * The following diagram shows how running_bw is updated. A task is
+ * "ACTIVE" when its utilization contributes to running_bw; an
+ * "ACTIVE contending" task is in the TASK_RUNNING state, while an
+ * "ACTIVE non contending" task is a blocked task for which the "0-lag time"
+ * has not passed yet. An "INACTIVE" task is a task for which the "0-lag"
+ * time already passed, which does not contribute to running_bw anymore.
+ * +------------------+
+ * wakeup | ACTIVE |
+ * +------------------>+ contending |
+ * | add_running_bw | |
+ * | +----+------+------+
+ * | | ^
+ * | dequeue | |
+ * +--------+-------+ | |
+ * | | t >= 0-lag | | wakeup
+ * | INACTIVE |<---------------+ |
+ * | | sub_running_bw | |
+ * +--------+-------+ | |
+ * ^ | |
+ * | t < 0-lag | |
+ * | | |
+ * | V |
+ * | +----+------+------+
+ * | sub_running_bw | ACTIVE |
+ * +-------------------+ |
+ * inactive timer | non contending |
+ * fired +------------------+
+ *
+ * The task_non_contending() function is invoked when a task
+ * blocks, and checks if the 0-lag time already passed or
+ * not (in the first case, it directly updates running_bw;
+ * in the second case, it arms the inactive timer).
+ *
+ * The task_contending() function is invoked when a task wakes
+ * up, and checks if the task is still in the "ACTIVE non contending"
+ * state or not (in the second case, it updates running_bw).
+ */
+static void task_non_contending(struct task_struct *p)
+{
+ struct sched_dl_entity *dl_se = &p->dl;
+ struct hrtimer *timer = &dl_se->inactive_timer;
+ struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
+ struct rq *rq = rq_of_dl_rq(dl_rq);
+ s64 zerolag_time;
+
+ /*
+ * If this is a non-deadline task that has been boosted,
+ * do nothing
+ */
+ if (dl_se->dl_runtime == 0)
+ return;
+
+ WARN_ON(hrtimer_active(&dl_se->inactive_timer));
+ WARN_ON(dl_se->dl_non_contending);
+
+ zerolag_time = dl_se->deadline -
+ div64_long((dl_se->runtime * dl_se->dl_period),
+ dl_se->dl_runtime);
+
+ /*
+ * Using relative times instead of the absolute "0-lag time"
+ * allows to simplify the code
+ */
+ zerolag_time -= rq_clock(rq);
+
+ /*
+ * If the "0-lag time" already passed, decrease the active
+ * utilization now, instead of starting a timer
+ */
+ if (zerolag_time < 0) {
+ if (dl_task(p))
+ sub_running_bw(dl_se->dl_bw, dl_rq);
+ if (!dl_task(p) || p->state == TASK_DEAD) {
+ struct dl_bw *dl_b = dl_bw_of(task_cpu(p));
+
+ if (p->state == TASK_DEAD)
+ sub_rq_bw(p->dl.dl_bw, &rq->dl);
+ raw_spin_lock(&dl_b->lock);
+ __dl_clear(dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p)));
+ __dl_clear_params(p);
+ raw_spin_unlock(&dl_b->lock);
+ }
+
+ return;
+ }
+
+ dl_se->dl_non_contending = 1;
+ get_task_struct(p);
+ hrtimer_start(timer, ns_to_ktime(zerolag_time), HRTIMER_MODE_REL);
+}
+
+static void task_contending(struct sched_dl_entity *dl_se, int flags)
+{
+ struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
+
+ /*
+ * If this is a non-deadline task that has been boosted,
+ * do nothing
+ */
+ if (dl_se->dl_runtime == 0)
+ return;
+
+ if (flags & ENQUEUE_MIGRATED)
+ add_rq_bw(dl_se->dl_bw, dl_rq);
+
+ if (dl_se->dl_non_contending) {
+ dl_se->dl_non_contending = 0;
+ /*
+ * If the timer handler is currently running and the
+ * timer cannot be cancelled, inactive_task_timer()
+ * will see that dl_not_contending is not set, and
+ * will not touch the rq's active utilization,
+ * so we are still safe.
+ */
+ if (hrtimer_try_to_cancel(&dl_se->inactive_timer) == 1)
+ put_task_struct(dl_task_of(dl_se));
+ } else {
+ /*
+ * Since "dl_non_contending" is not set, the
+ * task's utilization has already been removed from
+ * active utilization (either when the task blocked,
+ * when the "inactive timer" fired).
+ * So, add it back.
+ */
+ add_running_bw(dl_se->dl_bw, dl_rq);
+ }
+}
+
static inline int is_leftmost(struct task_struct *p, struct dl_rq *dl_rq)
{
struct sched_dl_entity *dl_se = &p->dl;
@@ -83,6 +299,10 @@ void init_dl_rq(struct dl_rq *dl_rq)
#else
init_dl_bw(&dl_rq->dl_bw);
#endif
+
+ dl_rq->running_bw = 0;
+ dl_rq->this_bw = 0;
+ init_dl_rq_bw_ratio(dl_rq);
}
#ifdef CONFIG_SMP
@@ -484,13 +704,84 @@ static bool dl_entity_overflow(struct sched_dl_entity *dl_se,
}
/*
- * When a -deadline entity is queued back on the runqueue, its runtime and
- * deadline might need updating.
+ * Revised wakeup rule [1]: For self-suspending tasks, rather then
+ * re-initializing task's runtime and deadline, the revised wakeup
+ * rule adjusts the task's runtime to avoid the task to overrun its
+ * density.
+ *
+ * Reasoning: a task may overrun the density if:
+ * runtime / (deadline - t) > dl_runtime / dl_deadline
+ *
+ * Therefore, runtime can be adjusted to:
+ * runtime = (dl_runtime / dl_deadline) * (deadline - t)
+ *
+ * In such way that runtime will be equal to the maximum density
+ * the task can use without breaking any rule.
+ *
+ * [1] Luca Abeni, Giuseppe Lipari, and Juri Lelli. 2015. Constant
+ * bandwidth server revisited. SIGBED Rev. 11, 4 (January 2015), 19-24.
+ */
+static void
+update_dl_revised_wakeup(struct sched_dl_entity *dl_se, struct rq *rq)
+{
+ u64 laxity = dl_se->deadline - rq_clock(rq);
+
+ /*
+ * If the task has deadline < period, and the deadline is in the past,
+ * it should already be throttled before this check.
+ *
+ * See update_dl_entity() comments for further details.
+ */
+ WARN_ON(dl_time_before(dl_se->deadline, rq_clock(rq)));
+
+ dl_se->runtime = (dl_se->dl_density * laxity) >> BW_SHIFT;
+}
+
+/*
+ * Regarding the deadline, a task with implicit deadline has a relative
+ * deadline == relative period. A task with constrained deadline has a
+ * relative deadline <= relative period.
+ *
+ * We support constrained deadline tasks. However, there are some restrictions
+ * applied only for tasks which do not have an implicit deadline. See
+ * update_dl_entity() to know more about such restrictions.
+ *
+ * The dl_is_implicit() returns true if the task has an implicit deadline.
+ */
+static inline bool dl_is_implicit(struct sched_dl_entity *dl_se)
+{
+ return dl_se->dl_deadline == dl_se->dl_period;
+}
+
+/*
+ * When a deadline entity is placed in the runqueue, its runtime and deadline
+ * might need to be updated. This is done by a CBS wake up rule. There are two
+ * different rules: 1) the original CBS; and 2) the Revisited CBS.
+ *
+ * When the task is starting a new period, the Original CBS is used. In this
+ * case, the runtime is replenished and a new absolute deadline is set.
+ *
+ * When a task is queued before the begin of the next period, using the
+ * remaining runtime and deadline could make the entity to overflow, see
+ * dl_entity_overflow() to find more about runtime overflow. When such case
+ * is detected, the runtime and deadline need to be updated.
*
- * The policy here is that we update the deadline of the entity only if:
- * - the current deadline is in the past,
- * - using the remaining runtime with the current deadline would make
- * the entity exceed its bandwidth.
+ * If the task has an implicit deadline, i.e., deadline == period, the Original
+ * CBS is applied. the runtime is replenished and a new absolute deadline is
+ * set, as in the previous cases.
+ *
+ * However, the Original CBS does not work properly for tasks with
+ * deadline < period, which are said to have a constrained deadline. By
+ * applying the Original CBS, a constrained deadline task would be able to run
+ * runtime/deadline in a period. With deadline < period, the task would
+ * overrun the runtime/period allowed bandwidth, breaking the admission test.
+ *
+ * In order to prevent this misbehave, the Revisited CBS is used for
+ * constrained deadline tasks when a runtime overflow is detected. In the
+ * Revisited CBS, rather than replenishing & setting a new absolute deadline,
+ * the remaining runtime of the task is reduced to avoid runtime overflow.
+ * Please refer to the comments update_dl_revised_wakeup() function to find
+ * more about the Revised CBS rule.
*/
static void update_dl_entity(struct sched_dl_entity *dl_se,
struct sched_dl_entity *pi_se)
@@ -500,6 +791,14 @@ static void update_dl_entity(struct sched_dl_entity *dl_se,
if (dl_time_before(dl_se->deadline, rq_clock(rq)) ||
dl_entity_overflow(dl_se, pi_se, rq_clock(rq))) {
+
+ if (unlikely(!dl_is_implicit(dl_se) &&
+ !dl_time_before(dl_se->deadline, rq_clock(rq)) &&
+ !dl_se->dl_boosted)){
+ update_dl_revised_wakeup(dl_se, rq);
+ return;
+ }
+
dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline;
dl_se->runtime = pi_se->dl_runtime;
}
@@ -593,10 +892,8 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
* The task might have changed its scheduling policy to something
* different than SCHED_DEADLINE (through switched_from_dl()).
*/
- if (!dl_task(p)) {
- __dl_clear_params(p);
+ if (!dl_task(p))
goto unlock;
- }
/*
* The task might have been boosted by someone else and might be in the
@@ -723,6 +1020,8 @@ static inline void dl_check_constrained_dl(struct sched_dl_entity *dl_se)
if (unlikely(dl_se->dl_boosted || !start_dl_timer(p)))
return;
dl_se->dl_throttled = 1;
+ if (dl_se->runtime > 0)
+ dl_se->runtime = 0;
}
}
@@ -735,6 +1034,47 @@ int dl_runtime_exceeded(struct sched_dl_entity *dl_se)
extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq);
/*
+ * This function implements the GRUB accounting rule:
+ * according to the GRUB reclaiming algorithm, the runtime is
+ * not decreased as "dq = -dt", but as
+ * "dq = -max{u / Umax, (1 - Uinact - Uextra)} dt",
+ * where u is the utilization of the task, Umax is the maximum reclaimable
+ * utilization, Uinact is the (per-runqueue) inactive utilization, computed
+ * as the difference between the "total runqueue utilization" and the
+ * runqueue active utilization, and Uextra is the (per runqueue) extra
+ * reclaimable utilization.
+ * Since rq->dl.running_bw and rq->dl.this_bw contain utilizations
+ * multiplied by 2^BW_SHIFT, the result has to be shifted right by
+ * BW_SHIFT.
+ * Since rq->dl.bw_ratio contains 1 / Umax multipled by 2^RATIO_SHIFT,
+ * dl_bw is multiped by rq->dl.bw_ratio and shifted right by RATIO_SHIFT.
+ * Since delta is a 64 bit variable, to have an overflow its value
+ * should be larger than 2^(64 - 20 - 8), which is more than 64 seconds.
+ * So, overflow is not an issue here.
+ */
+u64 grub_reclaim(u64 delta, struct rq *rq, struct sched_dl_entity *dl_se)
+{
+ u64 u_inact = rq->dl.this_bw - rq->dl.running_bw; /* Utot - Uact */
+ u64 u_act;
+ u64 u_act_min = (dl_se->dl_bw * rq->dl.bw_ratio) >> RATIO_SHIFT;
+
+ /*
+ * Instead of computing max{u * bw_ratio, (1 - u_inact - u_extra)},
+ * we compare u_inact + rq->dl.extra_bw with
+ * 1 - (u * rq->dl.bw_ratio >> RATIO_SHIFT), because
+ * u_inact + rq->dl.extra_bw can be larger than
+ * 1 * (so, 1 - u_inact - rq->dl.extra_bw would be negative
+ * leading to wrong results)
+ */
+ if (u_inact + rq->dl.extra_bw > BW_UNIT - u_act_min)
+ u_act = u_act_min;
+ else
+ u_act = BW_UNIT - u_inact - rq->dl.extra_bw;
+
+ return (delta * u_act) >> BW_SHIFT;
+}
+
+/*
* Update the current task's runtime statistics (provided it is still
* a -deadline task and has not been removed from the dl_rq).
*/
@@ -776,6 +1116,8 @@ static void update_curr_dl(struct rq *rq)
sched_rt_avg_update(rq, delta_exec);
+ if (unlikely(dl_se->flags & SCHED_FLAG_RECLAIM))
+ delta_exec = grub_reclaim(delta_exec, rq, &curr->dl);
dl_se->runtime -= delta_exec;
throttle:
@@ -815,6 +1157,56 @@ throttle:
}
}
+static enum hrtimer_restart inactive_task_timer(struct hrtimer *timer)
+{
+ struct sched_dl_entity *dl_se = container_of(timer,
+ struct sched_dl_entity,
+ inactive_timer);
+ struct task_struct *p = dl_task_of(dl_se);
+ struct rq_flags rf;
+ struct rq *rq;
+
+ rq = task_rq_lock(p, &rf);
+
+ if (!dl_task(p) || p->state == TASK_DEAD) {
+ struct dl_bw *dl_b = dl_bw_of(task_cpu(p));
+
+ if (p->state == TASK_DEAD && dl_se->dl_non_contending) {
+ sub_running_bw(p->dl.dl_bw, dl_rq_of_se(&p->dl));
+ sub_rq_bw(p->dl.dl_bw, dl_rq_of_se(&p->dl));
+ dl_se->dl_non_contending = 0;
+ }
+
+ raw_spin_lock(&dl_b->lock);
+ __dl_clear(dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p)));
+ raw_spin_unlock(&dl_b->lock);
+ __dl_clear_params(p);
+
+ goto unlock;
+ }
+ if (dl_se->dl_non_contending == 0)
+ goto unlock;
+
+ sched_clock_tick();
+ update_rq_clock(rq);
+
+ sub_running_bw(dl_se->dl_bw, &rq->dl);
+ dl_se->dl_non_contending = 0;
+unlock:
+ task_rq_unlock(rq, p, &rf);
+ put_task_struct(p);
+
+ return HRTIMER_NORESTART;
+}
+
+void init_dl_inactive_task_timer(struct sched_dl_entity *dl_se)
+{
+ struct hrtimer *timer = &dl_se->inactive_timer;
+
+ hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ timer->function = inactive_task_timer;
+}
+
#ifdef CONFIG_SMP
static void inc_dl_deadline(struct dl_rq *dl_rq, u64 deadline)
@@ -946,10 +1338,12 @@ enqueue_dl_entity(struct sched_dl_entity *dl_se,
* parameters of the task might need updating. Otherwise,
* we want a replenishment of its runtime.
*/
- if (flags & ENQUEUE_WAKEUP)
+ if (flags & ENQUEUE_WAKEUP) {
+ task_contending(dl_se, flags);
update_dl_entity(dl_se, pi_se);
- else if (flags & ENQUEUE_REPLENISH)
+ } else if (flags & ENQUEUE_REPLENISH) {
replenish_dl_entity(dl_se, pi_se);
+ }
__enqueue_dl_entity(dl_se);
}
@@ -959,11 +1353,6 @@ static void dequeue_dl_entity(struct sched_dl_entity *dl_se)
__dequeue_dl_entity(dl_se);
}
-static inline bool dl_is_constrained(struct sched_dl_entity *dl_se)
-{
- return dl_se->dl_deadline < dl_se->dl_period;
-}
-
static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
{
struct task_struct *pi_task = rt_mutex_get_top_task(p);
@@ -995,17 +1384,32 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
* If that is the case, the task will be throttled and
* the replenishment timer will be set to the next period.
*/
- if (!p->dl.dl_throttled && dl_is_constrained(&p->dl))
+ if (!p->dl.dl_throttled && !dl_is_implicit(&p->dl))
dl_check_constrained_dl(&p->dl);
+ if (p->on_rq == TASK_ON_RQ_MIGRATING || flags & ENQUEUE_RESTORE) {
+ add_rq_bw(p->dl.dl_bw, &rq->dl);
+ add_running_bw(p->dl.dl_bw, &rq->dl);
+ }
+
/*
- * If p is throttled, we do nothing. In fact, if it exhausted
+ * If p is throttled, we do not enqueue it. In fact, if it exhausted
* its budget it needs a replenishment and, since it now is on
* its rq, the bandwidth timer callback (which clearly has not
* run yet) will take care of this.
+ * However, the active utilization does not depend on the fact
+ * that the task is on the runqueue or not (but depends on the
+ * task's state - in GRUB parlance, "inactive" vs "active contending").
+ * In other words, even if a task is throttled its utilization must
+ * be counted in the active utilization; hence, we need to call
+ * add_running_bw().
*/
- if (p->dl.dl_throttled && !(flags & ENQUEUE_REPLENISH))
+ if (p->dl.dl_throttled && !(flags & ENQUEUE_REPLENISH)) {
+ if (flags & ENQUEUE_WAKEUP)
+ task_contending(&p->dl, flags);
+
return;
+ }
enqueue_dl_entity(&p->dl, pi_se, flags);
@@ -1023,6 +1427,23 @@ static void dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags)
{
update_curr_dl(rq);
__dequeue_task_dl(rq, p, flags);
+
+ if (p->on_rq == TASK_ON_RQ_MIGRATING || flags & DEQUEUE_SAVE) {
+ sub_running_bw(p->dl.dl_bw, &rq->dl);
+ sub_rq_bw(p->dl.dl_bw, &rq->dl);
+ }
+
+ /*
+ * This check allows to start the inactive timer (or to immediately
+ * decrease the active utilization, if needed) in two cases:
+ * when the task blocks and when it is terminating
+ * (p->state == TASK_DEAD). We can handle the two cases in the same
+ * way, because from GRUB's point of view the same thing is happening
+ * (the task moves from "active contending" to "active non contending"
+ * or "inactive")
+ */
+ if (flags & DEQUEUE_SLEEP)
+ task_non_contending(p);
}
/*
@@ -1100,6 +1521,37 @@ out:
return cpu;
}
+static void migrate_task_rq_dl(struct task_struct *p)
+{
+ struct rq *rq;
+
+ if (p->state != TASK_WAKING)
+ return;
+
+ rq = task_rq(p);
+ /*
+ * Since p->state == TASK_WAKING, set_task_cpu() has been called
+ * from try_to_wake_up(). Hence, p->pi_lock is locked, but
+ * rq->lock is not... So, lock it
+ */
+ raw_spin_lock(&rq->lock);
+ if (p->dl.dl_non_contending) {
+ sub_running_bw(p->dl.dl_bw, &rq->dl);
+ p->dl.dl_non_contending = 0;
+ /*
+ * If the timer handler is currently running and the
+ * timer cannot be cancelled, inactive_task_timer()
+ * will see that dl_not_contending is not set, and
+ * will not touch the rq's active utilization,
+ * so we are still safe.
+ */
+ if (hrtimer_try_to_cancel(&p->dl.inactive_timer) == 1)
+ put_task_struct(p);
+ }
+ sub_rq_bw(p->dl.dl_bw, &rq->dl);
+ raw_spin_unlock(&rq->lock);
+}
+
static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p)
{
/*
@@ -1255,19 +1707,6 @@ static void task_fork_dl(struct task_struct *p)
*/
}
-static void task_dead_dl(struct task_struct *p)
-{
- struct dl_bw *dl_b = dl_bw_of(task_cpu(p));
-
- /*
- * Since we are TASK_DEAD we won't slip out of the domain!
- */
- raw_spin_lock_irq(&dl_b->lock);
- /* XXX we should retain the bw until 0-lag */
- dl_b->total_bw -= p->dl.dl_bw;
- raw_spin_unlock_irq(&dl_b->lock);
-}
-
static void set_curr_task_dl(struct rq *rq)
{
struct task_struct *p = rq->curr;
@@ -1533,7 +1972,7 @@ retry:
* then possible that next_task has migrated.
*/
task = pick_next_pushable_dl_task(rq);
- if (task_cpu(next_task) == rq->cpu && task == next_task) {
+ if (task == next_task) {
/*
* The task is still there. We don't try
* again, some other cpu will pull it when ready.
@@ -1551,7 +1990,11 @@ retry:
}
deactivate_task(rq, next_task, 0);
+ sub_running_bw(next_task->dl.dl_bw, &rq->dl);
+ sub_rq_bw(next_task->dl.dl_bw, &rq->dl);
set_task_cpu(next_task, later_rq->cpu);
+ add_rq_bw(next_task->dl.dl_bw, &later_rq->dl);
+ add_running_bw(next_task->dl.dl_bw, &later_rq->dl);
activate_task(later_rq, next_task, 0);
ret = 1;
@@ -1639,7 +2082,11 @@ static void pull_dl_task(struct rq *this_rq)
resched = true;
deactivate_task(src_rq, p, 0);
+ sub_running_bw(p->dl.dl_bw, &src_rq->dl);
+ sub_rq_bw(p->dl.dl_bw, &src_rq->dl);
set_task_cpu(p, this_cpu);
+ add_rq_bw(p->dl.dl_bw, &this_rq->dl);
+ add_running_bw(p->dl.dl_bw, &this_rq->dl);
activate_task(this_rq, p, 0);
dmin = p->dl.deadline;
@@ -1695,7 +2142,7 @@ static void set_cpus_allowed_dl(struct task_struct *p,
* until we complete the update.
*/
raw_spin_lock(&src_dl_b->lock);
- __dl_clear(src_dl_b, p->dl.dl_bw);
+ __dl_clear(src_dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p)));
raw_spin_unlock(&src_dl_b->lock);
}
@@ -1737,13 +2184,26 @@ void __init init_sched_dl_class(void)
static void switched_from_dl(struct rq *rq, struct task_struct *p)
{
/*
- * Start the deadline timer; if we switch back to dl before this we'll
- * continue consuming our current CBS slice. If we stay outside of
- * SCHED_DEADLINE until the deadline passes, the timer will reset the
- * task.
+ * task_non_contending() can start the "inactive timer" (if the 0-lag
+ * time is in the future). If the task switches back to dl before
+ * the "inactive timer" fires, it can continue to consume its current
+ * runtime using its current deadline. If it stays outside of
+ * SCHED_DEADLINE until the 0-lag time passes, inactive_task_timer()
+ * will reset the task parameters.
*/
- if (!start_dl_timer(p))
- __dl_clear_params(p);
+ if (task_on_rq_queued(p) && p->dl.dl_runtime)
+ task_non_contending(p);
+
+ if (!task_on_rq_queued(p))
+ sub_rq_bw(p->dl.dl_bw, &rq->dl);
+
+ /*
+ * We cannot use inactive_task_timer() to invoke sub_running_bw()
+ * at the 0-lag time, because the task could have been migrated
+ * while SCHED_OTHER in the meanwhile.
+ */
+ if (p->dl.dl_non_contending)
+ p->dl.dl_non_contending = 0;
/*
* Since this might be the only -deadline task on the rq,
@@ -1762,11 +2222,15 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p)
*/
static void switched_to_dl(struct rq *rq, struct task_struct *p)
{
+ if (hrtimer_try_to_cancel(&p->dl.inactive_timer) == 1)
+ put_task_struct(p);
/* If p is not queued we will update its parameters at next wakeup. */
- if (!task_on_rq_queued(p))
- return;
+ if (!task_on_rq_queued(p)) {
+ add_rq_bw(p->dl.dl_bw, &rq->dl);
+ return;
+ }
/*
* If p is boosted we already updated its params in
* rt_mutex_setprio()->enqueue_task(..., ENQUEUE_REPLENISH),
@@ -1836,6 +2300,7 @@ const struct sched_class dl_sched_class = {
#ifdef CONFIG_SMP
.select_task_rq = select_task_rq_dl,
+ .migrate_task_rq = migrate_task_rq_dl,
.set_cpus_allowed = set_cpus_allowed_dl,
.rq_online = rq_online_dl,
.rq_offline = rq_offline_dl,
@@ -1845,7 +2310,6 @@ const struct sched_class dl_sched_class = {
.set_curr_task = set_curr_task_dl,
.task_tick = task_tick_dl,
.task_fork = task_fork_dl,
- .task_dead = task_dead_dl,
.prio_changed = prio_changed_dl,
.switched_from = switched_from_dl,
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index c77e4b1d51c0..a24661ac3d23 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -369,8 +369,9 @@ static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
}
/* Iterate thr' all leaf cfs_rq's on a runqueue */
-#define for_each_leaf_cfs_rq(rq, cfs_rq) \
- list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
+#define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) \
+ list_for_each_entry_safe(cfs_rq, pos, &rq->leaf_cfs_rq_list, \
+ leaf_cfs_rq_list)
/* Do the two (enqueued) entities belong to the same group ? */
static inline struct cfs_rq *
@@ -463,8 +464,8 @@ static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
{
}
-#define for_each_leaf_cfs_rq(rq, cfs_rq) \
- for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
+#define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) \
+ for (cfs_rq = &rq->cfs, pos = NULL; cfs_rq; cfs_rq = pos)
static inline struct sched_entity *parent_entity(struct sched_entity *se)
{
@@ -2469,7 +2470,8 @@ void task_numa_work(struct callback_head *work)
return;
- down_read(&mm->mmap_sem);
+ if (!down_read_trylock(&mm->mmap_sem))
+ return;
vma = find_vma(mm, start);
if (!vma) {
reset_ptenuma_scan(p);
@@ -2916,12 +2918,12 @@ ___update_load_avg(u64 now, int cpu, struct sched_avg *sa,
/*
* Step 2: update *_avg.
*/
- sa->load_avg = div_u64(sa->load_sum, LOAD_AVG_MAX);
+ sa->load_avg = div_u64(sa->load_sum, LOAD_AVG_MAX - 1024 + sa->period_contrib);
if (cfs_rq) {
cfs_rq->runnable_load_avg =
- div_u64(cfs_rq->runnable_load_sum, LOAD_AVG_MAX);
+ div_u64(cfs_rq->runnable_load_sum, LOAD_AVG_MAX - 1024 + sa->period_contrib);
}
- sa->util_avg = sa->util_sum / LOAD_AVG_MAX;
+ sa->util_avg = sa->util_sum / (LOAD_AVG_MAX - 1024 + sa->period_contrib);
return 1;
}
@@ -4642,24 +4644,43 @@ static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
hrtimer_cancel(&cfs_b->slack_timer);
}
+/*
+ * Both these cpu hotplug callbacks race against unregister_fair_sched_group()
+ *
+ * The race is harmless, since modifying bandwidth settings of unhooked group
+ * bits doesn't do much.
+ */
+
+/* cpu online calback */
static void __maybe_unused update_runtime_enabled(struct rq *rq)
{
- struct cfs_rq *cfs_rq;
+ struct task_group *tg;
- for_each_leaf_cfs_rq(rq, cfs_rq) {
- struct cfs_bandwidth *cfs_b = &cfs_rq->tg->cfs_bandwidth;
+ lockdep_assert_held(&rq->lock);
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(tg, &task_groups, list) {
+ struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
+ struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
raw_spin_lock(&cfs_b->lock);
cfs_rq->runtime_enabled = cfs_b->quota != RUNTIME_INF;
raw_spin_unlock(&cfs_b->lock);
}
+ rcu_read_unlock();
}
+/* cpu offline callback */
static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
{
- struct cfs_rq *cfs_rq;
+ struct task_group *tg;
+
+ lockdep_assert_held(&rq->lock);
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(tg, &task_groups, list) {
+ struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
- for_each_leaf_cfs_rq(rq, cfs_rq) {
if (!cfs_rq->runtime_enabled)
continue;
@@ -4677,6 +4698,7 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
if (cfs_rq_throttled(cfs_rq))
unthrottle_cfs_rq(cfs_rq);
}
+ rcu_read_unlock();
}
#else /* CONFIG_CFS_BANDWIDTH */
@@ -5484,12 +5506,12 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
int i;
/* Skip over this group if it has no CPUs allowed */
- if (!cpumask_intersects(sched_group_cpus(group),
+ if (!cpumask_intersects(sched_group_span(group),
&p->cpus_allowed))
continue;
local_group = cpumask_test_cpu(this_cpu,
- sched_group_cpus(group));
+ sched_group_span(group));
/*
* Tally up the load of all CPUs in the group and find
@@ -5499,7 +5521,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
runnable_load = 0;
max_spare_cap = 0;
- for_each_cpu(i, sched_group_cpus(group)) {
+ for_each_cpu(i, sched_group_span(group)) {
/* Bias balancing toward cpus of our domain */
if (local_group)
load = source_load(i, load_idx);
@@ -5602,10 +5624,10 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
/* Check if we have any choice: */
if (group->group_weight == 1)
- return cpumask_first(sched_group_cpus(group));
+ return cpumask_first(sched_group_span(group));
/* Traverse only the allowed CPUs */
- for_each_cpu_and(i, sched_group_cpus(group), &p->cpus_allowed) {
+ for_each_cpu_and(i, sched_group_span(group), &p->cpus_allowed) {
if (idle_cpu(i)) {
struct rq *rq = cpu_rq(i);
struct cpuidle_state *idle = idle_get_state(rq);
@@ -5640,43 +5662,6 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu;
}
-/*
- * Implement a for_each_cpu() variant that starts the scan at a given cpu
- * (@start), and wraps around.
- *
- * This is used to scan for idle CPUs; such that not all CPUs looking for an
- * idle CPU find the same CPU. The down-side is that tasks tend to cycle
- * through the LLC domain.
- *
- * Especially tbench is found sensitive to this.
- */
-
-static int cpumask_next_wrap(int n, const struct cpumask *mask, int start, int *wrapped)
-{
- int next;
-
-again:
- next = find_next_bit(cpumask_bits(mask), nr_cpumask_bits, n+1);
-
- if (*wrapped) {
- if (next >= start)
- return nr_cpumask_bits;
- } else {
- if (next >= nr_cpumask_bits) {
- *wrapped = 1;
- n = -1;
- goto again;
- }
- }
-
- return next;
-}
-
-#define for_each_cpu_wrap(cpu, mask, start, wrap) \
- for ((wrap) = 0, (cpu) = (start)-1; \
- (cpu) = cpumask_next_wrap((cpu), (mask), (start), &(wrap)), \
- (cpu) < nr_cpumask_bits; )
-
#ifdef CONFIG_SCHED_SMT
static inline void set_idle_cores(int cpu, int val)
@@ -5736,7 +5721,7 @@ unlock:
static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int target)
{
struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
- int core, cpu, wrap;
+ int core, cpu;
if (!static_branch_likely(&sched_smt_present))
return -1;
@@ -5746,7 +5731,7 @@ static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int
cpumask_and(cpus, sched_domain_span(sd), &p->cpus_allowed);
- for_each_cpu_wrap(core, cpus, target, wrap) {
+ for_each_cpu_wrap(core, cpus, target) {
bool idle = true;
for_each_cpu(cpu, cpu_smt_mask(core)) {
@@ -5809,27 +5794,38 @@ static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd
static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int target)
{
struct sched_domain *this_sd;
- u64 avg_cost, avg_idle = this_rq()->avg_idle;
+ u64 avg_cost, avg_idle;
u64 time, cost;
s64 delta;
- int cpu, wrap;
+ int cpu, nr = INT_MAX;
this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc));
if (!this_sd)
return -1;
- avg_cost = this_sd->avg_scan_cost;
-
/*
* Due to large variance we need a large fuzz factor; hackbench in
* particularly is sensitive here.
*/
- if (sched_feat(SIS_AVG_CPU) && (avg_idle / 512) < avg_cost)
+ avg_idle = this_rq()->avg_idle / 512;
+ avg_cost = this_sd->avg_scan_cost + 1;
+
+ if (sched_feat(SIS_AVG_CPU) && avg_idle < avg_cost)
return -1;
+ if (sched_feat(SIS_PROP)) {
+ u64 span_avg = sd->span_weight * avg_idle;
+ if (span_avg > 4*avg_cost)
+ nr = div_u64(span_avg, avg_cost);
+ else
+ nr = 4;
+ }
+
time = local_clock();
- for_each_cpu_wrap(cpu, sched_domain_span(sd), target, wrap) {
+ for_each_cpu_wrap(cpu, sched_domain_span(sd), target) {
+ if (!--nr)
+ return -1;
if (!cpumask_test_cpu(cpu, &p->cpus_allowed))
continue;
if (idle_cpu(cpu))
@@ -6168,8 +6164,11 @@ static void set_last_buddy(struct sched_entity *se)
if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
return;
- for_each_sched_entity(se)
+ for_each_sched_entity(se) {
+ if (SCHED_WARN_ON(!se->on_rq))
+ return;
cfs_rq_of(se)->last = se;
+ }
}
static void set_next_buddy(struct sched_entity *se)
@@ -6177,8 +6176,11 @@ static void set_next_buddy(struct sched_entity *se)
if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
return;
- for_each_sched_entity(se)
+ for_each_sched_entity(se) {
+ if (SCHED_WARN_ON(!se->on_rq))
+ return;
cfs_rq_of(se)->next = se;
+ }
}
static void set_skip_buddy(struct sched_entity *se)
@@ -6970,10 +6972,28 @@ static void attach_tasks(struct lb_env *env)
}
#ifdef CONFIG_FAIR_GROUP_SCHED
+
+static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
+{
+ if (cfs_rq->load.weight)
+ return false;
+
+ if (cfs_rq->avg.load_sum)
+ return false;
+
+ if (cfs_rq->avg.util_sum)
+ return false;
+
+ if (cfs_rq->runnable_load_sum)
+ return false;
+
+ return true;
+}
+
static void update_blocked_averages(int cpu)
{
struct rq *rq = cpu_rq(cpu);
- struct cfs_rq *cfs_rq;
+ struct cfs_rq *cfs_rq, *pos;
struct rq_flags rf;
rq_lock_irqsave(rq, &rf);
@@ -6983,7 +7003,7 @@ static void update_blocked_averages(int cpu)
* Iterates the task_group tree in a bottom up fashion, see
* list_add_leaf_cfs_rq() for details.
*/
- for_each_leaf_cfs_rq(rq, cfs_rq) {
+ for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) {
struct sched_entity *se;
/* throttled entities do not contribute to load */
@@ -6997,6 +7017,13 @@ static void update_blocked_averages(int cpu)
se = cfs_rq->tg->se[cpu];
if (se && !skip_blocked_update(se))
update_load_avg(se, 0);
+
+ /*
+ * There can be a lot of idle CPU cgroups. Don't let fully
+ * decayed cfs_rqs linger on the list.
+ */
+ if (cfs_rq_is_decayed(cfs_rq))
+ list_del_leaf_cfs_rq(cfs_rq);
}
rq_unlock_irqrestore(rq, &rf);
}
@@ -7229,7 +7256,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
* span the current group.
*/
- for_each_cpu(cpu, sched_group_cpus(sdg)) {
+ for_each_cpu(cpu, sched_group_span(sdg)) {
struct sched_group_capacity *sgc;
struct rq *rq = cpu_rq(cpu);
@@ -7408,7 +7435,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
memset(sgs, 0, sizeof(*sgs));
- for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
+ for_each_cpu_and(i, sched_group_span(group), env->cpus) {
struct rq *rq = cpu_rq(i);
/* Bias balancing toward cpus of our domain */
@@ -7572,7 +7599,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
struct sg_lb_stats *sgs = &tmp_sgs;
int local_group;
- local_group = cpumask_test_cpu(env->dst_cpu, sched_group_cpus(sg));
+ local_group = cpumask_test_cpu(env->dst_cpu, sched_group_span(sg));
if (local_group) {
sds->local = sg;
sgs = local;
@@ -7927,7 +7954,7 @@ static struct rq *find_busiest_queue(struct lb_env *env,
unsigned long busiest_load = 0, busiest_capacity = 1;
int i;
- for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
+ for_each_cpu_and(i, sched_group_span(group), env->cpus) {
unsigned long capacity, wl;
enum fbq_type rt;
@@ -8033,7 +8060,6 @@ static int active_load_balance_cpu_stop(void *data);
static int should_we_balance(struct lb_env *env)
{
struct sched_group *sg = env->sd->groups;
- struct cpumask *sg_cpus, *sg_mask;
int cpu, balance_cpu = -1;
/*
@@ -8043,11 +8069,9 @@ static int should_we_balance(struct lb_env *env)
if (env->idle == CPU_NEWLY_IDLE)
return 1;
- sg_cpus = sched_group_cpus(sg);
- sg_mask = sched_group_mask(sg);
/* Try to find first idle cpu */
- for_each_cpu_and(cpu, sg_cpus, env->cpus) {
- if (!cpumask_test_cpu(cpu, sg_mask) || !idle_cpu(cpu))
+ for_each_cpu_and(cpu, group_balance_mask(sg), env->cpus) {
+ if (!idle_cpu(cpu))
continue;
balance_cpu = cpu;
@@ -8083,7 +8107,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
.sd = sd,
.dst_cpu = this_cpu,
.dst_rq = this_rq,
- .dst_grpmask = sched_group_cpus(sd->groups),
+ .dst_grpmask = sched_group_span(sd->groups),
.idle = idle,
.loop_break = sched_nr_migrate_break,
.cpus = cpus,
@@ -9523,10 +9547,10 @@ const struct sched_class fair_sched_class = {
#ifdef CONFIG_SCHED_DEBUG
void print_cfs_stats(struct seq_file *m, int cpu)
{
- struct cfs_rq *cfs_rq;
+ struct cfs_rq *cfs_rq, *pos;
rcu_read_lock();
- for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq)
+ for_each_leaf_cfs_rq_safe(cpu_rq(cpu), cfs_rq, pos)
print_cfs_rq(m, cpu, cfs_rq);
rcu_read_unlock();
}
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 11192e0cb122..d3fb15555291 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -55,6 +55,7 @@ SCHED_FEAT(TTWU_QUEUE, true)
* When doing wakeups, attempt to limit superfluous scans of the LLC domain.
*/
SCHED_FEAT(SIS_AVG_CPU, false)
+SCHED_FEAT(SIS_PROP, true)
/*
* Issue a WARN when we do multiple update_rq_clock() calls
@@ -76,7 +77,6 @@ SCHED_FEAT(WARN_DOUBLE_CLOCK, false)
SCHED_FEAT(RT_PUSH_IPI, true)
#endif
-SCHED_FEAT(FORCE_SD_OVERLAP, false)
SCHED_FEAT(RT_RUNTIME_SHARE, true)
SCHED_FEAT(LB_MIN, false)
SCHED_FEAT(ATTACH_AGE_LOAD, true)
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index ef63adce0c9c..6c23e30c0e5c 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -219,6 +219,7 @@ static void do_idle(void)
*/
__current_set_polling();
+ quiet_vmstat();
tick_nohz_idle_enter();
while (!need_resched()) {
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 979b7341008a..581d5c7a5264 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -840,6 +840,17 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
int enqueue = 0;
struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i);
struct rq *rq = rq_of_rt_rq(rt_rq);
+ int skip;
+
+ /*
+ * When span == cpu_online_mask, taking each rq->lock
+ * can be time-consuming. Try to avoid it when possible.
+ */
+ raw_spin_lock(&rt_rq->rt_runtime_lock);
+ skip = !rt_rq->rt_time && !rt_rq->rt_nr_running;
+ raw_spin_unlock(&rt_rq->rt_runtime_lock);
+ if (skip)
+ continue;
raw_spin_lock(&rq->lock);
if (rt_rq->rt_time) {
@@ -1819,7 +1830,7 @@ retry:
* pushing.
*/
task = pick_next_pushable_task(rq);
- if (task_cpu(next_task) == rq->cpu && task == next_task) {
+ if (task == next_task) {
/*
* The task hasn't migrated, and is still the next
* eligible task, but we failed to find a run-queue
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 6dda2aab731e..e0329d10bdb8 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -39,9 +39,9 @@
#include "cpuacct.h"
#ifdef CONFIG_SCHED_DEBUG
-#define SCHED_WARN_ON(x) WARN_ONCE(x, #x)
+# define SCHED_WARN_ON(x) WARN_ONCE(x, #x)
#else
-#define SCHED_WARN_ON(x) ((void)(x))
+# define SCHED_WARN_ON(x) ({ (void)(x), 0; })
#endif
struct rq;
@@ -219,22 +219,27 @@ static inline int dl_bandwidth_enabled(void)
}
extern struct dl_bw *dl_bw_of(int i);
+extern int dl_bw_cpus(int i);
struct dl_bw {
raw_spinlock_t lock;
u64 bw, total_bw;
};
+static inline void __dl_update(struct dl_bw *dl_b, s64 bw);
+
static inline
-void __dl_clear(struct dl_bw *dl_b, u64 tsk_bw)
+void __dl_clear(struct dl_bw *dl_b, u64 tsk_bw, int cpus)
{
dl_b->total_bw -= tsk_bw;
+ __dl_update(dl_b, (s32)tsk_bw / cpus);
}
static inline
-void __dl_add(struct dl_bw *dl_b, u64 tsk_bw)
+void __dl_add(struct dl_bw *dl_b, u64 tsk_bw, int cpus)
{
dl_b->total_bw += tsk_bw;
+ __dl_update(dl_b, -((s32)tsk_bw / cpus));
}
static inline
@@ -244,6 +249,7 @@ bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw)
dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw;
}
+void dl_change_utilization(struct task_struct *p, u64 new_bw);
extern void init_dl_bw(struct dl_bw *dl_b);
#ifdef CONFIG_CGROUP_SCHED
@@ -558,6 +564,30 @@ struct dl_rq {
#else
struct dl_bw dl_bw;
#endif
+ /*
+ * "Active utilization" for this runqueue: increased when a
+ * task wakes up (becomes TASK_RUNNING) and decreased when a
+ * task blocks
+ */
+ u64 running_bw;
+
+ /*
+ * Utilization of the tasks "assigned" to this runqueue (including
+ * the tasks that are in runqueue and the tasks that executed on this
+ * CPU and blocked). Increased when a task moves to this runqueue, and
+ * decreased when the task moves away (migrates, changes scheduling
+ * policy, or terminates).
+ * This is needed to compute the "inactive utilization" for the
+ * runqueue (inactive utilization = this_bw - running_bw).
+ */
+ u64 this_bw;
+ u64 extra_bw;
+
+ /*
+ * Inverse of the fraction of CPU utilization that can be reclaimed
+ * by the GRUB algorithm.
+ */
+ u64 bw_ratio;
};
#ifdef CONFIG_SMP
@@ -606,11 +636,9 @@ struct root_domain {
extern struct root_domain def_root_domain;
extern struct mutex sched_domains_mutex;
-extern cpumask_var_t fallback_doms;
-extern cpumask_var_t sched_domains_tmpmask;
extern void init_defrootdomain(void);
-extern int init_sched_domains(const struct cpumask *cpu_map);
+extern int sched_init_domains(const struct cpumask *cpu_map);
extern void rq_attach_root(struct rq *rq, struct root_domain *rd);
#endif /* CONFIG_SMP */
@@ -1025,7 +1053,11 @@ struct sched_group_capacity {
unsigned long next_update;
int imbalance; /* XXX unrelated to capacity but shared group state */
- unsigned long cpumask[0]; /* iteration mask */
+#ifdef CONFIG_SCHED_DEBUG
+ int id;
+#endif
+
+ unsigned long cpumask[0]; /* balance mask */
};
struct sched_group {
@@ -1046,16 +1078,15 @@ struct sched_group {
unsigned long cpumask[0];
};
-static inline struct cpumask *sched_group_cpus(struct sched_group *sg)
+static inline struct cpumask *sched_group_span(struct sched_group *sg)
{
return to_cpumask(sg->cpumask);
}
/*
- * cpumask masking which cpus in the group are allowed to iterate up the domain
- * tree.
+ * See build_balance_mask().
*/
-static inline struct cpumask *sched_group_mask(struct sched_group *sg)
+static inline struct cpumask *group_balance_mask(struct sched_group *sg)
{
return to_cpumask(sg->sgc->cpumask);
}
@@ -1066,7 +1097,7 @@ static inline struct cpumask *sched_group_mask(struct sched_group *sg)
*/
static inline unsigned int group_first_cpu(struct sched_group *group)
{
- return cpumask_first(sched_group_cpus(group));
+ return cpumask_first(sched_group_span(group));
}
extern int group_balance_cpu(struct sched_group *sg);
@@ -1422,7 +1453,11 @@ static inline void set_curr_task(struct rq *rq, struct task_struct *curr)
curr->sched_class->set_curr_task(rq);
}
+#ifdef CONFIG_SMP
#define sched_class_highest (&stop_sched_class)
+#else
+#define sched_class_highest (&dl_sched_class)
+#endif
#define for_each_class(class) \
for (class = sched_class_highest; class; class = class->next)
@@ -1486,7 +1521,12 @@ extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime
extern struct dl_bandwidth def_dl_bandwidth;
extern void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime);
extern void init_dl_task_timer(struct sched_dl_entity *dl_se);
+extern void init_dl_inactive_task_timer(struct sched_dl_entity *dl_se);
+extern void init_dl_rq_bw_ratio(struct dl_rq *dl_rq);
+#define BW_SHIFT 20
+#define BW_UNIT (1 << BW_SHIFT)
+#define RATIO_SHIFT 8
unsigned long to_ratio(u64 period, u64 runtime);
extern void init_entity_runnable_average(struct sched_entity *se);
@@ -1928,6 +1968,33 @@ extern void nohz_balance_exit_idle(unsigned int cpu);
static inline void nohz_balance_exit_idle(unsigned int cpu) { }
#endif
+
+#ifdef CONFIG_SMP
+static inline
+void __dl_update(struct dl_bw *dl_b, s64 bw)
+{
+ struct root_domain *rd = container_of(dl_b, struct root_domain, dl_bw);
+ int i;
+
+ RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(),
+ "sched RCU must be held");
+ for_each_cpu_and(i, rd->span, cpu_active_mask) {
+ struct rq *rq = cpu_rq(i);
+
+ rq->dl.extra_bw += bw;
+ }
+}
+#else
+static inline
+void __dl_update(struct dl_bw *dl_b, s64 bw)
+{
+ struct dl_rq *dl = container_of(dl_b, struct dl_rq, dl_bw);
+
+ dl->extra_bw += bw;
+}
+#endif
+
+
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
struct irqtime {
u64 total;
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 1b0b4fb12837..79895aec281e 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -10,6 +10,7 @@ DEFINE_MUTEX(sched_domains_mutex);
/* Protected by sched_domains_mutex: */
cpumask_var_t sched_domains_tmpmask;
+cpumask_var_t sched_domains_tmpmask2;
#ifdef CONFIG_SCHED_DEBUG
@@ -35,7 +36,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
cpumask_clear(groupmask);
- printk(KERN_DEBUG "%*s domain %d: ", level, "", level);
+ printk(KERN_DEBUG "%*s domain-%d: ", level, "", level);
if (!(sd->flags & SD_LOAD_BALANCE)) {
printk("does not load-balance\n");
@@ -45,14 +46,14 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
return -1;
}
- printk(KERN_CONT "span %*pbl level %s\n",
+ printk(KERN_CONT "span=%*pbl level=%s\n",
cpumask_pr_args(sched_domain_span(sd)), sd->name);
if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) {
printk(KERN_ERR "ERROR: domain->span does not contain "
"CPU%d\n", cpu);
}
- if (!cpumask_test_cpu(cpu, sched_group_cpus(group))) {
+ if (!cpumask_test_cpu(cpu, sched_group_span(group))) {
printk(KERN_ERR "ERROR: domain->groups does not contain"
" CPU%d\n", cpu);
}
@@ -65,29 +66,47 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
break;
}
- if (!cpumask_weight(sched_group_cpus(group))) {
+ if (!cpumask_weight(sched_group_span(group))) {
printk(KERN_CONT "\n");
printk(KERN_ERR "ERROR: empty group\n");
break;
}
if (!(sd->flags & SD_OVERLAP) &&
- cpumask_intersects(groupmask, sched_group_cpus(group))) {
+ cpumask_intersects(groupmask, sched_group_span(group))) {
printk(KERN_CONT "\n");
printk(KERN_ERR "ERROR: repeated CPUs\n");
break;
}
- cpumask_or(groupmask, groupmask, sched_group_cpus(group));
+ cpumask_or(groupmask, groupmask, sched_group_span(group));
- printk(KERN_CONT " %*pbl",
- cpumask_pr_args(sched_group_cpus(group)));
- if (group->sgc->capacity != SCHED_CAPACITY_SCALE) {
- printk(KERN_CONT " (cpu_capacity = %lu)",
- group->sgc->capacity);
+ printk(KERN_CONT " %d:{ span=%*pbl",
+ group->sgc->id,
+ cpumask_pr_args(sched_group_span(group)));
+
+ if ((sd->flags & SD_OVERLAP) &&
+ !cpumask_equal(group_balance_mask(group), sched_group_span(group))) {
+ printk(KERN_CONT " mask=%*pbl",
+ cpumask_pr_args(group_balance_mask(group)));
+ }
+
+ if (group->sgc->capacity != SCHED_CAPACITY_SCALE)
+ printk(KERN_CONT " cap=%lu", group->sgc->capacity);
+
+ if (group == sd->groups && sd->child &&
+ !cpumask_equal(sched_domain_span(sd->child),
+ sched_group_span(group))) {
+ printk(KERN_ERR "ERROR: domain->groups does not match domain->child\n");
}
+ printk(KERN_CONT " }");
+
group = group->next;
+
+ if (group != sd->groups)
+ printk(KERN_CONT ",");
+
} while (group != sd->groups);
printk(KERN_CONT "\n");
@@ -113,7 +132,7 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
return;
}
- printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
+ printk(KERN_DEBUG "CPU%d attaching sched-domain(s):\n", cpu);
for (;;) {
if (sched_domain_debug_one(sd, cpu, level, sched_domains_tmpmask))
@@ -477,46 +496,214 @@ enum s_alloc {
};
/*
- * Build an iteration mask that can exclude certain CPUs from the upwards
- * domain traversal.
+ * Return the canonical balance CPU for this group, this is the first CPU
+ * of this group that's also in the balance mask.
*
- * Asymmetric node setups can result in situations where the domain tree is of
- * unequal depth, make sure to skip domains that already cover the entire
- * range.
+ * The balance mask are all those CPUs that could actually end up at this
+ * group. See build_balance_mask().
*
- * In that case build_sched_domains() will have terminated the iteration early
- * and our sibling sd spans will be empty. Domains should always include the
- * CPU they're built on, so check that.
+ * Also see should_we_balance().
*/
-static void build_group_mask(struct sched_domain *sd, struct sched_group *sg)
+int group_balance_cpu(struct sched_group *sg)
{
- const struct cpumask *span = sched_domain_span(sd);
+ return cpumask_first(group_balance_mask(sg));
+}
+
+
+/*
+ * NUMA topology (first read the regular topology blurb below)
+ *
+ * Given a node-distance table, for example:
+ *
+ * node 0 1 2 3
+ * 0: 10 20 30 20
+ * 1: 20 10 20 30
+ * 2: 30 20 10 20
+ * 3: 20 30 20 10
+ *
+ * which represents a 4 node ring topology like:
+ *
+ * 0 ----- 1
+ * | |
+ * | |
+ * | |
+ * 3 ----- 2
+ *
+ * We want to construct domains and groups to represent this. The way we go
+ * about doing this is to build the domains on 'hops'. For each NUMA level we
+ * construct the mask of all nodes reachable in @level hops.
+ *
+ * For the above NUMA topology that gives 3 levels:
+ *
+ * NUMA-2 0-3 0-3 0-3 0-3
+ * groups: {0-1,3},{1-3} {0-2},{0,2-3} {1-3},{0-1,3} {0,2-3},{0-2}
+ *
+ * NUMA-1 0-1,3 0-2 1-3 0,2-3
+ * groups: {0},{1},{3} {0},{1},{2} {1},{2},{3} {0},{2},{3}
+ *
+ * NUMA-0 0 1 2 3
+ *
+ *
+ * As can be seen; things don't nicely line up as with the regular topology.
+ * When we iterate a domain in child domain chunks some nodes can be
+ * represented multiple times -- hence the "overlap" naming for this part of
+ * the topology.
+ *
+ * In order to minimize this overlap, we only build enough groups to cover the
+ * domain. For instance Node-0 NUMA-2 would only get groups: 0-1,3 and 1-3.
+ *
+ * Because:
+ *
+ * - the first group of each domain is its child domain; this
+ * gets us the first 0-1,3
+ * - the only uncovered node is 2, who's child domain is 1-3.
+ *
+ * However, because of the overlap, computing a unique CPU for each group is
+ * more complicated. Consider for instance the groups of NODE-1 NUMA-2, both
+ * groups include the CPUs of Node-0, while those CPUs would not in fact ever
+ * end up at those groups (they would end up in group: 0-1,3).
+ *
+ * To correct this we have to introduce the group balance mask. This mask
+ * will contain those CPUs in the group that can reach this group given the
+ * (child) domain tree.
+ *
+ * With this we can once again compute balance_cpu and sched_group_capacity
+ * relations.
+ *
+ * XXX include words on how balance_cpu is unique and therefore can be
+ * used for sched_group_capacity links.
+ *
+ *
+ * Another 'interesting' topology is:
+ *
+ * node 0 1 2 3
+ * 0: 10 20 20 30
+ * 1: 20 10 20 20
+ * 2: 20 20 10 20
+ * 3: 30 20 20 10
+ *
+ * Which looks a little like:
+ *
+ * 0 ----- 1
+ * | / |
+ * | / |
+ * | / |
+ * 2 ----- 3
+ *
+ * This topology is asymmetric, nodes 1,2 are fully connected, but nodes 0,3
+ * are not.
+ *
+ * This leads to a few particularly weird cases where the sched_domain's are
+ * not of the same number for each cpu. Consider:
+ *
+ * NUMA-2 0-3 0-3
+ * groups: {0-2},{1-3} {1-3},{0-2}
+ *
+ * NUMA-1 0-2 0-3 0-3 1-3
+ *
+ * NUMA-0 0 1 2 3
+ *
+ */
+
+
+/*
+ * Build the balance mask; it contains only those CPUs that can arrive at this
+ * group and should be considered to continue balancing.
+ *
+ * We do this during the group creation pass, therefore the group information
+ * isn't complete yet, however since each group represents a (child) domain we
+ * can fully construct this using the sched_domain bits (which are already
+ * complete).
+ */
+static void
+build_balance_mask(struct sched_domain *sd, struct sched_group *sg, struct cpumask *mask)
+{
+ const struct cpumask *sg_span = sched_group_span(sg);
struct sd_data *sdd = sd->private;
struct sched_domain *sibling;
int i;
- for_each_cpu(i, span) {
+ cpumask_clear(mask);
+
+ for_each_cpu(i, sg_span) {
sibling = *per_cpu_ptr(sdd->sd, i);
- if (!cpumask_test_cpu(i, sched_domain_span(sibling)))
+
+ /*
+ * Can happen in the asymmetric case, where these siblings are
+ * unused. The mask will not be empty because those CPUs that
+ * do have the top domain _should_ span the domain.
+ */
+ if (!sibling->child)
continue;
- cpumask_set_cpu(i, sched_group_mask(sg));
+ /* If we would not end up here, we can't continue from here */
+ if (!cpumask_equal(sg_span, sched_domain_span(sibling->child)))
+ continue;
+
+ cpumask_set_cpu(i, mask);
}
+
+ /* We must not have empty masks here */
+ WARN_ON_ONCE(cpumask_empty(mask));
}
/*
- * Return the canonical balance CPU for this group, this is the first CPU
- * of this group that's also in the iteration mask.
+ * XXX: This creates per-node group entries; since the load-balancer will
+ * immediately access remote memory to construct this group's load-balance
+ * statistics having the groups node local is of dubious benefit.
*/
-int group_balance_cpu(struct sched_group *sg)
+static struct sched_group *
+build_group_from_child_sched_domain(struct sched_domain *sd, int cpu)
{
- return cpumask_first_and(sched_group_cpus(sg), sched_group_mask(sg));
+ struct sched_group *sg;
+ struct cpumask *sg_span;
+
+ sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
+ GFP_KERNEL, cpu_to_node(cpu));
+
+ if (!sg)
+ return NULL;
+
+ sg_span = sched_group_span(sg);
+ if (sd->child)
+ cpumask_copy(sg_span, sched_domain_span(sd->child));
+ else
+ cpumask_copy(sg_span, sched_domain_span(sd));
+
+ return sg;
+}
+
+static void init_overlap_sched_group(struct sched_domain *sd,
+ struct sched_group *sg)
+{
+ struct cpumask *mask = sched_domains_tmpmask2;
+ struct sd_data *sdd = sd->private;
+ struct cpumask *sg_span;
+ int cpu;
+
+ build_balance_mask(sd, sg, mask);
+ cpu = cpumask_first_and(sched_group_span(sg), mask);
+
+ sg->sgc = *per_cpu_ptr(sdd->sgc, cpu);
+ if (atomic_inc_return(&sg->sgc->ref) == 1)
+ cpumask_copy(group_balance_mask(sg), mask);
+ else
+ WARN_ON_ONCE(!cpumask_equal(group_balance_mask(sg), mask));
+
+ /*
+ * Initialize sgc->capacity such that even if we mess up the
+ * domains and no possible iteration will get us here, we won't
+ * die on a /0 trap.
+ */
+ sg_span = sched_group_span(sg);
+ sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span);
+ sg->sgc->min_capacity = SCHED_CAPACITY_SCALE;
}
static int
build_overlap_sched_groups(struct sched_domain *sd, int cpu)
{
- struct sched_group *first = NULL, *last = NULL, *groups = NULL, *sg;
+ struct sched_group *first = NULL, *last = NULL, *sg;
const struct cpumask *span = sched_domain_span(sd);
struct cpumask *covered = sched_domains_tmpmask;
struct sd_data *sdd = sd->private;
@@ -525,7 +712,7 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
cpumask_clear(covered);
- for_each_cpu(i, span) {
+ for_each_cpu_wrap(i, span, cpu) {
struct cpumask *sg_span;
if (cpumask_test_cpu(i, covered))
@@ -533,44 +720,27 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
sibling = *per_cpu_ptr(sdd->sd, i);
- /* See the comment near build_group_mask(). */
+ /*
+ * Asymmetric node setups can result in situations where the
+ * domain tree is of unequal depth, make sure to skip domains
+ * that already cover the entire range.
+ *
+ * In that case build_sched_domains() will have terminated the
+ * iteration early and our sibling sd spans will be empty.
+ * Domains should always include the CPU they're built on, so
+ * check that.
+ */
if (!cpumask_test_cpu(i, sched_domain_span(sibling)))
continue;
- sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
- GFP_KERNEL, cpu_to_node(cpu));
-
+ sg = build_group_from_child_sched_domain(sibling, cpu);
if (!sg)
goto fail;
- sg_span = sched_group_cpus(sg);
- if (sibling->child)
- cpumask_copy(sg_span, sched_domain_span(sibling->child));
- else
- cpumask_set_cpu(i, sg_span);
-
+ sg_span = sched_group_span(sg);
cpumask_or(covered, covered, sg_span);
- sg->sgc = *per_cpu_ptr(sdd->sgc, i);
- if (atomic_inc_return(&sg->sgc->ref) == 1)
- build_group_mask(sd, sg);
-
- /*
- * Initialize sgc->capacity such that even if we mess up the
- * domains and no possible iteration will get us here, we won't
- * die on a /0 trap.
- */
- sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span);
- sg->sgc->min_capacity = SCHED_CAPACITY_SCALE;
-
- /*
- * Make sure the first group of this domain contains the
- * canonical balance CPU. Otherwise the sched_domain iteration
- * breaks. See update_sg_lb_stats().
- */
- if ((!groups && cpumask_test_cpu(cpu, sg_span)) ||
- group_balance_cpu(sg) == cpu)
- groups = sg;
+ init_overlap_sched_group(sd, sg);
if (!first)
first = sg;
@@ -579,7 +749,7 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
last = sg;
last->next = first;
}
- sd->groups = groups;
+ sd->groups = first;
return 0;
@@ -589,23 +759,106 @@ fail:
return -ENOMEM;
}
-static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg)
+
+/*
+ * Package topology (also see the load-balance blurb in fair.c)
+ *
+ * The scheduler builds a tree structure to represent a number of important
+ * topology features. By default (default_topology[]) these include:
+ *
+ * - Simultaneous multithreading (SMT)
+ * - Multi-Core Cache (MC)
+ * - Package (DIE)
+ *
+ * Where the last one more or less denotes everything up to a NUMA node.
+ *
+ * The tree consists of 3 primary data structures:
+ *
+ * sched_domain -> sched_group -> sched_group_capacity
+ * ^ ^ ^ ^
+ * `-' `-'
+ *
+ * The sched_domains are per-cpu and have a two way link (parent & child) and
+ * denote the ever growing mask of CPUs belonging to that level of topology.
+ *
+ * Each sched_domain has a circular (double) linked list of sched_group's, each
+ * denoting the domains of the level below (or individual CPUs in case of the
+ * first domain level). The sched_group linked by a sched_domain includes the
+ * CPU of that sched_domain [*].
+ *
+ * Take for instance a 2 threaded, 2 core, 2 cache cluster part:
+ *
+ * CPU 0 1 2 3 4 5 6 7
+ *
+ * DIE [ ]
+ * MC [ ] [ ]
+ * SMT [ ] [ ] [ ] [ ]
+ *
+ * - or -
+ *
+ * DIE 0-7 0-7 0-7 0-7 0-7 0-7 0-7 0-7
+ * MC 0-3 0-3 0-3 0-3 4-7 4-7 4-7 4-7
+ * SMT 0-1 0-1 2-3 2-3 4-5 4-5 6-7 6-7
+ *
+ * CPU 0 1 2 3 4 5 6 7
+ *
+ * One way to think about it is: sched_domain moves you up and down among these
+ * topology levels, while sched_group moves you sideways through it, at child
+ * domain granularity.
+ *
+ * sched_group_capacity ensures each unique sched_group has shared storage.
+ *
+ * There are two related construction problems, both require a CPU that
+ * uniquely identify each group (for a given domain):
+ *
+ * - The first is the balance_cpu (see should_we_balance() and the
+ * load-balance blub in fair.c); for each group we only want 1 CPU to
+ * continue balancing at a higher domain.
+ *
+ * - The second is the sched_group_capacity; we want all identical groups
+ * to share a single sched_group_capacity.
+ *
+ * Since these topologies are exclusive by construction. That is, its
+ * impossible for an SMT thread to belong to multiple cores, and cores to
+ * be part of multiple caches. There is a very clear and unique location
+ * for each CPU in the hierarchy.
+ *
+ * Therefore computing a unique CPU for each group is trivial (the iteration
+ * mask is redundant and set all 1s; all CPUs in a group will end up at _that_
+ * group), we can simply pick the first CPU in each group.
+ *
+ *
+ * [*] in other words, the first group of each domain is its child domain.
+ */
+
+static struct sched_group *get_group(int cpu, struct sd_data *sdd)
{
struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
struct sched_domain *child = sd->child;
+ struct sched_group *sg;
if (child)
cpu = cpumask_first(sched_domain_span(child));
- if (sg) {
- *sg = *per_cpu_ptr(sdd->sg, cpu);
- (*sg)->sgc = *per_cpu_ptr(sdd->sgc, cpu);
+ sg = *per_cpu_ptr(sdd->sg, cpu);
+ sg->sgc = *per_cpu_ptr(sdd->sgc, cpu);
+
+ /* For claim_allocations: */
+ atomic_inc(&sg->ref);
+ atomic_inc(&sg->sgc->ref);
- /* For claim_allocations: */
- atomic_set(&(*sg)->sgc->ref, 1);
+ if (child) {
+ cpumask_copy(sched_group_span(sg), sched_domain_span(child));
+ cpumask_copy(group_balance_mask(sg), sched_group_span(sg));
+ } else {
+ cpumask_set_cpu(cpu, sched_group_span(sg));
+ cpumask_set_cpu(cpu, group_balance_mask(sg));
}
- return cpu;
+ sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sched_group_span(sg));
+ sg->sgc->min_capacity = SCHED_CAPACITY_SCALE;
+
+ return sg;
}
/*
@@ -624,34 +877,20 @@ build_sched_groups(struct sched_domain *sd, int cpu)
struct cpumask *covered;
int i;
- get_group(cpu, sdd, &sd->groups);
- atomic_inc(&sd->groups->ref);
-
- if (cpu != cpumask_first(span))
- return 0;
-
lockdep_assert_held(&sched_domains_mutex);
covered = sched_domains_tmpmask;
cpumask_clear(covered);
- for_each_cpu(i, span) {
+ for_each_cpu_wrap(i, span, cpu) {
struct sched_group *sg;
- int group, j;
if (cpumask_test_cpu(i, covered))
continue;
- group = get_group(i, sdd, &sg);
- cpumask_setall(sched_group_mask(sg));
+ sg = get_group(i, sdd);
- for_each_cpu(j, span) {
- if (get_group(j, sdd, NULL) != group)
- continue;
-
- cpumask_set_cpu(j, covered);
- cpumask_set_cpu(j, sched_group_cpus(sg));
- }
+ cpumask_or(covered, covered, sched_group_span(sg));
if (!first)
first = sg;
@@ -660,6 +899,7 @@ build_sched_groups(struct sched_domain *sd, int cpu)
last = sg;
}
last->next = first;
+ sd->groups = first;
return 0;
}
@@ -683,12 +923,12 @@ static void init_sched_groups_capacity(int cpu, struct sched_domain *sd)
do {
int cpu, max_cpu = -1;
- sg->group_weight = cpumask_weight(sched_group_cpus(sg));
+ sg->group_weight = cpumask_weight(sched_group_span(sg));
if (!(sd->flags & SD_ASYM_PACKING))
goto next;
- for_each_cpu(cpu, sched_group_cpus(sg)) {
+ for_each_cpu(cpu, sched_group_span(sg)) {
if (max_cpu < 0)
max_cpu = cpu;
else if (sched_asym_prefer(cpu, max_cpu))
@@ -1308,6 +1548,10 @@ static int __sdt_alloc(const struct cpumask *cpu_map)
if (!sgc)
return -ENOMEM;
+#ifdef CONFIG_SCHED_DEBUG
+ sgc->id = j;
+#endif
+
*per_cpu_ptr(sdd->sgc, j) = sgc;
}
}
@@ -1407,7 +1651,7 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
sd = build_sched_domain(tl, cpu_map, attr, sd, i);
if (tl == sched_domain_topology)
*per_cpu_ptr(d.sd, i) = sd;
- if (tl->flags & SDTL_OVERLAP || sched_feat(FORCE_SD_OVERLAP))
+ if (tl->flags & SDTL_OVERLAP)
sd->flags |= SD_OVERLAP;
if (cpumask_equal(cpu_map, sched_domain_span(sd)))
break;
@@ -1478,7 +1722,7 @@ static struct sched_domain_attr *dattr_cur;
* cpumask) fails, then fallback to a single sched domain,
* as determined by the single cpumask fallback_doms.
*/
-cpumask_var_t fallback_doms;
+static cpumask_var_t fallback_doms;
/*
* arch_update_cpu_topology lets virtualized architectures update the
@@ -1520,10 +1764,14 @@ void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms)
* For now this just excludes isolated CPUs, but could be used to
* exclude other special cases in the future.
*/
-int init_sched_domains(const struct cpumask *cpu_map)
+int sched_init_domains(const struct cpumask *cpu_map)
{
int err;
+ zalloc_cpumask_var(&sched_domains_tmpmask, GFP_KERNEL);
+ zalloc_cpumask_var(&sched_domains_tmpmask2, GFP_KERNEL);
+ zalloc_cpumask_var(&fallback_doms, GFP_KERNEL);
+
arch_update_cpu_topology();
ndoms_cur = 1;
doms_cur = alloc_sched_domains(ndoms_cur);
diff --git a/kernel/smp.c b/kernel/smp.c
index a817769b53c0..3061483cb3ad 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -30,6 +30,7 @@ enum {
struct call_function_data {
struct call_single_data __percpu *csd;
cpumask_var_t cpumask;
+ cpumask_var_t cpumask_ipi;
};
static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_function_data, cfd_data);
@@ -45,9 +46,15 @@ int smpcfd_prepare_cpu(unsigned int cpu)
if (!zalloc_cpumask_var_node(&cfd->cpumask, GFP_KERNEL,
cpu_to_node(cpu)))
return -ENOMEM;
+ if (!zalloc_cpumask_var_node(&cfd->cpumask_ipi, GFP_KERNEL,
+ cpu_to_node(cpu))) {
+ free_cpumask_var(cfd->cpumask);
+ return -ENOMEM;
+ }
cfd->csd = alloc_percpu(struct call_single_data);
if (!cfd->csd) {
free_cpumask_var(cfd->cpumask);
+ free_cpumask_var(cfd->cpumask_ipi);
return -ENOMEM;
}
@@ -59,6 +66,7 @@ int smpcfd_dead_cpu(unsigned int cpu)
struct call_function_data *cfd = &per_cpu(cfd_data, cpu);
free_cpumask_var(cfd->cpumask);
+ free_cpumask_var(cfd->cpumask_ipi);
free_percpu(cfd->csd);
return 0;
}
@@ -428,12 +436,13 @@ void smp_call_function_many(const struct cpumask *mask,
cfd = this_cpu_ptr(&cfd_data);
cpumask_and(cfd->cpumask, mask, cpu_online_mask);
- cpumask_clear_cpu(this_cpu, cfd->cpumask);
+ __cpumask_clear_cpu(this_cpu, cfd->cpumask);
/* Some callers race with other cpus changing the passed mask */
if (unlikely(!cpumask_weight(cfd->cpumask)))
return;
+ cpumask_clear(cfd->cpumask_ipi);
for_each_cpu(cpu, cfd->cpumask) {
struct call_single_data *csd = per_cpu_ptr(cfd->csd, cpu);
@@ -442,11 +451,12 @@ void smp_call_function_many(const struct cpumask *mask,
csd->flags |= CSD_FLAG_SYNCHRONOUS;
csd->func = func;
csd->info = info;
- llist_add(&csd->llist, &per_cpu(call_single_queue, cpu));
+ if (llist_add(&csd->llist, &per_cpu(call_single_queue, cpu)))
+ __cpumask_set_cpu(cpu, cfd->cpumask_ipi);
}
/* Send a message to all CPUs in the map */
- arch_send_call_function_ipi_mask(cfd->cpumask);
+ arch_send_call_function_ipi_mask(cfd->cpumask_ipi);
if (wait) {
for_each_cpu(cpu, cfd->cpumask) {
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 93621ae718d3..03918a19cf2d 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -233,6 +233,9 @@ static void clocksource_watchdog(unsigned long data)
continue;
}
+ if (cs == curr_clocksource && cs->tick_stable)
+ cs->tick_stable(cs);
+
if (!(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES) &&
(cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) &&
(watchdog->flags & CLOCK_SOURCE_IS_CONTINUOUS)) {
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 64c97fc130c4..9c2dc64e31d8 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -554,7 +554,7 @@ static void tick_nohz_stop_idle(struct tick_sched *ts, ktime_t now)
update_ts_time_stats(smp_processor_id(), ts, now, NULL);
ts->idle_active = 0;
- sched_clock_idle_wakeup_event(0);
+ sched_clock_idle_wakeup_event();
}
static ktime_t tick_nohz_start_idle(struct tick_sched *ts)
diff --git a/lib/cpumask.c b/lib/cpumask.c
index 81dedaab36cc..4731a0895760 100644
--- a/lib/cpumask.c
+++ b/lib/cpumask.c
@@ -43,6 +43,38 @@ int cpumask_any_but(const struct cpumask *mask, unsigned int cpu)
}
EXPORT_SYMBOL(cpumask_any_but);
+/**
+ * cpumask_next_wrap - helper to implement for_each_cpu_wrap
+ * @n: the cpu prior to the place to search
+ * @mask: the cpumask pointer
+ * @start: the start point of the iteration
+ * @wrap: assume @n crossing @start terminates the iteration
+ *
+ * Returns >= nr_cpu_ids on completion
+ *
+ * Note: the @wrap argument is required for the start condition when
+ * we cannot assume @start is set in @mask.
+ */
+int cpumask_next_wrap(int n, const struct cpumask *mask, int start, bool wrap)
+{
+ int next;
+
+again:
+ next = cpumask_next(n, mask);
+
+ if (wrap && n < start && next >= start) {
+ return nr_cpumask_bits;
+
+ } else if (next >= nr_cpumask_bits) {
+ wrap = true;
+ n = -1;
+ goto again;
+ }
+
+ return next;
+}
+EXPORT_SYMBOL(cpumask_next_wrap);
+
/* These are not inline because of header tangles. */
#ifdef CONFIG_CPUMASK_OFFSTACK
/**
diff --git a/lib/smp_processor_id.c b/lib/smp_processor_id.c
index 690d75b132fa..2fb007be0212 100644
--- a/lib/smp_processor_id.c
+++ b/lib/smp_processor_id.c
@@ -28,7 +28,7 @@ notrace static unsigned int check_preemption_disabled(const char *what1,
/*
* It is valid to assume CPU-locality during early bootup:
*/
- if (system_state != SYSTEM_RUNNING)
+ if (system_state < SYSTEM_SCHEDULING)
goto out;
/*
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 8ad39bbc79e6..c3c1c6ac62da 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -3652,7 +3652,7 @@ int kswapd_run(int nid)
pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid);
if (IS_ERR(pgdat->kswapd)) {
/* failure at boot is fatal */
- BUG_ON(system_state == SYSTEM_BOOTING);
+ BUG_ON(system_state < SYSTEM_RUNNING);
pr_err("Failed to start kswapd on node %d\n", nid);
ret = PTR_ERR(pgdat->kswapd);
pgdat->kswapd = NULL;