From 2e672ab2d491713541963afca3a5967ccc2376e9 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 2 Oct 2017 16:54:49 -0700 Subject: rcu: Avoid ->dynticks_nmi_nesting store tearing NMIs can nest, and store tearing could in theory happen on carries from one byte to the next. This commit therefore adds the WRITE_ONCE() macros preventing this. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index f9c0ca2ccf0c..c5d960f86cf8 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1103,7 +1103,8 @@ void rcu_nmi_enter(void) rcu_dynticks_eqs_exit(); incby = 1; } - rdtp->dynticks_nmi_nesting += incby; + WRITE_ONCE(rdtp->dynticks_nmi_nesting, /* Prevent store tearing. */ + rdtp->dynticks_nmi_nesting + incby); barrier(); } @@ -1135,12 +1136,13 @@ void rcu_nmi_exit(void) * leave it in non-RCU-idle state. */ if (rdtp->dynticks_nmi_nesting != 1) { - rdtp->dynticks_nmi_nesting -= 2; + WRITE_ONCE(rdtp->dynticks_nmi_nesting, /* No store tearing. */ + rdtp->dynticks_nmi_nesting - 2); return; } /* This NMI interrupted an RCU-idle CPU, restore RCU-idleness. */ - rdtp->dynticks_nmi_nesting = 0; + WRITE_ONCE(rdtp->dynticks_nmi_nesting, 0); /* Avoid store tearing. */ rcu_dynticks_eqs_enter(); } -- cgit v1.2.3 From a0eb22bf64a755bb162b421120b9fbe7d012b85f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 2 Oct 2017 19:45:10 -0700 Subject: rcu: Reduce dyntick-idle state space Both extended-quiescent-state entry and exit first update the nesting counter and then adjust the dyntick-idle state. This means that there are four states: (1) Both nesting and dyntick idle indicate idle, (2) Nesting indicates idle but dyntick idle does not, (3) Nesting indicates non-idle and dyntick idle does not, and (4) Both nesting and dyntick idle indicate non-idle. This commit simplifies the state space by eliminating #3, reversing the order of updates on exit from extended quiescent state. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index c5d960f86cf8..49f661bb8ffe 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -928,21 +928,21 @@ void rcu_irq_exit_irqson(void) * we really have exited idle, and must do the appropriate accounting. * The caller must have disabled interrupts. */ -static void rcu_eqs_exit_common(long long oldval, int user) +static void rcu_eqs_exit_common(long long newval, int user) { RCU_TRACE(struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);) rcu_dynticks_task_exit(); rcu_dynticks_eqs_exit(); rcu_cleanup_after_idle(); - trace_rcu_dyntick(TPS("End"), oldval, rdtp->dynticks_nesting); + trace_rcu_dyntick(TPS("End"), rdtp->dynticks_nesting, newval); if (IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current)) { struct task_struct *idle __maybe_unused = idle_task(smp_processor_id()); trace_rcu_dyntick(TPS("Error on exit: not idle task"), - oldval, rdtp->dynticks_nesting); + rdtp->dynticks_nesting, newval); rcu_ftrace_dump(DUMP_ORIG); WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", current->pid, current->comm, @@ -967,8 +967,8 @@ static void rcu_eqs_exit(bool user) rdtp->dynticks_nesting += DYNTICK_TASK_NEST_VALUE; } else { __this_cpu_inc(disable_rcu_irq_enter); + rcu_eqs_exit_common(DYNTICK_TASK_EXIT_IDLE, user); rdtp->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; - rcu_eqs_exit_common(oldval, user); __this_cpu_dec(disable_rcu_irq_enter); } } @@ -1037,7 +1037,7 @@ void rcu_user_exit(void) void rcu_irq_enter(void) { struct rcu_dynticks *rdtp; - long long oldval; + long long newval; lockdep_assert_irqs_disabled(); rdtp = this_cpu_ptr(&rcu_dynticks); @@ -1046,14 +1046,13 @@ void rcu_irq_enter(void) if (rdtp->dynticks_nmi_nesting) return; - oldval = rdtp->dynticks_nesting; - rdtp->dynticks_nesting++; - WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && - rdtp->dynticks_nesting == 0); - if (oldval) - trace_rcu_dyntick(TPS("++="), oldval, rdtp->dynticks_nesting); + newval = rdtp->dynticks_nesting + 1; + WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && newval == 0); + if (rdtp->dynticks_nesting) + trace_rcu_dyntick(TPS("++="), rdtp->dynticks_nesting, newval); else - rcu_eqs_exit_common(oldval, true); + rcu_eqs_exit_common(newval, true); + rdtp->dynticks_nesting++; } /* -- cgit v1.2.3 From fd581a91ac16187625ec509414d08d37827472c4 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 2 Oct 2017 21:56:20 -0700 Subject: rcu: Move rcu_nmi_{enter,exit}() to prepare for consolidation This is a code-motion-only commit that prepares to define rcu_irq_enter() in terms of rcu_nmi_enter() and rcu_irq_exit() in terms of rcu_irq_exit(). Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 150 +++++++++++++++++++++++++++--------------------------- 1 file changed, 75 insertions(+), 75 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 49f661bb8ffe..419f3c38e1b6 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -866,6 +866,44 @@ void rcu_user_enter(void) } #endif /* CONFIG_NO_HZ_FULL */ +/** + * rcu_nmi_exit - inform RCU of exit from NMI context + * + * If we are returning from the outermost NMI handler that interrupted an + * RCU-idle period, update rdtp->dynticks and rdtp->dynticks_nmi_nesting + * to let the RCU grace-period handling know that the CPU is back to + * being RCU-idle. + * + * If you add or remove a call to rcu_nmi_exit(), be sure to test + * with CONFIG_RCU_EQS_DEBUG=y. + */ +void rcu_nmi_exit(void) +{ + struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); + + /* + * Check for ->dynticks_nmi_nesting underflow and bad ->dynticks. + * (We are exiting an NMI handler, so RCU better be paying attention + * to us!) + */ + WARN_ON_ONCE(rdtp->dynticks_nmi_nesting <= 0); + WARN_ON_ONCE(rcu_dynticks_curr_cpu_in_eqs()); + + /* + * If the nesting level is not 1, the CPU wasn't RCU-idle, so + * leave it in non-RCU-idle state. + */ + if (rdtp->dynticks_nmi_nesting != 1) { + WRITE_ONCE(rdtp->dynticks_nmi_nesting, /* No store tearing. */ + rdtp->dynticks_nmi_nesting - 2); + return; + } + + /* This NMI interrupted an RCU-idle CPU, restore RCU-idleness. */ + WRITE_ONCE(rdtp->dynticks_nmi_nesting, 0); /* Avoid store tearing. */ + rcu_dynticks_eqs_enter(); +} + /** * rcu_irq_exit - inform RCU that current CPU is exiting irq towards idle * @@ -1012,6 +1050,43 @@ void rcu_user_exit(void) } #endif /* CONFIG_NO_HZ_FULL */ +/** + * rcu_nmi_enter - inform RCU of entry to NMI context + * + * If the CPU was idle from RCU's viewpoint, update rdtp->dynticks and + * rdtp->dynticks_nmi_nesting to let the RCU grace-period handling know + * that the CPU is active. This implementation permits nested NMIs, as + * long as the nesting level does not overflow an int. (You will probably + * run out of stack space first.) + * + * If you add or remove a call to rcu_nmi_enter(), be sure to test + * with CONFIG_RCU_EQS_DEBUG=y. + */ +void rcu_nmi_enter(void) +{ + struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); + int incby = 2; + + /* Complain about underflow. */ + WARN_ON_ONCE(rdtp->dynticks_nmi_nesting < 0); + + /* + * If idle from RCU viewpoint, atomically increment ->dynticks + * to mark non-idle and increment ->dynticks_nmi_nesting by one. + * Otherwise, increment ->dynticks_nmi_nesting by two. This means + * if ->dynticks_nmi_nesting is equal to one, we are guaranteed + * to be in the outermost NMI handler that interrupted an RCU-idle + * period (observation due to Andy Lutomirski). + */ + if (rcu_dynticks_curr_cpu_in_eqs()) { + rcu_dynticks_eqs_exit(); + incby = 1; + } + WRITE_ONCE(rdtp->dynticks_nmi_nesting, /* Prevent store tearing. */ + rdtp->dynticks_nmi_nesting + incby); + barrier(); +} + /** * rcu_irq_enter - inform RCU that current CPU is entering irq away from idle * @@ -1070,81 +1145,6 @@ void rcu_irq_enter_irqson(void) local_irq_restore(flags); } -/** - * rcu_nmi_enter - inform RCU of entry to NMI context - * - * If the CPU was idle from RCU's viewpoint, update rdtp->dynticks and - * rdtp->dynticks_nmi_nesting to let the RCU grace-period handling know - * that the CPU is active. This implementation permits nested NMIs, as - * long as the nesting level does not overflow an int. (You will probably - * run out of stack space first.) - * - * If you add or remove a call to rcu_nmi_enter(), be sure to test - * with CONFIG_RCU_EQS_DEBUG=y. - */ -void rcu_nmi_enter(void) -{ - struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); - int incby = 2; - - /* Complain about underflow. */ - WARN_ON_ONCE(rdtp->dynticks_nmi_nesting < 0); - - /* - * If idle from RCU viewpoint, atomically increment ->dynticks - * to mark non-idle and increment ->dynticks_nmi_nesting by one. - * Otherwise, increment ->dynticks_nmi_nesting by two. This means - * if ->dynticks_nmi_nesting is equal to one, we are guaranteed - * to be in the outermost NMI handler that interrupted an RCU-idle - * period (observation due to Andy Lutomirski). - */ - if (rcu_dynticks_curr_cpu_in_eqs()) { - rcu_dynticks_eqs_exit(); - incby = 1; - } - WRITE_ONCE(rdtp->dynticks_nmi_nesting, /* Prevent store tearing. */ - rdtp->dynticks_nmi_nesting + incby); - barrier(); -} - -/** - * rcu_nmi_exit - inform RCU of exit from NMI context - * - * If we are returning from the outermost NMI handler that interrupted an - * RCU-idle period, update rdtp->dynticks and rdtp->dynticks_nmi_nesting - * to let the RCU grace-period handling know that the CPU is back to - * being RCU-idle. - * - * If you add or remove a call to rcu_nmi_exit(), be sure to test - * with CONFIG_RCU_EQS_DEBUG=y. - */ -void rcu_nmi_exit(void) -{ - struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); - - /* - * Check for ->dynticks_nmi_nesting underflow and bad ->dynticks. - * (We are exiting an NMI handler, so RCU better be paying attention - * to us!) - */ - WARN_ON_ONCE(rdtp->dynticks_nmi_nesting <= 0); - WARN_ON_ONCE(rcu_dynticks_curr_cpu_in_eqs()); - - /* - * If the nesting level is not 1, the CPU wasn't RCU-idle, so - * leave it in non-RCU-idle state. - */ - if (rdtp->dynticks_nmi_nesting != 1) { - WRITE_ONCE(rdtp->dynticks_nmi_nesting, /* No store tearing. */ - rdtp->dynticks_nmi_nesting - 2); - return; - } - - /* This NMI interrupted an RCU-idle CPU, restore RCU-idleness. */ - WRITE_ONCE(rdtp->dynticks_nmi_nesting, 0); /* Avoid store tearing. */ - rcu_dynticks_eqs_enter(); -} - /** * rcu_is_watching - see if RCU thinks that the current CPU is idle * -- cgit v1.2.3 From 6136d6e48a0138f6be5bb3427dbeb0ba07a546a4 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 3 Oct 2017 08:28:04 -0700 Subject: rcu: Clamp ->dynticks_nmi_nesting at eqs entry/exit In preparation for merging dyntick-idle irq handling into the NMI algorithm, clamp ->dynticks_nmi_nesting value to allow for interrupts that enter but never leave and vice versa. It is important that the clamping happen outside of the extended quiescent state. Otherwise, there will be short windows where irqs and NMIs fail to convince RCU to start watching. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcu.h | 2 ++ kernel/rcu/tree.c | 2 ++ 2 files changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 59c471de342a..f4a411964c41 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -56,6 +56,8 @@ #define DYNTICK_TASK_EXIT_IDLE (DYNTICK_TASK_NEST_VALUE + \ DYNTICK_TASK_FLAG) +#define DYNTICK_IRQ_NONIDLE ((INT_MAX / 2) + 1) + /* * Grace-period counter management. diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 419f3c38e1b6..142cdd4a50c9 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -818,6 +818,7 @@ static void rcu_eqs_enter(bool user) struct rcu_dynticks *rdtp; rdtp = this_cpu_ptr(&rcu_dynticks); + WRITE_ONCE(rdtp->dynticks_nmi_nesting, 0); WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && (rdtp->dynticks_nesting & DYNTICK_TASK_NEST_MASK) == 0); if ((rdtp->dynticks_nesting & DYNTICK_TASK_NEST_MASK) == DYNTICK_TASK_NEST_VALUE) @@ -1008,6 +1009,7 @@ static void rcu_eqs_exit(bool user) rcu_eqs_exit_common(DYNTICK_TASK_EXIT_IDLE, user); rdtp->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; __this_cpu_dec(disable_rcu_irq_enter); + WRITE_ONCE(rdtp->dynticks_nmi_nesting, DYNTICK_IRQ_NONIDLE); } } -- cgit v1.2.3 From 58721f5da4bcd5187566f4159a4fc88f70bf74f6 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 3 Oct 2017 10:42:22 -0700 Subject: rcu: Define rcu_irq_{enter,exit}() in terms of rcu_nmi_{enter,exit}() RCU currently uses two different mechanisms for tracking irqs and NMIs. This is unnecessary complexity: Given that NMIs can nest and given that RCU's tracking handles such nesting, the NMI tracking mechanism can also be used to track irqs. This commit therefore defines rcu_irq_enter() in terms of rcu_nmi_enter() and rcu_irq_exit() in terms of rcu_nmi_exit(). Unfortunately, callers must still distinguish between the irq and NMI functions because additional actions are taken when an irq interrupts idle or nohz_full usermode execution, and these actions cannot always be taken from NMI handlers. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 59 ++++++++++++++++++++----------------------------------- 1 file changed, 21 insertions(+), 38 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 142cdd4a50c9..fde0e840563f 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -266,6 +266,7 @@ void rcu_bh_qs(void) static DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { .dynticks_nesting = DYNTICK_TASK_EXIT_IDLE, + .dynticks_nmi_nesting = DYNTICK_IRQ_NONIDLE, .dynticks = ATOMIC_INIT(RCU_DYNTICK_CTRL_CTR), }; @@ -914,8 +915,8 @@ void rcu_nmi_exit(void) * * This code assumes that the idle loop never does anything that might * result in unbalanced calls to irq_enter() and irq_exit(). If your - * architecture violates this assumption, RCU will give you what you - * deserve, good and hard. But very infrequently and irreproducibly. + * architecture's idle loop violates this assumption, RCU will give you what + * you deserve, good and hard. But very infrequently and irreproducibly. * * Use things like work queues to work around this limitation. * @@ -926,23 +927,14 @@ void rcu_nmi_exit(void) */ void rcu_irq_exit(void) { - struct rcu_dynticks *rdtp; + struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); lockdep_assert_irqs_disabled(); - rdtp = this_cpu_ptr(&rcu_dynticks); - - /* Page faults can happen in NMI handlers, so check... */ - if (rdtp->dynticks_nmi_nesting) - return; - - WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && - rdtp->dynticks_nesting < 1); - if (rdtp->dynticks_nesting <= 1) { - rcu_eqs_enter_common(true); - } else { - trace_rcu_dyntick(TPS("--="), rdtp->dynticks_nesting, rdtp->dynticks_nesting - 1); - rdtp->dynticks_nesting--; - } + if (rdtp->dynticks_nmi_nesting == 1) + rcu_prepare_for_idle(); + rcu_nmi_exit(); + if (rdtp->dynticks_nmi_nesting == 0) + rcu_dynticks_task_enter(); } /* @@ -1097,12 +1089,12 @@ void rcu_nmi_enter(void) * sections can occur. The caller must have disabled interrupts. * * Note that the Linux kernel is fully capable of entering an interrupt - * handler that it never exits, for example when doing upcalls to - * user mode! This code assumes that the idle loop never does upcalls to - * user mode. If your architecture does do upcalls from the idle loop (or - * does anything else that results in unbalanced calls to the irq_enter() - * and irq_exit() functions), RCU will give you what you deserve, good - * and hard. But very infrequently and irreproducibly. + * handler that it never exits, for example when doing upcalls to user mode! + * This code assumes that the idle loop never does upcalls to user mode. + * If your architecture's idle loop does do upcalls to user mode (or does + * anything else that results in unbalanced calls to the irq_enter() and + * irq_exit() functions), RCU will give you what you deserve, good and hard. + * But very infrequently and irreproducibly. * * Use things like work queues to work around this limitation. * @@ -1113,23 +1105,14 @@ void rcu_nmi_enter(void) */ void rcu_irq_enter(void) { - struct rcu_dynticks *rdtp; - long long newval; + struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); lockdep_assert_irqs_disabled(); - rdtp = this_cpu_ptr(&rcu_dynticks); - - /* Page faults can happen in NMI handlers, so check... */ - if (rdtp->dynticks_nmi_nesting) - return; - - newval = rdtp->dynticks_nesting + 1; - WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && newval == 0); - if (rdtp->dynticks_nesting) - trace_rcu_dyntick(TPS("++="), rdtp->dynticks_nesting, newval); - else - rcu_eqs_exit_common(newval, true); - rdtp->dynticks_nesting++; + if (rdtp->dynticks_nmi_nesting == 0) + rcu_dynticks_task_exit(); + rcu_nmi_enter(); + if (rdtp->dynticks_nmi_nesting == 1) + rcu_cleanup_after_idle(); } /* -- cgit v1.2.3 From 51a1fd30f13090be7750fed86cf3728afaf4e394 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 3 Oct 2017 14:43:40 -0700 Subject: rcu: Make ->dynticks_nesting be a simple counter Now that ->dynticks_nesting counts only process-level dyntick-idle entry and exit, there is no need for the elaborate segmented counter with its guard fields and overflow checking. This commit therefore makes ->dynticks_nesting be a simple counter. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcu.h | 27 +-------------------------- kernel/rcu/tree.c | 40 ++++++++++++++++++++-------------------- kernel/rcu/tree.h | 1 - 3 files changed, 21 insertions(+), 47 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index f4a411964c41..afe0559d1867 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -30,32 +30,7 @@ #define RCU_TRACE(stmt) #endif /* #else #ifdef CONFIG_RCU_TRACE */ -/* - * Process-level increment to ->dynticks_nesting field. This allows for - * architectures that use half-interrupts and half-exceptions from - * process context. - * - * DYNTICK_TASK_NEST_MASK defines a field of width DYNTICK_TASK_NEST_WIDTH - * that counts the number of process-based reasons why RCU cannot - * consider the corresponding CPU to be idle, and DYNTICK_TASK_NEST_VALUE - * is the value used to increment or decrement this field. - * - * The rest of the bits could in principle be used to count interrupts, - * but this would mean that a negative-one value in the interrupt - * field could incorrectly zero out the DYNTICK_TASK_NEST_MASK field. - * We therefore provide a two-bit guard field defined by DYNTICK_TASK_MASK - * that is set to DYNTICK_TASK_FLAG upon initial exit from idle. - * The DYNTICK_TASK_EXIT_IDLE value is thus the combined value used upon - * initial exit from idle. - */ -#define DYNTICK_TASK_NEST_WIDTH 7 -#define DYNTICK_TASK_NEST_VALUE ((LLONG_MAX >> DYNTICK_TASK_NEST_WIDTH) + 1) -#define DYNTICK_TASK_NEST_MASK (LLONG_MAX - DYNTICK_TASK_NEST_VALUE + 1) -#define DYNTICK_TASK_FLAG ((DYNTICK_TASK_NEST_VALUE / 8) * 2) -#define DYNTICK_TASK_MASK ((DYNTICK_TASK_NEST_VALUE / 8) * 3) -#define DYNTICK_TASK_EXIT_IDLE (DYNTICK_TASK_NEST_VALUE + \ - DYNTICK_TASK_FLAG) - +/* Offset to allow for unmatched rcu_irq_{enter,exit}(). */ #define DYNTICK_IRQ_NONIDLE ((INT_MAX / 2) + 1) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index fde0e840563f..d123474fe829 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -265,7 +265,7 @@ void rcu_bh_qs(void) #endif static DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { - .dynticks_nesting = DYNTICK_TASK_EXIT_IDLE, + .dynticks_nesting = 1, .dynticks_nmi_nesting = DYNTICK_IRQ_NONIDLE, .dynticks = ATOMIC_INIT(RCU_DYNTICK_CTRL_CTR), }; @@ -813,6 +813,10 @@ static void rcu_eqs_enter_common(bool user) /* * Enter an RCU extended quiescent state, which can be either the * idle loop or adaptive-tickless usermode execution. + * + * We crowbar the ->dynticks_nmi_nesting field to zero to allow for + * the possibility of usermode upcalls having messed up our count + * of interrupt nesting level during the prior busy period. */ static void rcu_eqs_enter(bool user) { @@ -821,11 +825,11 @@ static void rcu_eqs_enter(bool user) rdtp = this_cpu_ptr(&rcu_dynticks); WRITE_ONCE(rdtp->dynticks_nmi_nesting, 0); WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && - (rdtp->dynticks_nesting & DYNTICK_TASK_NEST_MASK) == 0); - if ((rdtp->dynticks_nesting & DYNTICK_TASK_NEST_MASK) == DYNTICK_TASK_NEST_VALUE) + rdtp->dynticks_nesting == 0); + if (rdtp->dynticks_nesting == 1) rcu_eqs_enter_common(user); else - rdtp->dynticks_nesting -= DYNTICK_TASK_NEST_VALUE; + rdtp->dynticks_nesting--; } /** @@ -836,10 +840,6 @@ static void rcu_eqs_enter(bool user) * critical sections can occur in irq handlers in idle, a possibility * handled by irq_enter() and irq_exit().) * - * We crowbar the ->dynticks_nesting field to zero to allow for - * the possibility of usermode upcalls having messed up our count - * of interrupt nesting level during the prior busy period. - * * If you add or remove a call to rcu_idle_enter(), be sure to test with * CONFIG_RCU_EQS_DEBUG=y. */ @@ -984,6 +984,10 @@ static void rcu_eqs_exit_common(long long newval, int user) /* * Exit an RCU extended quiescent state, which can be either the * idle loop or adaptive-tickless usermode execution. + * + * We crowbar the ->dynticks_nmi_nesting field to DYNTICK_IRQ_NONIDLE to + * allow for the possibility of usermode upcalls messing up our count of + * interrupt nesting level during the busy period that is just now starting. */ static void rcu_eqs_exit(bool user) { @@ -994,12 +998,12 @@ static void rcu_eqs_exit(bool user) rdtp = this_cpu_ptr(&rcu_dynticks); oldval = rdtp->dynticks_nesting; WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && oldval < 0); - if (oldval & DYNTICK_TASK_NEST_MASK) { - rdtp->dynticks_nesting += DYNTICK_TASK_NEST_VALUE; + if (oldval) { + rdtp->dynticks_nesting++; } else { __this_cpu_inc(disable_rcu_irq_enter); - rcu_eqs_exit_common(DYNTICK_TASK_EXIT_IDLE, user); - rdtp->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; + rcu_eqs_exit_common(1, user); + rdtp->dynticks_nesting = 1; __this_cpu_dec(disable_rcu_irq_enter); WRITE_ONCE(rdtp->dynticks_nmi_nesting, DYNTICK_IRQ_NONIDLE); } @@ -1011,11 +1015,6 @@ static void rcu_eqs_exit(bool user) * Exit idle mode, in other words, -enter- the mode in which RCU * read-side critical sections can occur. * - * We crowbar the ->dynticks_nesting field to DYNTICK_TASK_NEST to - * allow for the possibility of usermode upcalls messing up our count - * of interrupt nesting level during the busy period that is just - * now starting. - * * If you add or remove a call to rcu_idle_exit(), be sure to test with * CONFIG_RCU_EQS_DEBUG=y. */ @@ -1219,7 +1218,8 @@ EXPORT_SYMBOL_GPL(rcu_lockdep_current_cpu_online); */ static int rcu_is_cpu_rrupt_from_idle(void) { - return __this_cpu_read(rcu_dynticks.dynticks_nesting) <= 1; + return __this_cpu_read(rcu_dynticks.dynticks_nesting) <= 0 && + __this_cpu_read(rcu_dynticks.dynticks_nmi_nesting) <= 1; } /* @@ -3709,7 +3709,7 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp) raw_spin_lock_irqsave_rcu_node(rnp, flags); rdp->grpmask = leaf_node_cpu_bit(rdp->mynode, cpu); rdp->dynticks = &per_cpu(rcu_dynticks, cpu); - WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != DYNTICK_TASK_EXIT_IDLE); + WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != 1); WARN_ON_ONCE(rcu_dynticks_in_eqs(rcu_dynticks_snap(rdp->dynticks))); rdp->cpu = cpu; rdp->rsp = rsp; @@ -3738,7 +3738,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp) if (rcu_segcblist_empty(&rdp->cblist) && /* No early-boot CBs? */ !init_nocb_callback_list(rdp)) rcu_segcblist_init(&rdp->cblist); /* Re-enable callbacks. */ - rdp->dynticks->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; + rdp->dynticks->dynticks_nesting = 1; rcu_dynticks_eqs_online(); raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */ diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 46a5d1991450..dbd7e3753bed 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -39,7 +39,6 @@ */ struct rcu_dynticks { long long dynticks_nesting; /* Track irq/process nesting level. */ - /* Process level is worth LLONG_MAX/2. */ int dynticks_nmi_nesting; /* Track NMI nesting level. */ atomic_t dynticks; /* Even value for idle, else odd. */ bool rcu_need_heavy_qs; /* GP old, need heavy quiescent state. */ -- cgit v1.2.3 From 844ccdd7dce2c1a6ea9b437fcf8c3265b136e4a5 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 3 Oct 2017 16:51:47 -0700 Subject: rcu: Eliminate rcu_irq_enter_disabled() Now that the irq path uses the rcu_nmi_{enter,exit}() algorithm, rcu_irq_enter() and rcu_irq_exit() may be used from any context. There is thus no need for rcu_irq_enter_disabled() and for the checks using it. This commit therefore eliminates rcu_irq_enter_disabled(). Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 22 ++-------------------- kernel/trace/trace.c | 11 ----------- 2 files changed, 2 insertions(+), 31 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index d123474fe829..444aa2b3f24d 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -270,20 +270,6 @@ static DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { .dynticks = ATOMIC_INIT(RCU_DYNTICK_CTRL_CTR), }; -/* - * There's a few places, currently just in the tracing infrastructure, - * that uses rcu_irq_enter() to make sure RCU is watching. But there's - * a small location where that will not even work. In those cases - * rcu_irq_enter_disabled() needs to be checked to make sure rcu_irq_enter() - * can be called. - */ -static DEFINE_PER_CPU(bool, disable_rcu_irq_enter); - -bool rcu_irq_enter_disabled(void) -{ - return this_cpu_read(disable_rcu_irq_enter); -} - /* * Record entry into an extended quiescent state. This is only to be * called when not already in an extended quiescent state. @@ -792,10 +778,8 @@ static void rcu_eqs_enter_common(bool user) do_nocb_deferred_wakeup(rdp); } rcu_prepare_for_idle(); - __this_cpu_inc(disable_rcu_irq_enter); - rdtp->dynticks_nesting = 0; /* Breaks tracing momentarily. */ - rcu_dynticks_eqs_enter(); /* After this, tracing works again. */ - __this_cpu_dec(disable_rcu_irq_enter); + rdtp->dynticks_nesting = 0; + rcu_dynticks_eqs_enter(); rcu_dynticks_task_enter(); /* @@ -1001,10 +985,8 @@ static void rcu_eqs_exit(bool user) if (oldval) { rdtp->dynticks_nesting++; } else { - __this_cpu_inc(disable_rcu_irq_enter); rcu_eqs_exit_common(1, user); rdtp->dynticks_nesting = 1; - __this_cpu_dec(disable_rcu_irq_enter); WRITE_ONCE(rdtp->dynticks_nmi_nesting, DYNTICK_IRQ_NONIDLE); } } diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 73e67b68c53b..dbce1be3bab8 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2682,17 +2682,6 @@ void __trace_stack(struct trace_array *tr, unsigned long flags, int skip, if (unlikely(in_nmi())) return; - /* - * It is possible that a function is being traced in a - * location that RCU is not watching. A call to - * rcu_irq_enter() will make sure that it is, but there's - * a few internal rcu functions that could be traced - * where that wont work either. In those cases, we just - * do nothing. - */ - if (unlikely(rcu_irq_enter_disabled())) - return; - rcu_irq_enter_irqson(); __ftrace_trace_stack(buffer, flags, skip, pc, NULL); rcu_irq_exit_irqson(); -- cgit v1.2.3 From aa24163b2ee5c92120e32e99b5a93143a0f4258e Mon Sep 17 00:00:00 2001 From: Prateek Sood Date: Wed, 15 Nov 2017 19:50:14 +0530 Subject: cgroup/cpuset: remove circular dependency deadlock Remove circular dependency deadlock in a scenario where hotplug of CPU is being done while there is updation in cgroup and cpuset triggered from userspace. Process A => kthreadd => Process B => Process C => Process A Process A cpu_subsys_offline(); cpu_down(); _cpu_down(); percpu_down_write(&cpu_hotplug_lock); //held cpuhp_invoke_callback(); workqueue_offline_cpu(); queue_work_on(); // unbind_work on system_highpri_wq __queue_work(); insert_work(); wake_up_worker(); flush_work(); wait_for_completion(); worker_thread(); manage_workers(); create_worker(); kthread_create_on_node(); wake_up_process(kthreadd_task); kthreadd kthreadd(); kernel_thread(); do_fork(); copy_process(); percpu_down_read(&cgroup_threadgroup_rwsem); __rwsem_down_read_failed_common(); //waiting Process B kernfs_fop_write(); cgroup_file_write(); cgroup_procs_write(); percpu_down_write(&cgroup_threadgroup_rwsem); //held cgroup_attach_task(); cgroup_migrate(); cgroup_migrate_execute(); cpuset_can_attach(); mutex_lock(&cpuset_mutex); //waiting Process C kernfs_fop_write(); cgroup_file_write(); cpuset_write_resmask(); mutex_lock(&cpuset_mutex); //held update_cpumask(); update_cpumasks_hier(); rebuild_sched_domains_locked(); get_online_cpus(); percpu_down_read(&cpu_hotplug_lock); //waiting Eliminating deadlock by reversing the locking order for cpuset_mutex and cpu_hotplug_lock. Signed-off-by: Prateek Sood Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset.c | 53 ++++++++++++++++++++++++++++---------------------- 1 file changed, 30 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index f7efa7b4d825..cab5fd1ee767 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -812,6 +812,18 @@ done: return ndoms; } +static void cpuset_sched_change_begin(void) +{ + cpus_read_lock(); + mutex_lock(&cpuset_mutex); +} + +static void cpuset_sched_change_end(void) +{ + mutex_unlock(&cpuset_mutex); + cpus_read_unlock(); +} + /* * Rebuild scheduler domains. * @@ -821,16 +833,14 @@ done: * 'cpus' is removed, then call this routine to rebuild the * scheduler's dynamic sched domains. * - * Call with cpuset_mutex held. Takes get_online_cpus(). */ -static void rebuild_sched_domains_locked(void) +static void rebuild_sched_domains_cpuslocked(void) { struct sched_domain_attr *attr; cpumask_var_t *doms; int ndoms; lockdep_assert_held(&cpuset_mutex); - get_online_cpus(); /* * We have raced with CPU hotplug. Don't do anything to avoid @@ -838,27 +848,25 @@ static void rebuild_sched_domains_locked(void) * Anyways, hotplug work item will rebuild sched domains. */ if (!cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask)) - goto out; + return; /* Generate domain masks and attrs */ ndoms = generate_sched_domains(&doms, &attr); /* Have scheduler rebuild the domains */ partition_sched_domains(ndoms, doms, attr); -out: - put_online_cpus(); } #else /* !CONFIG_SMP */ -static void rebuild_sched_domains_locked(void) +static void rebuild_sched_domains_cpuslocked(void) { } #endif /* CONFIG_SMP */ void rebuild_sched_domains(void) { - mutex_lock(&cpuset_mutex); - rebuild_sched_domains_locked(); - mutex_unlock(&cpuset_mutex); + cpuset_sched_change_begin(); + rebuild_sched_domains_cpuslocked(); + cpuset_sched_change_end(); } /** @@ -944,7 +952,7 @@ static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus) rcu_read_unlock(); if (need_rebuild_sched_domains) - rebuild_sched_domains_locked(); + rebuild_sched_domains_cpuslocked(); } /** @@ -1276,7 +1284,7 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val) cs->relax_domain_level = val; if (!cpumask_empty(cs->cpus_allowed) && is_sched_load_balance(cs)) - rebuild_sched_domains_locked(); + rebuild_sched_domains_cpuslocked(); } return 0; @@ -1309,7 +1317,6 @@ static void update_tasks_flags(struct cpuset *cs) * * Call with cpuset_mutex held. */ - static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, int turning_on) { @@ -1342,7 +1349,7 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, spin_unlock_irq(&callback_lock); if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed) - rebuild_sched_domains_locked(); + rebuild_sched_domains_cpuslocked(); if (spread_flag_changed) update_tasks_flags(cs); @@ -1610,7 +1617,7 @@ static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft, cpuset_filetype_t type = cft->private; int retval = 0; - mutex_lock(&cpuset_mutex); + cpuset_sched_change_begin(); if (!is_cpuset_online(cs)) { retval = -ENODEV; goto out_unlock; @@ -1646,7 +1653,7 @@ static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft, break; } out_unlock: - mutex_unlock(&cpuset_mutex); + cpuset_sched_change_end(); return retval; } @@ -1657,7 +1664,7 @@ static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft, cpuset_filetype_t type = cft->private; int retval = -ENODEV; - mutex_lock(&cpuset_mutex); + cpuset_sched_change_begin(); if (!is_cpuset_online(cs)) goto out_unlock; @@ -1670,7 +1677,7 @@ static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft, break; } out_unlock: - mutex_unlock(&cpuset_mutex); + cpuset_sched_change_end(); return retval; } @@ -1709,7 +1716,7 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of, kernfs_break_active_protection(of->kn); flush_work(&cpuset_hotplug_work); - mutex_lock(&cpuset_mutex); + cpuset_sched_change_begin(); if (!is_cpuset_online(cs)) goto out_unlock; @@ -1733,7 +1740,7 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of, free_trial_cpuset(trialcs); out_unlock: - mutex_unlock(&cpuset_mutex); + cpuset_sched_change_end(); kernfs_unbreak_active_protection(of->kn); css_put(&cs->css); flush_workqueue(cpuset_migrate_mm_wq); @@ -2034,14 +2041,14 @@ out_unlock: /* * If the cpuset being removed has its flag 'sched_load_balance' * enabled, then simulate turning sched_load_balance off, which - * will call rebuild_sched_domains_locked(). + * will call rebuild_sched_domains_cpuslocked(). */ static void cpuset_css_offline(struct cgroup_subsys_state *css) { struct cpuset *cs = css_cs(css); - mutex_lock(&cpuset_mutex); + cpuset_sched_change_begin(); if (is_sched_load_balance(cs)) update_flag(CS_SCHED_LOAD_BALANCE, cs, 0); @@ -2049,7 +2056,7 @@ static void cpuset_css_offline(struct cgroup_subsys_state *css) cpuset_dec(); clear_bit(CS_ONLINE, &cs->flags); - mutex_unlock(&cpuset_mutex); + cpuset_sched_change_end(); } static void cpuset_css_free(struct cgroup_subsys_state *css) -- cgit v1.2.3 From 1599a185f0e6113be185b9fb809c621c73865829 Mon Sep 17 00:00:00 2001 From: Prateek Sood Date: Wed, 15 Nov 2017 19:50:15 +0530 Subject: cpuset: Make cpuset hotplug synchronous Convert cpuset_hotplug_workfn() into synchronous call for cpu hotplug path. For memory hotplug path it still gets queued as a work item. Since cpuset_hotplug_workfn() can be made synchronous for cpu hotplug path, it is not required to wait for cpuset hotplug while thawing processes. Signed-off-by: Prateek Sood Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset.c | 41 ++++++++++++++++++++--------------------- kernel/power/process.c | 2 -- kernel/sched/core.c | 1 - 3 files changed, 20 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index cab5fd1ee767..227bc25d951d 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -2277,15 +2277,8 @@ retry: mutex_unlock(&cpuset_mutex); } -static bool force_rebuild; - -void cpuset_force_rebuild(void) -{ - force_rebuild = true; -} - /** - * cpuset_hotplug_workfn - handle CPU/memory hotunplug for a cpuset + * cpuset_hotplug - handle CPU/memory hotunplug for a cpuset * * This function is called after either CPU or memory configuration has * changed and updates cpuset accordingly. The top_cpuset is always @@ -2300,7 +2293,7 @@ void cpuset_force_rebuild(void) * Note that CPU offlining during suspend is ignored. We don't modify * cpusets across suspend/resume cycles at all. */ -static void cpuset_hotplug_workfn(struct work_struct *work) +static void cpuset_hotplug(bool use_cpu_hp_lock) { static cpumask_t new_cpus; static nodemask_t new_mems; @@ -2358,25 +2351,31 @@ static void cpuset_hotplug_workfn(struct work_struct *work) } /* rebuild sched domains if cpus_allowed has changed */ - if (cpus_updated || force_rebuild) { - force_rebuild = false; - rebuild_sched_domains(); + if (cpus_updated) { + if (use_cpu_hp_lock) + rebuild_sched_domains(); + else { + /* Acquiring cpu_hotplug_lock is not required. + * When cpuset_hotplug() is called in hotplug path, + * cpu_hotplug_lock is held by the hotplug context + * which is waiting for cpuhp_thread_fun to indicate + * completion of callback. + */ + mutex_lock(&cpuset_mutex); + rebuild_sched_domains_cpuslocked(); + mutex_unlock(&cpuset_mutex); + } } } -void cpuset_update_active_cpus(void) +static void cpuset_hotplug_workfn(struct work_struct *work) { - /* - * We're inside cpu hotplug critical region which usually nests - * inside cgroup synchronization. Bounce actual hotplug processing - * to a work item to avoid reverse locking order. - */ - schedule_work(&cpuset_hotplug_work); + cpuset_hotplug(true); } -void cpuset_wait_for_hotplug(void) +void cpuset_update_active_cpus(void) { - flush_work(&cpuset_hotplug_work); + cpuset_hotplug(false); } /* diff --git a/kernel/power/process.c b/kernel/power/process.c index 7381d49a44db..c326d7235c5f 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -204,8 +204,6 @@ void thaw_processes(void) __usermodehelper_set_disable_depth(UMH_FREEZING); thaw_workqueues(); - cpuset_wait_for_hotplug(); - read_lock(&tasklist_lock); for_each_process_thread(g, p) { /* No other threads should have PF_SUSPEND_TASK set */ diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 75554f366fd3..88b3450b29ab 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5624,7 +5624,6 @@ static void cpuset_cpu_active(void) * restore the original sched domains by considering the * cpuset configurations. */ - cpuset_force_rebuild(); } cpuset_update_active_cpus(); } -- cgit v1.2.3 From c98a9805096460567404799a7bd3149826affde7 Mon Sep 17 00:00:00 2001 From: Tal Shorer Date: Fri, 3 Nov 2017 17:27:50 +0200 Subject: workqueue: respect isolated cpus when queueing an unbound work Initialize wq_unbound_cpumask to exclude cpus that were isolated by the cmdline's isolcpus parameter. Signed-off-by: Tal Shorer Signed-off-by: Tejun Heo --- kernel/workqueue.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 8fdb710bfdd7..6a5658cb46da 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -48,6 +48,7 @@ #include #include #include +#include #include "workqueue_internal.h" @@ -4957,6 +4958,10 @@ int workqueue_set_unbound_cpumask(cpumask_var_t cpumask) if (!zalloc_cpumask_var(&saved_cpumask, GFP_KERNEL)) return -ENOMEM; + /* + * Not excluding isolated cpus on purpose. + * If the user wishes to include them, we allow that. + */ cpumask_and(cpumask, cpumask, cpu_possible_mask); if (!cpumask_empty(cpumask)) { apply_wqattrs_lock(); @@ -5555,7 +5560,7 @@ int __init workqueue_init_early(void) WARN_ON(__alignof__(struct pool_workqueue) < __alignof__(long long)); BUG_ON(!alloc_cpumask_var(&wq_unbound_cpumask, GFP_KERNEL)); - cpumask_copy(wq_unbound_cpumask, cpu_possible_mask); + cpumask_copy(wq_unbound_cpumask, housekeeping_cpumask(HK_FLAG_DOMAIN)); pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC); -- cgit v1.2.3 From ddf7005f32212f28669032651e09bd8d2245c35d Mon Sep 17 00:00:00 2001 From: Wang Long Date: Sun, 19 Nov 2017 16:08:37 -0500 Subject: debug cgroup: use task_css_set instead of rcu_dereference This macro `task_css_set` verifies that the caller is inside proper critical section if the kernel set CONFIG_PROVE_RCU=y. Signed-off-by: Wang Long Signed-off-by: Tejun Heo --- kernel/cgroup/debug.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/debug.c b/kernel/cgroup/debug.c index 5f780d8f6a9d..9caeda610249 100644 --- a/kernel/cgroup/debug.c +++ b/kernel/cgroup/debug.c @@ -50,7 +50,7 @@ static int current_css_set_read(struct seq_file *seq, void *v) spin_lock_irq(&css_set_lock); rcu_read_lock(); - cset = rcu_dereference(current->cgroups); + cset = task_css_set(current); refcnt = refcount_read(&cset->refcount); seq_printf(seq, "css_set %pK %d", cset, refcnt); if (refcnt > cset->nr_tasks) @@ -96,7 +96,7 @@ static int current_css_set_cg_links_read(struct seq_file *seq, void *v) spin_lock_irq(&css_set_lock); rcu_read_lock(); - cset = rcu_dereference(current->cgroups); + cset = task_css_set(current); list_for_each_entry(link, &cset->cgrp_links, cgrp_link) { struct cgroup *c = link->cgrp; -- cgit v1.2.3 From 52cf373c37a684f8fc279d541307fad39d206376 Mon Sep 17 00:00:00 2001 From: Lucas Stach Date: Tue, 28 Nov 2017 13:59:25 +0100 Subject: cgroup: properly init u64_stats Lockdep complains that the stats update is trying to register a non-static key. This is because u64_stats are using a seqlock on 32bit arches, which needs to be initialized before usage. Fixes: 041cd640b2f3 (cgroup: Implement cgroup2 basic CPU usage accounting) Signed-off-by: Lucas Stach Signed-off-by: Tejun Heo --- kernel/cgroup/stat.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/stat.c b/kernel/cgroup/stat.c index 133b465691d6..1e111dd455c4 100644 --- a/kernel/cgroup/stat.c +++ b/kernel/cgroup/stat.c @@ -296,8 +296,12 @@ int cgroup_stat_init(struct cgroup *cgrp) } /* ->updated_children list is self terminated */ - for_each_possible_cpu(cpu) - cgroup_cpu_stat(cgrp, cpu)->updated_children = cgrp; + for_each_possible_cpu(cpu) { + struct cgroup_cpu_stat *cstat = cgroup_cpu_stat(cgrp, cpu); + + cstat->updated_children = cgrp; + u64_stats_init(&cstat->sync); + } prev_cputime_init(&cgrp->stat.prev_cputime); -- cgit v1.2.3 From bd2b879a1ca55486fdb9dcac691bfd3dd79c83d6 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 4 Oct 2017 12:29:01 -0700 Subject: rcu: Add tracing to irq/NMI dyntick-idle transitions Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 444aa2b3f24d..d069ba2d8412 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -880,12 +880,15 @@ void rcu_nmi_exit(void) * leave it in non-RCU-idle state. */ if (rdtp->dynticks_nmi_nesting != 1) { + trace_rcu_dyntick(TPS("--="), rdtp->dynticks_nmi_nesting, + rdtp->dynticks_nmi_nesting - 2); WRITE_ONCE(rdtp->dynticks_nmi_nesting, /* No store tearing. */ rdtp->dynticks_nmi_nesting - 2); return; } /* This NMI interrupted an RCU-idle CPU, restore RCU-idleness. */ + trace_rcu_dyntick(TPS("Startirq"), rdtp->dynticks_nmi_nesting, 0); WRITE_ONCE(rdtp->dynticks_nmi_nesting, 0); /* Avoid store tearing. */ rcu_dynticks_eqs_enter(); } @@ -1057,6 +1060,9 @@ void rcu_nmi_enter(void) rcu_dynticks_eqs_exit(); incby = 1; } + trace_rcu_dyntick(incby == 1 ? TPS("Endirq") : TPS("++="), + rdtp->dynticks_nmi_nesting, + rdtp->dynticks_nmi_nesting + incby); WRITE_ONCE(rdtp->dynticks_nmi_nesting, /* Prevent store tearing. */ rdtp->dynticks_nmi_nesting + incby); barrier(); -- cgit v1.2.3 From 84585aa8b6ad24e5bdfba9db4a320a6aeed192ab Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 4 Oct 2017 15:55:16 -0700 Subject: rcu: Shrink ->dynticks_{nmi_,}nesting from long long to long Because the ->dynticks_nesting field now only contains the process-based nesting level instead of a value encoding both the process nesting level and the irq "nesting" level, we no longer need a long long, even on 32-bit systems. This commit therefore changes both the ->dynticks_nesting and ->dynticks_nmi_nesting fields to long. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcu.h | 2 +- kernel/rcu/tree.c | 6 +++--- kernel/rcu/tree.h | 4 ++-- kernel/rcu/tree_plugin.h | 2 +- 4 files changed, 7 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index afe0559d1867..6334f2c1abd0 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -31,7 +31,7 @@ #endif /* #else #ifdef CONFIG_RCU_TRACE */ /* Offset to allow for unmatched rcu_irq_{enter,exit}(). */ -#define DYNTICK_IRQ_NONIDLE ((INT_MAX / 2) + 1) +#define DYNTICK_IRQ_NONIDLE ((LONG_MAX / 2) + 1) /* diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index d069ba2d8412..92de3bacda07 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -946,7 +946,7 @@ void rcu_irq_exit_irqson(void) * we really have exited idle, and must do the appropriate accounting. * The caller must have disabled interrupts. */ -static void rcu_eqs_exit_common(long long newval, int user) +static void rcu_eqs_exit_common(long newval, int user) { RCU_TRACE(struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);) @@ -979,7 +979,7 @@ static void rcu_eqs_exit_common(long long newval, int user) static void rcu_eqs_exit(bool user) { struct rcu_dynticks *rdtp; - long long oldval; + long oldval; lockdep_assert_irqs_disabled(); rdtp = this_cpu_ptr(&rcu_dynticks); @@ -1043,7 +1043,7 @@ void rcu_user_exit(void) void rcu_nmi_enter(void) { struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); - int incby = 2; + long incby = 2; /* Complain about underflow. */ WARN_ON_ONCE(rdtp->dynticks_nmi_nesting < 0); diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index dbd7e3753bed..6488a3b0e729 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -38,8 +38,8 @@ * Dynticks per-CPU state. */ struct rcu_dynticks { - long long dynticks_nesting; /* Track irq/process nesting level. */ - int dynticks_nmi_nesting; /* Track NMI nesting level. */ + long dynticks_nesting; /* Track process nesting level. */ + long dynticks_nmi_nesting; /* Track irq/NMI nesting level. */ atomic_t dynticks; /* Even value for idle, else odd. */ bool rcu_need_heavy_qs; /* GP old, need heavy quiescent state. */ unsigned long rcu_qs_ctr; /* Light universal quiescent state ctr. */ diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index db85ca3975f1..e94e754464cd 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1687,7 +1687,7 @@ static void print_cpu_stall_info(struct rcu_state *rsp, int cpu) } print_cpu_stall_fast_no_hz(fast_no_hz, cpu); delta = rdp->mynode->gpnum - rdp->rcu_iw_gpnum; - pr_err("\t%d-%c%c%c%c: (%lu %s) idle=%03x/%llx/%d softirq=%u/%u fqs=%ld %s\n", + pr_err("\t%d-%c%c%c%c: (%lu %s) idle=%03x/%ld/%ld softirq=%u/%u fqs=%ld %s\n", cpu, "O."[!!cpu_online(cpu)], "o."[!!(rdp->grpmask & rdp->mynode->qsmaskinit)], -- cgit v1.2.3 From dec98900eae1e22467182e58688abe5fae98bd5f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 4 Oct 2017 16:24:29 -0700 Subject: rcu: Add ->dynticks field to rcu_dyntick trace event Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 92de3bacda07..5febb76809f6 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -761,13 +761,13 @@ static void rcu_eqs_enter_common(bool user) struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); lockdep_assert_irqs_disabled(); - trace_rcu_dyntick(TPS("Start"), rdtp->dynticks_nesting, 0); + trace_rcu_dyntick(TPS("Start"), rdtp->dynticks_nesting, 0, rdtp->dynticks); if (IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current)) { struct task_struct *idle __maybe_unused = idle_task(smp_processor_id()); - trace_rcu_dyntick(TPS("Error on entry: not idle task"), rdtp->dynticks_nesting, 0); + trace_rcu_dyntick(TPS("Error on entry: not idle task"), rdtp->dynticks_nesting, 0, rdtp->dynticks); rcu_ftrace_dump(DUMP_ORIG); WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", current->pid, current->comm, @@ -880,15 +880,14 @@ void rcu_nmi_exit(void) * leave it in non-RCU-idle state. */ if (rdtp->dynticks_nmi_nesting != 1) { - trace_rcu_dyntick(TPS("--="), rdtp->dynticks_nmi_nesting, - rdtp->dynticks_nmi_nesting - 2); + trace_rcu_dyntick(TPS("--="), rdtp->dynticks_nmi_nesting, rdtp->dynticks_nmi_nesting - 2, rdtp->dynticks); WRITE_ONCE(rdtp->dynticks_nmi_nesting, /* No store tearing. */ rdtp->dynticks_nmi_nesting - 2); return; } /* This NMI interrupted an RCU-idle CPU, restore RCU-idleness. */ - trace_rcu_dyntick(TPS("Startirq"), rdtp->dynticks_nmi_nesting, 0); + trace_rcu_dyntick(TPS("Startirq"), rdtp->dynticks_nmi_nesting, 0, rdtp->dynticks); WRITE_ONCE(rdtp->dynticks_nmi_nesting, 0); /* Avoid store tearing. */ rcu_dynticks_eqs_enter(); } @@ -953,14 +952,13 @@ static void rcu_eqs_exit_common(long newval, int user) rcu_dynticks_task_exit(); rcu_dynticks_eqs_exit(); rcu_cleanup_after_idle(); - trace_rcu_dyntick(TPS("End"), rdtp->dynticks_nesting, newval); + trace_rcu_dyntick(TPS("End"), rdtp->dynticks_nesting, newval, rdtp->dynticks); if (IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current)) { struct task_struct *idle __maybe_unused = idle_task(smp_processor_id()); - trace_rcu_dyntick(TPS("Error on exit: not idle task"), - rdtp->dynticks_nesting, newval); + trace_rcu_dyntick(TPS("Error on exit: not idle task"), rdtp->dynticks_nesting, newval, rdtp->dynticks); rcu_ftrace_dump(DUMP_ORIG); WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", current->pid, current->comm, @@ -1062,7 +1060,7 @@ void rcu_nmi_enter(void) } trace_rcu_dyntick(incby == 1 ? TPS("Endirq") : TPS("++="), rdtp->dynticks_nmi_nesting, - rdtp->dynticks_nmi_nesting + incby); + rdtp->dynticks_nmi_nesting + incby, rdtp->dynticks); WRITE_ONCE(rdtp->dynticks_nmi_nesting, /* Prevent store tearing. */ rdtp->dynticks_nmi_nesting + incby); barrier(); -- cgit v1.2.3 From 914955e18ca09fc404d7fc3614bb04c96a03692c Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 5 Oct 2017 13:50:57 -0700 Subject: rcu: Stop duplicating lockdep checks in RCU's idle-entry code The three RCU_LOCKDEP_WARN() calls in rcu_eqs_enter_common() are redundant with other lockdep checks, so this commit removes them. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 11 ----------- 1 file changed, 11 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 5febb76809f6..80cada11f544 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -781,17 +781,6 @@ static void rcu_eqs_enter_common(bool user) rdtp->dynticks_nesting = 0; rcu_dynticks_eqs_enter(); rcu_dynticks_task_enter(); - - /* - * It is illegal to enter an extended quiescent state while - * in an RCU read-side critical section. - */ - RCU_LOCKDEP_WARN(lock_is_held(&rcu_lock_map), - "Illegal idle entry in RCU read-side critical section."); - RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map), - "Illegal idle entry in RCU-bh read-side critical section."); - RCU_LOCKDEP_WARN(lock_is_held(&rcu_sched_lock_map), - "Illegal idle entry in RCU-sched read-side critical section."); } /* -- cgit v1.2.3 From 2342172fd6c148506456862d795c7f155baf6797 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 5 Oct 2017 15:03:10 -0700 Subject: rcu: Avoid ->dynticks_nesting store tearing Although ->dynticks_nesting is updated only by process level, it is accessed from hardirq to check for interrupt-from-idle quiescent states. Store tearing is thus possible, so this commit applies WRITE_ONCE() to ->dynticks_nesting stores. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 80cada11f544..b2ded4d436c6 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -778,7 +778,7 @@ static void rcu_eqs_enter_common(bool user) do_nocb_deferred_wakeup(rdp); } rcu_prepare_for_idle(); - rdtp->dynticks_nesting = 0; + WRITE_ONCE(rdtp->dynticks_nesting, 0); /* Avoid irq-access tearing. */ rcu_dynticks_eqs_enter(); rcu_dynticks_task_enter(); } @@ -976,7 +976,7 @@ static void rcu_eqs_exit(bool user) rdtp->dynticks_nesting++; } else { rcu_eqs_exit_common(1, user); - rdtp->dynticks_nesting = 1; + WRITE_ONCE(rdtp->dynticks_nesting, 1); WRITE_ONCE(rdtp->dynticks_nmi_nesting, DYNTICK_IRQ_NONIDLE); } } @@ -3713,7 +3713,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp) if (rcu_segcblist_empty(&rdp->cblist) && /* No early-boot CBs? */ !init_nocb_callback_list(rdp)) rcu_segcblist_init(&rdp->cblist); /* Re-enable callbacks. */ - rdp->dynticks->dynticks_nesting = 1; + rdp->dynticks->dynticks_nesting = 1; /* CPU not up, no tearing. */ rcu_dynticks_eqs_online(); raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */ -- cgit v1.2.3 From 215bba9f59e35e64b9936da62632b2fa3ede647c Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 5 Oct 2017 16:37:03 -0700 Subject: rcu: Fold rcu_eqs_enter_common() into rcu_eqs_enter() There is now only one call to rcu_eqs_enter_common() and there is no other reason to keep it separate. This commit therefore inlines it into its sole call site, saving a few lines of code in the process. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 43 ++++++++++++++++--------------------------- 1 file changed, 16 insertions(+), 27 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index b2ded4d436c6..5c8a5796c71f 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -749,16 +749,27 @@ cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp) } /* - * rcu_eqs_enter_common - current CPU is entering an extended quiescent state + * Enter an RCU extended quiescent state, which can be either the + * idle loop or adaptive-tickless usermode execution. * - * Enter idle, doing appropriate accounting. The caller must have - * disabled interrupts. + * We crowbar the ->dynticks_nmi_nesting field to zero to allow for + * the possibility of usermode upcalls having messed up our count + * of interrupt nesting level during the prior busy period. */ -static void rcu_eqs_enter_common(bool user) +static void rcu_eqs_enter(bool user) { struct rcu_state *rsp; struct rcu_data *rdp; - struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); + struct rcu_dynticks *rdtp; + + rdtp = this_cpu_ptr(&rcu_dynticks); + WRITE_ONCE(rdtp->dynticks_nmi_nesting, 0); + WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && + rdtp->dynticks_nesting == 0); + if (rdtp->dynticks_nesting != 1) { + rdtp->dynticks_nesting--; + return; + } lockdep_assert_irqs_disabled(); trace_rcu_dyntick(TPS("Start"), rdtp->dynticks_nesting, 0, rdtp->dynticks); @@ -783,28 +794,6 @@ static void rcu_eqs_enter_common(bool user) rcu_dynticks_task_enter(); } -/* - * Enter an RCU extended quiescent state, which can be either the - * idle loop or adaptive-tickless usermode execution. - * - * We crowbar the ->dynticks_nmi_nesting field to zero to allow for - * the possibility of usermode upcalls having messed up our count - * of interrupt nesting level during the prior busy period. - */ -static void rcu_eqs_enter(bool user) -{ - struct rcu_dynticks *rdtp; - - rdtp = this_cpu_ptr(&rcu_dynticks); - WRITE_ONCE(rdtp->dynticks_nmi_nesting, 0); - WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && - rdtp->dynticks_nesting == 0); - if (rdtp->dynticks_nesting == 1) - rcu_eqs_enter_common(user); - else - rdtp->dynticks_nesting--; -} - /** * rcu_idle_enter - inform RCU that current CPU is entering idle * -- cgit v1.2.3 From 9dd238e28640d5514bbd0ff2d425f32409981d85 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 5 Oct 2017 16:56:26 -0700 Subject: rcu: Fold rcu_eqs_exit_common() into rcu_eqs_exit() There is now only one call to rcu_eqs_exit_common() and there is no other reason to keep it separate. This commit therefore inlines it into its sole call site, saving a few lines of code in the process. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 50 ++++++++++++++++++-------------------------------- 1 file changed, 18 insertions(+), 32 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 5c8a5796c71f..46a8e06bf03e 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -916,34 +916,6 @@ void rcu_irq_exit_irqson(void) local_irq_restore(flags); } -/* - * rcu_eqs_exit_common - current CPU moving away from extended quiescent state - * - * If the new value of the ->dynticks_nesting counter was previously zero, - * we really have exited idle, and must do the appropriate accounting. - * The caller must have disabled interrupts. - */ -static void rcu_eqs_exit_common(long newval, int user) -{ - RCU_TRACE(struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);) - - rcu_dynticks_task_exit(); - rcu_dynticks_eqs_exit(); - rcu_cleanup_after_idle(); - trace_rcu_dyntick(TPS("End"), rdtp->dynticks_nesting, newval, rdtp->dynticks); - if (IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && - !user && !is_idle_task(current)) { - struct task_struct *idle __maybe_unused = - idle_task(smp_processor_id()); - - trace_rcu_dyntick(TPS("Error on exit: not idle task"), rdtp->dynticks_nesting, newval, rdtp->dynticks); - rcu_ftrace_dump(DUMP_ORIG); - WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", - current->pid, current->comm, - idle->pid, idle->comm); /* must be idle task! */ - } -} - /* * Exit an RCU extended quiescent state, which can be either the * idle loop or adaptive-tickless usermode execution. @@ -963,11 +935,25 @@ static void rcu_eqs_exit(bool user) WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && oldval < 0); if (oldval) { rdtp->dynticks_nesting++; - } else { - rcu_eqs_exit_common(1, user); - WRITE_ONCE(rdtp->dynticks_nesting, 1); - WRITE_ONCE(rdtp->dynticks_nmi_nesting, DYNTICK_IRQ_NONIDLE); + return; + } + rcu_dynticks_task_exit(); + rcu_dynticks_eqs_exit(); + rcu_cleanup_after_idle(); + trace_rcu_dyntick(TPS("End"), rdtp->dynticks_nesting, 1, rdtp->dynticks); + if (IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && + !user && !is_idle_task(current)) { + struct task_struct *idle __maybe_unused = + idle_task(smp_processor_id()); + + trace_rcu_dyntick(TPS("Error on exit: not idle task"), rdtp->dynticks_nesting, 1, rdtp->dynticks); + rcu_ftrace_dump(DUMP_ORIG); + WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", + current->pid, current->comm, + idle->pid, idle->comm); /* must be idle task! */ } + WRITE_ONCE(rdtp->dynticks_nesting, 1); + WRITE_ONCE(rdtp->dynticks_nmi_nesting, DYNTICK_IRQ_NONIDLE); } /** -- cgit v1.2.3 From e68bbb266dcfed201d8d54a2828ef820d747f083 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 5 Oct 2017 19:55:31 -0700 Subject: rcu: Simplify rcu_eqs_{enter,exit}() non-idle task debug code The code that checks for non-idle non-nohz_idle-usermode tasks invoking rcu_eqs_enter() and rcu_eqs_exit() prints a considerable quantity of helpful information. However, these checks fire rarely, so the extra complexity is no longer worth it. This commit therefore replaces this debug code with simple WARN_ON_ONCE() statements. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 24 ++---------------------- 1 file changed, 2 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 46a8e06bf03e..4d374d2bc925 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -773,17 +773,7 @@ static void rcu_eqs_enter(bool user) lockdep_assert_irqs_disabled(); trace_rcu_dyntick(TPS("Start"), rdtp->dynticks_nesting, 0, rdtp->dynticks); - if (IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && - !user && !is_idle_task(current)) { - struct task_struct *idle __maybe_unused = - idle_task(smp_processor_id()); - - trace_rcu_dyntick(TPS("Error on entry: not idle task"), rdtp->dynticks_nesting, 0, rdtp->dynticks); - rcu_ftrace_dump(DUMP_ORIG); - WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", - current->pid, current->comm, - idle->pid, idle->comm); /* must be idle task! */ - } + WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current)); for_each_rcu_flavor(rsp) { rdp = this_cpu_ptr(rsp->rda); do_nocb_deferred_wakeup(rdp); @@ -941,17 +931,7 @@ static void rcu_eqs_exit(bool user) rcu_dynticks_eqs_exit(); rcu_cleanup_after_idle(); trace_rcu_dyntick(TPS("End"), rdtp->dynticks_nesting, 1, rdtp->dynticks); - if (IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && - !user && !is_idle_task(current)) { - struct task_struct *idle __maybe_unused = - idle_task(smp_processor_id()); - - trace_rcu_dyntick(TPS("Error on exit: not idle task"), rdtp->dynticks_nesting, 1, rdtp->dynticks); - rcu_ftrace_dump(DUMP_ORIG); - WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", - current->pid, current->comm, - idle->pid, idle->comm); /* must be idle task! */ - } + WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current)); WRITE_ONCE(rdtp->dynticks_nesting, 1); WRITE_ONCE(rdtp->dynticks_nmi_nesting, DYNTICK_IRQ_NONIDLE); } -- cgit v1.2.3 From d633198088bd9e358566c470ed182994403acc7a Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 10 Oct 2017 13:52:30 -0700 Subject: srcu: Prohibit call_srcu() use under raw spinlocks Invoking queue_delayed_work() while holding a raw spinlock is forbidden in -rt kernels, which is exactly what __call_srcu() does, indirectly via srcu_funnel_gp_start(). This commit therefore downgrades Tree SRCU's locking from raw to non-raw spinlocks, which works because call_srcu() is not ever called while holding a raw spinlock. Reported-by: Sebastian Andrzej Siewior Signed-off-by: Paul E. McKenney --- kernel/rcu/srcutree.c | 109 +++++++++++++++++++++++++++++++------------------- 1 file changed, 68 insertions(+), 41 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 6d5880089ff6..d5cea81378cc 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -53,6 +53,33 @@ static void srcu_invoke_callbacks(struct work_struct *work); static void srcu_reschedule(struct srcu_struct *sp, unsigned long delay); static void process_srcu(struct work_struct *work); +/* Wrappers for lock acquisition and release, see raw_spin_lock_rcu_node(). */ +#define spin_lock_rcu_node(p) \ +do { \ + spin_lock(&ACCESS_PRIVATE(p, lock)); \ + smp_mb__after_unlock_lock(); \ +} while (0) + +#define spin_unlock_rcu_node(p) spin_unlock(&ACCESS_PRIVATE(p, lock)) + +#define spin_lock_irq_rcu_node(p) \ +do { \ + spin_lock_irq(&ACCESS_PRIVATE(p, lock)); \ + smp_mb__after_unlock_lock(); \ +} while (0) + +#define spin_unlock_irq_rcu_node(p) \ + spin_unlock_irq(&ACCESS_PRIVATE(p, lock)) + +#define spin_lock_irqsave_rcu_node(p, flags) \ +do { \ + spin_lock_irqsave(&ACCESS_PRIVATE(p, lock), flags); \ + smp_mb__after_unlock_lock(); \ +} while (0) + +#define spin_unlock_irqrestore_rcu_node(p, flags) \ + spin_unlock_irqrestore(&ACCESS_PRIVATE(p, lock), flags) \ + /* * Initialize SRCU combining tree. Note that statically allocated * srcu_struct structures might already have srcu_read_lock() and @@ -77,7 +104,7 @@ static void init_srcu_struct_nodes(struct srcu_struct *sp, bool is_static) /* Each pass through this loop initializes one srcu_node structure. */ rcu_for_each_node_breadth_first(sp, snp) { - raw_spin_lock_init(&ACCESS_PRIVATE(snp, lock)); + spin_lock_init(&ACCESS_PRIVATE(snp, lock)); WARN_ON_ONCE(ARRAY_SIZE(snp->srcu_have_cbs) != ARRAY_SIZE(snp->srcu_data_have_cbs)); for (i = 0; i < ARRAY_SIZE(snp->srcu_have_cbs); i++) { @@ -111,7 +138,7 @@ static void init_srcu_struct_nodes(struct srcu_struct *sp, bool is_static) snp_first = sp->level[level]; for_each_possible_cpu(cpu) { sdp = per_cpu_ptr(sp->sda, cpu); - raw_spin_lock_init(&ACCESS_PRIVATE(sdp, lock)); + spin_lock_init(&ACCESS_PRIVATE(sdp, lock)); rcu_segcblist_init(&sdp->srcu_cblist); sdp->srcu_cblist_invoking = false; sdp->srcu_gp_seq_needed = sp->srcu_gp_seq; @@ -170,7 +197,7 @@ int __init_srcu_struct(struct srcu_struct *sp, const char *name, /* Don't re-initialize a lock while it is held. */ debug_check_no_locks_freed((void *)sp, sizeof(*sp)); lockdep_init_map(&sp->dep_map, name, key, 0); - raw_spin_lock_init(&ACCESS_PRIVATE(sp, lock)); + spin_lock_init(&ACCESS_PRIVATE(sp, lock)); return init_srcu_struct_fields(sp, false); } EXPORT_SYMBOL_GPL(__init_srcu_struct); @@ -187,7 +214,7 @@ EXPORT_SYMBOL_GPL(__init_srcu_struct); */ int init_srcu_struct(struct srcu_struct *sp) { - raw_spin_lock_init(&ACCESS_PRIVATE(sp, lock)); + spin_lock_init(&ACCESS_PRIVATE(sp, lock)); return init_srcu_struct_fields(sp, false); } EXPORT_SYMBOL_GPL(init_srcu_struct); @@ -210,13 +237,13 @@ static void check_init_srcu_struct(struct srcu_struct *sp) /* The smp_load_acquire() pairs with the smp_store_release(). */ if (!rcu_seq_state(smp_load_acquire(&sp->srcu_gp_seq_needed))) /*^^^*/ return; /* Already initialized. */ - raw_spin_lock_irqsave_rcu_node(sp, flags); + spin_lock_irqsave_rcu_node(sp, flags); if (!rcu_seq_state(sp->srcu_gp_seq_needed)) { - raw_spin_unlock_irqrestore_rcu_node(sp, flags); + spin_unlock_irqrestore_rcu_node(sp, flags); return; } init_srcu_struct_fields(sp, true); - raw_spin_unlock_irqrestore_rcu_node(sp, flags); + spin_unlock_irqrestore_rcu_node(sp, flags); } /* @@ -513,7 +540,7 @@ static void srcu_gp_end(struct srcu_struct *sp) mutex_lock(&sp->srcu_cb_mutex); /* End the current grace period. */ - raw_spin_lock_irq_rcu_node(sp); + spin_lock_irq_rcu_node(sp); idx = rcu_seq_state(sp->srcu_gp_seq); WARN_ON_ONCE(idx != SRCU_STATE_SCAN2); cbdelay = srcu_get_delay(sp); @@ -522,7 +549,7 @@ static void srcu_gp_end(struct srcu_struct *sp) gpseq = rcu_seq_current(&sp->srcu_gp_seq); if (ULONG_CMP_LT(sp->srcu_gp_seq_needed_exp, gpseq)) sp->srcu_gp_seq_needed_exp = gpseq; - raw_spin_unlock_irq_rcu_node(sp); + spin_unlock_irq_rcu_node(sp); mutex_unlock(&sp->srcu_gp_mutex); /* A new grace period can start at this point. But only one. */ @@ -530,7 +557,7 @@ static void srcu_gp_end(struct srcu_struct *sp) idx = rcu_seq_ctr(gpseq) % ARRAY_SIZE(snp->srcu_have_cbs); idxnext = (idx + 1) % ARRAY_SIZE(snp->srcu_have_cbs); rcu_for_each_node_breadth_first(sp, snp) { - raw_spin_lock_irq_rcu_node(snp); + spin_lock_irq_rcu_node(snp); cbs = false; if (snp >= sp->level[rcu_num_lvls - 1]) cbs = snp->srcu_have_cbs[idx] == gpseq; @@ -540,7 +567,7 @@ static void srcu_gp_end(struct srcu_struct *sp) snp->srcu_gp_seq_needed_exp = gpseq; mask = snp->srcu_data_have_cbs[idx]; snp->srcu_data_have_cbs[idx] = 0; - raw_spin_unlock_irq_rcu_node(snp); + spin_unlock_irq_rcu_node(snp); if (cbs) srcu_schedule_cbs_snp(sp, snp, mask, cbdelay); @@ -548,11 +575,11 @@ static void srcu_gp_end(struct srcu_struct *sp) if (!(gpseq & counter_wrap_check)) for (cpu = snp->grplo; cpu <= snp->grphi; cpu++) { sdp = per_cpu_ptr(sp->sda, cpu); - raw_spin_lock_irqsave_rcu_node(sdp, flags); + spin_lock_irqsave_rcu_node(sdp, flags); if (ULONG_CMP_GE(gpseq, sdp->srcu_gp_seq_needed + 100)) sdp->srcu_gp_seq_needed = gpseq; - raw_spin_unlock_irqrestore_rcu_node(sdp, flags); + spin_unlock_irqrestore_rcu_node(sdp, flags); } } @@ -560,17 +587,17 @@ static void srcu_gp_end(struct srcu_struct *sp) mutex_unlock(&sp->srcu_cb_mutex); /* Start a new grace period if needed. */ - raw_spin_lock_irq_rcu_node(sp); + spin_lock_irq_rcu_node(sp); gpseq = rcu_seq_current(&sp->srcu_gp_seq); if (!rcu_seq_state(gpseq) && ULONG_CMP_LT(gpseq, sp->srcu_gp_seq_needed)) { srcu_gp_start(sp); - raw_spin_unlock_irq_rcu_node(sp); + spin_unlock_irq_rcu_node(sp); /* Throttle expedited grace periods: Should be rare! */ srcu_reschedule(sp, rcu_seq_ctr(gpseq) & 0x3ff ? 0 : SRCU_INTERVAL); } else { - raw_spin_unlock_irq_rcu_node(sp); + spin_unlock_irq_rcu_node(sp); } } @@ -590,18 +617,18 @@ static void srcu_funnel_exp_start(struct srcu_struct *sp, struct srcu_node *snp, if (rcu_seq_done(&sp->srcu_gp_seq, s) || ULONG_CMP_GE(READ_ONCE(snp->srcu_gp_seq_needed_exp), s)) return; - raw_spin_lock_irqsave_rcu_node(snp, flags); + spin_lock_irqsave_rcu_node(snp, flags); if (ULONG_CMP_GE(snp->srcu_gp_seq_needed_exp, s)) { - raw_spin_unlock_irqrestore_rcu_node(snp, flags); + spin_unlock_irqrestore_rcu_node(snp, flags); return; } WRITE_ONCE(snp->srcu_gp_seq_needed_exp, s); - raw_spin_unlock_irqrestore_rcu_node(snp, flags); + spin_unlock_irqrestore_rcu_node(snp, flags); } - raw_spin_lock_irqsave_rcu_node(sp, flags); + spin_lock_irqsave_rcu_node(sp, flags); if (!ULONG_CMP_LT(sp->srcu_gp_seq_needed_exp, s)) sp->srcu_gp_seq_needed_exp = s; - raw_spin_unlock_irqrestore_rcu_node(sp, flags); + spin_unlock_irqrestore_rcu_node(sp, flags); } /* @@ -623,12 +650,12 @@ static void srcu_funnel_gp_start(struct srcu_struct *sp, struct srcu_data *sdp, for (; snp != NULL; snp = snp->srcu_parent) { if (rcu_seq_done(&sp->srcu_gp_seq, s) && snp != sdp->mynode) return; /* GP already done and CBs recorded. */ - raw_spin_lock_irqsave_rcu_node(snp, flags); + spin_lock_irqsave_rcu_node(snp, flags); if (ULONG_CMP_GE(snp->srcu_have_cbs[idx], s)) { snp_seq = snp->srcu_have_cbs[idx]; if (snp == sdp->mynode && snp_seq == s) snp->srcu_data_have_cbs[idx] |= sdp->grpmask; - raw_spin_unlock_irqrestore_rcu_node(snp, flags); + spin_unlock_irqrestore_rcu_node(snp, flags); if (snp == sdp->mynode && snp_seq != s) { srcu_schedule_cbs_sdp(sdp, do_norm ? SRCU_INTERVAL @@ -644,11 +671,11 @@ static void srcu_funnel_gp_start(struct srcu_struct *sp, struct srcu_data *sdp, snp->srcu_data_have_cbs[idx] |= sdp->grpmask; if (!do_norm && ULONG_CMP_LT(snp->srcu_gp_seq_needed_exp, s)) snp->srcu_gp_seq_needed_exp = s; - raw_spin_unlock_irqrestore_rcu_node(snp, flags); + spin_unlock_irqrestore_rcu_node(snp, flags); } /* Top of tree, must ensure the grace period will be started. */ - raw_spin_lock_irqsave_rcu_node(sp, flags); + spin_lock_irqsave_rcu_node(sp, flags); if (ULONG_CMP_LT(sp->srcu_gp_seq_needed, s)) { /* * Record need for grace period s. Pair with load @@ -667,7 +694,7 @@ static void srcu_funnel_gp_start(struct srcu_struct *sp, struct srcu_data *sdp, queue_delayed_work(system_power_efficient_wq, &sp->work, srcu_get_delay(sp)); } - raw_spin_unlock_irqrestore_rcu_node(sp, flags); + spin_unlock_irqrestore_rcu_node(sp, flags); } /* @@ -830,7 +857,7 @@ void __call_srcu(struct srcu_struct *sp, struct rcu_head *rhp, rhp->func = func; local_irq_save(flags); sdp = this_cpu_ptr(sp->sda); - raw_spin_lock_rcu_node(sdp); + spin_lock_rcu_node(sdp); rcu_segcblist_enqueue(&sdp->srcu_cblist, rhp, false); rcu_segcblist_advance(&sdp->srcu_cblist, rcu_seq_current(&sp->srcu_gp_seq)); @@ -844,7 +871,7 @@ void __call_srcu(struct srcu_struct *sp, struct rcu_head *rhp, sdp->srcu_gp_seq_needed_exp = s; needexp = true; } - raw_spin_unlock_irqrestore_rcu_node(sdp, flags); + spin_unlock_irqrestore_rcu_node(sdp, flags); if (needgp) srcu_funnel_gp_start(sp, sdp, s, do_norm); else if (needexp) @@ -900,7 +927,7 @@ static void __synchronize_srcu(struct srcu_struct *sp, bool do_norm) /* * Make sure that later code is ordered after the SRCU grace - * period. This pairs with the raw_spin_lock_irq_rcu_node() + * period. This pairs with the spin_lock_irq_rcu_node() * in srcu_invoke_callbacks(). Unlike Tree RCU, this is needed * because the current CPU might have been totally uninvolved with * (and thus unordered against) that grace period. @@ -1024,7 +1051,7 @@ void srcu_barrier(struct srcu_struct *sp) */ for_each_possible_cpu(cpu) { sdp = per_cpu_ptr(sp->sda, cpu); - raw_spin_lock_irq_rcu_node(sdp); + spin_lock_irq_rcu_node(sdp); atomic_inc(&sp->srcu_barrier_cpu_cnt); sdp->srcu_barrier_head.func = srcu_barrier_cb; debug_rcu_head_queue(&sdp->srcu_barrier_head); @@ -1033,7 +1060,7 @@ void srcu_barrier(struct srcu_struct *sp) debug_rcu_head_unqueue(&sdp->srcu_barrier_head); atomic_dec(&sp->srcu_barrier_cpu_cnt); } - raw_spin_unlock_irq_rcu_node(sdp); + spin_unlock_irq_rcu_node(sdp); } /* Remove the initial count, at which point reaching zero can happen. */ @@ -1082,17 +1109,17 @@ static void srcu_advance_state(struct srcu_struct *sp) */ idx = rcu_seq_state(smp_load_acquire(&sp->srcu_gp_seq)); /* ^^^ */ if (idx == SRCU_STATE_IDLE) { - raw_spin_lock_irq_rcu_node(sp); + spin_lock_irq_rcu_node(sp); if (ULONG_CMP_GE(sp->srcu_gp_seq, sp->srcu_gp_seq_needed)) { WARN_ON_ONCE(rcu_seq_state(sp->srcu_gp_seq)); - raw_spin_unlock_irq_rcu_node(sp); + spin_unlock_irq_rcu_node(sp); mutex_unlock(&sp->srcu_gp_mutex); return; } idx = rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)); if (idx == SRCU_STATE_IDLE) srcu_gp_start(sp); - raw_spin_unlock_irq_rcu_node(sp); + spin_unlock_irq_rcu_node(sp); if (idx != SRCU_STATE_IDLE) { mutex_unlock(&sp->srcu_gp_mutex); return; /* Someone else started the grace period. */ @@ -1141,19 +1168,19 @@ static void srcu_invoke_callbacks(struct work_struct *work) sdp = container_of(work, struct srcu_data, work.work); sp = sdp->sp; rcu_cblist_init(&ready_cbs); - raw_spin_lock_irq_rcu_node(sdp); + spin_lock_irq_rcu_node(sdp); rcu_segcblist_advance(&sdp->srcu_cblist, rcu_seq_current(&sp->srcu_gp_seq)); if (sdp->srcu_cblist_invoking || !rcu_segcblist_ready_cbs(&sdp->srcu_cblist)) { - raw_spin_unlock_irq_rcu_node(sdp); + spin_unlock_irq_rcu_node(sdp); return; /* Someone else on the job or nothing to do. */ } /* We are on the job! Extract and invoke ready callbacks. */ sdp->srcu_cblist_invoking = true; rcu_segcblist_extract_done_cbs(&sdp->srcu_cblist, &ready_cbs); - raw_spin_unlock_irq_rcu_node(sdp); + spin_unlock_irq_rcu_node(sdp); rhp = rcu_cblist_dequeue(&ready_cbs); for (; rhp != NULL; rhp = rcu_cblist_dequeue(&ready_cbs)) { debug_rcu_head_unqueue(rhp); @@ -1166,13 +1193,13 @@ static void srcu_invoke_callbacks(struct work_struct *work) * Update counts, accelerate new callbacks, and if needed, * schedule another round of callback invocation. */ - raw_spin_lock_irq_rcu_node(sdp); + spin_lock_irq_rcu_node(sdp); rcu_segcblist_insert_count(&sdp->srcu_cblist, &ready_cbs); (void)rcu_segcblist_accelerate(&sdp->srcu_cblist, rcu_seq_snap(&sp->srcu_gp_seq)); sdp->srcu_cblist_invoking = false; more = rcu_segcblist_ready_cbs(&sdp->srcu_cblist); - raw_spin_unlock_irq_rcu_node(sdp); + spin_unlock_irq_rcu_node(sdp); if (more) srcu_schedule_cbs_sdp(sdp, 0); } @@ -1185,7 +1212,7 @@ static void srcu_reschedule(struct srcu_struct *sp, unsigned long delay) { bool pushgp = true; - raw_spin_lock_irq_rcu_node(sp); + spin_lock_irq_rcu_node(sp); if (ULONG_CMP_GE(sp->srcu_gp_seq, sp->srcu_gp_seq_needed)) { if (!WARN_ON_ONCE(rcu_seq_state(sp->srcu_gp_seq))) { /* All requests fulfilled, time to go idle. */ @@ -1195,7 +1222,7 @@ static void srcu_reschedule(struct srcu_struct *sp, unsigned long delay) /* Outstanding request and no GP. Start one. */ srcu_gp_start(sp); } - raw_spin_unlock_irq_rcu_node(sp); + spin_unlock_irq_rcu_node(sp); if (pushgp) queue_delayed_work(system_power_efficient_wq, &sp->work, delay); -- cgit v1.2.3 From dac95906003fec1b4801115830cc14ec61c74960 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 4 Oct 2017 11:23:10 -0700 Subject: torture: Suppress CPU stall warnings during shutdown ftrace dump The torture_shutdown() function directly invokes ftrace_dump(), which can result in RCU CPU stall warnings when the ftrace buffer is large, which it usually is. This commit therefore invoks rcu_ftrace_dump() in place of ftrace_dump(), suppressing RCU CPU stall warnings during this time. Signed-off-by: Paul E. McKenney --- kernel/torture.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/torture.c b/kernel/torture.c index 637e172835d8..52781e838541 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -47,6 +47,7 @@ #include #include #include +#include "rcu/rcu.h" MODULE_LICENSE("GPL"); MODULE_AUTHOR("Paul E. McKenney "); @@ -500,7 +501,7 @@ static int torture_shutdown(void *arg) torture_shutdown_hook(); else VERBOSE_TOROUT_STRING("No torture_shutdown_hook(), skipping."); - ftrace_dump(DUMP_ALL); + rcu_ftrace_dump(DUMP_ALL); kernel_power_off(); /* Shut down the system. */ return 0; } -- cgit v1.2.3 From a0982dfa03efca6c239c52cabebcea4afb93ea6b Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 13 Oct 2017 16:24:28 -0700 Subject: sched: Stop resched_cpu() from sending IPIs to offline CPUs The rcutorture test suite occasionally provokes a splat due to invoking resched_cpu() on an offline CPU: WARNING: CPU: 2 PID: 8 at /home/paulmck/public_git/linux-rcu/arch/x86/kernel/smp.c:128 native_smp_send_reschedule+0x37/0x40 Modules linked in: CPU: 2 PID: 8 Comm: rcu_preempt Not tainted 4.14.0-rc4+ #1 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014 task: ffff902ede9daf00 task.stack: ffff96c50010c000 RIP: 0010:native_smp_send_reschedule+0x37/0x40 RSP: 0018:ffff96c50010fdb8 EFLAGS: 00010096 RAX: 000000000000002e RBX: ffff902edaab4680 RCX: 0000000000000003 RDX: 0000000080000003 RSI: 0000000000000000 RDI: 00000000ffffffff RBP: ffff96c50010fdb8 R08: 0000000000000000 R09: 0000000000000001 R10: 0000000000000000 R11: 00000000299f36ae R12: 0000000000000001 R13: ffffffff9de64240 R14: 0000000000000001 R15: ffffffff9de64240 FS: 0000000000000000(0000) GS:ffff902edfc80000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00000000f7d4c642 CR3: 000000001e0e2000 CR4: 00000000000006e0 Call Trace: resched_curr+0x8f/0x1c0 resched_cpu+0x2c/0x40 rcu_implicit_dynticks_qs+0x152/0x220 force_qs_rnp+0x147/0x1d0 ? sync_rcu_exp_select_cpus+0x450/0x450 rcu_gp_kthread+0x5a9/0x950 kthread+0x142/0x180 ? force_qs_rnp+0x1d0/0x1d0 ? kthread_create_on_node+0x40/0x40 ret_from_fork+0x27/0x40 Code: 14 01 0f 92 c0 84 c0 74 14 48 8b 05 14 4f f4 00 be fd 00 00 00 ff 90 a0 00 00 00 5d c3 89 fe 48 c7 c7 38 89 ca 9d e8 e5 56 08 00 <0f> ff 5d c3 0f 1f 44 00 00 8b 05 52 9e 37 02 85 c0 75 38 55 48 ---[ end trace 26df9e5df4bba4ac ]--- This splat cannot be generated by expedited grace periods because they always invoke resched_cpu() on the current CPU, which is good because expedited grace periods require that resched_cpu() unconditionally succeed. However, other parts of RCU can tolerate resched_cpu() acting as a no-op, at least as long as it doesn't happen too often. This commit therefore makes resched_cpu() invoke resched_curr() only if the CPU is either online or is the current CPU. Signed-off-by: Paul E. McKenney Cc: Ingo Molnar Cc: Peter Zijlstra --- kernel/sched/core.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 75554f366fd3..c85dfb746f8c 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -508,7 +508,8 @@ void resched_cpu(int cpu) unsigned long flags; raw_spin_lock_irqsave(&rq->lock, flags); - resched_curr(rq); + if (cpu_online(cpu) || cpu == smp_processor_id()) + resched_curr(rq); raw_spin_unlock_irqrestore(&rq->lock, flags); } -- cgit v1.2.3 From 2fe2582649aa2355f79acddb86bd4d6c5363eb63 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 13 Oct 2017 17:00:18 -0700 Subject: sched: Stop switched_to_rt() from sending IPIs to offline CPUs The rcutorture test suite occasionally provokes a splat due to invoking rt_mutex_lock() which needs to boost the priority of a task currently sitting on a runqueue that belongs to an offline CPU: WARNING: CPU: 0 PID: 12 at /home/paulmck/public_git/linux-rcu/arch/x86/kernel/smp.c:128 native_smp_send_reschedule+0x37/0x40 Modules linked in: CPU: 0 PID: 12 Comm: rcub/7 Not tainted 4.14.0-rc4+ #1 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014 task: ffff9ed3de5f8cc0 task.stack: ffffbbf80012c000 RIP: 0010:native_smp_send_reschedule+0x37/0x40 RSP: 0018:ffffbbf80012fd10 EFLAGS: 00010082 RAX: 000000000000002f RBX: ffff9ed3dd9cb300 RCX: 0000000000000004 RDX: 0000000080000004 RSI: 0000000000000086 RDI: 00000000ffffffff RBP: ffffbbf80012fd10 R08: 000000000009da7a R09: 0000000000007b9d R10: 0000000000000001 R11: ffffffffbb57c2cd R12: 000000000000000d R13: ffff9ed3de5f8cc0 R14: 0000000000000061 R15: ffff9ed3ded59200 FS: 0000000000000000(0000) GS:ffff9ed3dea00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00000000080686f0 CR3: 000000001b9e0000 CR4: 00000000000006f0 Call Trace: resched_curr+0x61/0xd0 switched_to_rt+0x8f/0xa0 rt_mutex_setprio+0x25c/0x410 task_blocks_on_rt_mutex+0x1b3/0x1f0 rt_mutex_slowlock+0xa9/0x1e0 rt_mutex_lock+0x29/0x30 rcu_boost_kthread+0x127/0x3c0 kthread+0x104/0x140 ? rcu_report_unblock_qs_rnp+0x90/0x90 ? kthread_create_on_node+0x40/0x40 ret_from_fork+0x22/0x30 Code: f0 00 0f 92 c0 84 c0 74 14 48 8b 05 34 74 c5 00 be fd 00 00 00 ff 90 a0 00 00 00 5d c3 89 fe 48 c7 c7 a0 c6 fc b9 e8 d5 b5 06 00 <0f> ff 5d c3 0f 1f 44 00 00 8b 05 a2 d1 13 02 85 c0 75 38 55 48 But the target task's priority has already been adjusted, so the only purpose of switched_to_rt() invoking resched_curr() is to wake up the CPU running some task that needs to be preempted by the boosted task. But the CPU is offline, which presumably means that the task must be migrated to some other CPU, and that this other CPU will undertake any needed preemption at the time of migration. Because the runqueue lock is held when resched_curr() is invoked, we know that the boosted task cannot go anywhere, so it is not necessary to invoke resched_curr() in this particular case. This commit therefore makes switched_to_rt() refrain from invoking resched_curr() when the target CPU is offline. Signed-off-by: Paul E. McKenney Cc: Ingo Molnar Cc: Peter Zijlstra --- kernel/sched/rt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 4056c19ca3f0..f242f642ef53 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -2206,7 +2206,7 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p) if (p->nr_cpus_allowed > 1 && rq->rt.overloaded) queue_push_tasks(rq); #endif /* CONFIG_SMP */ - if (p->prio < rq->curr->prio) + if (p->prio < rq->curr->prio && cpu_online(cpu_of(rq))) resched_curr(rq); } } -- cgit v1.2.3 From 5a93bae2c382c588f437ce0395e8032ae287dc36 Mon Sep 17 00:00:00 2001 From: Chunyu Hu Date: Thu, 19 Oct 2017 14:32:33 +0800 Subject: tracing: Fix code comments in trace.c Naming in code comments for tracing_snapshot, tracing_snapshot_alloc and trace_pid_filter_add_remove_task don't match the real function names. And latency_trace has been removed from tracing directory. Fix them. Link: http://lkml.kernel.org/r/1508394753-20887-1-git-send-email-chuhu@redhat.com Fixes: cab5037 ("tracing/ftrace: Enable snapshot function trigger") Fixes: 886b5b7 ("tracing: remove /debug/tracing/latency_trace") Signed-off-by: Chunyu Hu [ Replaced /sys/kernel/debug/tracing with /sys/kerne/tracing ] Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 73e67b68c53b..5815ec16edd4 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -362,7 +362,7 @@ trace_ignore_this_task(struct trace_pid_list *filtered_pids, struct task_struct } /** - * trace_pid_filter_add_remove - Add or remove a task from a pid_list + * trace_pid_filter_add_remove_task - Add or remove a task from a pid_list * @pid_list: The list to modify * @self: The current task for fork or NULL for exit * @task: The task to add or remove @@ -925,7 +925,7 @@ static void tracing_snapshot_instance(struct trace_array *tr) } /** - * trace_snapshot - take a snapshot of the current buffer. + * tracing_snapshot - take a snapshot of the current buffer. * * This causes a swap between the snapshot buffer and the current live * tracing buffer. You can use this to take snapshots of the live @@ -1004,9 +1004,9 @@ int tracing_alloc_snapshot(void) EXPORT_SYMBOL_GPL(tracing_alloc_snapshot); /** - * trace_snapshot_alloc - allocate and take a snapshot of the current buffer. + * tracing_snapshot_alloc - allocate and take a snapshot of the current buffer. * - * This is similar to trace_snapshot(), but it will allocate the + * This is similar to tracing_snapshot(), but it will allocate the * snapshot buffer if it isn't already allocated. Use this only * where it is safe to sleep, as the allocation may sleep. * @@ -1303,7 +1303,7 @@ unsigned long __read_mostly tracing_thresh; /* * Copy the new maximum trace into the separate maximum-trace * structure. (this way the maximum trace is permanently saved, - * for later retrieval via /sys/kernel/debug/tracing/latency_trace) + * for later retrieval via /sys/kernel/tracing/tracing_max_latency) */ static void __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) -- cgit v1.2.3 From 90e406f96f630c07d631a021fd4af10aac913e77 Mon Sep 17 00:00:00 2001 From: Changbin Du Date: Thu, 30 Nov 2017 11:39:43 +0800 Subject: tracing: Allocate mask_str buffer dynamically The default NR_CPUS can be very large, but actual possible nr_cpu_ids usually is very small. For my x86 distribution, the NR_CPUS is 8192 and nr_cpu_ids is 4. About 2 pages are wasted. Most machines don't have so many CPUs, so define a array with NR_CPUS just wastes memory. So let's allocate the buffer dynamically when need. With this change, the mutext tracing_cpumask_update_lock also can be removed now, which was used to protect mask_str. Link: http://lkml.kernel.org/r/1512013183-19107-1-git-send-email-changbin.du@intel.com Fixes: 36dfe9252bd4c ("ftrace: make use of tracing_cpumask") Cc: stable@vger.kernel.org Signed-off-by: Changbin Du Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 29 +++++++++-------------------- 1 file changed, 9 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 5815ec16edd4..9f3f043ba3b7 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4178,37 +4178,30 @@ static const struct file_operations show_traces_fops = { .llseek = seq_lseek, }; -/* - * The tracer itself will not take this lock, but still we want - * to provide a consistent cpumask to user-space: - */ -static DEFINE_MUTEX(tracing_cpumask_update_lock); - -/* - * Temporary storage for the character representation of the - * CPU bitmask (and one more byte for the newline): - */ -static char mask_str[NR_CPUS + 1]; - static ssize_t tracing_cpumask_read(struct file *filp, char __user *ubuf, size_t count, loff_t *ppos) { struct trace_array *tr = file_inode(filp)->i_private; + char *mask_str; int len; - mutex_lock(&tracing_cpumask_update_lock); + len = snprintf(NULL, 0, "%*pb\n", + cpumask_pr_args(tr->tracing_cpumask)) + 1; + mask_str = kmalloc(len, GFP_KERNEL); + if (!mask_str) + return -ENOMEM; - len = snprintf(mask_str, count, "%*pb\n", + len = snprintf(mask_str, len, "%*pb\n", cpumask_pr_args(tr->tracing_cpumask)); if (len >= count) { count = -EINVAL; goto out_err; } - count = simple_read_from_buffer(ubuf, count, ppos, mask_str, NR_CPUS+1); + count = simple_read_from_buffer(ubuf, count, ppos, mask_str, len); out_err: - mutex_unlock(&tracing_cpumask_update_lock); + kfree(mask_str); return count; } @@ -4228,8 +4221,6 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf, if (err) goto err_unlock; - mutex_lock(&tracing_cpumask_update_lock); - local_irq_disable(); arch_spin_lock(&tr->max_lock); for_each_tracing_cpu(cpu) { @@ -4252,8 +4243,6 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf, local_irq_enable(); cpumask_copy(tr->tracing_cpumask, tracing_cpumask_new); - - mutex_unlock(&tracing_cpumask_update_lock); free_cpumask_var(tracing_cpumask_new); return count; -- cgit v1.2.3 From 2dde6b0034dbc050957cdb6539ce28eca57e8cdf Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 3 Nov 2017 11:39:57 +0100 Subject: tracing: make PREEMPTIRQ_EVENTS depend on TRACING When CONFIG_TRACING is disabled, the new preemptirq events tracer produces a build failure: In file included from kernel/trace/trace_irqsoff.c:17:0: kernel/trace/trace.h: In function 'trace_test_and_set_recursion': kernel/trace/trace.h:542:28: error: 'struct task_struct' has no member named 'trace_recursion' Adding an explicit dependency avoids the broken configuration. Link: http://lkml.kernel.org/r/20171103104031.270375-1-arnd@arndb.de Fixes: d59158162e03 ("tracing: Add support for preempt and irq enable/disable events") Signed-off-by: Arnd Bergmann Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/Kconfig | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index af7dad126c13..904c952ac383 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -164,6 +164,7 @@ config PREEMPTIRQ_EVENTS bool "Enable trace events for preempt and irq disable/enable" select TRACE_IRQFLAGS depends on DEBUG_PREEMPT || !PROVE_LOCKING + depends on TRACING default n help Enable tracing of disable and enable events for preemption and irqs. -- cgit v1.2.3 From c4bfd39d7fa5203d4b387c283d360e9a108e85b3 Mon Sep 17 00:00:00 2001 From: Matthias Kaehlcke Date: Wed, 17 May 2017 17:14:15 -0700 Subject: ring-buffer: Remove unused function __rb_data_page_index() This fixes the following warning when building with clang: kernel/trace/ring_buffer.c:1842:1: error: unused function '__rb_data_page_index' [-Werror,-Wunused-function] Link: http://lkml.kernel.org/r/20170518001415.5223-1-mka@chromium.org Reviewed-by: Douglas Anderson Signed-off-by: Matthias Kaehlcke Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ring_buffer.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 91874a95060d..c87766c1c204 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1799,12 +1799,6 @@ void ring_buffer_change_overwrite(struct ring_buffer *buffer, int val) } EXPORT_SYMBOL_GPL(ring_buffer_change_overwrite); -static __always_inline void * -__rb_data_page_index(struct buffer_data_page *bpage, unsigned index) -{ - return bpage->data + index; -} - static __always_inline void *__rb_page_index(struct buffer_page *bpage, unsigned index) { return bpage->page->data + index; -- cgit v1.2.3 From a773d419275bf54854ca6cfda8f2594ed2790faa Mon Sep 17 00:00:00 2001 From: Felipe Balbi Date: Fri, 2 Jun 2017 13:20:25 +0300 Subject: tracing: Pass export pointer as argument to ->write() By passing an export descriptor to the write function, users don't need to keep a global static pointer and can rely on container_of() to fetch their own structure. Link: http://lkml.kernel.org/r/20170602102025.5140-1-felipe.balbi@linux.intel.com Acked-by: Steven Rostedt (VMware) Reviewed-by: Chunyan Zhang Signed-off-by: Felipe Balbi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 9f3f043ba3b7..59518b8126d0 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2415,7 +2415,7 @@ trace_process_export(struct trace_export *export, entry = ring_buffer_event_data(event); size = ring_buffer_event_length(event); - export->write(entry, size); + export->write(export, entry, size); } static DEFINE_MUTEX(ftrace_export_lock); -- cgit v1.2.3 From 4ab53fe612e21b0f509a3b468c56706364de98df Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Wed, 29 Nov 2017 11:12:27 +0000 Subject: PM: Provide a config snippet for disabling PM A frequent source of build problems is poor handling of optional PM support, almost all development is done with the PM options enabled but they can be turned off. Currently few if any of the build test services do this as standard as there is no standard config for it and the use of selects and def_bool means that simply setting CONFIG_PM=n doesn't do what is expected. To make this easier provide a fragement that can be used with KCONFIG_ALLCONFIG to force PM off. CONFIG_XEN is disabled as Xen uses hibernation callbacks which end up turning on power management on architectures with Xen. Some cpuidle implementations on ARM select PM so CONFIG_CPU_IDLE is disabled, and some ARM architectures unconditionally enable PM so they are also disabled. Signed-off-by: Mark Brown Signed-off-by: Rafael J. Wysocki --- kernel/configs/nopm.config | 15 +++++++++++++++ 1 file changed, 15 insertions(+) create mode 100644 kernel/configs/nopm.config (limited to 'kernel') diff --git a/kernel/configs/nopm.config b/kernel/configs/nopm.config new file mode 100644 index 000000000000..81ff07863576 --- /dev/null +++ b/kernel/configs/nopm.config @@ -0,0 +1,15 @@ +CONFIG_PM=n +CONFIG_SUSPEND=n +CONFIG_HIBERNATION=n + +# Triggers PM on OMAP +CONFIG_CPU_IDLE=n + +# Triggers enablement via hibernate callbacks +CONFIG_XEN=n + +# ARM/ARM64 architectures that select PM unconditionally +CONFIG_ARCH_OMAP2PLUS_TYPICAL=n +CONFIG_ARCH_RENESAS=n +CONFIG_ARCH_TEGRA=n +CONFIG_ARCH_VEXPRESS=n -- cgit v1.2.3 From a7e6425ea5816947d3cb51fbf57351207c074383 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 24 Oct 2017 08:25:02 -0700 Subject: workqueue: Eliminate cond_resched_rcu_qs() in favor of cond_resched() Now that cond_resched() also provides RCU quiescent states when needed, it can be used in place of cond_resched_rcu_qs(). This commit therefore makes this change. Signed-off-by: Paul E. McKenney Cc: Tejun Heo Reviewed-by: Lai Jiangshan --- kernel/workqueue.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 8fdb710bfdd7..aee7eaab05cb 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -2135,7 +2135,7 @@ __acquires(&pool->lock) * stop_machine. At the same time, report a quiescent RCU state so * the same condition doesn't freeze RCU. */ - cond_resched_rcu_qs(); + cond_resched(); spin_lock_irq(&pool->lock); -- cgit v1.2.3 From e31d28b6ab8f0431e2288edb02723269f54d1471 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 24 Oct 2017 08:26:32 -0700 Subject: trace: Eliminate cond_resched_rcu_qs() in favor of cond_resched() Now that cond_resched() also provides RCU quiescent states when needed, it can be used in place of cond_resched_rcu_qs(). This commit therefore makes this change. Signed-off-by: Paul E. McKenney Cc: Steven Rostedt Cc: Ingo Molnar --- kernel/trace/trace_benchmark.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_benchmark.c b/kernel/trace/trace_benchmark.c index 79f838a75077..22fee766081b 100644 --- a/kernel/trace/trace_benchmark.c +++ b/kernel/trace/trace_benchmark.c @@ -165,7 +165,7 @@ static int benchmark_event_kthread(void *arg) * this thread will never voluntarily schedule which would * block synchronize_rcu_tasks() indefinitely. */ - cond_resched_rcu_qs(); + cond_resched(); } return 0; -- cgit v1.2.3 From edf22f4ca26babcd8cba4f049c6be53f0e73bcc1 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 24 Oct 2017 08:31:12 -0700 Subject: softirq: Eliminate cond_resched_rcu_qs() in favor of cond_resched() Now that cond_resched() also provides RCU quiescent states when needed, it can be used in place of cond_resched_rcu_qs(). This commit therefore makes this change. Signed-off-by: Paul E. McKenney Cc: NeilBrown Cc: Ingo Molnar --- kernel/softirq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/softirq.c b/kernel/softirq.c index 2f5e87f1bae2..24d243ef8e71 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -665,7 +665,7 @@ static void run_ksoftirqd(unsigned int cpu) */ __do_softirq(); local_irq_enable(); - cond_resched_rcu_qs(); + cond_resched(); return; } local_irq_enable(); -- cgit v1.2.3 From 5c6338b4877038d28148fcfe1e7f862970ebaad1 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 9 Oct 2017 11:08:53 -0700 Subject: uprobes: Remove now-redundant smp_read_barrier_depends() Now that READ_ONCE() implies smp_read_barrier_depends(), the get_xol_area() and get_trampoline_vaddr() no longer need their smp_read_barrier_depends() calls, which this commit removes. While we are here, convert the corresponding smp_wmb() to an smp_store_release(). Signed-off-by: Paul E. McKenney Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Arnaldo Carvalho de Melo Cc: Alexander Shishkin --- kernel/events/uprobes.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 267f6ef91d97..ce6848e46e94 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1167,8 +1167,8 @@ static int xol_add_vma(struct mm_struct *mm, struct xol_area *area) } ret = 0; - smp_wmb(); /* pairs with get_xol_area() */ - mm->uprobes_state.xol_area = area; + /* pairs with get_xol_area() */ + smp_store_release(&mm->uprobes_state.xol_area, area); /* ^^^ */ fail: up_write(&mm->mmap_sem); @@ -1230,8 +1230,8 @@ static struct xol_area *get_xol_area(void) if (!mm->uprobes_state.xol_area) __create_xol_area(0); - area = mm->uprobes_state.xol_area; - smp_read_barrier_depends(); /* pairs with wmb in xol_add_vma() */ + /* Pairs with xol_add_vma() smp_store_release() */ + area = READ_ONCE(mm->uprobes_state.xol_area); /* ^^^ */ return area; } @@ -1528,8 +1528,8 @@ static unsigned long get_trampoline_vaddr(void) struct xol_area *area; unsigned long trampoline_vaddr = -1; - area = current->mm->uprobes_state.xol_area; - smp_read_barrier_depends(); + /* Pairs with xol_add_vma() smp_store_release() */ + area = READ_ONCE(current->mm->uprobes_state.xol_area); /* ^^^ */ if (area) trampoline_vaddr = area->vaddr; -- cgit v1.2.3 From 548095dea63ffc016d39c35b32c628d033638aca Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 9 Oct 2017 11:22:50 -0700 Subject: locking: Remove smp_read_barrier_depends() from queued_spin_lock_slowpath() Queued spinlocks are not used by DEC Alpha, and furthermore operations such as READ_ONCE() and release/relaxed RMW atomics are being changed to imply smp_read_barrier_depends(). This commit therefore removes the now-redundant smp_read_barrier_depends() from queued_spin_lock_slowpath(), and adjusts the comments accordingly. Signed-off-by: Paul E. McKenney Cc: Peter Zijlstra Cc: Ingo Molnar --- kernel/locking/qspinlock.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c index 294294c71ba4..38ece035039e 100644 --- a/kernel/locking/qspinlock.c +++ b/kernel/locking/qspinlock.c @@ -170,7 +170,7 @@ static __always_inline void clear_pending_set_locked(struct qspinlock *lock) * @tail : The new queue tail code word * Return: The previous queue tail code word * - * xchg(lock, tail) + * xchg(lock, tail), which heads an address dependency * * p,*,* -> n,*,* ; prev = xchg(lock, node) */ @@ -409,13 +409,11 @@ queue: if (old & _Q_TAIL_MASK) { prev = decode_tail(old); /* - * The above xchg_tail() is also a load of @lock which generates, - * through decode_tail(), a pointer. - * - * The address dependency matches the RELEASE of xchg_tail() - * such that the access to @prev must happen after. + * The above xchg_tail() is also a load of @lock which + * generates, through decode_tail(), a pointer. The address + * dependency matches the RELEASE of xchg_tail() such that + * the subsequent access to @prev happens after. */ - smp_read_barrier_depends(); WRITE_ONCE(prev->next, node); -- cgit v1.2.3 From 243d1a7977ae0814aa1ccb8bb87f8a4e0822ca8f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 9 Oct 2017 11:30:11 -0700 Subject: tracepoint: Remove smp_read_barrier_depends() from comment The comment in tracepoint_add_func() mentions smp_read_barrier_depends(), whose use should be quite restricted. This commit updates the comment to instead mention the smp_store_release() and rcu_dereference_sched() that the current code actually uses. Signed-off-by: Paul E. McKenney Cc: Ingo Molnar Acked-by: Steven Rostedt (VMware) Acked-by: Mathieu Desnoyers --- kernel/tracepoint.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index 685c50ae6300..671b13457387 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -212,11 +212,10 @@ static int tracepoint_add_func(struct tracepoint *tp, } /* - * rcu_assign_pointer has a smp_wmb() which makes sure that the new - * probe callbacks array is consistent before setting a pointer to it. - * This array is referenced by __DO_TRACE from - * include/linux/tracepoints.h. A matching smp_read_barrier_depends() - * is used. + * rcu_assign_pointer has as smp_store_release() which makes sure + * that the new probe callbacks array is consistent before setting + * a pointer to it. This array is referenced by __DO_TRACE from + * include/linux/tracepoint.h using rcu_dereference_sched(). */ rcu_assign_pointer(tp->funcs, tp_funcs); if (!static_key_enabled(&tp->key)) -- cgit v1.2.3 From 11db855c3d06e82f432cb1bafd73296586d5ceec Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 4 Dec 2017 14:41:11 -0800 Subject: Revert "cpuset: Make cpuset hotplug synchronous" This reverts commit 1599a185f0e6113be185b9fb809c621c73865829. This and the previous commit led to another circular locking scenario and the scenario which is fixed by this commit no longer exists after e8b3f8db7aad ("workqueue/hotplug: simplify workqueue_offline_cpu()") which removes work item flushing from hotplug path. Revert it for now. Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset.c | 41 +++++++++++++++++++++-------------------- kernel/power/process.c | 2 ++ kernel/sched/core.c | 1 + 3 files changed, 24 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 227bc25d951d..cab5fd1ee767 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -2277,8 +2277,15 @@ retry: mutex_unlock(&cpuset_mutex); } +static bool force_rebuild; + +void cpuset_force_rebuild(void) +{ + force_rebuild = true; +} + /** - * cpuset_hotplug - handle CPU/memory hotunplug for a cpuset + * cpuset_hotplug_workfn - handle CPU/memory hotunplug for a cpuset * * This function is called after either CPU or memory configuration has * changed and updates cpuset accordingly. The top_cpuset is always @@ -2293,7 +2300,7 @@ retry: * Note that CPU offlining during suspend is ignored. We don't modify * cpusets across suspend/resume cycles at all. */ -static void cpuset_hotplug(bool use_cpu_hp_lock) +static void cpuset_hotplug_workfn(struct work_struct *work) { static cpumask_t new_cpus; static nodemask_t new_mems; @@ -2351,31 +2358,25 @@ static void cpuset_hotplug(bool use_cpu_hp_lock) } /* rebuild sched domains if cpus_allowed has changed */ - if (cpus_updated) { - if (use_cpu_hp_lock) - rebuild_sched_domains(); - else { - /* Acquiring cpu_hotplug_lock is not required. - * When cpuset_hotplug() is called in hotplug path, - * cpu_hotplug_lock is held by the hotplug context - * which is waiting for cpuhp_thread_fun to indicate - * completion of callback. - */ - mutex_lock(&cpuset_mutex); - rebuild_sched_domains_cpuslocked(); - mutex_unlock(&cpuset_mutex); - } + if (cpus_updated || force_rebuild) { + force_rebuild = false; + rebuild_sched_domains(); } } -static void cpuset_hotplug_workfn(struct work_struct *work) +void cpuset_update_active_cpus(void) { - cpuset_hotplug(true); + /* + * We're inside cpu hotplug critical region which usually nests + * inside cgroup synchronization. Bounce actual hotplug processing + * to a work item to avoid reverse locking order. + */ + schedule_work(&cpuset_hotplug_work); } -void cpuset_update_active_cpus(void) +void cpuset_wait_for_hotplug(void) { - cpuset_hotplug(false); + flush_work(&cpuset_hotplug_work); } /* diff --git a/kernel/power/process.c b/kernel/power/process.c index c326d7235c5f..7381d49a44db 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -204,6 +204,8 @@ void thaw_processes(void) __usermodehelper_set_disable_depth(UMH_FREEZING); thaw_workqueues(); + cpuset_wait_for_hotplug(); + read_lock(&tasklist_lock); for_each_process_thread(g, p) { /* No other threads should have PF_SUSPEND_TASK set */ diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 88b3450b29ab..75554f366fd3 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5624,6 +5624,7 @@ static void cpuset_cpu_active(void) * restore the original sched domains by considering the * cpuset configurations. */ + cpuset_force_rebuild(); } cpuset_update_active_cpus(); } -- cgit v1.2.3 From e8b3f8db7aad99fcc5234fc5b89984ff6620de3d Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Fri, 1 Dec 2017 22:20:36 +0800 Subject: workqueue/hotplug: simplify workqueue_offline_cpu() Since the recent cpu/hotplug refactoring, workqueue_offline_cpu() is guaranteed to run on the local cpu which is going offline. This also fixes the following deadlock by removing work item scheduling and flushing from CPU hotplug path. http://lkml.kernel.org/r/1504764252-29091-1-git-send-email-prsood@codeaurora.org tj: Description update. Signed-off-by: Lai Jiangshan Signed-off-by: Tejun Heo --- kernel/workqueue.c | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 6a5658cb46da..48a4d00f55dc 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1635,7 +1635,7 @@ static void worker_enter_idle(struct worker *worker) mod_timer(&pool->idle_timer, jiffies + IDLE_WORKER_TIMEOUT); /* - * Sanity check nr_running. Because wq_unbind_fn() releases + * Sanity check nr_running. Because unbind_workers() releases * pool->lock between setting %WORKER_UNBOUND and zapping * nr_running, the warning may trigger spuriously. Check iff * unbind is not in progress. @@ -4511,9 +4511,8 @@ void show_workqueue_state(void) * cpu comes back online. */ -static void wq_unbind_fn(struct work_struct *work) +static void unbind_workers(int cpu) { - int cpu = smp_processor_id(); struct worker_pool *pool; struct worker *worker; @@ -4710,12 +4709,13 @@ int workqueue_online_cpu(unsigned int cpu) int workqueue_offline_cpu(unsigned int cpu) { - struct work_struct unbind_work; struct workqueue_struct *wq; /* unbinding per-cpu workers should happen on the local CPU */ - INIT_WORK_ONSTACK(&unbind_work, wq_unbind_fn); - queue_work_on(cpu, system_highpri_wq, &unbind_work); + if (WARN_ON(cpu != smp_processor_id())) + return -1; + + unbind_workers(cpu); /* update NUMA affinity of unbound workqueues */ mutex_lock(&wq_pool_mutex); @@ -4723,9 +4723,6 @@ int workqueue_offline_cpu(unsigned int cpu) wq_update_unbound_numa(wq, cpu, false); mutex_unlock(&wq_pool_mutex); - /* wait for per-cpu unbinding to finish */ - flush_work(&unbind_work); - destroy_work_on_stack(&unbind_work); return 0; } -- cgit v1.2.3 From 62408c1ef00784e8bcfc4848ade76480fb8aed21 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Fri, 1 Dec 2017 22:23:07 +0800 Subject: workqueue/hotplug: remove the workaround in rebind_workers() Since the cpu/hotplug refactoring, DOWN_FAILED is never called without preceding DOWN_PREPARE making the workaround unnecessary. Remove it. Signed-off-by: Lai Jiangshan Signed-off-by: Tejun Heo --- kernel/workqueue.c | 10 ---------- 1 file changed, 10 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 48a4d00f55dc..45ce93f3dd1f 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -4589,16 +4589,6 @@ static void rebind_workers(struct worker_pool *pool) spin_lock_irq(&pool->lock); - /* - * XXX: CPU hotplug notifiers are weird and can call DOWN_FAILED - * w/o preceding DOWN_PREPARE. Work around it. CPU hotplug is - * being reworked and this can go away in time. - */ - if (!(pool->flags & POOL_DISASSOCIATED)) { - spin_unlock_irq(&pool->lock); - return; - } - pool->flags &= ~POOL_DISASSOCIATED; for_each_pool_worker(worker, pool) { -- cgit v1.2.3 From bdfbbda90aeb75ce0951413fd7f495d4d377bd5e Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 4 Dec 2017 14:55:59 -0800 Subject: Revert "cgroup/cpuset: remove circular dependency deadlock" This reverts commit aa24163b2ee5c92120e32e99b5a93143a0f4258e. This and the following commit led to another circular locking scenario and the scenario which is fixed by this commit no longer exists after e8b3f8db7aad ("workqueue/hotplug: simplify workqueue_offline_cpu()") which removes work item flushing from hotplug path. Revert it for now. Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset.c | 53 ++++++++++++++++++++++---------------------------- 1 file changed, 23 insertions(+), 30 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index cab5fd1ee767..f7efa7b4d825 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -812,18 +812,6 @@ done: return ndoms; } -static void cpuset_sched_change_begin(void) -{ - cpus_read_lock(); - mutex_lock(&cpuset_mutex); -} - -static void cpuset_sched_change_end(void) -{ - mutex_unlock(&cpuset_mutex); - cpus_read_unlock(); -} - /* * Rebuild scheduler domains. * @@ -833,14 +821,16 @@ static void cpuset_sched_change_end(void) * 'cpus' is removed, then call this routine to rebuild the * scheduler's dynamic sched domains. * + * Call with cpuset_mutex held. Takes get_online_cpus(). */ -static void rebuild_sched_domains_cpuslocked(void) +static void rebuild_sched_domains_locked(void) { struct sched_domain_attr *attr; cpumask_var_t *doms; int ndoms; lockdep_assert_held(&cpuset_mutex); + get_online_cpus(); /* * We have raced with CPU hotplug. Don't do anything to avoid @@ -848,25 +838,27 @@ static void rebuild_sched_domains_cpuslocked(void) * Anyways, hotplug work item will rebuild sched domains. */ if (!cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask)) - return; + goto out; /* Generate domain masks and attrs */ ndoms = generate_sched_domains(&doms, &attr); /* Have scheduler rebuild the domains */ partition_sched_domains(ndoms, doms, attr); +out: + put_online_cpus(); } #else /* !CONFIG_SMP */ -static void rebuild_sched_domains_cpuslocked(void) +static void rebuild_sched_domains_locked(void) { } #endif /* CONFIG_SMP */ void rebuild_sched_domains(void) { - cpuset_sched_change_begin(); - rebuild_sched_domains_cpuslocked(); - cpuset_sched_change_end(); + mutex_lock(&cpuset_mutex); + rebuild_sched_domains_locked(); + mutex_unlock(&cpuset_mutex); } /** @@ -952,7 +944,7 @@ static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus) rcu_read_unlock(); if (need_rebuild_sched_domains) - rebuild_sched_domains_cpuslocked(); + rebuild_sched_domains_locked(); } /** @@ -1284,7 +1276,7 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val) cs->relax_domain_level = val; if (!cpumask_empty(cs->cpus_allowed) && is_sched_load_balance(cs)) - rebuild_sched_domains_cpuslocked(); + rebuild_sched_domains_locked(); } return 0; @@ -1317,6 +1309,7 @@ static void update_tasks_flags(struct cpuset *cs) * * Call with cpuset_mutex held. */ + static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, int turning_on) { @@ -1349,7 +1342,7 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, spin_unlock_irq(&callback_lock); if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed) - rebuild_sched_domains_cpuslocked(); + rebuild_sched_domains_locked(); if (spread_flag_changed) update_tasks_flags(cs); @@ -1617,7 +1610,7 @@ static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft, cpuset_filetype_t type = cft->private; int retval = 0; - cpuset_sched_change_begin(); + mutex_lock(&cpuset_mutex); if (!is_cpuset_online(cs)) { retval = -ENODEV; goto out_unlock; @@ -1653,7 +1646,7 @@ static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft, break; } out_unlock: - cpuset_sched_change_end(); + mutex_unlock(&cpuset_mutex); return retval; } @@ -1664,7 +1657,7 @@ static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft, cpuset_filetype_t type = cft->private; int retval = -ENODEV; - cpuset_sched_change_begin(); + mutex_lock(&cpuset_mutex); if (!is_cpuset_online(cs)) goto out_unlock; @@ -1677,7 +1670,7 @@ static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft, break; } out_unlock: - cpuset_sched_change_end(); + mutex_unlock(&cpuset_mutex); return retval; } @@ -1716,7 +1709,7 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of, kernfs_break_active_protection(of->kn); flush_work(&cpuset_hotplug_work); - cpuset_sched_change_begin(); + mutex_lock(&cpuset_mutex); if (!is_cpuset_online(cs)) goto out_unlock; @@ -1740,7 +1733,7 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of, free_trial_cpuset(trialcs); out_unlock: - cpuset_sched_change_end(); + mutex_unlock(&cpuset_mutex); kernfs_unbreak_active_protection(of->kn); css_put(&cs->css); flush_workqueue(cpuset_migrate_mm_wq); @@ -2041,14 +2034,14 @@ out_unlock: /* * If the cpuset being removed has its flag 'sched_load_balance' * enabled, then simulate turning sched_load_balance off, which - * will call rebuild_sched_domains_cpuslocked(). + * will call rebuild_sched_domains_locked(). */ static void cpuset_css_offline(struct cgroup_subsys_state *css) { struct cpuset *cs = css_cs(css); - cpuset_sched_change_begin(); + mutex_lock(&cpuset_mutex); if (is_sched_load_balance(cs)) update_flag(CS_SCHED_LOAD_BALANCE, cs, 0); @@ -2056,7 +2049,7 @@ static void cpuset_css_offline(struct cgroup_subsys_state *css) cpuset_dec(); clear_bit(CS_ONLINE, &cs->flags); - cpuset_sched_change_end(); + mutex_unlock(&cpuset_mutex); } static void cpuset_css_free(struct cgroup_subsys_state *css) -- cgit v1.2.3 From a555e9d86ee384d9d3cb3310a57aed33f7e053d4 Mon Sep 17 00:00:00 2001 From: Cheng Jian Date: Thu, 7 Dec 2017 21:30:43 +0800 Subject: sched/fair: Remove unused 'curr' parameter from wakeup_gran The first parameter of wakeup_gran(), 'curr', is unnecessary now. Signed-off-by: Cheng Jian Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: huawei.libin@huawei.com Cc: xiexiuqi@huawei.com Link: http://lkml.kernel.org/r/1512653443-179848-1-git-send-email-cj.chengjian@huawei.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 2fe3aa853e4d..2915c0d95107 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6449,8 +6449,7 @@ static void task_dead_fair(struct task_struct *p) } #endif /* CONFIG_SMP */ -static unsigned long -wakeup_gran(struct sched_entity *curr, struct sched_entity *se) +static unsigned long wakeup_gran(struct sched_entity *se) { unsigned long gran = sysctl_sched_wakeup_granularity; @@ -6492,7 +6491,7 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se) if (vdiff <= 0) return -1; - gran = wakeup_gran(curr, se); + gran = wakeup_gran(se); if (vdiff > gran) return 1; -- cgit v1.2.3 From 2064a5ab04707c55003e099e5abbf19a0826bbac Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sun, 3 Dec 2017 13:19:00 -0800 Subject: sched/core: Fix kernel-doc warnings after code movement Fix the following kernel-doc warnings after code restructuring: ../kernel/sched/core.c:5113: warning: No description found for parameter 't' ../kernel/sched/core.c:5113: warning: Excess function parameter 'interval' description in 'sched_rr_get_interval' get rid of set_fs()") Signed-off-by: Randy Dunlap Cc: Al Viro Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Fixes: abca5fc535a3e ("sched_rr_get_interval(): move compat to native, Link: http://lkml.kernel.org/r/995c6ded-b32e-bbe4-d9f5-4d42d121aff1@infradead.org Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 75554f366fd3..644fa2e3d993 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5097,17 +5097,6 @@ SYSCALL_DEFINE1(sched_get_priority_min, int, policy) return ret; } -/** - * sys_sched_rr_get_interval - return the default timeslice of a process. - * @pid: pid of the process. - * @interval: userspace pointer to the timeslice value. - * - * this syscall writes the default timeslice value of a given process - * into the user-space timespec buffer. A value of '0' means infinity. - * - * Return: On success, 0 and the timeslice is in @interval. Otherwise, - * an error code. - */ static int sched_rr_get_interval(pid_t pid, struct timespec64 *t) { struct task_struct *p; @@ -5144,6 +5133,17 @@ out_unlock: return retval; } +/** + * sys_sched_rr_get_interval - return the default timeslice of a process. + * @pid: pid of the process. + * @interval: userspace pointer to the timeslice value. + * + * this syscall writes the default timeslice value of a given process + * into the user-space timespec buffer. A value of '0' means infinity. + * + * Return: On success, 0 and the timeslice is in @interval. Otherwise, + * an error code. + */ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, struct timespec __user *, interval) { -- cgit v1.2.3 From 01dfee9582d9b4403c4902df096ed8b43d55181c Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Fri, 8 Dec 2017 11:56:14 +0900 Subject: workqueue: remove unneeded kallsyms include The filw was converted from print_symbol() to %pf some time ago (044c782ce3a901fb "workqueue: fix checkpatch issues"). kallsyms does not seem to be needed anymore. Signed-off-by: Sergey Senozhatsky Cc: Tejun Heo Cc: Lai Jiangshan Signed-off-by: Tejun Heo --- kernel/workqueue.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 45ce93f3dd1f..43d18cb46308 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -38,7 +38,6 @@ #include #include #include -#include #include #include #include -- cgit v1.2.3 From efd88b02bb9e6b8b73a20ea611e5d07ed6d4af34 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 19 Oct 2017 14:52:41 -0700 Subject: rcu: Add comment giving debug strategy for double call_rcu() The following statement has for some reason proven non-intuitive: WARN_ON_ONCE(rcu_segcblist_empty(&rdp->cblist) != (count == 0)); This commit therefore adds a comment that states that this warning usually triggers in response to a double call_rcu(), which is sort of like a double free. The comment also suggests building with CONFIG_DEBUG_OBJECTS_RCU_HEAD=y to track down the double call_rcu(). Reported-by: David Howells Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index f9c0ca2ccf0c..1bdc0481aaf1 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2789,6 +2789,11 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) rdp->n_force_qs_snap = rsp->n_force_qs; } else if (count < rdp->qlen_last_fqs_check - qhimark) rdp->qlen_last_fqs_check = count; + + /* + * The following usually indicates a double call_rcu(). To track + * this down, try building with CONFIG_DEBUG_OBJECTS_RCU_HEAD=y. + */ WARN_ON_ONCE(rcu_segcblist_empty(&rdp->cblist) != (count == 0)); local_irq_restore(flags); -- cgit v1.2.3 From 84b12b752f41cd3d25d75692c2145d816e42926c Mon Sep 17 00:00:00 2001 From: Rakib Mullick Date: Fri, 17 Nov 2017 21:40:15 +0600 Subject: rcu: Remove have_rcu_nocb_mask from tree_plugin.h Currently have_rcu_nocb_mask is used to avoid double allocation of rcu_nocb_mask during boot up. Due to different representation of cpumask_var_t on different kernel config CPUMASK=y(or n) it was okay. But now we have a helper cpumask_available(), which can be utilized to check whether rcu_nocb_mask has been allocated or not without using a variable. Removing the variable also reduces vmlinux size. Unpatched version: text data bss dec hex filename 13050393 7852470 14543408 35446271 21cddff vmlinux Patched version: text data bss dec hex filename 13050390 7852438 14543408 35446236 21cdddc vmlinux Signed-off-by: Rakib Mullick Cc: "Paul E. McKenney" Cc: Josh Triplett Cc: Steven Rostedt Cc: Mathieu Desnoyers Cc: Lai Jiangshan Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index db85ca3975f1..13a8e08f1998 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -61,7 +61,6 @@ DEFINE_PER_CPU(char, rcu_cpu_has_work); #ifdef CONFIG_RCU_NOCB_CPU static cpumask_var_t rcu_nocb_mask; /* CPUs to have callbacks offloaded. */ -static bool have_rcu_nocb_mask; /* Was rcu_nocb_mask allocated? */ static bool __read_mostly rcu_nocb_poll; /* Offload kthread are to poll. */ #endif /* #ifdef CONFIG_RCU_NOCB_CPU */ @@ -1752,7 +1751,6 @@ static void increment_cpu_stall_ticks(void) static int __init rcu_nocb_setup(char *str) { alloc_bootmem_cpumask_var(&rcu_nocb_mask); - have_rcu_nocb_mask = true; cpulist_parse(str, rcu_nocb_mask); return 1; } @@ -1801,7 +1799,7 @@ static void rcu_init_one_nocb(struct rcu_node *rnp) /* Is the specified CPU a no-CBs CPU? */ bool rcu_is_nocb_cpu(int cpu) { - if (have_rcu_nocb_mask) + if (cpumask_available(rcu_nocb_mask)) return cpumask_test_cpu(cpu, rcu_nocb_mask); return false; } @@ -2295,14 +2293,13 @@ void __init rcu_init_nohz(void) need_rcu_nocb_mask = true; #endif /* #if defined(CONFIG_NO_HZ_FULL) */ - if (!have_rcu_nocb_mask && need_rcu_nocb_mask) { + if (!cpumask_available(rcu_nocb_mask) && need_rcu_nocb_mask) { if (!zalloc_cpumask_var(&rcu_nocb_mask, GFP_KERNEL)) { pr_info("rcu_nocb_mask allocation failed, callback offloading disabled.\n"); return; } - have_rcu_nocb_mask = true; } - if (!have_rcu_nocb_mask) + if (!cpumask_available(rcu_nocb_mask)) return; #if defined(CONFIG_NO_HZ_FULL) @@ -2428,7 +2425,7 @@ static void __init rcu_organize_nocb_kthreads(struct rcu_state *rsp) struct rcu_data *rdp_leader = NULL; /* Suppress misguided gcc warn. */ struct rcu_data *rdp_prev = NULL; - if (!have_rcu_nocb_mask) + if (!cpumask_available(rcu_nocb_mask)) return; if (ls == -1) { ls = int_sqrt(nr_cpu_ids); -- cgit v1.2.3 From cc1321c96f855525fbd847fec130f000daa1bb1f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 16 Oct 2017 11:05:03 -0700 Subject: torture: Reduce #ifdefs for preempt_schedule() This commit adds a torture_preempt_schedule() that is nothingness in !PREEMPT builds and is preempt_schedule() otherwise. Then torture_preempt_schedule() is used to eliminate several ugly #ifdefs, both in rcutorture and in locktorture. Signed-off-by: Paul E. McKenney --- kernel/locking/locktorture.c | 24 ++++++------------------ kernel/rcu/rcutorture.c | 4 +--- 2 files changed, 7 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c index f24582d4dad3..617cea2520b3 100644 --- a/kernel/locking/locktorture.c +++ b/kernel/locking/locktorture.c @@ -130,10 +130,8 @@ static void torture_lock_busted_write_delay(struct torture_random_state *trsp) if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 2000 * longdelay_ms))) mdelay(longdelay_ms); -#ifdef CONFIG_PREEMPT if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 20000))) - preempt_schedule(); /* Allow test to be preempted. */ -#endif + torture_preempt_schedule(); /* Allow test to be preempted. */ } static void torture_lock_busted_write_unlock(void) @@ -179,10 +177,8 @@ static void torture_spin_lock_write_delay(struct torture_random_state *trsp) if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 2 * shortdelay_us))) udelay(shortdelay_us); -#ifdef CONFIG_PREEMPT if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 20000))) - preempt_schedule(); /* Allow test to be preempted. */ -#endif + torture_preempt_schedule(); /* Allow test to be preempted. */ } static void torture_spin_lock_write_unlock(void) __releases(torture_spinlock) @@ -352,10 +348,8 @@ static void torture_mutex_delay(struct torture_random_state *trsp) mdelay(longdelay_ms * 5); else mdelay(longdelay_ms / 5); -#ifdef CONFIG_PREEMPT if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 20000))) - preempt_schedule(); /* Allow test to be preempted. */ -#endif + torture_preempt_schedule(); /* Allow test to be preempted. */ } static void torture_mutex_unlock(void) __releases(torture_mutex) @@ -507,10 +501,8 @@ static void torture_rtmutex_delay(struct torture_random_state *trsp) if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 2 * shortdelay_us))) udelay(shortdelay_us); -#ifdef CONFIG_PREEMPT if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 20000))) - preempt_schedule(); /* Allow test to be preempted. */ -#endif + torture_preempt_schedule(); /* Allow test to be preempted. */ } static void torture_rtmutex_unlock(void) __releases(torture_rtmutex) @@ -547,10 +539,8 @@ static void torture_rwsem_write_delay(struct torture_random_state *trsp) mdelay(longdelay_ms * 10); else mdelay(longdelay_ms / 10); -#ifdef CONFIG_PREEMPT if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 20000))) - preempt_schedule(); /* Allow test to be preempted. */ -#endif + torture_preempt_schedule(); /* Allow test to be preempted. */ } static void torture_rwsem_up_write(void) __releases(torture_rwsem) @@ -574,10 +564,8 @@ static void torture_rwsem_read_delay(struct torture_random_state *trsp) mdelay(longdelay_ms * 2); else mdelay(longdelay_ms / 2); -#ifdef CONFIG_PREEMPT if (!(torture_random(trsp) % (cxt.nrealreaders_stress * 20000))) - preempt_schedule(); /* Allow test to be preempted. */ -#endif + torture_preempt_schedule(); /* Allow test to be preempted. */ } static void torture_rwsem_up_read(void) __releases(torture_rwsem) diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 74f6b0146b98..e7d3cce84214 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -315,11 +315,9 @@ static void rcu_read_delay(struct torture_random_state *rrsp) } if (!(torture_random(rrsp) % (nrealreaders * 2 * shortdelay_us))) udelay(shortdelay_us); -#ifdef CONFIG_PREEMPT if (!preempt_count() && !(torture_random(rrsp) % (nrealreaders * 20000))) - preempt_schedule(); /* No QS if preempt_disable() in effect */ -#endif + torture_preempt_schedule(); /* QS only if preemptible. */ } static void rcu_torture_read_unlock(int idx) __releases(RCU) -- cgit v1.2.3 From e8302739aa2204d52dacf9e9619cb6e755fa997a Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 16 Oct 2017 11:23:42 -0700 Subject: rcutorture: Preempt RCU-preempt readers more vigorously This commit attempts to make a very rare rcutorture failure happen more often by increasing the fraction of RCU-preempt read-side critical sections that are preempted. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index e7d3cce84214..1074ecc3f72f 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -316,7 +316,7 @@ static void rcu_read_delay(struct torture_random_state *rrsp) if (!(torture_random(rrsp) % (nrealreaders * 2 * shortdelay_us))) udelay(shortdelay_us); if (!preempt_count() && - !(torture_random(rrsp) % (nrealreaders * 20000))) + !(torture_random(rrsp) % (nrealreaders * 500))) torture_preempt_schedule(); /* QS only if preemptible. */ } -- cgit v1.2.3 From f2f762608f45353b0b8c37507824f95bb716c3d5 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Mon, 15 May 2017 02:07:22 -0700 Subject: locking/locktorture: Fix rwsem reader_delay We should account for nreader threads, not writers in this callback. Could even trigger a div by 0 if the user explicitly disables writers. Signed-off-by: Davidlohr Bueso Signed-off-by: Paul E. McKenney --- kernel/locking/locktorture.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c index 617cea2520b3..a307a79e6b0b 100644 --- a/kernel/locking/locktorture.c +++ b/kernel/locking/locktorture.c @@ -560,7 +560,7 @@ static void torture_rwsem_read_delay(struct torture_random_state *trsp) /* We want a long delay occasionally to force massive contention. */ if (!(torture_random(trsp) % - (cxt.nrealwriters_stress * 2000 * longdelay_ms))) + (cxt.nrealreaders_stress * 2000 * longdelay_ms))) mdelay(longdelay_ms * 2); else mdelay(longdelay_ms / 2); -- cgit v1.2.3 From 2ce77d16db4240dd2e422fc0a5c26d3e2ec03446 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Mon, 15 May 2017 02:07:23 -0700 Subject: locking/locktorture: Fix num reader/writer corner cases Things can explode for locktorture if the user does combinations of nwriters_stress=0 nreaders_stress=0. Fix this by not assuming we always want to torture writer threads. Reported-by: Jeremy Linton Signed-off-by: Davidlohr Bueso Signed-off-by: Paul E. McKenney Reviewed-by: Jeremy Linton Tested-by: Jeremy Linton --- kernel/locking/locktorture.c | 76 +++++++++++++++++++++++++------------------- 1 file changed, 44 insertions(+), 32 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c index a307a79e6b0b..2a1fc2a58910 100644 --- a/kernel/locking/locktorture.c +++ b/kernel/locking/locktorture.c @@ -703,8 +703,7 @@ static void __torture_print_stats(char *page, { bool fail = 0; int i, n_stress; - long max = 0; - long min = statp[0].n_lock_acquired; + long max = 0, min = statp ? statp[0].n_lock_acquired : 0; long long sum = 0; n_stress = write ? cxt.nrealwriters_stress : cxt.nrealreaders_stress; @@ -811,7 +810,7 @@ static void lock_torture_cleanup(void) * such, only perform the underlying torture-specific cleanups, * and avoid anything related to locktorture. */ - if (!cxt.lwsa) + if (!cxt.lwsa && !cxt.lrsa) goto end; if (writer_tasks) { @@ -886,6 +885,13 @@ static int __init lock_torture_init(void) firsterr = -EINVAL; goto unwind; } + + if (nwriters_stress == 0 && nreaders_stress == 0) { + pr_alert("lock-torture: must run at least one locking thread\n"); + firsterr = -EINVAL; + goto unwind; + } + if (cxt.cur_ops->init) cxt.cur_ops->init(); @@ -909,17 +915,19 @@ static int __init lock_torture_init(void) #endif /* Initialize the statistics so that each run gets its own numbers. */ + if (nwriters_stress) { + lock_is_write_held = 0; + cxt.lwsa = kmalloc(sizeof(*cxt.lwsa) * cxt.nrealwriters_stress, GFP_KERNEL); + if (cxt.lwsa == NULL) { + VERBOSE_TOROUT_STRING("cxt.lwsa: Out of memory"); + firsterr = -ENOMEM; + goto unwind; + } - lock_is_write_held = 0; - cxt.lwsa = kmalloc(sizeof(*cxt.lwsa) * cxt.nrealwriters_stress, GFP_KERNEL); - if (cxt.lwsa == NULL) { - VERBOSE_TOROUT_STRING("cxt.lwsa: Out of memory"); - firsterr = -ENOMEM; - goto unwind; - } - for (i = 0; i < cxt.nrealwriters_stress; i++) { - cxt.lwsa[i].n_lock_fail = 0; - cxt.lwsa[i].n_lock_acquired = 0; + for (i = 0; i < cxt.nrealwriters_stress; i++) { + cxt.lwsa[i].n_lock_fail = 0; + cxt.lwsa[i].n_lock_acquired = 0; + } } if (cxt.cur_ops->readlock) { @@ -936,19 +944,21 @@ static int __init lock_torture_init(void) cxt.nrealreaders_stress = cxt.nrealwriters_stress; } - lock_is_read_held = 0; - cxt.lrsa = kmalloc(sizeof(*cxt.lrsa) * cxt.nrealreaders_stress, GFP_KERNEL); - if (cxt.lrsa == NULL) { - VERBOSE_TOROUT_STRING("cxt.lrsa: Out of memory"); - firsterr = -ENOMEM; - kfree(cxt.lwsa); - cxt.lwsa = NULL; - goto unwind; - } - - for (i = 0; i < cxt.nrealreaders_stress; i++) { - cxt.lrsa[i].n_lock_fail = 0; - cxt.lrsa[i].n_lock_acquired = 0; + if (nreaders_stress) { + lock_is_read_held = 0; + cxt.lrsa = kmalloc(sizeof(*cxt.lrsa) * cxt.nrealreaders_stress, GFP_KERNEL); + if (cxt.lrsa == NULL) { + VERBOSE_TOROUT_STRING("cxt.lrsa: Out of memory"); + firsterr = -ENOMEM; + kfree(cxt.lwsa); + cxt.lwsa = NULL; + goto unwind; + } + + for (i = 0; i < cxt.nrealreaders_stress; i++) { + cxt.lrsa[i].n_lock_fail = 0; + cxt.lrsa[i].n_lock_acquired = 0; + } } } @@ -978,12 +988,14 @@ static int __init lock_torture_init(void) goto unwind; } - writer_tasks = kzalloc(cxt.nrealwriters_stress * sizeof(writer_tasks[0]), - GFP_KERNEL); - if (writer_tasks == NULL) { - VERBOSE_TOROUT_ERRSTRING("writer_tasks: Out of memory"); - firsterr = -ENOMEM; - goto unwind; + if (nwriters_stress) { + writer_tasks = kzalloc(cxt.nrealwriters_stress * sizeof(writer_tasks[0]), + GFP_KERNEL); + if (writer_tasks == NULL) { + VERBOSE_TOROUT_ERRSTRING("writer_tasks: Out of memory"); + firsterr = -ENOMEM; + goto unwind; + } } if (cxt.cur_ops->readlock) { -- cgit v1.2.3 From 4ced3314fd3a73dabac4e8a41747883eff36c3e8 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 21 Nov 2017 15:01:02 -0800 Subject: torture: Make stutter less vulnerable to compilers and races The stutter_wait() function repeatedly fetched stutter_pause_test, and should really just fetch it once on each pass. The races should be harmless, but why have the races? Also, the whole point of the value "2" for stutter_pause_test is to get everyone to start at very nearly the same time, but the value "2" was the first jiffy of the stutter rather than the last jiffy of the stutter. This commit rearranges the code to be more sensible. Signed-off-by: Paul E. McKenney --- kernel/torture.c | 33 +++++++++++++++++---------------- 1 file changed, 17 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/torture.c b/kernel/torture.c index 52781e838541..3bcbd4fbfe18 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -573,18 +573,21 @@ static int stutter; */ void stutter_wait(const char *title) { + int spt; + cond_resched_rcu_qs(); - while (READ_ONCE(stutter_pause_test) || - (torture_runnable && !READ_ONCE(*torture_runnable))) { - if (stutter_pause_test) - if (READ_ONCE(stutter_pause_test) == 1) - schedule_timeout_interruptible(1); - else - while (READ_ONCE(stutter_pause_test)) - cond_resched(); - else + spt = READ_ONCE(stutter_pause_test); + while (spt || (torture_runnable && !READ_ONCE(*torture_runnable))) { + if (spt == 1) { + schedule_timeout_interruptible(1); + } else if (spt == 2) { + while (READ_ONCE(stutter_pause_test)) + cond_resched(); + } else { schedule_timeout_interruptible(round_jiffies_relative(HZ)); + } torture_shutdown_absorb(title); + spt = READ_ONCE(stutter_pause_test); } } EXPORT_SYMBOL_GPL(stutter_wait); @@ -597,17 +600,15 @@ static int torture_stutter(void *arg) { VERBOSE_TOROUT_STRING("torture_stutter task started"); do { - if (!torture_must_stop()) { - if (stutter > 1) { - schedule_timeout_interruptible(stutter - 1); - WRITE_ONCE(stutter_pause_test, 2); - } - schedule_timeout_interruptible(1); + if (!torture_must_stop() && stutter > 1) { WRITE_ONCE(stutter_pause_test, 1); + schedule_timeout_interruptible(stutter - 1); + WRITE_ONCE(stutter_pause_test, 2); + schedule_timeout_interruptible(1); } + WRITE_ONCE(stutter_pause_test, 0); if (!torture_must_stop()) schedule_timeout_interruptible(stutter); - WRITE_ONCE(stutter_pause_test, 0); torture_shutdown_absorb("torture_stutter"); } while (!torture_must_stop()); torture_kthread_stopping("torture_stutter"); -- cgit v1.2.3 From a2f2577d96ad060b65eb909dd39b57d676754119 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 21 Nov 2017 20:19:17 -0800 Subject: torture: Eliminate torture_runnable and perf_runnable The purpose of torture_runnable is to allow rcutorture and locktorture to be started and stopped via sysfs when they are built into the kernel (as in not compiled as loadable modules). However, the 0444 permissions for both instances of torture_runnable prevent this use case from ever being put into practice. Given that there have been no complaints about this deficiency, it is reasonable to conclude that no one actually makes use of this sysfs capability. The perf_runnable module parameter for rcuperf is in the same situation. This commit therefore removes both torture_runnable instances as well as perf_runnable. Reported-by: Thomas Gleixner Signed-off-by: Paul E. McKenney --- kernel/locking/locktorture.c | 6 +----- kernel/rcu/rcuperf.c | 6 +----- kernel/rcu/rcutorture.c | 6 +----- kernel/torture.c | 6 ++---- 4 files changed, 5 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c index 2a1fc2a58910..6850ffd69125 100644 --- a/kernel/locking/locktorture.c +++ b/kernel/locking/locktorture.c @@ -77,10 +77,6 @@ struct lock_stress_stats { long n_lock_acquired; }; -int torture_runnable = IS_ENABLED(MODULE); -module_param(torture_runnable, int, 0444); -MODULE_PARM_DESC(torture_runnable, "Start locktorture at module init"); - /* Forward reference. */ static void lock_torture_cleanup(void); @@ -866,7 +862,7 @@ static int __init lock_torture_init(void) &percpu_rwsem_lock_ops, }; - if (!torture_init_begin(torture_type, verbose, &torture_runnable)) + if (!torture_init_begin(torture_type, verbose)) return -EBUSY; /* Process args and tell the world that the torturer is on the job. */ diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c index 1f87a02c3399..d1ebdf9868bb 100644 --- a/kernel/rcu/rcuperf.c +++ b/kernel/rcu/rcuperf.c @@ -106,10 +106,6 @@ static int rcu_perf_writer_state; #define MAX_MEAS 10000 #define MIN_MEAS 100 -static int perf_runnable = IS_ENABLED(MODULE); -module_param(perf_runnable, int, 0444); -MODULE_PARM_DESC(perf_runnable, "Start rcuperf at boot"); - /* * Operations vector for selecting different types of tests. */ @@ -646,7 +642,7 @@ rcu_perf_init(void) &tasks_ops, }; - if (!torture_init_begin(perf_type, verbose, &perf_runnable)) + if (!torture_init_begin(perf_type, verbose)) return -EBUSY; /* Process args and tell the world that the perf'er is on the job. */ diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 1074ecc3f72f..308e6fdbced8 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -187,10 +187,6 @@ static const char *rcu_torture_writer_state_getname(void) return rcu_torture_writer_state_names[i]; } -static int torture_runnable = IS_ENABLED(MODULE); -module_param(torture_runnable, int, 0444); -MODULE_PARM_DESC(torture_runnable, "Start rcutorture at boot"); - #if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) #define rcu_can_boost() 1 #else /* #if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) */ @@ -1729,7 +1725,7 @@ rcu_torture_init(void) &sched_ops, &tasks_ops, }; - if (!torture_init_begin(torture_type, verbose, &torture_runnable)) + if (!torture_init_begin(torture_type, verbose)) return -EBUSY; /* Process args and tell the world that the torturer is on the job. */ diff --git a/kernel/torture.c b/kernel/torture.c index 3bcbd4fbfe18..572576ad9f58 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -61,7 +61,6 @@ static bool verbose; #define FULLSTOP_RMMOD 2 /* Normal rmmod of torture. */ static int fullstop = FULLSTOP_RMMOD; static DEFINE_MUTEX(fullstop_mutex); -static int *torture_runnable; #ifdef CONFIG_HOTPLUG_CPU @@ -577,7 +576,7 @@ void stutter_wait(const char *title) cond_resched_rcu_qs(); spt = READ_ONCE(stutter_pause_test); - while (spt || (torture_runnable && !READ_ONCE(*torture_runnable))) { + while (spt) { if (spt == 1) { schedule_timeout_interruptible(1); } else if (spt == 2) { @@ -649,7 +648,7 @@ static void torture_stutter_cleanup(void) * The runnable parameter points to a flag that controls whether or not * the test is currently runnable. If there is no such flag, pass in NULL. */ -bool torture_init_begin(char *ttype, bool v, int *runnable) +bool torture_init_begin(char *ttype, bool v) { mutex_lock(&fullstop_mutex); if (torture_type != NULL) { @@ -661,7 +660,6 @@ bool torture_init_begin(char *ttype, bool v, int *runnable) } torture_type = ttype; verbose = v; - torture_runnable = runnable; fullstop = FULLSTOP_DONTSTOP; return true; } -- cgit v1.2.3 From 29d3939084583b26a5487be64b9523e61468f1be Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 21 Nov 2017 22:07:59 -0800 Subject: torture: Save a line in stutter_wait(): while -> for Signed-off-by: Paul E. McKenney --- kernel/torture.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/torture.c b/kernel/torture.c index 572576ad9f58..37b94012a3f8 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -576,7 +576,7 @@ void stutter_wait(const char *title) cond_resched_rcu_qs(); spt = READ_ONCE(stutter_pause_test); - while (spt) { + for (; spt; spt = READ_ONCE(stutter_pause_test)) { if (spt == 1) { schedule_timeout_interruptible(1); } else if (spt == 2) { @@ -586,7 +586,6 @@ void stutter_wait(const char *title) schedule_timeout_interruptible(round_jiffies_relative(HZ)); } torture_shutdown_absorb(title); - spt = READ_ONCE(stutter_pause_test); } } EXPORT_SYMBOL_GPL(stutter_wait); -- cgit v1.2.3 From f87f3a328dbbb3e79dd53e7e889ced9222512649 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 28 Nov 2017 18:42:18 +0000 Subject: locking/core: Fix deadlock during boot on systems with GENERIC_LOCKBREAK Commit: a8a217c22116 ("locking/core: Remove {read,spin,write}_can_lock()") removed the definition of raw_spin_can_lock(), causing the GENERIC_LOCKBREAK spin_lock() routines to poll the ->break_lock field when waiting on a lock. This has been reported to cause a deadlock during boot on s390, because the ->break_lock field is also set by the waiters, and can potentially remain set indefinitely if no other CPUs come in to take the lock after it has been released. This patch removes the explicit spinning on ->break_lock from the waiters, instead relying on the outer trylock() operation to determine when the lock is available. Reported-by: Sebastian Ott Tested-by: Sebastian Ott Signed-off-by: Will Deacon Acked-by: Peter Zijlstra Cc: Heiko Carstens Cc: Linus Torvalds Cc: Martin Schwidefsky Cc: Thomas Gleixner Fixes: a8a217c22116 ("locking/core: Remove {read,spin,write}_can_lock()") Link: http://lkml.kernel.org/r/1511894539-7988-2-git-send-email-will.deacon@arm.com Signed-off-by: Ingo Molnar --- kernel/locking/spinlock.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/spinlock.c b/kernel/locking/spinlock.c index 1fd1a7543cdd..0ebb253e2199 100644 --- a/kernel/locking/spinlock.c +++ b/kernel/locking/spinlock.c @@ -68,8 +68,8 @@ void __lockfunc __raw_##op##_lock(locktype##_t *lock) \ \ if (!(lock)->break_lock) \ (lock)->break_lock = 1; \ - while ((lock)->break_lock) \ - arch_##op##_relax(&lock->raw_lock); \ + \ + arch_##op##_relax(&lock->raw_lock); \ } \ (lock)->break_lock = 0; \ } \ @@ -88,8 +88,8 @@ unsigned long __lockfunc __raw_##op##_lock_irqsave(locktype##_t *lock) \ \ if (!(lock)->break_lock) \ (lock)->break_lock = 1; \ - while ((lock)->break_lock) \ - arch_##op##_relax(&lock->raw_lock); \ + \ + arch_##op##_relax(&lock->raw_lock); \ } \ (lock)->break_lock = 0; \ return flags; \ -- cgit v1.2.3 From d89c70356acf11b7cf47ca5cfcafae5062a85451 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 28 Nov 2017 18:42:19 +0000 Subject: locking/core: Remove break_lock field when CONFIG_GENERIC_LOCKBREAK=y When CONFIG_GENERIC_LOCKBEAK=y, locking structures grow an extra int ->break_lock field which is used to implement raw_spin_is_contended() by setting the field to 1 when waiting on a lock and clearing it to zero when holding a lock. However, there are a few problems with this approach: - There is a write-write race between a CPU successfully taking the lock (and subsequently writing break_lock = 0) and a waiter waiting on the lock (and subsequently writing break_lock = 1). This could result in a contended lock being reported as uncontended and vice-versa. - On machines with store buffers, nothing guarantees that the writes to break_lock are visible to other CPUs at any particular time. - READ_ONCE/WRITE_ONCE are not used, so the field is potentially susceptible to harmful compiler optimisations, Consequently, the usefulness of this field is unclear and we'd be better off removing it and allowing architectures to implement raw_spin_is_contended() by providing a definition of arch_spin_is_contended(), as they can when CONFIG_GENERIC_LOCKBREAK=n. Signed-off-by: Will Deacon Acked-by: Peter Zijlstra Cc: Heiko Carstens Cc: Linus Torvalds Cc: Martin Schwidefsky Cc: Sebastian Ott Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1511894539-7988-3-git-send-email-will.deacon@arm.com Signed-off-by: Ingo Molnar --- kernel/locking/spinlock.c | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/spinlock.c b/kernel/locking/spinlock.c index 0ebb253e2199..936f3d14dd6b 100644 --- a/kernel/locking/spinlock.c +++ b/kernel/locking/spinlock.c @@ -66,12 +66,8 @@ void __lockfunc __raw_##op##_lock(locktype##_t *lock) \ break; \ preempt_enable(); \ \ - if (!(lock)->break_lock) \ - (lock)->break_lock = 1; \ - \ arch_##op##_relax(&lock->raw_lock); \ } \ - (lock)->break_lock = 0; \ } \ \ unsigned long __lockfunc __raw_##op##_lock_irqsave(locktype##_t *lock) \ @@ -86,12 +82,9 @@ unsigned long __lockfunc __raw_##op##_lock_irqsave(locktype##_t *lock) \ local_irq_restore(flags); \ preempt_enable(); \ \ - if (!(lock)->break_lock) \ - (lock)->break_lock = 1; \ - \ arch_##op##_relax(&lock->raw_lock); \ } \ - (lock)->break_lock = 0; \ + \ return flags; \ } \ \ -- cgit v1.2.3 From e966eaeeb623f09975ef362c2866fae6f86844f9 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 12 Dec 2017 12:31:16 +0100 Subject: locking/lockdep: Remove the cross-release locking checks This code (CONFIG_LOCKDEP_CROSSRELEASE=y and CONFIG_LOCKDEP_COMPLETIONS=y), while it found a number of old bugs initially, was also causing too many false positives that caused people to disable lockdep - which is arguably a worse overall outcome. If we disable cross-release by default but keep the code upstream then in practice the most likely outcome is that we'll allow the situation to degrade gradually, by allowing entropy to introduce more and more false positives, until it overwhelms maintenance capacity. Another bad side effect was that people were trying to work around the false positives by uglifying/complicating unrelated code. There's a marked difference between annotating locking operations and uglifying good code just due to bad lock debugging code ... This gradual decrease in quality happened to a number of debugging facilities in the kernel, and lockdep is pretty complex already, so we cannot risk this outcome. Either cross-release checking can be done right with no false positives, or it should not be included in the upstream kernel. ( Note that it might make sense to maintain it out of tree and go through the false positives every now and then and see whether new bugs were introduced. ) Cc: Byungchul Park Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/locking/lockdep.c | 652 +++-------------------------------------------- 1 file changed, 35 insertions(+), 617 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 670d8d7d8087..5fa1324a4f29 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -57,10 +57,6 @@ #define CREATE_TRACE_POINTS #include -#ifdef CONFIG_LOCKDEP_CROSSRELEASE -#include -#endif - #ifdef CONFIG_PROVE_LOCKING int prove_locking = 1; module_param(prove_locking, int, 0644); @@ -75,19 +71,6 @@ module_param(lock_stat, int, 0644); #define lock_stat 0 #endif -#ifdef CONFIG_BOOTPARAM_LOCKDEP_CROSSRELEASE_FULLSTACK -static int crossrelease_fullstack = 1; -#else -static int crossrelease_fullstack; -#endif -static int __init allow_crossrelease_fullstack(char *str) -{ - crossrelease_fullstack = 1; - return 0; -} - -early_param("crossrelease_fullstack", allow_crossrelease_fullstack); - /* * lockdep_lock: protects the lockdep graph, the hashes and the * class/list/hash allocators. @@ -740,18 +723,6 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass) return is_static || static_obj(lock->key) ? NULL : ERR_PTR(-EINVAL); } -#ifdef CONFIG_LOCKDEP_CROSSRELEASE -static void cross_init(struct lockdep_map *lock, int cross); -static int cross_lock(struct lockdep_map *lock); -static int lock_acquire_crosslock(struct held_lock *hlock); -static int lock_release_crosslock(struct lockdep_map *lock); -#else -static inline void cross_init(struct lockdep_map *lock, int cross) {} -static inline int cross_lock(struct lockdep_map *lock) { return 0; } -static inline int lock_acquire_crosslock(struct held_lock *hlock) { return 2; } -static inline int lock_release_crosslock(struct lockdep_map *lock) { return 2; } -#endif - /* * Register a lock's class in the hash-table, if the class is not present * yet. Otherwise we look it up. We cache the result in the lock object @@ -1151,41 +1122,22 @@ print_circular_lock_scenario(struct held_lock *src, printk(KERN_CONT "\n\n"); } - if (cross_lock(tgt->instance)) { - printk(" Possible unsafe locking scenario by crosslock:\n\n"); - printk(" CPU0 CPU1\n"); - printk(" ---- ----\n"); - printk(" lock("); - __print_lock_name(parent); - printk(KERN_CONT ");\n"); - printk(" lock("); - __print_lock_name(target); - printk(KERN_CONT ");\n"); - printk(" lock("); - __print_lock_name(source); - printk(KERN_CONT ");\n"); - printk(" unlock("); - __print_lock_name(target); - printk(KERN_CONT ");\n"); - printk("\n *** DEADLOCK ***\n\n"); - } else { - printk(" Possible unsafe locking scenario:\n\n"); - printk(" CPU0 CPU1\n"); - printk(" ---- ----\n"); - printk(" lock("); - __print_lock_name(target); - printk(KERN_CONT ");\n"); - printk(" lock("); - __print_lock_name(parent); - printk(KERN_CONT ");\n"); - printk(" lock("); - __print_lock_name(target); - printk(KERN_CONT ");\n"); - printk(" lock("); - __print_lock_name(source); - printk(KERN_CONT ");\n"); - printk("\n *** DEADLOCK ***\n\n"); - } + printk(" Possible unsafe locking scenario:\n\n"); + printk(" CPU0 CPU1\n"); + printk(" ---- ----\n"); + printk(" lock("); + __print_lock_name(target); + printk(KERN_CONT ");\n"); + printk(" lock("); + __print_lock_name(parent); + printk(KERN_CONT ");\n"); + printk(" lock("); + __print_lock_name(target); + printk(KERN_CONT ");\n"); + printk(" lock("); + __print_lock_name(source); + printk(KERN_CONT ");\n"); + printk("\n *** DEADLOCK ***\n\n"); } /* @@ -1211,10 +1163,7 @@ print_circular_bug_header(struct lock_list *entry, unsigned int depth, curr->comm, task_pid_nr(curr)); print_lock(check_src); - if (cross_lock(check_tgt->instance)) - pr_warn("\nbut now in release context of a crosslock acquired at the following:\n"); - else - pr_warn("\nbut task is already holding lock:\n"); + pr_warn("\nbut task is already holding lock:\n"); print_lock(check_tgt); pr_warn("\nwhich lock already depends on the new lock.\n\n"); @@ -1244,9 +1193,7 @@ static noinline int print_circular_bug(struct lock_list *this, if (!debug_locks_off_graph_unlock() || debug_locks_silent) return 0; - if (cross_lock(check_tgt->instance)) - this->trace = *trace; - else if (!save_trace(&this->trace)) + if (!save_trace(&this->trace)) return 0; depth = get_lock_depth(target); @@ -1850,9 +1797,6 @@ check_deadlock(struct task_struct *curr, struct held_lock *next, if (nest) return 2; - if (cross_lock(prev->instance)) - continue; - return print_deadlock_bug(curr, prev, next); } return 1; @@ -2018,31 +1962,26 @@ check_prevs_add(struct task_struct *curr, struct held_lock *next) for (;;) { int distance = curr->lockdep_depth - depth + 1; hlock = curr->held_locks + depth - 1; + /* - * Only non-crosslock entries get new dependencies added. - * Crosslock entries will be added by commit later: + * Only non-recursive-read entries get new dependencies + * added: */ - if (!cross_lock(hlock->instance)) { + if (hlock->read != 2 && hlock->check) { + int ret = check_prev_add(curr, hlock, next, distance, &trace, save_trace); + if (!ret) + return 0; + /* - * Only non-recursive-read entries get new dependencies - * added: + * Stop after the first non-trylock entry, + * as non-trylock entries have added their + * own direct dependencies already, so this + * lock is connected to them indirectly: */ - if (hlock->read != 2 && hlock->check) { - int ret = check_prev_add(curr, hlock, next, - distance, &trace, save_trace); - if (!ret) - return 0; - - /* - * Stop after the first non-trylock entry, - * as non-trylock entries have added their - * own direct dependencies already, so this - * lock is connected to them indirectly: - */ - if (!hlock->trylock) - break; - } + if (!hlock->trylock) + break; } + depth--; /* * End of lock-stack? @@ -3292,21 +3231,10 @@ static void __lockdep_init_map(struct lockdep_map *lock, const char *name, void lockdep_init_map(struct lockdep_map *lock, const char *name, struct lock_class_key *key, int subclass) { - cross_init(lock, 0); __lockdep_init_map(lock, name, key, subclass); } EXPORT_SYMBOL_GPL(lockdep_init_map); -#ifdef CONFIG_LOCKDEP_CROSSRELEASE -void lockdep_init_map_crosslock(struct lockdep_map *lock, const char *name, - struct lock_class_key *key, int subclass) -{ - cross_init(lock, 1); - __lockdep_init_map(lock, name, key, subclass); -} -EXPORT_SYMBOL_GPL(lockdep_init_map_crosslock); -#endif - struct lock_class_key __lockdep_no_validate__; EXPORT_SYMBOL_GPL(__lockdep_no_validate__); @@ -3362,7 +3290,6 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, int chain_head = 0; int class_idx; u64 chain_key; - int ret; if (unlikely(!debug_locks)) return 0; @@ -3411,8 +3338,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, class_idx = class - lock_classes + 1; - /* TODO: nest_lock is not implemented for crosslock yet. */ - if (depth && !cross_lock(lock)) { + if (depth) { hlock = curr->held_locks + depth - 1; if (hlock->class_idx == class_idx && nest_lock) { if (hlock->references) { @@ -3500,14 +3426,6 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, if (!validate_chain(curr, lock, hlock, chain_head, chain_key)) return 0; - ret = lock_acquire_crosslock(hlock); - /* - * 2 means normal acquire operations are needed. Otherwise, it's - * ok just to return with '0:fail, 1:success'. - */ - if (ret != 2) - return ret; - curr->curr_chain_key = chain_key; curr->lockdep_depth++; check_chain_key(curr); @@ -3745,19 +3663,11 @@ __lock_release(struct lockdep_map *lock, int nested, unsigned long ip) struct task_struct *curr = current; struct held_lock *hlock; unsigned int depth; - int ret, i; + int i; if (unlikely(!debug_locks)) return 0; - ret = lock_release_crosslock(lock); - /* - * 2 means normal release operations are needed. Otherwise, it's - * ok just to return with '0:fail, 1:success'. - */ - if (ret != 2) - return ret; - depth = curr->lockdep_depth; /* * So we're all set to release this lock.. wait what lock? We don't @@ -4675,495 +4585,3 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s) dump_stack(); } EXPORT_SYMBOL_GPL(lockdep_rcu_suspicious); - -#ifdef CONFIG_LOCKDEP_CROSSRELEASE - -/* - * Crossrelease works by recording a lock history for each thread and - * connecting those historic locks that were taken after the - * wait_for_completion() in the complete() context. - * - * Task-A Task-B - * - * mutex_lock(&A); - * mutex_unlock(&A); - * - * wait_for_completion(&C); - * lock_acquire_crosslock(); - * atomic_inc_return(&cross_gen_id); - * | - * | mutex_lock(&B); - * | mutex_unlock(&B); - * | - * | complete(&C); - * `-- lock_commit_crosslock(); - * - * Which will then add a dependency between B and C. - */ - -#define xhlock(i) (current->xhlocks[(i) % MAX_XHLOCKS_NR]) - -/* - * Whenever a crosslock is held, cross_gen_id will be increased. - */ -static atomic_t cross_gen_id; /* Can be wrapped */ - -/* - * Make an entry of the ring buffer invalid. - */ -static inline void invalidate_xhlock(struct hist_lock *xhlock) -{ - /* - * Normally, xhlock->hlock.instance must be !NULL. - */ - xhlock->hlock.instance = NULL; -} - -/* - * Lock history stacks; we have 2 nested lock history stacks: - * - * HARD(IRQ) - * SOFT(IRQ) - * - * The thing is that once we complete a HARD/SOFT IRQ the future task locks - * should not depend on any of the locks observed while running the IRQ. So - * what we do is rewind the history buffer and erase all our knowledge of that - * temporal event. - */ - -void crossrelease_hist_start(enum xhlock_context_t c) -{ - struct task_struct *cur = current; - - if (!cur->xhlocks) - return; - - cur->xhlock_idx_hist[c] = cur->xhlock_idx; - cur->hist_id_save[c] = cur->hist_id; -} - -void crossrelease_hist_end(enum xhlock_context_t c) -{ - struct task_struct *cur = current; - - if (cur->xhlocks) { - unsigned int idx = cur->xhlock_idx_hist[c]; - struct hist_lock *h = &xhlock(idx); - - cur->xhlock_idx = idx; - - /* Check if the ring was overwritten. */ - if (h->hist_id != cur->hist_id_save[c]) - invalidate_xhlock(h); - } -} - -/* - * lockdep_invariant_state() is used to annotate independence inside a task, to - * make one task look like multiple independent 'tasks'. - * - * Take for instance workqueues; each work is independent of the last. The - * completion of a future work does not depend on the completion of a past work - * (in general). Therefore we must not carry that (lock) dependency across - * works. - * - * This is true for many things; pretty much all kthreads fall into this - * pattern, where they have an invariant state and future completions do not - * depend on past completions. Its just that since they all have the 'same' - * form -- the kthread does the same over and over -- it doesn't typically - * matter. - * - * The same is true for system-calls, once a system call is completed (we've - * returned to userspace) the next system call does not depend on the lock - * history of the previous system call. - * - * They key property for independence, this invariant state, is that it must be - * a point where we hold no locks and have no history. Because if we were to - * hold locks, the restore at _end() would not necessarily recover it's history - * entry. Similarly, independence per-definition means it does not depend on - * prior state. - */ -void lockdep_invariant_state(bool force) -{ - /* - * We call this at an invariant point, no current state, no history. - * Verify the former, enforce the latter. - */ - WARN_ON_ONCE(!force && current->lockdep_depth); - if (current->xhlocks) - invalidate_xhlock(&xhlock(current->xhlock_idx)); -} - -static int cross_lock(struct lockdep_map *lock) -{ - return lock ? lock->cross : 0; -} - -/* - * This is needed to decide the relationship between wrapable variables. - */ -static inline int before(unsigned int a, unsigned int b) -{ - return (int)(a - b) < 0; -} - -static inline struct lock_class *xhlock_class(struct hist_lock *xhlock) -{ - return hlock_class(&xhlock->hlock); -} - -static inline struct lock_class *xlock_class(struct cross_lock *xlock) -{ - return hlock_class(&xlock->hlock); -} - -/* - * Should we check a dependency with previous one? - */ -static inline int depend_before(struct held_lock *hlock) -{ - return hlock->read != 2 && hlock->check && !hlock->trylock; -} - -/* - * Should we check a dependency with next one? - */ -static inline int depend_after(struct held_lock *hlock) -{ - return hlock->read != 2 && hlock->check; -} - -/* - * Check if the xhlock is valid, which would be false if, - * - * 1. Has not used after initializaion yet. - * 2. Got invalidated. - * - * Remind hist_lock is implemented as a ring buffer. - */ -static inline int xhlock_valid(struct hist_lock *xhlock) -{ - /* - * xhlock->hlock.instance must be !NULL. - */ - return !!xhlock->hlock.instance; -} - -/* - * Record a hist_lock entry. - * - * Irq disable is only required. - */ -static void add_xhlock(struct held_lock *hlock) -{ - unsigned int idx = ++current->xhlock_idx; - struct hist_lock *xhlock = &xhlock(idx); - -#ifdef CONFIG_DEBUG_LOCKDEP - /* - * This can be done locklessly because they are all task-local - * state, we must however ensure IRQs are disabled. - */ - WARN_ON_ONCE(!irqs_disabled()); -#endif - - /* Initialize hist_lock's members */ - xhlock->hlock = *hlock; - xhlock->hist_id = ++current->hist_id; - - xhlock->trace.nr_entries = 0; - xhlock->trace.max_entries = MAX_XHLOCK_TRACE_ENTRIES; - xhlock->trace.entries = xhlock->trace_entries; - - if (crossrelease_fullstack) { - xhlock->trace.skip = 3; - save_stack_trace(&xhlock->trace); - } else { - xhlock->trace.nr_entries = 1; - xhlock->trace.entries[0] = hlock->acquire_ip; - } -} - -static inline int same_context_xhlock(struct hist_lock *xhlock) -{ - return xhlock->hlock.irq_context == task_irq_context(current); -} - -/* - * This should be lockless as far as possible because this would be - * called very frequently. - */ -static void check_add_xhlock(struct held_lock *hlock) -{ - /* - * Record a hist_lock, only in case that acquisitions ahead - * could depend on the held_lock. For example, if the held_lock - * is trylock then acquisitions ahead never depends on that. - * In that case, we don't need to record it. Just return. - */ - if (!current->xhlocks || !depend_before(hlock)) - return; - - add_xhlock(hlock); -} - -/* - * For crosslock. - */ -static int add_xlock(struct held_lock *hlock) -{ - struct cross_lock *xlock; - unsigned int gen_id; - - if (!graph_lock()) - return 0; - - xlock = &((struct lockdep_map_cross *)hlock->instance)->xlock; - - /* - * When acquisitions for a crosslock are overlapped, we use - * nr_acquire to perform commit for them, based on cross_gen_id - * of the first acquisition, which allows to add additional - * dependencies. - * - * Moreover, when no acquisition of a crosslock is in progress, - * we should not perform commit because the lock might not exist - * any more, which might cause incorrect memory access. So we - * have to track the number of acquisitions of a crosslock. - * - * depend_after() is necessary to initialize only the first - * valid xlock so that the xlock can be used on its commit. - */ - if (xlock->nr_acquire++ && depend_after(&xlock->hlock)) - goto unlock; - - gen_id = (unsigned int)atomic_inc_return(&cross_gen_id); - xlock->hlock = *hlock; - xlock->hlock.gen_id = gen_id; -unlock: - graph_unlock(); - return 1; -} - -/* - * Called for both normal and crosslock acquires. Normal locks will be - * pushed on the hist_lock queue. Cross locks will record state and - * stop regular lock_acquire() to avoid being placed on the held_lock - * stack. - * - * Return: 0 - failure; - * 1 - crosslock, done; - * 2 - normal lock, continue to held_lock[] ops. - */ -static int lock_acquire_crosslock(struct held_lock *hlock) -{ - /* - * CONTEXT 1 CONTEXT 2 - * --------- --------- - * lock A (cross) - * X = atomic_inc_return(&cross_gen_id) - * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - * Y = atomic_read_acquire(&cross_gen_id) - * lock B - * - * atomic_read_acquire() is for ordering between A and B, - * IOW, A happens before B, when CONTEXT 2 see Y >= X. - * - * Pairs with atomic_inc_return() in add_xlock(). - */ - hlock->gen_id = (unsigned int)atomic_read_acquire(&cross_gen_id); - - if (cross_lock(hlock->instance)) - return add_xlock(hlock); - - check_add_xhlock(hlock); - return 2; -} - -static int copy_trace(struct stack_trace *trace) -{ - unsigned long *buf = stack_trace + nr_stack_trace_entries; - unsigned int max_nr = MAX_STACK_TRACE_ENTRIES - nr_stack_trace_entries; - unsigned int nr = min(max_nr, trace->nr_entries); - - trace->nr_entries = nr; - memcpy(buf, trace->entries, nr * sizeof(trace->entries[0])); - trace->entries = buf; - nr_stack_trace_entries += nr; - - if (nr_stack_trace_entries >= MAX_STACK_TRACE_ENTRIES-1) { - if (!debug_locks_off_graph_unlock()) - return 0; - - print_lockdep_off("BUG: MAX_STACK_TRACE_ENTRIES too low!"); - dump_stack(); - - return 0; - } - - return 1; -} - -static int commit_xhlock(struct cross_lock *xlock, struct hist_lock *xhlock) -{ - unsigned int xid, pid; - u64 chain_key; - - xid = xlock_class(xlock) - lock_classes; - chain_key = iterate_chain_key((u64)0, xid); - pid = xhlock_class(xhlock) - lock_classes; - chain_key = iterate_chain_key(chain_key, pid); - - if (lookup_chain_cache(chain_key)) - return 1; - - if (!add_chain_cache_classes(xid, pid, xhlock->hlock.irq_context, - chain_key)) - return 0; - - if (!check_prev_add(current, &xlock->hlock, &xhlock->hlock, 1, - &xhlock->trace, copy_trace)) - return 0; - - return 1; -} - -static void commit_xhlocks(struct cross_lock *xlock) -{ - unsigned int cur = current->xhlock_idx; - unsigned int prev_hist_id = xhlock(cur).hist_id; - unsigned int i; - - if (!graph_lock()) - return; - - if (xlock->nr_acquire) { - for (i = 0; i < MAX_XHLOCKS_NR; i++) { - struct hist_lock *xhlock = &xhlock(cur - i); - - if (!xhlock_valid(xhlock)) - break; - - if (before(xhlock->hlock.gen_id, xlock->hlock.gen_id)) - break; - - if (!same_context_xhlock(xhlock)) - break; - - /* - * Filter out the cases where the ring buffer was - * overwritten and the current entry has a bigger - * hist_id than the previous one, which is impossible - * otherwise: - */ - if (unlikely(before(prev_hist_id, xhlock->hist_id))) - break; - - prev_hist_id = xhlock->hist_id; - - /* - * commit_xhlock() returns 0 with graph_lock already - * released if fail. - */ - if (!commit_xhlock(xlock, xhlock)) - return; - } - } - - graph_unlock(); -} - -void lock_commit_crosslock(struct lockdep_map *lock) -{ - struct cross_lock *xlock; - unsigned long flags; - - if (unlikely(!debug_locks || current->lockdep_recursion)) - return; - - if (!current->xhlocks) - return; - - /* - * Do commit hist_locks with the cross_lock, only in case that - * the cross_lock could depend on acquisitions after that. - * - * For example, if the cross_lock does not have the 'check' flag - * then we don't need to check dependencies and commit for that. - * Just skip it. In that case, of course, the cross_lock does - * not depend on acquisitions ahead, either. - * - * WARNING: Don't do that in add_xlock() in advance. When an - * acquisition context is different from the commit context, - * invalid(skipped) cross_lock might be accessed. - */ - if (!depend_after(&((struct lockdep_map_cross *)lock)->xlock.hlock)) - return; - - raw_local_irq_save(flags); - check_flags(flags); - current->lockdep_recursion = 1; - xlock = &((struct lockdep_map_cross *)lock)->xlock; - commit_xhlocks(xlock); - current->lockdep_recursion = 0; - raw_local_irq_restore(flags); -} -EXPORT_SYMBOL_GPL(lock_commit_crosslock); - -/* - * Return: 0 - failure; - * 1 - crosslock, done; - * 2 - normal lock, continue to held_lock[] ops. - */ -static int lock_release_crosslock(struct lockdep_map *lock) -{ - if (cross_lock(lock)) { - if (!graph_lock()) - return 0; - ((struct lockdep_map_cross *)lock)->xlock.nr_acquire--; - graph_unlock(); - return 1; - } - return 2; -} - -static void cross_init(struct lockdep_map *lock, int cross) -{ - if (cross) - ((struct lockdep_map_cross *)lock)->xlock.nr_acquire = 0; - - lock->cross = cross; - - /* - * Crossrelease assumes that the ring buffer size of xhlocks - * is aligned with power of 2. So force it on build. - */ - BUILD_BUG_ON(MAX_XHLOCKS_NR & (MAX_XHLOCKS_NR - 1)); -} - -void lockdep_init_task(struct task_struct *task) -{ - int i; - - task->xhlock_idx = UINT_MAX; - task->hist_id = 0; - - for (i = 0; i < XHLOCK_CTX_NR; i++) { - task->xhlock_idx_hist[i] = UINT_MAX; - task->hist_id_save[i] = 0; - } - - task->xhlocks = kzalloc(sizeof(struct hist_lock) * MAX_XHLOCKS_NR, - GFP_KERNEL); -} - -void lockdep_free_task(struct task_struct *task) -{ - if (task->xhlocks) { - void *tmp = task->xhlocks; - /* Diable crossrelease for current */ - task->xhlocks = NULL; - kfree(tmp); - } -} -#endif -- cgit v1.2.3 From e7fd37ba12170cc414be8b639dfc2c5f7172fac2 Mon Sep 17 00:00:00 2001 From: Ma Shimiao Date: Tue, 12 Dec 2017 09:43:49 +0800 Subject: cgroup: avoid copying strings longer than the buffers cgroup root name and file name have max length limit, we should avoid copying longer name than that to the name. tj: minor update to $SUBJ. Signed-off-by: Ma Shimiao Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 0b1ffe147f24..18d71fbd3923 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -1397,7 +1397,7 @@ static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft, cgroup_on_dfl(cgrp) ? ss->name : ss->legacy_name, cft->name); else - strncpy(buf, cft->name, CGROUP_FILE_NAME_MAX); + strscpy(buf, cft->name, CGROUP_FILE_NAME_MAX); return buf; } @@ -1864,9 +1864,9 @@ void init_cgroup_root(struct cgroup_root *root, struct cgroup_sb_opts *opts) root->flags = opts->flags; if (opts->release_agent) - strcpy(root->release_agent_path, opts->release_agent); + strscpy(root->release_agent_path, opts->release_agent, PATH_MAX); if (opts->name) - strcpy(root->name, opts->name); + strscpy(root->name, opts->name, MAX_CGROUP_ROOT_NAMELEN); if (opts->cpuset_clone_children) set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags); } -- cgit v1.2.3 From 283ca526a9bd75aed7350220d7b1f8027d99c3fd Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 12 Dec 2017 02:25:30 +0100 Subject: bpf: fix corruption on concurrent perf_event_output calls When tracing and networking programs are both attached in the system and both use event-output helpers that eventually call into perf_event_output(), then we could end up in a situation where the tracing attached program runs in user context while a cls_bpf program is triggered on that same CPU out of softirq context. Since both rely on the same per-cpu perf_sample_data, we could potentially corrupt it. This can only ever happen in a combination of the two types; all tracing programs use a bpf_prog_active counter to bail out in case a program is already running on that CPU out of a different context. XDP and cls_bpf programs by themselves don't have this issue as they run in the same context only. Therefore, split both perf_sample_data so they cannot be accessed from each other. Fixes: 20b9d7ac4852 ("bpf: avoid excessive stack usage for perf_sample_data") Reported-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Tested-by: Song Liu Acked-by: Alexei Starovoitov Signed-off-by: Alexei Starovoitov --- kernel/trace/bpf_trace.c | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 0ce99c379c30..40207c2a4113 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -343,14 +343,13 @@ static const struct bpf_func_proto bpf_perf_event_read_value_proto = { .arg4_type = ARG_CONST_SIZE, }; -static DEFINE_PER_CPU(struct perf_sample_data, bpf_sd); +static DEFINE_PER_CPU(struct perf_sample_data, bpf_trace_sd); static __always_inline u64 __bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map, - u64 flags, struct perf_raw_record *raw) + u64 flags, struct perf_sample_data *sd) { struct bpf_array *array = container_of(map, struct bpf_array, map); - struct perf_sample_data *sd = this_cpu_ptr(&bpf_sd); unsigned int cpu = smp_processor_id(); u64 index = flags & BPF_F_INDEX_MASK; struct bpf_event_entry *ee; @@ -373,8 +372,6 @@ __bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map, if (unlikely(event->oncpu != cpu)) return -EOPNOTSUPP; - perf_sample_data_init(sd, 0, 0); - sd->raw = raw; perf_event_output(event, sd, regs); return 0; } @@ -382,6 +379,7 @@ __bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map, BPF_CALL_5(bpf_perf_event_output, struct pt_regs *, regs, struct bpf_map *, map, u64, flags, void *, data, u64, size) { + struct perf_sample_data *sd = this_cpu_ptr(&bpf_trace_sd); struct perf_raw_record raw = { .frag = { .size = size, @@ -392,7 +390,10 @@ BPF_CALL_5(bpf_perf_event_output, struct pt_regs *, regs, struct bpf_map *, map, if (unlikely(flags & ~(BPF_F_INDEX_MASK))) return -EINVAL; - return __bpf_perf_event_output(regs, map, flags, &raw); + perf_sample_data_init(sd, 0, 0); + sd->raw = &raw; + + return __bpf_perf_event_output(regs, map, flags, sd); } static const struct bpf_func_proto bpf_perf_event_output_proto = { @@ -407,10 +408,12 @@ static const struct bpf_func_proto bpf_perf_event_output_proto = { }; static DEFINE_PER_CPU(struct pt_regs, bpf_pt_regs); +static DEFINE_PER_CPU(struct perf_sample_data, bpf_misc_sd); u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size, void *ctx, u64 ctx_size, bpf_ctx_copy_t ctx_copy) { + struct perf_sample_data *sd = this_cpu_ptr(&bpf_misc_sd); struct pt_regs *regs = this_cpu_ptr(&bpf_pt_regs); struct perf_raw_frag frag = { .copy = ctx_copy, @@ -428,8 +431,10 @@ u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size, }; perf_fetch_caller_regs(regs); + perf_sample_data_init(sd, 0, 0); + sd->raw = &raw; - return __bpf_perf_event_output(regs, map, flags, &raw); + return __bpf_perf_event_output(regs, map, flags, sd); } BPF_CALL_0(bpf_get_current_task) -- cgit v1.2.3 From 9147efcbe0b7cc96b18eb64b1a3f0d4bba81443c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 12 Dec 2017 14:22:39 -0800 Subject: bpf: add schedule points to map alloc/free While using large percpu maps, htab_map_alloc() can hold cpu for hundreds of ms. This patch adds cond_resched() calls to percpu alloc/free call sites, all running in process context. Signed-off-by: Eric Dumazet Signed-off-by: Alexei Starovoitov --- kernel/bpf/hashtab.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index e469e05c8e83..3905d4bc5b80 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -114,6 +114,7 @@ static void htab_free_elems(struct bpf_htab *htab) pptr = htab_elem_get_ptr(get_htab_elem(htab, i), htab->map.key_size); free_percpu(pptr); + cond_resched(); } free_elems: bpf_map_area_free(htab->elems); @@ -159,6 +160,7 @@ static int prealloc_init(struct bpf_htab *htab) goto free_elems; htab_elem_set_ptr(get_htab_elem(htab, i), htab->map.key_size, pptr); + cond_resched(); } skip_percpu_elems: -- cgit v1.2.3 From 689d77f001cd22da31cc943170e1f6f2e8197035 Mon Sep 17 00:00:00 2001 From: Dmitry Vyukov Date: Thu, 14 Dec 2017 15:33:02 -0800 Subject: kcov: fix comparison callback signature Fix a silly copy-paste bug. We truncated u32 args to u16. Link: http://lkml.kernel.org/r/20171207101134.107168-1-dvyukov@google.com Fixes: ded97d2c2b2c ("kcov: support comparison operands collection") Signed-off-by: Dmitry Vyukov Cc: syzkaller@googlegroups.com Cc: Alexander Potapenko Cc: Vegard Nossum Cc: Quentin Casasnovas Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kcov.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/kcov.c b/kernel/kcov.c index 15f33faf4013..7594c033d98a 100644 --- a/kernel/kcov.c +++ b/kernel/kcov.c @@ -157,7 +157,7 @@ void notrace __sanitizer_cov_trace_cmp2(u16 arg1, u16 arg2) } EXPORT_SYMBOL(__sanitizer_cov_trace_cmp2); -void notrace __sanitizer_cov_trace_cmp4(u16 arg1, u16 arg2) +void notrace __sanitizer_cov_trace_cmp4(u32 arg1, u32 arg2) { write_comp_data(KCOV_CMP_SIZE(2), arg1, arg2, _RET_IP_); } @@ -183,7 +183,7 @@ void notrace __sanitizer_cov_trace_const_cmp2(u16 arg1, u16 arg2) } EXPORT_SYMBOL(__sanitizer_cov_trace_const_cmp2); -void notrace __sanitizer_cov_trace_const_cmp4(u16 arg1, u16 arg2) +void notrace __sanitizer_cov_trace_const_cmp4(u32 arg1, u32 arg2) { write_comp_data(KCOV_CMP_SIZE(2) | KCOV_CMP_CONST, arg1, arg2, _RET_IP_); -- cgit v1.2.3 From bdcf0a423ea1c40bbb40e7ee483b50fc8aa3d758 Mon Sep 17 00:00:00 2001 From: Thiago Rafael Becker Date: Thu, 14 Dec 2017 15:33:12 -0800 Subject: kernel: make groups_sort calling a responsibility group_info allocators In testing, we found that nfsd threads may call set_groups in parallel for the same entry cached in auth.unix.gid, racing in the call of groups_sort, corrupting the groups for that entry and leading to permission denials for the client. This patch: - Make groups_sort globally visible. - Move the call to groups_sort to the modifiers of group_info - Remove the call to groups_sort from set_groups Link: http://lkml.kernel.org/r/20171211151420.18655-1-thiago.becker@gmail.com Signed-off-by: Thiago Rafael Becker Reviewed-by: Matthew Wilcox Reviewed-by: NeilBrown Acked-by: "J. Bruce Fields" Cc: Al Viro Cc: Martin Schwidefsky Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/groups.c | 5 +++-- kernel/uid16.c | 1 + 2 files changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/groups.c b/kernel/groups.c index e357bc800111..daae2f2dc6d4 100644 --- a/kernel/groups.c +++ b/kernel/groups.c @@ -86,11 +86,12 @@ static int gid_cmp(const void *_a, const void *_b) return gid_gt(a, b) - gid_lt(a, b); } -static void groups_sort(struct group_info *group_info) +void groups_sort(struct group_info *group_info) { sort(group_info->gid, group_info->ngroups, sizeof(*group_info->gid), gid_cmp, NULL); } +EXPORT_SYMBOL(groups_sort); /* a simple bsearch */ int groups_search(const struct group_info *group_info, kgid_t grp) @@ -122,7 +123,6 @@ int groups_search(const struct group_info *group_info, kgid_t grp) void set_groups(struct cred *new, struct group_info *group_info) { put_group_info(new->group_info); - groups_sort(group_info); get_group_info(group_info); new->group_info = group_info; } @@ -206,6 +206,7 @@ SYSCALL_DEFINE2(setgroups, int, gidsetsize, gid_t __user *, grouplist) return retval; } + groups_sort(group_info); retval = set_current_groups(group_info); put_group_info(group_info); diff --git a/kernel/uid16.c b/kernel/uid16.c index ce74a4901d2b..ef1da2a5f9bd 100644 --- a/kernel/uid16.c +++ b/kernel/uid16.c @@ -192,6 +192,7 @@ SYSCALL_DEFINE2(setgroups16, int, gidsetsize, old_gid_t __user *, grouplist) return retval; } + groups_sort(group_info); retval = set_current_groups(group_info); put_group_info(group_info); -- cgit v1.2.3 From 7c2c11b208be09c156573fc0076b7b3646e05219 Mon Sep 17 00:00:00 2001 From: Sudip Mukherjee Date: Thu, 14 Dec 2017 15:33:19 -0800 Subject: arch: define weak abort() gcc toggle -fisolate-erroneous-paths-dereference (default at -O2 onwards) isolates faulty code paths such as null pointer access, divide by zero etc. If gcc port doesnt implement __builtin_trap, an abort() is generated which causes kernel link error. In this case, gcc is generating abort due to 'divide by zero' in lib/mpi/mpih-div.c. Currently 'frv' and 'arc' are failing. Previously other arch was also broken like m32r was fixed by commit d22e3d69ee1a ("m32r: fix build failure"). Let's define this weak function which is common for all arch and fix the problem permanently. We can even remove the arch specific 'abort' after this is done. Link: http://lkml.kernel.org/r/1513118956-8718-1-git-send-email-sudipm.mukherjee@gmail.com Signed-off-by: Sudip Mukherjee Cc: Alexey Brodkin Cc: Vineet Gupta Cc: Sudip Mukherjee Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 6b4298a41167..df0c91d5606c 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1755,3 +1755,11 @@ Efault: return -EFAULT; } #endif + +__weak void abort(void) +{ + BUG(); + + /* if that doesn't kill us, halt */ + panic("Oops failed to kill thread"); +} -- cgit v1.2.3 From b00d607bb188e187c7b60074d2fa91a6f1985029 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Tue, 5 Dec 2017 04:41:51 -0500 Subject: tracing: Have stack trace not record if RCU is not watching The stack tracer records a stack dump whenever it sees a stack usage that is more than what it ever saw before. This can happen at any function that is being traced. If it happens when the CPU is going idle (or other strange locations), RCU may not be watching, and in this case, the recording of the stack trace will trigger a warning. There's been lots of efforts to make hacks to allow stack tracing to proceed even if RCU is not watching, but this only causes more issues to appear. Simply do not trace a stack if RCU is not watching. It probably isn't a bad stack anyway. Acked-by: "Paul E. McKenney" Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_stack.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index 734accc02418..3c7bfc4bf5e9 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -209,6 +209,10 @@ stack_trace_call(unsigned long ip, unsigned long parent_ip, if (__this_cpu_read(disable_stack_tracer) != 1) goto out; + /* If rcu is not watching, then save stack trace can fail */ + if (!rcu_is_watching()) + goto out; + ip += MCOUNT_INSN_SIZE; check_stack(ip, &stack); -- cgit v1.2.3 From cef31d9af908243421258f1df35a4a644604efbe Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 15 Dec 2017 10:32:03 +0100 Subject: posix-timer: Properly check sigevent->sigev_notify timer_create() specifies via sigevent->sigev_notify the signal delivery for the new timer. The valid modes are SIGEV_NONE, SIGEV_SIGNAL, SIGEV_THREAD and (SIGEV_SIGNAL | SIGEV_THREAD_ID). The sanity check in good_sigevent() is only checking the valid combination for the SIGEV_THREAD_ID bit, i.e. SIGEV_SIGNAL, but if SIGEV_THREAD_ID is not set it accepts any random value. This has no real effects on the posix timer and signal delivery code, but it affects show_timer() which handles the output of /proc/$PID/timers. That function uses a string array to pretty print sigev_notify. The access to that array has no bound checks, so random sigev_notify cause access beyond the array bounds. Add proper checks for the valid notify modes and remove the SIGEV_THREAD_ID masking from various code pathes as SIGEV_NONE can never be set in combination with SIGEV_THREAD_ID. Reported-by: Eric Biggers Reported-by: Dmitry Vyukov Reported-by: Alexey Dobriyan Signed-off-by: Thomas Gleixner Cc: John Stultz Cc: stable@vger.kernel.org --- kernel/time/posix-timers.c | 29 +++++++++++++++++------------ 1 file changed, 17 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 13d6881f908b..ec999f32c840 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -434,17 +434,22 @@ static struct pid *good_sigevent(sigevent_t * event) { struct task_struct *rtn = current->group_leader; - if ((event->sigev_notify & SIGEV_THREAD_ID ) && - (!(rtn = find_task_by_vpid(event->sigev_notify_thread_id)) || - !same_thread_group(rtn, current) || - (event->sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_SIGNAL)) + switch (event->sigev_notify) { + case SIGEV_SIGNAL | SIGEV_THREAD_ID: + rtn = find_task_by_vpid(event->sigev_notify_thread_id); + if (!rtn || !same_thread_group(rtn, current)) + return NULL; + /* FALLTHRU */ + case SIGEV_SIGNAL: + case SIGEV_THREAD: + if (event->sigev_signo <= 0 || event->sigev_signo > SIGRTMAX) + return NULL; + /* FALLTHRU */ + case SIGEV_NONE: + return task_pid(rtn); + default: return NULL; - - if (((event->sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE) && - ((event->sigev_signo <= 0) || (event->sigev_signo > SIGRTMAX))) - return NULL; - - return task_pid(rtn); + } } static struct k_itimer * alloc_posix_timer(void) @@ -669,7 +674,7 @@ void common_timer_get(struct k_itimer *timr, struct itimerspec64 *cur_setting) struct timespec64 ts64; bool sig_none; - sig_none = (timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE; + sig_none = timr->it_sigev_notify == SIGEV_NONE; iv = timr->it_interval; /* interval timer ? */ @@ -856,7 +861,7 @@ int common_timer_set(struct k_itimer *timr, int flags, timr->it_interval = timespec64_to_ktime(new_setting->it_interval); expires = timespec64_to_ktime(new_setting->it_value); - sigev_none = (timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE; + sigev_none = timr->it_sigev_notify == SIGEV_NONE; kc->timer_arm(timr, expires, flags & TIMER_ABSTIME, sigev_none); timr->it_active = !sigev_none; -- cgit v1.2.3 From 50034ed49645463a16327cad05694e201e6b4126 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 15 Dec 2017 05:09:47 -0800 Subject: cgroup: use strlcpy() instead of strscpy() to avoid spurious warning As long as cft->name is guaranteed to be NUL-terminated, using strlcpy() would work just as well and avoid that warning, so the change below could be folded into that commit. Signed-off-by: Arnd Bergmann Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 18d71fbd3923..f4c2f8cb5748 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -1397,7 +1397,7 @@ static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft, cgroup_on_dfl(cgrp) ? ss->name : ss->legacy_name, cft->name); else - strscpy(buf, cft->name, CGROUP_FILE_NAME_MAX); + strlcpy(buf, cft->name, CGROUP_FILE_NAME_MAX); return buf; } @@ -1864,9 +1864,9 @@ void init_cgroup_root(struct cgroup_root *root, struct cgroup_sb_opts *opts) root->flags = opts->flags; if (opts->release_agent) - strscpy(root->release_agent_path, opts->release_agent, PATH_MAX); + strlcpy(root->release_agent_path, opts->release_agent, PATH_MAX); if (opts->name) - strscpy(root->name, opts->name, MAX_CGROUP_ROOT_NAMELEN); + strlcpy(root->name, opts->name, MAX_CGROUP_ROOT_NAMELEN); if (opts->cpuset_clone_children) set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags); } -- cgit v1.2.3 From f73c52a5bcd1710994e53fbccc378c42b97a06b6 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Sat, 2 Dec 2017 13:04:54 -0500 Subject: sched/rt: Do not pull from current CPU if only one CPU to pull Daniel Wagner reported a crash on the BeagleBone Black SoC. This is a single CPU architecture, and does not have a functional arch_send_call_function_single_ipi() implementation which can crash the kernel if that is called. As it only has one CPU, it shouldn't be called, but if the kernel is compiled for SMP, the push/pull RT scheduling logic now calls it for irq_work if the one CPU is overloaded, it can use that function to call itself and crash the kernel. Ideally, we should disable the SCHED_FEAT(RT_PUSH_IPI) if the system only has a single CPU. But SCHED_FEAT is a constant if sched debugging is turned off. Another fix can also be used, and this should also help with normal SMP machines. That is, do not initiate the pull code if there's only one RT overloaded CPU, and that CPU happens to be the current CPU that is scheduling in a lower priority task. Even on a system with many CPUs, if there's many RT tasks waiting to run on a single CPU, and that CPU schedules in another RT task of lower priority, it will initiate the PULL logic in case there's a higher priority RT task on another CPU that is waiting to run. But if there is no other CPU with waiting RT tasks, it will initiate the RT pull logic on itself (as it still has RT tasks waiting to run). This is a wasted effort. Not only does this help with SMP code where the current CPU is the only one with RT overloaded tasks, it should also solve the issue that Daniel encountered, because it will prevent the PULL logic from executing, as there's only one CPU on the system, and the check added here will cause it to exit the RT pull code. Reported-by: Daniel Wagner Signed-off-by: Steven Rostedt (VMware) Acked-by: Peter Zijlstra Cc: Linus Torvalds Cc: Sebastian Andrzej Siewior Cc: Thomas Gleixner Cc: linux-rt-users Cc: stable@vger.kernel.org Fixes: 4bdced5c9 ("sched/rt: Simplify the IPI based RT balancing logic") Link: http://lkml.kernel.org/r/20171202130454.4cbbfe8d@vmware.local.home Signed-off-by: Ingo Molnar --- kernel/sched/rt.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 4056c19ca3f0..665ace2fc558 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -2034,8 +2034,9 @@ static void pull_rt_task(struct rq *this_rq) bool resched = false; struct task_struct *p; struct rq *src_rq; + int rt_overload_count = rt_overloaded(this_rq); - if (likely(!rt_overloaded(this_rq))) + if (likely(!rt_overload_count)) return; /* @@ -2044,6 +2045,11 @@ static void pull_rt_task(struct rq *this_rq) */ smp_rmb(); + /* If we are the only overloaded CPU do nothing */ + if (rt_overload_count == 1 && + cpumask_test_cpu(this_rq->cpu, this_rq->rd->rto_mask)) + return; + #ifdef HAVE_RT_PUSH_IPI if (sched_feat(RT_PUSH_IPI)) { tell_cpu_to_push(this_rq); -- cgit v1.2.3 From 04514d13222f2c4c91adf0ecb21004cec3388795 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 14 Dec 2017 21:07:25 +0100 Subject: bpf: guarantee r1 to be ctx in case of bpf_helper_changes_pkt_data Some JITs don't cache skb context on stack in prologue, so when LD_ABS/IND is used and helper calls yield bpf_helper_changes_pkt_data() as true, then they temporarily save/restore skb pointer. However, the assumption that skb always has to be in r1 is a bit of a gamble. Right now it turned out to be true for all helpers listed in bpf_helper_changes_pkt_data(), but lets enforce that from verifier side, so that we make this a guarantee and bail out if the func proto is misconfigured in future helpers. In case of BPF helper calls from cBPF, bpf_helper_changes_pkt_data() is completely unrelevant here (since cBPF is context read-only) and therefore always false. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index d4593571c404..e39b01317b6f 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1674,7 +1674,13 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx) return -EINVAL; } + /* With LD_ABS/IND some JITs save/restore skb from r1. */ changes_data = bpf_helper_changes_pkt_data(fn->func); + if (changes_data && fn->arg1_type != ARG_PTR_TO_CTX) { + verbose(env, "kernel subsystem misconfigured func %s#%d: r1 != ctx\n", + func_id_name(func_id), func_id); + return -EINVAL; + } memset(&meta, 0, sizeof(meta)); meta.pkt_access = fn->pkt_access; -- cgit v1.2.3 From 3382290ed2d5e275429cef510ab21889d3ccd164 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 24 Oct 2017 11:22:48 +0100 Subject: locking/barriers: Convert users of lockless_dereference() to READ_ONCE() [ Note, this is a Git cherry-pick of the following commit: 506458efaf15 ("locking/barriers: Convert users of lockless_dereference() to READ_ONCE()") ... for easier x86 PTI code testing and back-porting. ] READ_ONCE() now has an implicit smp_read_barrier_depends() call, so it can be used instead of lockless_dereference() without any change in semantics. Signed-off-by: Will Deacon Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1508840570-22169-4-git-send-email-will.deacon@arm.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 4 ++-- kernel/seccomp.c | 2 +- kernel/task_work.c | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 10cdb9c26b5d..6eee4ed97af0 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -4233,7 +4233,7 @@ static void perf_remove_from_owner(struct perf_event *event) * indeed free this event, otherwise we need to serialize on * owner->perf_event_mutex. */ - owner = lockless_dereference(event->owner); + owner = READ_ONCE(event->owner); if (owner) { /* * Since delayed_put_task_struct() also drops the last @@ -4330,7 +4330,7 @@ again: * Cannot change, child events are not migrated, see the * comment with perf_event_ctx_lock_nested(). */ - ctx = lockless_dereference(child->ctx); + ctx = READ_ONCE(child->ctx); /* * Since child_mutex nests inside ctx::mutex, we must jump * through hoops. We start by grabbing a reference on the ctx. diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 418a1c045933..5f0dfb2abb8d 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -190,7 +190,7 @@ static u32 seccomp_run_filters(const struct seccomp_data *sd, u32 ret = SECCOMP_RET_ALLOW; /* Make sure cross-thread synced filter points somewhere sane. */ struct seccomp_filter *f = - lockless_dereference(current->seccomp.filter); + READ_ONCE(current->seccomp.filter); /* Ensure unexpected behavior doesn't result in failing open. */ if (unlikely(WARN_ON(f == NULL))) diff --git a/kernel/task_work.c b/kernel/task_work.c index 5718b3ea202a..0fef395662a6 100644 --- a/kernel/task_work.c +++ b/kernel/task_work.c @@ -68,7 +68,7 @@ task_work_cancel(struct task_struct *task, task_work_func_t func) * we raced with task_work_run(), *pprev == NULL/exited. */ raw_spin_lock_irqsave(&task->pi_lock, flags); - while ((work = lockless_dereference(*pprev))) { + while ((work = READ_ONCE(*pprev))) { if (work->func != func) pprev = &work->next; else if (cmpxchg(pprev, work, work->next) == work) -- cgit v1.2.3 From bf29cb238dc0656e6564b6a94bb82e11d2129437 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 14 Dec 2017 19:18:25 +0100 Subject: sched/isolation: Make CONFIG_NO_HZ_FULL select CONFIG_CPU_ISOLATION CONFIG_NO_HZ_FULL doesn't make sense without CONFIG_CPU_ISOLATION. In fact enabling the first without the second is a regression as nohz_full= boot parameter gets silently ignored. Besides this unnatural combination hangs RCU gp kthread when running rcutorture for reasons that are not yet fully understood: rcu_preempt kthread starved for 9974 jiffies! g4294967208 +c4294967207 f0x0 RCU_GP_WAIT_FQS(3) ->state=0x402 ->cpu=0 rcu_preempt I 7464 8 2 0x80000000 Call Trace: __schedule+0x493/0x620 schedule+0x24/0x40 schedule_timeout+0x330/0x3b0 ? preempt_count_sub+0xea/0x140 ? collect_expired_timers+0xb0/0xb0 rcu_gp_kthread+0x6bf/0xef0 This commit therefore makes NO_HZ_FULL select CPU_ISOLATION, which prevents all these bad behaviours. Reported-by: kernel test robot Signed-off-by: Paul E. McKenney Signed-off-by: Frederic Weisbecker Cc: Chris Metcalf Cc: Christoph Lameter Cc: John Stultz Cc: Linus Torvalds Cc: Luiz Capitulino Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Wanpeng Li Fixes: 5c4991e24c69 ("sched/isolation: Split out new CONFIG_CPU_ISOLATION=y config from CONFIG_NO_HZ_FULL") Link: http://lkml.kernel.org/r/1513275507-29200-2-git-send-email-frederic@kernel.org Signed-off-by: Ingo Molnar --- kernel/time/Kconfig | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index e776fc8cc1df..f6b5f19223d6 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig @@ -95,6 +95,7 @@ config NO_HZ_FULL select RCU_NOCB_CPU select VIRT_CPU_ACCOUNTING_GEN select IRQ_WORK + select CPU_ISOLATION help Adaptively try to shutdown the tick whenever possible, even when the CPU is running tasks. Typically this requires running a single -- cgit v1.2.3 From 116d2f7496c51b2e02e8e4ecdd2bdf5fb9d5a641 Mon Sep 17 00:00:00 2001 From: Prateek Sood Date: Tue, 19 Dec 2017 12:56:57 +0530 Subject: cgroup: Fix deadlock in cpu hotplug path Deadlock during cgroup migration from cpu hotplug path when a task T is being moved from source to destination cgroup. kworker/0:0 cpuset_hotplug_workfn() cpuset_hotplug_update_tasks() hotplug_update_tasks_legacy() remove_tasks_in_empty_cpuset() cgroup_transfer_tasks() // stuck in iterator loop cgroup_migrate() cgroup_migrate_add_task() In cgroup_migrate_add_task() it checks for PF_EXITING flag of task T. Task T will not migrate to destination cgroup. css_task_iter_start() will keep pointing to task T in loop waiting for task T cg_list node to be removed. Task T do_exit() exit_signals() // sets PF_EXITING exit_task_namespaces() switch_task_namespaces() free_nsproxy() put_mnt_ns() drop_collected_mounts() namespace_unlock() synchronize_rcu() _synchronize_rcu_expedited() schedule_work() // on cpu0 low priority worker pool wait_event() // waiting for work item to execute Task T inserted a work item in the worklist of cpu0 low priority worker pool. It is waiting for expedited grace period work item to execute. This work item will only be executed once kworker/0:0 complete execution of cpuset_hotplug_workfn(). kworker/0:0 ==> Task T ==>kworker/0:0 In case of PF_EXITING task being migrated from source to destination cgroup, migrate next available task in source cgroup. Signed-off-by: Prateek Sood Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup-v1.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index 024085daab1a..a2c05d2476ac 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -123,7 +123,11 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) */ do { css_task_iter_start(&from->self, 0, &it); - task = css_task_iter_next(&it); + + do { + task = css_task_iter_next(&it); + } while (task && (task->flags & PF_EXITING)); + if (task) get_task_struct(task); css_task_iter_end(&it); -- cgit v1.2.3 From 74d0833c659a8a54735e5efdd44f4b225af68586 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 20 Dec 2017 07:09:19 -0800 Subject: cgroup: fix css_task_iter crash on CSS_TASK_ITER_PROC While teaching css_task_iter to handle skipping over tasks which aren't group leaders, bc2fb7ed089f ("cgroup: add @flags to css_task_iter_start() and implement CSS_TASK_ITER_PROCS") introduced a silly bug. CSS_TASK_ITER_PROCS is implemented by repeating css_task_iter_advance() while the advanced cursor is pointing to a non-leader thread. However, the cursor variable, @l, wasn't updated when the iteration has to advance to the next css_set and the following repetition would operate on the terminal @l from the previous iteration which isn't pointing to a valid task leading to oopses like the following or infinite looping. BUG: unable to handle kernel NULL pointer dereference at 0000000000000254 IP: __task_pid_nr_ns+0xc7/0xf0 PGD 0 P4D 0 Oops: 0000 [#1] SMP ... CPU: 2 PID: 1 Comm: systemd Not tainted 4.14.4-200.fc26.x86_64 #1 Hardware name: System manufacturer System Product Name/PRIME B350M-A, BIOS 3203 11/09/2017 task: ffff88c4baee8000 task.stack: ffff96d5c3158000 RIP: 0010:__task_pid_nr_ns+0xc7/0xf0 RSP: 0018:ffff96d5c315bd50 EFLAGS: 00010206 RAX: 0000000000000000 RBX: ffff88c4b68c6000 RCX: 0000000000000250 RDX: ffffffffa5e47960 RSI: 0000000000000000 RDI: ffff88c490f6ab00 RBP: ffff96d5c315bd50 R08: 0000000000001000 R09: 0000000000000005 R10: ffff88c4be006b80 R11: ffff88c42f1b8004 R12: ffff96d5c315bf18 R13: ffff88c42d7dd200 R14: ffff88c490f6a510 R15: ffff88c4b68c6000 FS: 00007f9446f8ea00(0000) GS:ffff88c4be680000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000254 CR3: 00000007f956f000 CR4: 00000000003406e0 Call Trace: cgroup_procs_show+0x19/0x30 cgroup_seqfile_show+0x4c/0xb0 kernfs_seq_show+0x21/0x30 seq_read+0x2ec/0x3f0 kernfs_fop_read+0x134/0x180 __vfs_read+0x37/0x160 ? security_file_permission+0x9b/0xc0 vfs_read+0x8e/0x130 SyS_read+0x55/0xc0 entry_SYSCALL_64_fastpath+0x1a/0xa5 RIP: 0033:0x7f94455f942d RSP: 002b:00007ffe81ba2d00 EFLAGS: 00000293 ORIG_RAX: 0000000000000000 RAX: ffffffffffffffda RBX: 00005574e2233f00 RCX: 00007f94455f942d RDX: 0000000000001000 RSI: 00005574e2321a90 RDI: 000000000000002b RBP: 0000000000000000 R08: 00005574e2321a90 R09: 00005574e231de60 R10: 00007f94458c8b38 R11: 0000000000000293 R12: 00007f94458c8ae0 R13: 00007ffe81ba3800 R14: 0000000000000000 R15: 00005574e2116560 Code: 04 74 0e 89 f6 48 8d 04 76 48 8d 04 c5 f0 05 00 00 48 8b bf b8 05 00 00 48 01 c7 31 c0 48 8b 0f 48 85 c9 74 18 8b b2 30 08 00 00 <3b> 71 04 77 0d 48 c1 e6 05 48 01 f1 48 3b 51 38 74 09 5d c3 8b RIP: __task_pid_nr_ns+0xc7/0xf0 RSP: ffff96d5c315bd50 Fix it by moving the initialization of the cursor below the repeat label. While at it, rename it to @next for readability. Signed-off-by: Tejun Heo Fixes: bc2fb7ed089f ("cgroup: add @flags to css_task_iter_start() and implement CSS_TASK_ITER_PROCS") Cc: stable@vger.kernel.org # v4.14+ Reported-by: Laura Abbott Reported-by: Bronek Kozicki Reported-by: George Amanakis Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index f4c2f8cb5748..2cf06c274e4c 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -4125,26 +4125,24 @@ static void css_task_iter_advance_css_set(struct css_task_iter *it) static void css_task_iter_advance(struct css_task_iter *it) { - struct list_head *l = it->task_pos; + struct list_head *next; lockdep_assert_held(&css_set_lock); - WARN_ON_ONCE(!l); - repeat: /* * Advance iterator to find next entry. cset->tasks is consumed * first and then ->mg_tasks. After ->mg_tasks, we move onto the * next cset. */ - l = l->next; + next = it->task_pos->next; - if (l == it->tasks_head) - l = it->mg_tasks_head->next; + if (next == it->tasks_head) + next = it->mg_tasks_head->next; - if (l == it->mg_tasks_head) + if (next == it->mg_tasks_head) css_task_iter_advance_css_set(it); else - it->task_pos = l; + it->task_pos = next; /* if PROCS, skip over tasks which aren't group leaders */ if ((it->flags & CSS_TASK_ITER_PROCS) && it->task_pos && -- cgit v1.2.3 From 4374f256ce8182019353c0c639bb8d0695b4c941 Mon Sep 17 00:00:00 2001 From: Edward Cree Date: Mon, 18 Dec 2017 20:11:53 -0800 Subject: bpf/verifier: fix bounds calculation on BPF_RSH Incorrect signed bounds were being computed. If the old upper signed bound was positive and the old lower signed bound was negative, this could cause the new upper signed bound to be too low, leading to security issues. Fixes: b03c9f9fdc37 ("bpf/verifier: track signed and unsigned min/max values") Reported-by: Jann Horn Signed-off-by: Edward Cree Acked-by: Alexei Starovoitov [jannh@google.com: changed description to reflect bug impact] Signed-off-by: Jann Horn Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- kernel/bpf/verifier.c | 30 ++++++++++++++++-------------- 1 file changed, 16 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index e39b01317b6f..625e358ca765 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2190,20 +2190,22 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, mark_reg_unknown(env, regs, insn->dst_reg); break; } - /* BPF_RSH is an unsigned shift, so make the appropriate casts */ - if (dst_reg->smin_value < 0) { - if (umin_val) { - /* Sign bit will be cleared */ - dst_reg->smin_value = 0; - } else { - /* Lost sign bit information */ - dst_reg->smin_value = S64_MIN; - dst_reg->smax_value = S64_MAX; - } - } else { - dst_reg->smin_value = - (u64)(dst_reg->smin_value) >> umax_val; - } + /* BPF_RSH is an unsigned shift. If the value in dst_reg might + * be negative, then either: + * 1) src_reg might be zero, so the sign bit of the result is + * unknown, so we lose our signed bounds + * 2) it's known negative, thus the unsigned bounds capture the + * signed bounds + * 3) the signed bounds cross zero, so they tell us nothing + * about the result + * If the value in dst_reg is known nonnegative, then again the + * unsigned bounts capture the signed bounds. + * Thus, in all cases it suffices to blow away our signed bounds + * and rely on inferring new ones from the unsigned bounds and + * var_off of the result. + */ + dst_reg->smin_value = S64_MIN; + dst_reg->smax_value = S64_MAX; if (src_known) dst_reg->var_off = tnum_rshift(dst_reg->var_off, umin_val); -- cgit v1.2.3 From 95a762e2c8c942780948091f8f2a4f32fce1ac6f Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Mon, 18 Dec 2017 20:11:54 -0800 Subject: bpf: fix incorrect sign extension in check_alu_op() Distinguish between BPF_ALU64|BPF_MOV|BPF_K (load 32-bit immediate, sign-extended to 64-bit) and BPF_ALU|BPF_MOV|BPF_K (load 32-bit immediate, zero-padded to 64-bit); only perform sign extension in the first case. Starting with v4.14, this is exploitable by unprivileged users as long as the unprivileged_bpf_disabled sysctl isn't set. Debian assigned CVE-2017-16995 for this issue. v3: - add CVE number (Ben Hutchings) Fixes: 484611357c19 ("bpf: allow access into map value arrays") Signed-off-by: Jann Horn Acked-by: Edward Cree Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- kernel/bpf/verifier.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 625e358ca765..c086010ae51e 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2408,7 +2408,13 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) * remember the value we stored into this reg */ regs[insn->dst_reg].type = SCALAR_VALUE; - __mark_reg_known(regs + insn->dst_reg, insn->imm); + if (BPF_CLASS(insn->code) == BPF_ALU64) { + __mark_reg_known(regs + insn->dst_reg, + insn->imm); + } else { + __mark_reg_known(regs + insn->dst_reg, + (u32)insn->imm); + } } } else if (opcode > BPF_END) { -- cgit v1.2.3 From 0c17d1d2c61936401f4702e1846e2c19b200f958 Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Mon, 18 Dec 2017 20:11:55 -0800 Subject: bpf: fix incorrect tracking of register size truncation Properly handle register truncation to a smaller size. The old code first mirrors the clearing of the high 32 bits in the bitwise tristate representation, which is correct. But then, it computes the new arithmetic bounds as the intersection between the old arithmetic bounds and the bounds resulting from the bitwise tristate representation. Therefore, when coerce_reg_to_32() is called on a number with bounds [0xffff'fff8, 0x1'0000'0007], the verifier computes [0xffff'fff8, 0xffff'ffff] as bounds of the truncated number. This is incorrect: The truncated number could also be in the range [0, 7], and no meaningful arithmetic bounds can be computed in that case apart from the obvious [0, 0xffff'ffff]. Starting with v4.14, this is exploitable by unprivileged users as long as the unprivileged_bpf_disabled sysctl isn't set. Debian assigned CVE-2017-16996 for this issue. v2: - flip the mask during arithmetic bounds calculation (Ben Hutchings) v3: - add CVE number (Ben Hutchings) Fixes: b03c9f9fdc37 ("bpf/verifier: track signed and unsigned min/max values") Signed-off-by: Jann Horn Acked-by: Edward Cree Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- kernel/bpf/verifier.c | 44 +++++++++++++++++++++++++++----------------- 1 file changed, 27 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index c086010ae51e..f716bdf29dd0 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1067,6 +1067,29 @@ static int check_ptr_alignment(struct bpf_verifier_env *env, strict); } +/* truncate register to smaller size (in bytes) + * must be called with size < BPF_REG_SIZE + */ +static void coerce_reg_to_size(struct bpf_reg_state *reg, int size) +{ + u64 mask; + + /* clear high bits in bit representation */ + reg->var_off = tnum_cast(reg->var_off, size); + + /* fix arithmetic bounds */ + mask = ((u64)1 << (size * 8)) - 1; + if ((reg->umin_value & ~mask) == (reg->umax_value & ~mask)) { + reg->umin_value &= mask; + reg->umax_value &= mask; + } else { + reg->umin_value = 0; + reg->umax_value = mask; + } + reg->smin_value = reg->umin_value; + reg->smax_value = reg->umax_value; +} + /* check whether memory at (regno + off) is accessible for t = (read | write) * if t==write, value_regno is a register which value is stored into memory * if t==read, value_regno is a register which will receive the value from memory @@ -1200,9 +1223,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn if (!err && size < BPF_REG_SIZE && value_regno >= 0 && t == BPF_READ && regs[value_regno].type == SCALAR_VALUE) { /* b/h/w load zero-extends, mark upper bits as known 0 */ - regs[value_regno].var_off = - tnum_cast(regs[value_regno].var_off, size); - __update_reg_bounds(®s[value_regno]); + coerce_reg_to_size(®s[value_regno], size); } return err; } @@ -1772,14 +1793,6 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx) return 0; } -static void coerce_reg_to_32(struct bpf_reg_state *reg) -{ - /* clear high 32 bits */ - reg->var_off = tnum_cast(reg->var_off, 4); - /* Update bounds */ - __update_reg_bounds(reg); -} - static bool signed_add_overflows(s64 a, s64 b) { /* Do the add in u64, where overflow is well-defined */ @@ -2017,8 +2030,8 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, if (BPF_CLASS(insn->code) != BPF_ALU64) { /* 32-bit ALU ops are (32,32)->64 */ - coerce_reg_to_32(dst_reg); - coerce_reg_to_32(&src_reg); + coerce_reg_to_size(dst_reg, 4); + coerce_reg_to_size(&src_reg, 4); } smin_val = src_reg.smin_value; smax_val = src_reg.smax_value; @@ -2398,10 +2411,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) return -EACCES; } mark_reg_unknown(env, regs, insn->dst_reg); - /* high 32 bits are known zero. */ - regs[insn->dst_reg].var_off = tnum_cast( - regs[insn->dst_reg].var_off, 4); - __update_reg_bounds(®s[insn->dst_reg]); + coerce_reg_to_size(®s[insn->dst_reg], 4); } } else { /* case: R = imm -- cgit v1.2.3 From 468f6eafa6c44cb2c5d8aad35e12f06c240a812a Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Mon, 18 Dec 2017 20:11:56 -0800 Subject: bpf: fix 32-bit ALU op verification 32-bit ALU ops operate on 32-bit values and have 32-bit outputs. Adjust the verifier accordingly. Fixes: f1174f77b50c ("bpf/verifier: rework value tracking") Signed-off-by: Jann Horn Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- kernel/bpf/verifier.c | 28 +++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index f716bdf29dd0..ecdc265244ca 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2017,6 +2017,10 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, return 0; } +/* WARNING: This function does calculations on 64-bit values, but the actual + * execution may occur on 32-bit values. Therefore, things like bitshifts + * need extra checks in the 32-bit case. + */ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, struct bpf_insn *insn, struct bpf_reg_state *dst_reg, @@ -2027,12 +2031,8 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, bool src_known, dst_known; s64 smin_val, smax_val; u64 umin_val, umax_val; + u64 insn_bitness = (BPF_CLASS(insn->code) == BPF_ALU64) ? 64 : 32; - if (BPF_CLASS(insn->code) != BPF_ALU64) { - /* 32-bit ALU ops are (32,32)->64 */ - coerce_reg_to_size(dst_reg, 4); - coerce_reg_to_size(&src_reg, 4); - } smin_val = src_reg.smin_value; smax_val = src_reg.smax_value; umin_val = src_reg.umin_value; @@ -2168,9 +2168,9 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, __update_reg_bounds(dst_reg); break; case BPF_LSH: - if (umax_val > 63) { - /* Shifts greater than 63 are undefined. This includes - * shifts by a negative number. + if (umax_val >= insn_bitness) { + /* Shifts greater than 31 or 63 are undefined. + * This includes shifts by a negative number. */ mark_reg_unknown(env, regs, insn->dst_reg); break; @@ -2196,9 +2196,9 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, __update_reg_bounds(dst_reg); break; case BPF_RSH: - if (umax_val > 63) { - /* Shifts greater than 63 are undefined. This includes - * shifts by a negative number. + if (umax_val >= insn_bitness) { + /* Shifts greater than 31 or 63 are undefined. + * This includes shifts by a negative number. */ mark_reg_unknown(env, regs, insn->dst_reg); break; @@ -2234,6 +2234,12 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, break; } + if (BPF_CLASS(insn->code) != BPF_ALU64) { + /* 32-bit ALU ops are (32,32)->32 */ + coerce_reg_to_size(dst_reg, 4); + coerce_reg_to_size(&src_reg, 4); + } + __reg_deduce_bounds(dst_reg); __reg_bound_offset(dst_reg); return 0; -- cgit v1.2.3 From ea25f914dc164c8d56b36147ecc86bc65f83c469 Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Mon, 18 Dec 2017 20:11:57 -0800 Subject: bpf: fix missing error return in check_stack_boundary() Prevent indirect stack accesses at non-constant addresses, which would permit reading and corrupting spilled pointers. Fixes: f1174f77b50c ("bpf/verifier: rework value tracking") Signed-off-by: Jann Horn Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- kernel/bpf/verifier.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index ecdc265244ca..77e4b5223867 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1303,6 +1303,7 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno, tnum_strn(tn_buf, sizeof(tn_buf), regs[regno].var_off); verbose(env, "invalid variable stack read R%d var_off=%s\n", regno, tn_buf); + return -EACCES; } off = regs[regno].off + regs[regno].var_off.value; if (off >= 0 || off < -MAX_BPF_STACK || off + access_size > 0 || -- cgit v1.2.3 From a5ec6ae161d72f01411169a938fa5f8baea16e8f Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Mon, 18 Dec 2017 20:11:58 -0800 Subject: bpf: force strict alignment checks for stack pointers Force strict alignment checks for stack pointers because the tracking of stack spills relies on it; unaligned stack accesses can lead to corruption of spilled registers, which is exploitable. Fixes: f1174f77b50c ("bpf/verifier: rework value tracking") Signed-off-by: Jann Horn Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- kernel/bpf/verifier.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 77e4b5223867..102c519836f6 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1059,6 +1059,11 @@ static int check_ptr_alignment(struct bpf_verifier_env *env, break; case PTR_TO_STACK: pointer_desc = "stack "; + /* The stack spill tracking logic in check_stack_write() + * and check_stack_read() relies on stack accesses being + * aligned. + */ + strict = true; break; default: break; -- cgit v1.2.3 From 179d1c5602997fef5a940c6ddcf31212cbfebd14 Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Mon, 18 Dec 2017 20:11:59 -0800 Subject: bpf: don't prune branches when a scalar is replaced with a pointer This could be made safe by passing through a reference to env and checking for env->allow_ptr_leaks, but it would only work one way and is probably not worth the hassle - not doing it will not directly lead to program rejection. Fixes: f1174f77b50c ("bpf/verifier: rework value tracking") Signed-off-by: Jann Horn Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- kernel/bpf/verifier.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 102c519836f6..982bd9ec721a 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3467,15 +3467,14 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur, return range_within(rold, rcur) && tnum_in(rold->var_off, rcur->var_off); } else { - /* if we knew anything about the old value, we're not - * equal, because we can't know anything about the - * scalar value of the pointer in the new value. + /* We're trying to use a pointer in place of a scalar. + * Even if the scalar was unbounded, this could lead to + * pointer leaks because scalars are allowed to leak + * while pointers are not. We could make this safe in + * special cases if root is calling us, but it's + * probably not worth the hassle. */ - return rold->umin_value == 0 && - rold->umax_value == U64_MAX && - rold->smin_value == S64_MIN && - rold->smax_value == S64_MAX && - tnum_is_unknown(rold->var_off); + return false; } case PTR_TO_MAP_VALUE: /* If the new min/max/var_off satisfy the old ones and -- cgit v1.2.3 From bb7f0f989ca7de1153bd128a40a71709e339fa03 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 18 Dec 2017 20:12:00 -0800 Subject: bpf: fix integer overflows There were various issues related to the limited size of integers used in the verifier: - `off + size` overflow in __check_map_access() - `off + reg->off` overflow in check_mem_access() - `off + reg->var_off.value` overflow or 32-bit truncation of `reg->var_off.value` in check_mem_access() - 32-bit truncation in check_stack_boundary() Make sure that any integer math cannot overflow by not allowing pointer math with large values. Also reduce the scope of "scalar op scalar" tracking. Fixes: f1174f77b50c ("bpf/verifier: rework value tracking") Reported-by: Jann Horn Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- kernel/bpf/verifier.c | 48 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 982bd9ec721a..86dfe6b5c243 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1819,6 +1819,41 @@ static bool signed_sub_overflows(s64 a, s64 b) return res > a; } +static bool check_reg_sane_offset(struct bpf_verifier_env *env, + const struct bpf_reg_state *reg, + enum bpf_reg_type type) +{ + bool known = tnum_is_const(reg->var_off); + s64 val = reg->var_off.value; + s64 smin = reg->smin_value; + + if (known && (val >= BPF_MAX_VAR_OFF || val <= -BPF_MAX_VAR_OFF)) { + verbose(env, "math between %s pointer and %lld is not allowed\n", + reg_type_str[type], val); + return false; + } + + if (reg->off >= BPF_MAX_VAR_OFF || reg->off <= -BPF_MAX_VAR_OFF) { + verbose(env, "%s pointer offset %d is not allowed\n", + reg_type_str[type], reg->off); + return false; + } + + if (smin == S64_MIN) { + verbose(env, "math between %s pointer and register with unbounded min value is not allowed\n", + reg_type_str[type]); + return false; + } + + if (smin >= BPF_MAX_VAR_OFF || smin <= -BPF_MAX_VAR_OFF) { + verbose(env, "value %lld makes %s pointer be out of bounds\n", + smin, reg_type_str[type]); + return false; + } + + return true; +} + /* Handles arithmetic on a pointer and a scalar: computes new min/max and var_off. * Caller should also handle BPF_MOV case separately. * If we return -EACCES, caller may want to try again treating pointer as a @@ -1887,6 +1922,10 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, dst_reg->type = ptr_reg->type; dst_reg->id = ptr_reg->id; + if (!check_reg_sane_offset(env, off_reg, ptr_reg->type) || + !check_reg_sane_offset(env, ptr_reg, ptr_reg->type)) + return -EINVAL; + switch (opcode) { case BPF_ADD: /* We can take a fixed offset as long as it doesn't overflow @@ -2017,6 +2056,9 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, return -EACCES; } + if (!check_reg_sane_offset(env, dst_reg, ptr_reg->type)) + return -EINVAL; + __update_reg_bounds(dst_reg); __reg_deduce_bounds(dst_reg); __reg_bound_offset(dst_reg); @@ -2046,6 +2088,12 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, src_known = tnum_is_const(src_reg.var_off); dst_known = tnum_is_const(dst_reg->var_off); + if (!src_known && + opcode != BPF_ADD && opcode != BPF_SUB && opcode != BPF_AND) { + __mark_reg_unknown(dst_reg); + return 0; + } + switch (opcode) { case BPF_ADD: if (signed_add_overflows(dst_reg->smin_value, smin_val) || -- cgit v1.2.3 From 82abbf8d2fc46d79611ab58daa7c608df14bb3ee Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 18 Dec 2017 20:15:20 -0800 Subject: bpf: do not allow root to mangle valid pointers Do not allow root to convert valid pointers into unknown scalars. In particular disallow: ptr &= reg ptr <<= reg ptr += ptr and explicitly allow: ptr -= ptr since pkt_end - pkt == length 1. This minimizes amount of address leaks root can do. In the future may need to further tighten the leaks with kptr_restrict. 2. If program has such pointer math it's likely a user mistake and when verifier complains about it right away instead of many instructions later on invalid memory access it's easier for users to fix their progs. 3. when register holding a pointer cannot change to scalar it allows JITs to optimize better. Like 32-bit archs could use single register for pointers instead of a pair required to hold 64-bit scalars. 4. reduces architecture dependent behavior. Since code: r1 = r10; r1 &= 0xff; if (r1 ...) will behave differently arm64 vs x64 and offloaded vs native. A significant chunk of ptr mangling was allowed by commit f1174f77b50c ("bpf/verifier: rework value tracking") yet some of it was allowed even earlier. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- kernel/bpf/verifier.c | 102 +++++++++++++++++--------------------------------- 1 file changed, 34 insertions(+), 68 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 86dfe6b5c243..04b24876cd23 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1890,29 +1890,25 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, if (BPF_CLASS(insn->code) != BPF_ALU64) { /* 32-bit ALU ops on pointers produce (meaningless) scalars */ - if (!env->allow_ptr_leaks) - verbose(env, - "R%d 32-bit pointer arithmetic prohibited\n", - dst); + verbose(env, + "R%d 32-bit pointer arithmetic prohibited\n", + dst); return -EACCES; } if (ptr_reg->type == PTR_TO_MAP_VALUE_OR_NULL) { - if (!env->allow_ptr_leaks) - verbose(env, "R%d pointer arithmetic on PTR_TO_MAP_VALUE_OR_NULL prohibited, null-check it first\n", - dst); + verbose(env, "R%d pointer arithmetic on PTR_TO_MAP_VALUE_OR_NULL prohibited, null-check it first\n", + dst); return -EACCES; } if (ptr_reg->type == CONST_PTR_TO_MAP) { - if (!env->allow_ptr_leaks) - verbose(env, "R%d pointer arithmetic on CONST_PTR_TO_MAP prohibited\n", - dst); + verbose(env, "R%d pointer arithmetic on CONST_PTR_TO_MAP prohibited\n", + dst); return -EACCES; } if (ptr_reg->type == PTR_TO_PACKET_END) { - if (!env->allow_ptr_leaks) - verbose(env, "R%d pointer arithmetic on PTR_TO_PACKET_END prohibited\n", - dst); + verbose(env, "R%d pointer arithmetic on PTR_TO_PACKET_END prohibited\n", + dst); return -EACCES; } @@ -1979,9 +1975,8 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, case BPF_SUB: if (dst_reg == off_reg) { /* scalar -= pointer. Creates an unknown scalar */ - if (!env->allow_ptr_leaks) - verbose(env, "R%d tried to subtract pointer from scalar\n", - dst); + verbose(env, "R%d tried to subtract pointer from scalar\n", + dst); return -EACCES; } /* We don't allow subtraction from FP, because (according to @@ -1989,9 +1984,8 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, * be able to deal with it. */ if (ptr_reg->type == PTR_TO_STACK) { - if (!env->allow_ptr_leaks) - verbose(env, "R%d subtraction from stack pointer prohibited\n", - dst); + verbose(env, "R%d subtraction from stack pointer prohibited\n", + dst); return -EACCES; } if (known && (ptr_reg->off - smin_val == @@ -2040,19 +2034,14 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, case BPF_AND: case BPF_OR: case BPF_XOR: - /* bitwise ops on pointers are troublesome, prohibit for now. - * (However, in principle we could allow some cases, e.g. - * ptr &= ~3 which would reduce min_value by 3.) - */ - if (!env->allow_ptr_leaks) - verbose(env, "R%d bitwise operator %s on pointer prohibited\n", - dst, bpf_alu_string[opcode >> 4]); + /* bitwise ops on pointers are troublesome, prohibit. */ + verbose(env, "R%d bitwise operator %s on pointer prohibited\n", + dst, bpf_alu_string[opcode >> 4]); return -EACCES; default: /* other operators (e.g. MUL,LSH) produce non-pointer results */ - if (!env->allow_ptr_leaks) - verbose(env, "R%d pointer arithmetic with %s operator prohibited\n", - dst, bpf_alu_string[opcode >> 4]); + verbose(env, "R%d pointer arithmetic with %s operator prohibited\n", + dst, bpf_alu_string[opcode >> 4]); return -EACCES; } @@ -2308,7 +2297,6 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env, struct bpf_reg_state *regs = cur_regs(env), *dst_reg, *src_reg; struct bpf_reg_state *ptr_reg = NULL, off_reg = {0}; u8 opcode = BPF_OP(insn->code); - int rc; dst_reg = ®s[insn->dst_reg]; src_reg = NULL; @@ -2319,43 +2307,29 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env, if (src_reg->type != SCALAR_VALUE) { if (dst_reg->type != SCALAR_VALUE) { /* Combining two pointers by any ALU op yields - * an arbitrary scalar. + * an arbitrary scalar. Disallow all math except + * pointer subtraction */ - if (!env->allow_ptr_leaks) { - verbose(env, "R%d pointer %s pointer prohibited\n", - insn->dst_reg, - bpf_alu_string[opcode >> 4]); - return -EACCES; + if (opcode == BPF_SUB){ + mark_reg_unknown(env, regs, insn->dst_reg); + return 0; } - mark_reg_unknown(env, regs, insn->dst_reg); - return 0; + verbose(env, "R%d pointer %s pointer prohibited\n", + insn->dst_reg, + bpf_alu_string[opcode >> 4]); + return -EACCES; } else { /* scalar += pointer * This is legal, but we have to reverse our * src/dest handling in computing the range */ - rc = adjust_ptr_min_max_vals(env, insn, - src_reg, dst_reg); - if (rc == -EACCES && env->allow_ptr_leaks) { - /* scalar += unknown scalar */ - __mark_reg_unknown(&off_reg); - return adjust_scalar_min_max_vals( - env, insn, - dst_reg, off_reg); - } - return rc; + return adjust_ptr_min_max_vals(env, insn, + src_reg, dst_reg); } } else if (ptr_reg) { /* pointer += scalar */ - rc = adjust_ptr_min_max_vals(env, insn, - dst_reg, src_reg); - if (rc == -EACCES && env->allow_ptr_leaks) { - /* unknown scalar += scalar */ - __mark_reg_unknown(dst_reg); - return adjust_scalar_min_max_vals( - env, insn, dst_reg, *src_reg); - } - return rc; + return adjust_ptr_min_max_vals(env, insn, + dst_reg, src_reg); } } else { /* Pretend the src is a reg with a known value, since we only @@ -2364,17 +2338,9 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env, off_reg.type = SCALAR_VALUE; __mark_reg_known(&off_reg, insn->imm); src_reg = &off_reg; - if (ptr_reg) { /* pointer += K */ - rc = adjust_ptr_min_max_vals(env, insn, - ptr_reg, src_reg); - if (rc == -EACCES && env->allow_ptr_leaks) { - /* unknown scalar += K */ - __mark_reg_unknown(dst_reg); - return adjust_scalar_min_max_vals( - env, insn, dst_reg, off_reg); - } - return rc; - } + if (ptr_reg) /* pointer += K */ + return adjust_ptr_min_max_vals(env, insn, + ptr_reg, src_reg); } /* Got here implies adding two SCALAR_VALUEs */ -- cgit v1.2.3 From c10e83f598d08046dd1ebc8360d4bb12d802d51b Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 14 Dec 2017 12:27:29 +0100 Subject: arch, mm: Allow arch_dup_mmap() to fail In order to sanitize the LDT initialization on x86 arch_dup_mmap() must be allowed to fail. Fix up all instances. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Cc: Andy Lutomirski Cc: Andy Lutomirsky Cc: Boris Ostrovsky Cc: Borislav Petkov Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Dave Hansen Cc: David Laight Cc: Denys Vlasenko Cc: Eduardo Valentin Cc: Greg KH Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Will Deacon Cc: aliguori@amazon.com Cc: dan.j.williams@intel.com Cc: hughd@google.com Cc: keescook@google.com Cc: kirill.shutemov@linux.intel.com Cc: linux-mm@kvack.org Signed-off-by: Ingo Molnar --- kernel/fork.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 07cc743698d3..500ce64517d9 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -721,8 +721,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, goto out; } /* a new mm has just been created */ - arch_dup_mmap(oldmm, mm); - retval = 0; + retval = arch_dup_mmap(oldmm, mm); out: up_write(&mm->mmap_sem); flush_tlb_mm(oldmm); -- cgit v1.2.3 From c0ee554906c3d6554fbddf95ae664cd9f817082b Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 22 Dec 2017 12:37:43 -0600 Subject: pid: Handle failure to allocate the first pid in a pid namespace With the replacement of the pid bitmap and hashtable with an idr in alloc_pid started occassionally failing when allocating the first pid in a pid namespace. Things were not completely reset resulting in the first allocated pid getting the number 2 (not 1). Which further resulted in ns->proc_mnt not getting set and eventually causing an oops in proc_flush_task. Oops: 0000 [#1] SMP CPU: 2 PID: 6743 Comm: trinity-c117 Not tainted 4.15.0-rc4-think+ #2 RIP: 0010:proc_flush_task+0x8e/0x1b0 RSP: 0018:ffffc9000bbffc40 EFLAGS: 00010286 RAX: 0000000000000001 RBX: 0000000000000001 RCX: 00000000fffffffb RDX: 0000000000000000 RSI: ffffc9000bbffc50 RDI: 0000000000000000 RBP: ffffc9000bbffc63 R08: 0000000000000000 R09: 0000000000000002 R10: ffffc9000bbffb70 R11: ffffc9000bbffc64 R12: 0000000000000003 R13: 0000000000000000 R14: 0000000000000003 R15: ffff8804c10d7840 FS: 00007f7cb8965700(0000) GS:ffff88050a200000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000000 CR3: 00000003e21ae003 CR4: 00000000001606e0 DR0: 00007fb1d6c22000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000600 Call Trace: ? release_task+0xaf/0x680 release_task+0xd2/0x680 ? wait_consider_task+0xb82/0xce0 wait_consider_task+0xbe9/0xce0 ? do_wait+0xe1/0x330 do_wait+0x151/0x330 kernel_wait4+0x8d/0x150 ? task_stopped_code+0x50/0x50 SYSC_wait4+0x95/0xa0 ? rcu_read_lock_sched_held+0x6c/0x80 ? syscall_trace_enter+0x2d7/0x340 ? do_syscall_64+0x60/0x210 do_syscall_64+0x60/0x210 entry_SYSCALL64_slow_path+0x25/0x25 RIP: 0033:0x7f7cb82603aa RSP: 002b:00007ffd60770bc8 EFLAGS: 00000246 ORIG_RAX: 000000000000003d RAX: ffffffffffffffda RBX: 00007f7cb6cd4000 RCX: 00007f7cb82603aa RDX: 000000000000000b RSI: 00007ffd60770bd0 RDI: 0000000000007cca RBP: 0000000000007cca R08: 00007f7cb8965700 R09: 00007ffd607c7080 R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000000 R13: 00007ffd60770bd0 R14: 00007f7cb6cd4058 R15: 00000000cccccccd Code: c1 e2 04 44 8b 60 30 48 8b 40 38 44 8b 34 11 48 c7 c2 60 3a f5 81 44 89 e1 4c 8b 68 58 e8 4b b4 77 00 89 44 24 14 48 8d 74 24 10 <49> 8b 7d 00 e8 b9 6a f9 ff 48 85 c0 74 1a 48 89 c7 48 89 44 24 RIP: proc_flush_task+0x8e/0x1b0 RSP: ffffc9000bbffc40 CR2: 0000000000000000 ---[ end trace 53d67a6481059862 ]--- Improve the quality of the implementation by resetting the place to start allocating pids on failure to allocate the first pid. As improving the quality of the implementation is the goal remove the now unnecesarry disable_pid_allocations call when we fail to mount proc. Fixes: 95846ecf9dac ("pid: replace pid bitmap implementation with IDR API") Fixes: 8ef047aaaeb8 ("pid namespaces: make alloc_pid(), free_pid() and put_pid() work with struct upid") Reported-by: Dave Jones Signed-off-by: "Eric W. Biederman" --- kernel/pid.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/pid.c b/kernel/pid.c index b13b624e2c49..1e8bb6550ec4 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -193,10 +193,8 @@ struct pid *alloc_pid(struct pid_namespace *ns) } if (unlikely(is_child_reaper(pid))) { - if (pid_ns_prepare_proc(ns)) { - disable_pid_allocation(ns); + if (pid_ns_prepare_proc(ns)) goto out_free; - } } get_pid_ns(ns); @@ -226,6 +224,10 @@ out_free: while (++i <= ns->level) idr_remove(&ns->idr, (pid->numbers + i)->nr); + /* On failure to allocate the first pid, reset the state */ + if (ns->pid_allocated == PIDNS_ADDING) + idr_set_cursor(&ns->idr, 0); + spin_unlock_irq(&pidmap_lock); kmem_cache_free(ns->pid_cachep, pid); -- cgit v1.2.3 From 76dc6c097d581ad8eeedf8e1a000423a3d742445 Mon Sep 17 00:00:00 2001 From: Mathieu Malaterre Date: Tue, 26 Dec 2017 15:08:53 +0100 Subject: cpu/hotplug: Move inline keyword at the beginning of declaration MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix non-fatal warnings such as: kernel/cpu.c:95:1: warning: ‘inline’ is not at beginning of declaration [-Wold-style-declaration] static void inline cpuhp_lock_release(bool bringup) { } ^~~~~~ Signed-off-by: Mathieu Malaterre Signed-off-by: Thomas Gleixner Cc: Arnd Bergmann Cc: Sebastian Andrzej Siewior Cc: Peter Zijlstra Cc: "Paul E. McKenney" Link: https://lkml.kernel.org/r/20171226140855.16583-1-malat@debian.org --- kernel/cpu.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index 41376c3ac93b..3d002a6f216e 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -80,19 +80,19 @@ static struct lockdep_map cpuhp_state_down_map = STATIC_LOCKDEP_MAP_INIT("cpuhp_state-down", &cpuhp_state_down_map); -static void inline cpuhp_lock_acquire(bool bringup) +static inline void cpuhp_lock_acquire(bool bringup) { lock_map_acquire(bringup ? &cpuhp_state_up_map : &cpuhp_state_down_map); } -static void inline cpuhp_lock_release(bool bringup) +static inline void cpuhp_lock_release(bool bringup) { lock_map_release(bringup ? &cpuhp_state_up_map : &cpuhp_state_down_map); } #else -static void inline cpuhp_lock_acquire(bool bringup) { } -static void inline cpuhp_lock_release(bool bringup) { } +static inline void cpuhp_lock_acquire(bool bringup) { } +static inline void cpuhp_lock_release(bool bringup) { } #endif -- cgit v1.2.3 From 45d8b80c2ac5d21cd1e2954431fb676bc2b1e099 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Fri, 22 Dec 2017 20:32:35 -0500 Subject: ring-buffer: Mask out the info bits when returning buffer page length Two info bits were added to the "commit" part of the ring buffer data page when returned to be consumed. This was to inform the user space readers that events have been missed, and that the count may be stored at the end of the page. What wasn't handled, was the splice code that actually called a function to return the length of the data in order to zero out the rest of the page before sending it up to user space. These data bits were returned with the length making the value negative, and that negative value was not checked. It was compared to PAGE_SIZE, and only used if the size was less than PAGE_SIZE. Luckily PAGE_SIZE is unsigned long which made the compare an unsigned compare, meaning the negative size value did not end up causing a large portion of memory to be randomly zeroed out. Cc: stable@vger.kernel.org Fixes: 66a8cb95ed040 ("ring-buffer: Add place holder recording of dropped events") Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ring_buffer.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index c87766c1c204..e06cde093f76 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -280,6 +280,8 @@ EXPORT_SYMBOL_GPL(ring_buffer_event_data); /* Missed count stored at end */ #define RB_MISSED_STORED (1 << 30) +#define RB_MISSED_FLAGS (RB_MISSED_EVENTS|RB_MISSED_STORED) + struct buffer_data_page { u64 time_stamp; /* page time stamp */ local_t commit; /* write committed index */ @@ -331,7 +333,9 @@ static void rb_init_page(struct buffer_data_page *bpage) */ size_t ring_buffer_page_len(void *page) { - return local_read(&((struct buffer_data_page *)page)->commit) + struct buffer_data_page *bpage = page; + + return (local_read(&bpage->commit) & ~RB_MISSED_FLAGS) + BUF_PAGE_HDR_SIZE; } -- cgit v1.2.3 From 6b7e633fe9c24682df550e5311f47fb524701586 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Fri, 22 Dec 2017 20:38:57 -0500 Subject: tracing: Remove extra zeroing out of the ring buffer page The ring_buffer_read_page() takes care of zeroing out any extra data in the page that it returns. There's no need to zero it out again from the consumer. It was removed from one consumer of this function, but read_buffers_splice_read() did not remove it, and worse, it contained a nasty bug because of it. Cc: stable@vger.kernel.org Fixes: 2711ca237a084 ("ring-buffer: Move zeroing out excess in page to ring buffer code") Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 59518b8126d0..73652d5318b2 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -6769,7 +6769,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, .spd_release = buffer_spd_release, }; struct buffer_ref *ref; - int entries, size, i; + int entries, i; ssize_t ret = 0; #ifdef CONFIG_TRACER_MAX_TRACE @@ -6823,14 +6823,6 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, break; } - /* - * zero out any left over data, this is going to - * user land. - */ - size = ring_buffer_page_len(ref->page); - if (size < PAGE_SIZE) - memset(ref->page + size, 0, PAGE_SIZE - size); - page = virt_to_page(ref->page); spd.pages[i] = page; -- cgit v1.2.3 From ae415fa4c5248a8cf4faabd5a3c20576cb1ad607 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Fri, 22 Dec 2017 21:19:29 -0500 Subject: ring-buffer: Do no reuse reader page if still in use To free the reader page that is allocated with ring_buffer_alloc_read_page(), ring_buffer_free_read_page() must be called. For faster performance, this page can be reused by the ring buffer to avoid having to free and allocate new pages. The issue arises when the page is used with a splice pipe into the networking code. The networking code may up the page counter for the page, and keep it active while sending it is queued to go to the network. The incrementing of the page ref does not prevent it from being reused in the ring buffer, and this can cause the page that is being sent out to the network to be modified before it is sent by reading new data. Add a check to the page ref counter, and only reuse the page if it is not being used anywhere else. Cc: stable@vger.kernel.org Fixes: 73a757e63114d ("ring-buffer: Return reader page back into existing ring buffer") Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ring_buffer.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index e06cde093f76..9ab18995ff1e 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -4404,8 +4404,13 @@ void ring_buffer_free_read_page(struct ring_buffer *buffer, int cpu, void *data) { struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu]; struct buffer_data_page *bpage = data; + struct page *page = virt_to_page(bpage); unsigned long flags; + /* If the page is still in use someplace else, we can't reuse it */ + if (page_ref_count(page) > 1) + goto out; + local_irq_save(flags); arch_spin_lock(&cpu_buffer->lock); @@ -4417,6 +4422,7 @@ void ring_buffer_free_read_page(struct ring_buffer *buffer, int cpu, void *data) arch_spin_unlock(&cpu_buffer->lock); local_irq_restore(flags); + out: free_page((unsigned long)bpage); } EXPORT_SYMBOL_GPL(ring_buffer_free_read_page); -- cgit v1.2.3 From 24f2aaf952ee0b59f31c3a18b8b36c9e3d3c2cf5 Mon Sep 17 00:00:00 2001 From: Jing Xia Date: Tue, 26 Dec 2017 15:12:53 +0800 Subject: tracing: Fix crash when it fails to alloc ring buffer Double free of the ring buffer happens when it fails to alloc new ring buffer instance for max_buffer if TRACER_MAX_TRACE is configured. The root cause is that the pointer is not set to NULL after the buffer is freed in allocate_trace_buffers(), and the freeing of the ring buffer is invoked again later if the pointer is not equal to Null, as: instance_mkdir() |-allocate_trace_buffers() |-allocate_trace_buffer(tr, &tr->trace_buffer...) |-allocate_trace_buffer(tr, &tr->max_buffer...) // allocate fail(-ENOMEM),first free // and the buffer pointer is not set to null |-ring_buffer_free(tr->trace_buffer.buffer) // out_free_tr |-free_trace_buffers() |-free_trace_buffer(&tr->trace_buffer); //if trace_buffer is not null, free again |-ring_buffer_free(buf->buffer) |-rb_free_cpu_buffer(buffer->buffers[cpu]) // ring_buffer_per_cpu is null, and // crash in ring_buffer_per_cpu->pages Link: http://lkml.kernel.org/r/20171226071253.8968-1-chunyan.zhang@spreadtrum.com Cc: stable@vger.kernel.org Fixes: 737223fbca3b1 ("tracing: Consolidate buffer allocation code") Signed-off-by: Jing Xia Signed-off-by: Chunyan Zhang Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 73652d5318b2..0e53d46544b8 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -7603,7 +7603,9 @@ static int allocate_trace_buffers(struct trace_array *tr, int size) allocate_snapshot ? size : 1); if (WARN_ON(ret)) { ring_buffer_free(tr->trace_buffer.buffer); + tr->trace_buffer.buffer = NULL; free_percpu(tr->trace_buffer.data); + tr->trace_buffer.data = NULL; return -ENOMEM; } tr->allocated_snapshot = allocate_snapshot; -- cgit v1.2.3 From 4397f04575c44e1440ec2e49b6302785c95fd2f8 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Tue, 26 Dec 2017 20:07:34 -0500 Subject: tracing: Fix possible double free on failure of allocating trace buffer Jing Xia and Chunyan Zhang reported that on failing to allocate part of the tracing buffer, memory is freed, but the pointers that point to them are not initialized back to NULL, and later paths may try to free the freed memory again. Jing and Chunyan fixed one of the locations that does this, but missed a spot. Link: http://lkml.kernel.org/r/20171226071253.8968-1-chunyan.zhang@spreadtrum.com Cc: stable@vger.kernel.org Fixes: 737223fbca3b1 ("tracing: Consolidate buffer allocation code") Reported-by: Jing Xia Reported-by: Chunyan Zhang Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 0e53d46544b8..2a8d8a294345 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -7580,6 +7580,7 @@ allocate_trace_buffer(struct trace_array *tr, struct trace_buffer *buf, int size buf->data = alloc_percpu(struct trace_array_cpu); if (!buf->data) { ring_buffer_free(buf->buffer); + buf->buffer = NULL; return -ENOMEM; } -- cgit v1.2.3 From 39c3fd58952d7599d367c84c1330b785d91d6088 Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Sat, 2 Dec 2017 18:11:04 +0100 Subject: kernel/irq: Extend lockdep class for request mutex The IRQ code already has support for lockdep class for the lock mutex in an interrupt descriptor. Extend this to add a second class for the request mutex in the descriptor. Not having a class is resulting in false positive splats in some code paths. Signed-off-by: Andrew Lunn Signed-off-by: Thomas Gleixner Acked-by: linus.walleij@linaro.org Cc: grygorii.strashko@ti.com Cc: f.fainelli@gmail.com Link: https://lkml.kernel.org/r/1512234664-21555-1-git-send-email-andrew@lunn.ch --- kernel/irq/generic-chip.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c index c26c5bb6b491..508c03dfef25 100644 --- a/kernel/irq/generic-chip.c +++ b/kernel/irq/generic-chip.c @@ -364,10 +364,11 @@ irq_get_domain_generic_chip(struct irq_domain *d, unsigned int hw_irq) EXPORT_SYMBOL_GPL(irq_get_domain_generic_chip); /* - * Separate lockdep class for interrupt chip which can nest irq_desc - * lock. + * Separate lockdep classes for interrupt chip which can nest irq_desc + * lock and request mutex. */ static struct lock_class_key irq_nested_lock_class; +static struct lock_class_key irq_nested_request_class; /* * irq_map_generic_chip - Map a generic chip for an irq domain @@ -409,7 +410,8 @@ int irq_map_generic_chip(struct irq_domain *d, unsigned int virq, set_bit(idx, &gc->installed); if (dgc->gc_flags & IRQ_GC_INIT_NESTED_LOCK) - irq_set_lockdep_class(virq, &irq_nested_lock_class); + irq_set_lockdep_class(virq, &irq_nested_lock_class, + &irq_nested_request_class); if (chip->irq_calc_mask) chip->irq_calc_mask(data); @@ -479,7 +481,8 @@ void irq_setup_generic_chip(struct irq_chip_generic *gc, u32 msk, continue; if (flags & IRQ_GC_INIT_NESTED_LOCK) - irq_set_lockdep_class(i, &irq_nested_lock_class); + irq_set_lockdep_class(i, &irq_nested_lock_class, + &irq_nested_request_class); if (!(flags & IRQ_GC_NO_MASK)) { struct irq_data *d = irq_get_irq_data(i); -- cgit v1.2.3 From 466a2b42d67644447a1765276259a3ea5531ddff Mon Sep 17 00:00:00 2001 From: Joel Fernandes Date: Thu, 21 Dec 2017 02:22:45 +0100 Subject: cpufreq: schedutil: Use idle_calls counter of the remote CPU Since the recent remote cpufreq callback work, its possible that a cpufreq update is triggered from a remote CPU. For single policies however, the current code uses the local CPU when trying to determine if the remote sg_cpu entered idle or is busy. This is incorrect. To remedy this, compare with the nohz tick idle_calls counter of the remote CPU. Fixes: 674e75411fc2 (sched: cpufreq: Allow remote cpufreq callbacks) Acked-by: Viresh Kumar Acked-by: Peter Zijlstra (Intel) Signed-off-by: Joel Fernandes Cc: 4.14+ # 4.14+ Signed-off-by: Rafael J. Wysocki --- kernel/sched/cpufreq_schedutil.c | 2 +- kernel/time/tick-sched.c | 13 +++++++++++++ 2 files changed, 14 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 2f52ec0f1539..d6717a3331a1 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -244,7 +244,7 @@ static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, unsigned long *util, #ifdef CONFIG_NO_HZ_COMMON static bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu) { - unsigned long idle_calls = tick_nohz_get_idle_calls(); + unsigned long idle_calls = tick_nohz_get_idle_calls_cpu(sg_cpu->cpu); bool ret = idle_calls == sg_cpu->saved_idle_calls; sg_cpu->saved_idle_calls = idle_calls; diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 99578f06c8d4..77555faf6fbc 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -985,6 +985,19 @@ ktime_t tick_nohz_get_sleep_length(void) return ts->sleep_length; } +/** + * tick_nohz_get_idle_calls_cpu - return the current idle calls counter value + * for a particular CPU. + * + * Called from the schedutil frequency scaling governor in scheduler context. + */ +unsigned long tick_nohz_get_idle_calls_cpu(int cpu) +{ + struct tick_sched *ts = tick_get_tick_sched(cpu); + + return ts->idle_calls; +} + /** * tick_nohz_get_idle_calls - return the current idle calls counter value * -- cgit v1.2.3 From 11bca0a83f83f6093d816295668e74ef24595944 Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Sat, 2 Dec 2017 09:13:04 -0800 Subject: genirq: Guard handle_bad_irq log messages An interrupt storm on a bad interrupt will cause the kernel log to be clogged. [ 60.089234] ->handle_irq(): ffffffffbe2f803f, [ 60.090455] 0xffffffffbf2af380 [ 60.090510] handle_bad_irq+0x0/0x2e5 [ 60.090522] ->irq_data.chip(): ffffffffbf2af380, [ 60.090553] IRQ_NOPROBE set [ 60.090584] ->handle_irq(): ffffffffbe2f803f, [ 60.090590] handle_bad_irq+0x0/0x2e5 [ 60.090596] ->irq_data.chip(): ffffffffbf2af380, [ 60.090602] 0xffffffffbf2af380 [ 60.090608] ->action(): (null) [ 60.090779] handle_bad_irq+0x0/0x2e5 This was seen when running an upstream kernel on Acer Chromebook R11. The system was unstable as result. Guard the log message with __printk_ratelimit to reduce the impact. This won't prevent the interrupt storm from happening, but at least the system remains stable. Signed-off-by: Guenter Roeck Signed-off-by: Thomas Gleixner Cc: Dmitry Torokhov Cc: Joe Perches Cc: Andy Shevchenko Cc: Mika Westerberg Link: https://bugzilla.kernel.org/show_bug.cgi?id=197953 Link: https://lkml.kernel.org/r/1512234784-21038-1-git-send-email-linux@roeck-us.net --- kernel/irq/debug.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/irq/debug.h b/kernel/irq/debug.h index 17f05ef8f575..e4d3819a91cc 100644 --- a/kernel/irq/debug.h +++ b/kernel/irq/debug.h @@ -12,6 +12,11 @@ static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc) { + static DEFINE_RATELIMIT_STATE(ratelimit, 5 * HZ, 5); + + if (!__ratelimit(&ratelimit)) + return; + printk("irq %d, desc: %p, depth: %d, count: %d, unhandled: %d\n", irq, desc, desc->depth, desc->irq_count, desc->irqs_unhandled); printk("->handle_irq(): %p, ", desc->handle_irq); -- cgit v1.2.3 From da5dd9e854d2edd6b02ebfe28583052f922104da Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 29 Dec 2017 10:42:10 +0100 Subject: genirq/msi: Handle reactivation only on success When analyzing the fallout of the x86 vector allocation rework it turned out that the error handling in msi_domain_alloc_irqs() is broken. If MSI_FLAG_MUST_REACTIVATE is set for a MSI domain then it clears the activation flag for a successfully initialized msi descriptor. If a subsequent initialization fails then the error handling code path does not deactivate the interrupt because the activation flag got cleared. Move the clearing of the activation flag outside of the initialization loop so that an eventual failure can be cleaned up correctly. Fixes: 22d0b12f3560 ("genirq/irqdomain: Add force reactivation flag to irq domains") Signed-off-by: Thomas Gleixner Tested-by: Alexandru Chirvasitu Tested-by: Andy Shevchenko Cc: Dou Liyang Cc: Pavel Machek Cc: Maciej W. Rozycki Cc: Mikael Pettersson Cc: Josh Poulson Cc: Mihai Costache Cc: Stephen Hemminger Cc: Marc Zyngier Cc: linux-pci@vger.kernel.org Cc: Haiyang Zhang Cc: Dexuan Cui Cc: Simon Xiao Cc: Saeed Mahameed Cc: Jork Loeser Cc: Bjorn Helgaas Cc: devel@linuxdriverproject.org Cc: KY Srinivasan Cc: Alan Cox Cc: Sakari Ailus , Cc: linux-media@vger.kernel.org --- kernel/irq/msi.c | 35 +++++++++++++++++++++++++++-------- 1 file changed, 27 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index edb987b2c58d..9ba954331171 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c @@ -339,6 +339,13 @@ int msi_domain_populate_irqs(struct irq_domain *domain, struct device *dev, return ret; } +static bool msi_check_reservation_mode(struct msi_domain_info *info) +{ + if (!(info->flags & MSI_FLAG_MUST_REACTIVATE)) + return false; + return true; +} + /** * msi_domain_alloc_irqs - Allocate interrupts from a MSI interrupt domain * @domain: The domain to allocate from @@ -353,9 +360,11 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev, { struct msi_domain_info *info = domain->host_data; struct msi_domain_ops *ops = info->ops; - msi_alloc_info_t arg; + struct irq_data *irq_data; struct msi_desc *desc; + msi_alloc_info_t arg; int i, ret, virq; + bool can_reserve; ret = msi_domain_prepare_irqs(domain, dev, nvec, &arg); if (ret) @@ -385,6 +394,8 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev, if (ops->msi_finish) ops->msi_finish(&arg, 0); + can_reserve = msi_check_reservation_mode(info); + for_each_msi_entry(desc, dev) { virq = desc->irq; if (desc->nvec_used == 1) @@ -397,15 +408,23 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev, * the MSI entries before the PCI layer enables MSI in the * card. Otherwise the card latches a random msi message. */ - if (info->flags & MSI_FLAG_ACTIVATE_EARLY) { - struct irq_data *irq_data; + if (!(info->flags & MSI_FLAG_ACTIVATE_EARLY)) + continue; + irq_data = irq_domain_get_irq_data(domain, desc->irq); + ret = irq_domain_activate_irq(irq_data, true); + if (ret) + goto cleanup; + } + + /* + * If these interrupts use reservation mode, clear the activated bit + * so request_irq() will assign the final vector. + */ + if (can_reserve) { + for_each_msi_entry(desc, dev) { irq_data = irq_domain_get_irq_data(domain, desc->irq); - ret = irq_domain_activate_irq(irq_data, true); - if (ret) - goto cleanup; - if (info->flags & MSI_FLAG_MUST_REACTIVATE) - irqd_clr_activated(irq_data); + irqd_clr_activated(irq_data); } } return 0; -- cgit v1.2.3 From 69790ba92b8d67eaee5e50b30a5b696d40664caf Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 29 Dec 2017 16:44:34 +0100 Subject: genirq: Introduce IRQD_CAN_RESERVE flag Add a new flag to mark interrupts which can use reservation mode. This is going to be used in subsequent patches to disable reservation mode for a certain class of MSI devices. Signed-off-by: Thomas Gleixner Tested-by: Alexandru Chirvasitu Tested-by: Andy Shevchenko Cc: Dou Liyang Cc: Pavel Machek Cc: Maciej W. Rozycki Cc: Mikael Pettersson Cc: Josh Poulson Cc: Mihai Costache Cc: Stephen Hemminger Cc: Marc Zyngier Cc: linux-pci@vger.kernel.org Cc: Haiyang Zhang Cc: Dexuan Cui Cc: Simon Xiao Cc: Saeed Mahameed Cc: Jork Loeser Cc: Bjorn Helgaas Cc: devel@linuxdriverproject.org Cc: KY Srinivasan Cc: Alan Cox Cc: Sakari Ailus , Cc: linux-media@vger.kernel.org --- kernel/irq/debugfs.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/irq/debugfs.c b/kernel/irq/debugfs.c index 7f608ac39653..acfaaef8672a 100644 --- a/kernel/irq/debugfs.c +++ b/kernel/irq/debugfs.c @@ -113,6 +113,7 @@ static const struct irq_bit_descr irqdata_states[] = { BIT_MASK_DESCR(IRQD_SETAFFINITY_PENDING), BIT_MASK_DESCR(IRQD_AFFINITY_MANAGED), BIT_MASK_DESCR(IRQD_MANAGED_SHUTDOWN), + BIT_MASK_DESCR(IRQD_CAN_RESERVE), BIT_MASK_DESCR(IRQD_FORWARDED_TO_VCPU), -- cgit v1.2.3 From 702cb0a02813299d6911b775c637906ae21b737d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 29 Dec 2017 16:59:06 +0100 Subject: genirq/irqdomain: Rename early argument of irq_domain_activate_irq() The 'early' argument of irq_domain_activate_irq() is actually used to denote reservation mode. To avoid confusion, rename it before abuse happens. No functional change. Fixes: 72491643469a ("genirq/irqdomain: Update irq_domain_ops.activate() signature") Signed-off-by: Thomas Gleixner Cc: Alexandru Chirvasitu Cc: Andy Shevchenko Cc: Dou Liyang Cc: Pavel Machek Cc: Maciej W. Rozycki Cc: Mikael Pettersson Cc: Josh Poulson Cc: Mihai Costache Cc: Stephen Hemminger Cc: Marc Zyngier Cc: linux-pci@vger.kernel.org Cc: Haiyang Zhang Cc: Dexuan Cui Cc: Simon Xiao Cc: Saeed Mahameed Cc: Jork Loeser Cc: Bjorn Helgaas Cc: devel@linuxdriverproject.org Cc: KY Srinivasan Cc: Alan Cox Cc: Sakari Ailus , Cc: linux-media@vger.kernel.org --- kernel/irq/internals.h | 2 +- kernel/irq/irqdomain.c | 13 +++++++------ 2 files changed, 8 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 07d08ca701ec..ab19371eab9b 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -440,7 +440,7 @@ static inline bool irq_fixup_move_pending(struct irq_desc *desc, bool fclear) #endif /* !CONFIG_GENERIC_PENDING_IRQ */ #if !defined(CONFIG_IRQ_DOMAIN) || !defined(CONFIG_IRQ_DOMAIN_HIERARCHY) -static inline int irq_domain_activate_irq(struct irq_data *data, bool early) +static inline int irq_domain_activate_irq(struct irq_data *data, bool reserve) { irqd_set_activated(data); return 0; diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 4f4f60015e8a..62068ad46930 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -1693,7 +1693,7 @@ static void __irq_domain_deactivate_irq(struct irq_data *irq_data) } } -static int __irq_domain_activate_irq(struct irq_data *irqd, bool early) +static int __irq_domain_activate_irq(struct irq_data *irqd, bool reserve) { int ret = 0; @@ -1702,9 +1702,9 @@ static int __irq_domain_activate_irq(struct irq_data *irqd, bool early) if (irqd->parent_data) ret = __irq_domain_activate_irq(irqd->parent_data, - early); + reserve); if (!ret && domain->ops->activate) { - ret = domain->ops->activate(domain, irqd, early); + ret = domain->ops->activate(domain, irqd, reserve); /* Rollback in case of error */ if (ret && irqd->parent_data) __irq_domain_deactivate_irq(irqd->parent_data); @@ -1716,17 +1716,18 @@ static int __irq_domain_activate_irq(struct irq_data *irqd, bool early) /** * irq_domain_activate_irq - Call domain_ops->activate recursively to activate * interrupt - * @irq_data: outermost irq_data associated with interrupt + * @irq_data: Outermost irq_data associated with interrupt + * @reserve: If set only reserve an interrupt vector instead of assigning one * * This is the second step to call domain_ops->activate to program interrupt * controllers, so the interrupt could actually get delivered. */ -int irq_domain_activate_irq(struct irq_data *irq_data, bool early) +int irq_domain_activate_irq(struct irq_data *irq_data, bool reserve) { int ret = 0; if (!irqd_is_activated(irq_data)) - ret = __irq_domain_activate_irq(irq_data, early); + ret = __irq_domain_activate_irq(irq_data, reserve); if (!ret) irqd_set_activated(irq_data); return ret; -- cgit v1.2.3 From bc976233a872c0f20f018fb1e89264a541584e25 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 29 Dec 2017 10:47:22 +0100 Subject: genirq/msi, x86/vector: Prevent reservation mode for non maskable MSI The new reservation mode for interrupts assigns a dummy vector when the interrupt is allocated and assigns a real vector when the interrupt is requested. The reservation mode prevents vector pressure when devices with a large amount of queues/interrupts are initialized, but only a minimal subset of those queues/interrupts is actually used. This mode has an issue with MSI interrupts which cannot be masked. If the driver is not careful or the hardware emits an interrupt before the device irq is requestd by the driver then the interrupt ends up on the dummy vector as a spurious interrupt which can cause malfunction of the device or in the worst case a lockup of the machine. Change the logic for the reservation mode so that the early activation of MSI interrupts checks whether: - the device is a PCI/MSI device - the reservation mode of the underlying irqdomain is activated - PCI/MSI masking is globally enabled - the PCI/MSI device uses either MSI-X, which supports masking, or MSI with the maskbit supported. If one of those conditions is false, then clear the reservation mode flag in the irq data of the interrupt and invoke irq_domain_activate_irq() with the reserve argument cleared. In the x86 vector code, clear the can_reserve flag in the vector allocation data so a subsequent free_irq() won't create the same situation again. The interrupt stays assigned to a real vector until pci_disable_msi() is invoked and all allocations are undone. Fixes: 4900be83602b ("x86/vector/msi: Switch to global reservation mode") Reported-by: Alexandru Chirvasitu Reported-by: Andy Shevchenko Signed-off-by: Thomas Gleixner Tested-by: Alexandru Chirvasitu Tested-by: Andy Shevchenko Cc: Dou Liyang Cc: Pavel Machek Cc: Maciej W. Rozycki Cc: Mikael Pettersson Cc: Josh Poulson Cc: Mihai Costache Cc: Stephen Hemminger Cc: Marc Zyngier Cc: linux-pci@vger.kernel.org Cc: Haiyang Zhang Cc: Dexuan Cui Cc: Simon Xiao Cc: Saeed Mahameed Cc: Jork Loeser Cc: Bjorn Helgaas Cc: devel@linuxdriverproject.org Cc: KY Srinivasan Cc: Alan Cox Cc: Sakari Ailus , Cc: linux-media@vger.kernel.org Link: https://lkml.kernel.org/r/alpine.DEB.2.20.1712291406420.1899@nanos Link: https://lkml.kernel.org/r/alpine.DEB.2.20.1712291409460.1899@nanos --- kernel/irq/msi.c | 37 +++++++++++++++++++++++++++++++++---- 1 file changed, 33 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index 9ba954331171..2f3c4f5382cc 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c @@ -339,11 +339,38 @@ int msi_domain_populate_irqs(struct irq_domain *domain, struct device *dev, return ret; } -static bool msi_check_reservation_mode(struct msi_domain_info *info) +/* + * Carefully check whether the device can use reservation mode. If + * reservation mode is enabled then the early activation will assign a + * dummy vector to the device. If the PCI/MSI device does not support + * masking of the entry then this can result in spurious interrupts when + * the device driver is not absolutely careful. But even then a malfunction + * of the hardware could result in a spurious interrupt on the dummy vector + * and render the device unusable. If the entry can be masked then the core + * logic will prevent the spurious interrupt and reservation mode can be + * used. For now reservation mode is restricted to PCI/MSI. + */ +static bool msi_check_reservation_mode(struct irq_domain *domain, + struct msi_domain_info *info, + struct device *dev) { + struct msi_desc *desc; + + if (domain->bus_token != DOMAIN_BUS_PCI_MSI) + return false; + if (!(info->flags & MSI_FLAG_MUST_REACTIVATE)) return false; - return true; + + if (IS_ENABLED(CONFIG_PCI_MSI) && pci_msi_ignore_mask) + return false; + + /* + * Checking the first MSI descriptor is sufficient. MSIX supports + * masking and MSI does so when the maskbit is set. + */ + desc = first_msi_entry(dev); + return desc->msi_attrib.is_msix || desc->msi_attrib.maskbit; } /** @@ -394,7 +421,7 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev, if (ops->msi_finish) ops->msi_finish(&arg, 0); - can_reserve = msi_check_reservation_mode(info); + can_reserve = msi_check_reservation_mode(domain, info, dev); for_each_msi_entry(desc, dev) { virq = desc->irq; @@ -412,7 +439,9 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev, continue; irq_data = irq_domain_get_irq_data(domain, desc->irq); - ret = irq_domain_activate_irq(irq_data, true); + if (!can_reserve) + irqd_clr_can_reserve(irq_data); + ret = irq_domain_activate_irq(irq_data, can_reserve); if (ret) goto cleanup; } -- cgit v1.2.3 From ced6d5c11d3e7b342f1a80f908e6756ebd4b8ddd Mon Sep 17 00:00:00 2001 From: Anna-Maria Gleixner Date: Fri, 22 Dec 2017 15:51:12 +0100 Subject: timers: Use deferrable base independent of base::nohz_active During boot and before base::nohz_active is set in the timer bases, deferrable timers are enqueued into the standard timer base. This works correctly as long as base::nohz_active is false. Once it base::nohz_active is set and a timer which was enqueued before that is accessed the lock selector code choses the lock of the deferred base. This causes unlocked access to the standard base and in case the timer is removed it does not clear the pending flag in the standard base bitmap which causes get_next_timer_interrupt() to return bogus values. To prevent that, the deferrable timers must be enqueued in the deferrable base, even when base::nohz_active is not set. Those deferrable timers also need to be expired unconditional. Fixes: 500462a9de65 ("timers: Switch to a non-cascading wheel") Signed-off-by: Anna-Maria Gleixner Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Cc: Peter Zijlstra Cc: Sebastian Siewior Cc: stable@vger.kernel.org Cc: rt@linutronix.de Cc: Paul McKenney Link: https://lkml.kernel.org/r/20171222145337.633328378@linutronix.de --- kernel/time/timer.c | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timer.c b/kernel/time/timer.c index ffebcf878fba..19a9c3da7698 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -823,11 +823,10 @@ static inline struct timer_base *get_timer_cpu_base(u32 tflags, u32 cpu) struct timer_base *base = per_cpu_ptr(&timer_bases[BASE_STD], cpu); /* - * If the timer is deferrable and nohz is active then we need to use - * the deferrable base. + * If the timer is deferrable and NO_HZ_COMMON is set then we need + * to use the deferrable base. */ - if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && base->nohz_active && - (tflags & TIMER_DEFERRABLE)) + if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && (tflags & TIMER_DEFERRABLE)) base = per_cpu_ptr(&timer_bases[BASE_DEF], cpu); return base; } @@ -837,11 +836,10 @@ static inline struct timer_base *get_timer_this_cpu_base(u32 tflags) struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]); /* - * If the timer is deferrable and nohz is active then we need to use - * the deferrable base. + * If the timer is deferrable and NO_HZ_COMMON is set then we need + * to use the deferrable base. */ - if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && base->nohz_active && - (tflags & TIMER_DEFERRABLE)) + if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && (tflags & TIMER_DEFERRABLE)) base = this_cpu_ptr(&timer_bases[BASE_DEF]); return base; } @@ -1684,7 +1682,7 @@ static __latent_entropy void run_timer_softirq(struct softirq_action *h) base->must_forward_clk = false; __run_timers(base); - if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && base->nohz_active) + if (IS_ENABLED(CONFIG_NO_HZ_COMMON)) __run_timers(this_cpu_ptr(&timer_bases[BASE_DEF])); } -- cgit v1.2.3 From 26456f87aca7157c057de65c9414b37f1ab881d1 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 27 Dec 2017 21:37:25 +0100 Subject: timers: Reinitialize per cpu bases on hotplug The timer wheel bases are not (re)initialized on CPU hotplug. That leaves them with a potentially stale clk and next_expiry valuem, which can cause trouble then the CPU is plugged. Add a prepare callback which forwards the clock, sets next_expiry to far in the future and reset the control flags to a known state. Set base->must_forward_clk so the first timer which is queued will try to forward the clock to current jiffies. Fixes: 500462a9de65 ("timers: Switch to a non-cascading wheel") Reported-by: Paul E. McKenney Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: Frederic Weisbecker Cc: Sebastian Siewior Cc: Anna-Maria Gleixner Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/alpine.DEB.2.20.1712272152200.2431@nanos --- kernel/cpu.c | 4 ++-- kernel/time/timer.c | 15 +++++++++++++++ 2 files changed, 17 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index 41376c3ac93b..97858477e586 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -1277,9 +1277,9 @@ static struct cpuhp_step cpuhp_bp_states[] = { * before blk_mq_queue_reinit_notify() from notify_dead(), * otherwise a RCU stall occurs. */ - [CPUHP_TIMERS_DEAD] = { + [CPUHP_TIMERS_PREPARE] = { .name = "timers:dead", - .startup.single = NULL, + .startup.single = timers_prepare_cpu, .teardown.single = timers_dead_cpu, }, /* Kicks the plugged cpu into life */ diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 19a9c3da7698..6be576e02209 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -1853,6 +1853,21 @@ static void migrate_timer_list(struct timer_base *new_base, struct hlist_head *h } } +int timers_prepare_cpu(unsigned int cpu) +{ + struct timer_base *base; + int b; + + for (b = 0; b < NR_BASES; b++) { + base = per_cpu_ptr(&timer_bases[b], cpu); + base->clk = jiffies; + base->next_expiry = base->clk + NEXT_TIMER_MAX_DELTA; + base->is_idle = false; + base->must_forward_clk = true; + } + return 0; +} + int timers_dead_cpu(unsigned int cpu) { struct timer_base *old_base; -- cgit v1.2.3 From 5d62c183f9e9df1deeea0906d099a94e8a43047a Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 22 Dec 2017 15:51:13 +0100 Subject: nohz: Prevent a timer interrupt storm in tick_nohz_stop_sched_tick() The conditions in irq_exit() to invoke tick_nohz_irq_exit() which subsequently invokes tick_nohz_stop_sched_tick() are: if ((idle_cpu(cpu) && !need_resched()) || tick_nohz_full_cpu(cpu)) If need_resched() is not set, but a timer softirq is pending then this is an indication that the softirq code punted and delegated the execution to softirqd. need_resched() is not true because the current interrupted task takes precedence over softirqd. Invoking tick_nohz_irq_exit() in this case can cause an endless loop of timer interrupts because the timer wheel contains an expired timer, but softirqs are not yet executed. So it returns an immediate expiry request, which causes the timer to fire immediately again. Lather, rinse and repeat.... Prevent that by adding a check for a pending timer soft interrupt to the conditions in tick_nohz_stop_sched_tick() which avoid calling get_next_timer_interrupt(). That keeps the tick sched timer on the tick and prevents a repetitive programming of an already expired timer. Reported-by: Sebastian Siewior Signed-off-by: Thomas Gleixner Acked-by: Frederic Weisbecker Cc: Peter Zijlstra Cc: Paul McKenney Cc: Anna-Maria Gleixner Cc: Sebastian Siewior Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/alpine.DEB.2.20.1712272156050.2431@nanos --- kernel/time/tick-sched.c | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 77555faf6fbc..f7cc7abfcf25 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -650,6 +650,11 @@ static void tick_nohz_restart(struct tick_sched *ts, ktime_t now) ts->next_tick = 0; } +static inline bool local_timer_softirq_pending(void) +{ + return local_softirq_pending() & TIMER_SOFTIRQ; +} + static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, ktime_t now, int cpu) { @@ -666,8 +671,18 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, } while (read_seqretry(&jiffies_lock, seq)); ts->last_jiffies = basejiff; - if (rcu_needs_cpu(basemono, &next_rcu) || - arch_needs_cpu() || irq_work_needs_cpu()) { + /* + * Keep the periodic tick, when RCU, architecture or irq_work + * requests it. + * Aside of that check whether the local timer softirq is + * pending. If so its a bad idea to call get_next_timer_interrupt() + * because there is an already expired timer, so it will request + * immeditate expiry, which rearms the hardware timer with a + * minimal delta which brings us back to this place + * immediately. Lather, rinse and repeat... + */ + if (rcu_needs_cpu(basemono, &next_rcu) || arch_needs_cpu() || + irq_work_needs_cpu() || local_timer_softirq_pending()) { next_tick = basemono + TICK_NSEC; } else { /* -- cgit v1.2.3 From fd45bb77ad682be728d1002431d77b8c73342836 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 22 Dec 2017 15:51:14 +0100 Subject: timers: Invoke timer_start_debug() where it makes sense The timer start debug function is called before the proper timer base is set. As a consequence the trace data contains the stale CPU and flags values. Call the debug function after setting the new base and flags. Fixes: 500462a9de65 ("timers: Switch to a non-cascading wheel") Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: Frederic Weisbecker Cc: Sebastian Siewior Cc: stable@vger.kernel.org Cc: rt@linutronix.de Cc: Paul McKenney Cc: Anna-Maria Gleixner Link: https://lkml.kernel.org/r/20171222145337.792907137@linutronix.de --- kernel/time/timer.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 6be576e02209..89a9e1b4264a 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -1007,8 +1007,6 @@ __mod_timer(struct timer_list *timer, unsigned long expires, unsigned int option if (!ret && (options & MOD_TIMER_PENDING_ONLY)) goto out_unlock; - debug_activate(timer, expires); - new_base = get_target_base(base, timer->flags); if (base != new_base) { @@ -1032,6 +1030,8 @@ __mod_timer(struct timer_list *timer, unsigned long expires, unsigned int option } } + debug_activate(timer, expires); + timer->expires = expires; /* * If 'idx' was calculated above and the base time did not advance -- cgit v1.2.3 From 29f1b2b0fecfae69e31833836f1da3136696eee5 Mon Sep 17 00:00:00 2001 From: Nick Desaulniers Date: Thu, 28 Dec 2017 22:11:36 -0500 Subject: posix-timers: Prevent UB from shifting negative signed value Shifting a negative signed number is undefined behavior. Looking at the macros MAKE_PROCESS_CPUCLOCK and FD_TO_CLOCKID, it seems that the subexpression: (~(clockid_t) (pid) << 3) where clockid_t resolves to a signed int, which once negated, is undefined behavior to shift the value of if the results thus far are negative. It was further suggested to make these macros into inline functions. Suggested-by: Thomas Gleixner Signed-off-by: Nick Desaulniers Signed-off-by: Thomas Gleixner Cc: Dimitri Sivanich Cc: Frederic Weisbecker Cc: Al Viro Cc: linux-kselftest@vger.kernel.org Cc: Shuah Khan Cc: Deepa Dinamani Link: https://lkml.kernel.org/r/1514517100-18051-1-git-send-email-nick.desaulniers@gmail.com --- kernel/time/posix-clock.c | 2 +- kernel/time/posix-cpu-timers.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/time/posix-clock.c b/kernel/time/posix-clock.c index 17cdc554c9fe..cc91d90abd84 100644 --- a/kernel/time/posix-clock.c +++ b/kernel/time/posix-clock.c @@ -216,7 +216,7 @@ struct posix_clock_desc { static int get_clock_desc(const clockid_t id, struct posix_clock_desc *cd) { - struct file *fp = fget(CLOCKID_TO_FD(id)); + struct file *fp = fget(clockid_to_fd(id)); int err = -EINVAL; if (!fp) diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 1f27887aa194..cef79ca5bbd5 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -1363,8 +1363,8 @@ static long posix_cpu_nsleep_restart(struct restart_block *restart_block) return do_cpu_nanosleep(which_clock, TIMER_ABSTIME, &t); } -#define PROCESS_CLOCK MAKE_PROCESS_CPUCLOCK(0, CPUCLOCK_SCHED) -#define THREAD_CLOCK MAKE_THREAD_CPUCLOCK(0, CPUCLOCK_SCHED) +#define PROCESS_CLOCK make_process_cpuclock(0, CPUCLOCK_SCHED) +#define THREAD_CLOCK make_thread_cpuclock(0, CPUCLOCK_SCHED) static int process_cpu_clock_getres(const clockid_t which_clock, struct timespec64 *tp) -- cgit v1.2.3 From 4d9570158b6260f449e317a5f9ed030c2504a615 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 4 Jan 2018 16:17:49 -0800 Subject: kernel/acct.c: fix the acct->needcheck check in check_free_space() As Tsukada explains, the time_is_before_jiffies(acct->needcheck) check is very wrong, we need time_is_after_jiffies() to make sys_acct() work. Ignoring the overflows, the code should "goto out" if needcheck > jiffies, while currently it checks "needcheck < jiffies" and thus in the likely case check_free_space() does nothing until jiffies overflow. In particular this means that sys_acct() is simply broken, acct_on() sets acct->needcheck = jiffies and expects that check_free_space() should set acct->active = 1 after the free-space check, but this won't happen if jiffies increments in between. This was broken by commit 32dc73086015 ("get rid of timer in kern/acct.c") in 2011, then another (correct) commit 795a2f22a8ea ("acct() should honour the limits from the very beginning") made the problem more visible. Link: http://lkml.kernel.org/r/20171213133940.GA6554@redhat.com Fixes: 32dc73086015 ("get rid of timer in kern/acct.c") Reported-by: TSUKADA Koutaro Suggested-by: TSUKADA Koutaro Signed-off-by: Oleg Nesterov Cc: Al Viro Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/acct.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/acct.c b/kernel/acct.c index d15c0ee4d955..addf7732fb56 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -102,7 +102,7 @@ static int check_free_space(struct bsd_acct_struct *acct) { struct kstatfs sbuf; - if (time_is_before_jiffies(acct->needcheck)) + if (time_is_after_jiffies(acct->needcheck)) goto out; /* May block */ -- cgit v1.2.3 From dc8635b78cd8669c37e230058d18c33af7451ab1 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Thu, 4 Jan 2018 16:17:56 -0800 Subject: kernel/exit.c: export abort() to modules gcc -fisolate-erroneous-paths-dereference can generate calls to abort() from modular code too. [arnd@arndb.de: drop duplicate exports of abort()] Link: http://lkml.kernel.org/r/20180102103311.706364-1-arnd@arndb.de Reported-by: Vineet Gupta Cc: Sudip Mukherjee Cc: Arnd Bergmann Cc: Alexey Brodkin Cc: Russell King Cc: Jose Abreu Signed-off-by: Andrew Morton Signed-off-by: Arnd Bergmann Signed-off-by: Linus Torvalds --- kernel/exit.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index df0c91d5606c..995453d9fb55 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1763,3 +1763,4 @@ __weak void abort(void) /* if that doesn't kill us, halt */ panic("Oops failed to kill thread"); } +EXPORT_SYMBOL(abort); -- cgit v1.2.3 From bdbc98abb3aa323f6323b11db39c740e6f8fc5b1 Mon Sep 17 00:00:00 2001 From: Rainer Fiebig Date: Fri, 22 Dec 2017 11:13:59 +0100 Subject: PM: hibernate: Do not subtract NR_FILE_MAPPED in minimum_image_size() s2disk/s2both may fail unnecessarily and erratically if NR_FILE_MAPPED is high - for instance when using VMs with VirtualBox and perhaps VMware Player. In those situations s2disk becomes unreliable and therefore unusable. A typical scenario is: user issues a s2disk and it fails. User issues a second s2disk immediately after that and it succeeds. And user wonders why. The problem is caused by minimum_image_size() in snapshot.c. The value it returns is roughly 100% too high because NR_FILE_MAPPED is subtracted in its calculation. Eventually the number of preallocated image pages is falsely too low. This doesn't matter as long as NR_FILE_MAPPED-values are in a normal range or in 32bit-environments as the code allows for allocation of additional pages from highmem. But with the high values generated by VirtualBox-VMs (a 2-GB-VM causes NR_FILE_MAPPED go up by 2 GB) it may lead to failure in 64bit-systems. Not subtracting NR_FILE_MAPPED in minimum_image_size() solves the problem. I've done at least hundreds of successful s2both/s2disk now on an x86_64 system (with and without VirtualBox) which gives me some confidence that this is right. It has turned s2disk/s2both from unusable into 100% reliable. Link: https://bugzilla.kernel.org/show_bug.cgi?id=97201 Signed-off-by: Rainer Fiebig Signed-off-by: Rafael J. Wysocki --- kernel/power/snapshot.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index bce0464524d8..3d37c279c090 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -1645,8 +1645,7 @@ static unsigned long free_unnecessary_pages(void) * [number of saveable pages] - [number of pages that can be freed in theory] * * where the second term is the sum of (1) reclaimable slab pages, (2) active - * and (3) inactive anonymous pages, (4) active and (5) inactive file pages, - * minus mapped file pages. + * and (3) inactive anonymous pages, (4) active and (5) inactive file pages. */ static unsigned long minimum_image_size(unsigned long saveable) { @@ -1656,8 +1655,7 @@ static unsigned long minimum_image_size(unsigned long saveable) + global_node_page_state(NR_ACTIVE_ANON) + global_node_page_state(NR_INACTIVE_ANON) + global_node_page_state(NR_ACTIVE_FILE) - + global_node_page_state(NR_INACTIVE_FILE) - - global_node_page_state(NR_FILE_MAPPED); + + global_node_page_state(NR_INACTIVE_FILE); return saveable <= size ? 0 : saveable - size; } -- cgit v1.2.3 From 040ee69226f8a96b7943645d68f41d5d44b5ff7d Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 2 Dec 2017 20:20:38 -0500 Subject: fix "netfilter: xt_bpf: Fix XT_BPF_MODE_FD_PINNED mode of 'xt_bpf_info_v1'" Descriptor table is a shared object; it's not a place where you can stick temporary references to files, especially when we don't need an opened file at all. Cc: stable@vger.kernel.org # v4.14 Fixes: 98589a0998b8 ("netfilter: xt_bpf: Fix XT_BPF_MODE_FD_PINNED mode of 'xt_bpf_info_v1'") Signed-off-by: Al Viro --- kernel/bpf/inode.c | 40 +++++++++++++++++++++++++++++++++++++++- kernel/bpf/syscall.c | 2 +- 2 files changed, 40 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index 01aaef1a77c5..5bb5e49ef4c3 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -368,7 +368,45 @@ out: putname(pname); return ret; } -EXPORT_SYMBOL_GPL(bpf_obj_get_user); + +static struct bpf_prog *__get_prog_inode(struct inode *inode, enum bpf_prog_type type) +{ + struct bpf_prog *prog; + int ret = inode_permission(inode, MAY_READ | MAY_WRITE); + if (ret) + return ERR_PTR(ret); + + if (inode->i_op == &bpf_map_iops) + return ERR_PTR(-EINVAL); + if (inode->i_op != &bpf_prog_iops) + return ERR_PTR(-EACCES); + + prog = inode->i_private; + + ret = security_bpf_prog(prog); + if (ret < 0) + return ERR_PTR(ret); + + if (!bpf_prog_get_ok(prog, &type, false)) + return ERR_PTR(-EINVAL); + + return bpf_prog_inc(prog); +} + +struct bpf_prog *bpf_prog_get_type_path(const char *name, enum bpf_prog_type type) +{ + struct bpf_prog *prog; + struct path path; + int ret = kern_path(name, LOOKUP_FOLLOW, &path); + if (ret) + return ERR_PTR(ret); + prog = __get_prog_inode(d_backing_inode(path.dentry), type); + if (!IS_ERR(prog)) + touch_atime(&path); + path_put(&path); + return prog; +} +EXPORT_SYMBOL(bpf_prog_get_type_path); static void bpf_evict_inode(struct inode *inode) { diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 2c4cfeaa8d5e..5cb783fc8224 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1057,7 +1057,7 @@ struct bpf_prog *bpf_prog_inc_not_zero(struct bpf_prog *prog) } EXPORT_SYMBOL_GPL(bpf_prog_inc_not_zero); -static bool bpf_prog_get_ok(struct bpf_prog *prog, +bool bpf_prog_get_ok(struct bpf_prog *prog, enum bpf_prog_type *attach_type, bool attach_drv) { /* not an attachment, just a refcount inc, always allow */ -- cgit v1.2.3 From 263663cd3c4fbfc40cb7504c4be2dadbc0992cc1 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Mon, 18 Dec 2017 20:22:04 +0800 Subject: block: convert to bio_first_bvec_all & bio_first_page_all This patch converts to bio_first_bvec_all() & bio_first_page_all() for retrieving the 1st bvec/page, and prepares for supporting multipage bvec. Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- kernel/power/swap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 293ead59eccc..96c736313faa 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -240,7 +240,7 @@ static void hib_init_batch(struct hib_bio_batch *hb) static void hib_end_io(struct bio *bio) { struct hib_bio_batch *hb = bio->bi_private; - struct page *page = bio->bi_io_vec[0].bv_page; + struct page *page = bio_first_page_all(bio); if (bio->bi_status) { pr_alert("Read-error on swap-device (%u:%u:%Lu)\n", -- cgit v1.2.3 From 5731a879d03bdaa00265f8ebc32dfd0e65d25276 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Thu, 4 Jan 2018 20:02:09 -0800 Subject: bpf: sockmap missing NULL psock check Add psock NULL check to handle a racing sock event that can get the sk_callback_lock before this case but after xchg happens causing the refcnt to hit zero and sock user data (psock) to be null and queued for garbage collection. Also add a comment in the code because this is a bit subtle and not obvious in my opinion. Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann --- kernel/bpf/sockmap.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index 5ee2e41893d9..1712d319c2d8 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -591,8 +591,15 @@ static void sock_map_free(struct bpf_map *map) write_lock_bh(&sock->sk_callback_lock); psock = smap_psock_sk(sock); - smap_list_remove(psock, &stab->sock_map[i]); - smap_release_sock(psock, sock); + /* This check handles a racing sock event that can get the + * sk_callback_lock before this case but after xchg happens + * causing the refcnt to hit zero and sock user data (psock) + * to be null and queued for garbage collection. + */ + if (likely(psock)) { + smap_list_remove(psock, &stab->sock_map[i]); + smap_release_sock(psock, sock); + } write_unlock_bh(&sock->sk_callback_lock); } rcu_read_unlock(); -- cgit v1.2.3 From 313ccb96159489eabdbdcf4deb34e7fbac17557d Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Sun, 7 Jan 2018 17:03:47 +0100 Subject: perf: Allocate context task_ctx_data for child event Currently we use perf_event_context::task_ctx_data to save and restore the LBR status when the task is scheduled out and in. We don't allocate it for child contexts, which results in shorter task's LBR stack, because we don't save the history from previous run and start over every time we schedule the task in. I made a test to generate samples with LBR call stack and got higher numbers on bigger chain depths: before: after: LBR call chain: nr: 1 60561 498127 LBR call chain: nr: 2 0 0 LBR call chain: nr: 3 107030 2172 LBR call chain: nr: 4 466685 62758 LBR call chain: nr: 5 2307319 878046 LBR call chain: nr: 6 48713 495218 LBR call chain: nr: 7 1040 4551 LBR call chain: nr: 8 481 172 LBR call chain: nr: 9 878 120 LBR call chain: nr: 10 2377 6698 LBR call chain: nr: 11 28830 151487 LBR call chain: nr: 12 29347 339867 LBR call chain: nr: 13 4 22 LBR call chain: nr: 14 3 53 Signed-off-by: Jiri Olsa Acked-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Andi Kleen Cc: David Ahern Cc: Namhyung Kim Fixes: 4af57ef28c2c ("perf: Add pmu specific data for perf task context") Link: http://lkml.kernel.org/r/20180107160356.28203-4-jolsa@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- kernel/events/core.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 4df5b695bf0d..55fb648a32b0 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -10703,6 +10703,19 @@ inherit_event(struct perf_event *parent_event, if (IS_ERR(child_event)) return child_event; + + if ((child_event->attach_state & PERF_ATTACH_TASK_DATA) && + !child_ctx->task_ctx_data) { + struct pmu *pmu = child_event->pmu; + + child_ctx->task_ctx_data = kzalloc(pmu->task_ctx_size, + GFP_KERNEL); + if (!child_ctx->task_ctx_data) { + free_event(child_event); + return NULL; + } + } + /* * is_orphaned_event() and list_add_tail(&parent_event->child_list) * must be under the same lock in order to serialize against @@ -10713,6 +10726,7 @@ inherit_event(struct perf_event *parent_event, if (is_orphaned_event(parent_event) || !atomic_long_inc_not_zero(&parent_event->refcount)) { mutex_unlock(&parent_event->child_mutex); + /* task_ctx_data is freed with child_ctx */ free_event(child_event); return NULL; } -- cgit v1.2.3 From 8cf7e0e22414f5acf85ecb7cd0d4482e6c9696ae Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Sun, 7 Jan 2018 17:03:49 +0100 Subject: perf: Make perf_callchain function static And move it to core.c, because there's no caller of this function other than the one in core.c Signed-off-by: Jiri Olsa Cc: Alexander Shishkin Cc: Andi Kleen Cc: David Ahern Cc: Namhyung Kim Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20180107160356.28203-6-jolsa@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- kernel/events/callchain.c | 15 --------------- kernel/events/core.c | 16 ++++++++++++++++ kernel/events/internal.h | 4 ---- 3 files changed, 16 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c index 1b2be63c8528..772a43fea825 100644 --- a/kernel/events/callchain.c +++ b/kernel/events/callchain.c @@ -178,21 +178,6 @@ put_callchain_entry(int rctx) put_recursion_context(this_cpu_ptr(callchain_recursion), rctx); } -struct perf_callchain_entry * -perf_callchain(struct perf_event *event, struct pt_regs *regs) -{ - bool kernel = !event->attr.exclude_callchain_kernel; - bool user = !event->attr.exclude_callchain_user; - /* Disallow cross-task user callchains. */ - bool crosstask = event->ctx->task && event->ctx->task != current; - const u32 max_stack = event->attr.sample_max_stack; - - if (!kernel && !user) - return NULL; - - return get_perf_callchain(regs, 0, kernel, user, max_stack, crosstask, true); -} - struct perf_callchain_entry * get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user, u32 max_stack, bool crosstask, bool add_mark) diff --git a/kernel/events/core.c b/kernel/events/core.c index 55fb648a32b0..5fc1ded4b450 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -5980,6 +5980,22 @@ static u64 perf_virt_to_phys(u64 virt) return phys_addr; } +static struct perf_callchain_entry * +perf_callchain(struct perf_event *event, struct pt_regs *regs) +{ + bool kernel = !event->attr.exclude_callchain_kernel; + bool user = !event->attr.exclude_callchain_user; + /* Disallow cross-task user callchains. */ + bool crosstask = event->ctx->task && event->ctx->task != current; + const u32 max_stack = event->attr.sample_max_stack; + + if (!kernel && !user) + return NULL; + + return get_perf_callchain(regs, 0, kernel, user, + max_stack, crosstask, true); +} + void perf_prepare_sample(struct perf_event_header *header, struct perf_sample_data *data, struct perf_event *event, diff --git a/kernel/events/internal.h b/kernel/events/internal.h index 09b1537ae06c..6dc725a7e7bc 100644 --- a/kernel/events/internal.h +++ b/kernel/events/internal.h @@ -201,10 +201,6 @@ arch_perf_out_copy_user(void *dst, const void *src, unsigned long n) DEFINE_OUTPUT_COPY(__output_copy_user, arch_perf_out_copy_user) -/* Callchain handling */ -extern struct perf_callchain_entry * -perf_callchain(struct perf_event *event, struct pt_regs *regs); - static inline int get_recursion_context(int *recursion) { int rctx; -- cgit v1.2.3 From 99e818cc88889a2fa2f483b91b372c47b94b7c98 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Sun, 7 Jan 2018 17:03:50 +0100 Subject: perf: Return empty callchain instead of NULL It simplifies the code a bit, because we dump the callchain Link: http://lkml.kernel.org/n/tip-uqp7qd6aif47g39glnbu95yl@git.kernel.org even if it's empty. With 'empty' callchain we can remove all the NULL-checking code paths. Original-patch-from: Peter Zijlstra Signed-off-by: Jiri Olsa Acked-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Andi Kleen Cc: David Ahern Cc: Namhyung Kim Link: http://lkml.kernel.org/r/20180107160356.28203-7-jolsa@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- kernel/events/core.c | 30 ++++++++++++------------------ 1 file changed, 12 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 5fc1ded4b450..4e1a1bf8d867 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -5815,19 +5815,11 @@ void perf_output_sample(struct perf_output_handle *handle, perf_output_read(handle, event); if (sample_type & PERF_SAMPLE_CALLCHAIN) { - if (data->callchain) { - int size = 1; - - if (data->callchain) - size += data->callchain->nr; - - size *= sizeof(u64); + int size = 1; - __output_copy(handle, data->callchain, size); - } else { - u64 nr = 0; - perf_output_put(handle, nr); - } + size += data->callchain->nr; + size *= sizeof(u64); + __output_copy(handle, data->callchain, size); } if (sample_type & PERF_SAMPLE_RAW) { @@ -5980,6 +5972,8 @@ static u64 perf_virt_to_phys(u64 virt) return phys_addr; } +static struct perf_callchain_entry __empty_callchain = { .nr = 0, }; + static struct perf_callchain_entry * perf_callchain(struct perf_event *event, struct pt_regs *regs) { @@ -5988,12 +5982,14 @@ perf_callchain(struct perf_event *event, struct pt_regs *regs) /* Disallow cross-task user callchains. */ bool crosstask = event->ctx->task && event->ctx->task != current; const u32 max_stack = event->attr.sample_max_stack; + struct perf_callchain_entry *callchain; if (!kernel && !user) - return NULL; + return &__empty_callchain; - return get_perf_callchain(regs, 0, kernel, user, - max_stack, crosstask, true); + callchain = get_perf_callchain(regs, 0, kernel, user, + max_stack, crosstask, true); + return callchain ?: &__empty_callchain; } void perf_prepare_sample(struct perf_event_header *header, @@ -6018,9 +6014,7 @@ void perf_prepare_sample(struct perf_event_header *header, int size = 1; data->callchain = perf_callchain(event, regs); - - if (data->callchain) - size += data->callchain->nr; + size += data->callchain->nr; header->size += size * sizeof(u64); } -- cgit v1.2.3 From 527187d28569e39c5d489d6306d3b79605cf85a6 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 8 Jan 2018 17:27:19 +0100 Subject: locking/lockdep: Remove cross-release leftovers There's two cross-release leftover facilities: - the crossrelease_hist_*() irq-tracing callbacks (NOPs currently) - the complete_release_commit() callback (NOP as well) Remove them. Cc: David Sterba Cc: Byungchul Park Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/completion.c | 5 ----- 1 file changed, 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c index 2ddaec40956f..0926aef10dad 100644 --- a/kernel/sched/completion.c +++ b/kernel/sched/completion.c @@ -34,11 +34,6 @@ void complete(struct completion *x) spin_lock_irqsave(&x->wait.lock, flags); - /* - * Perform commit of crossrelease here. - */ - complete_release_commit(x); - if (x->done != UINT_MAX) x->done++; __wake_up_locked(&x->wait, TASK_NORMAL, 1); -- cgit v1.2.3 From 6baf9e67c9c5d738188b8490893c7e079d3deb7e Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Fri, 5 Jan 2018 05:19:56 +0100 Subject: irq/work: Improve the flag definitions IRQ_WORK_FLAGS is defined simply to 3UL. This is confusing as it says nothing about its purpose. Define IRQ_WORK_FLAGS as a bitwise OR of IRQ_WORK_PENDING and IRQ_WORK_BUSY and change its name to IRQ_WORK_CLAIMED. While we're at it: use the BIT() macro for all flags. Signed-off-by: Bartosz Golaszewski Signed-off-by: Frederic Weisbecker Reviewed-by: Andy Shevchenko Cc: Linus Torvalds Cc: Marc Zyngier Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1515125996-21564-1-git-send-email-frederic@kernel.org Signed-off-by: Ingo Molnar --- kernel/irq_work.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq_work.c b/kernel/irq_work.c index 40e9d739c169..6b7cdf17ccf8 100644 --- a/kernel/irq_work.c +++ b/kernel/irq_work.c @@ -36,7 +36,7 @@ static bool irq_work_claim(struct irq_work *work) */ flags = work->flags & ~IRQ_WORK_PENDING; for (;;) { - nflags = flags | IRQ_WORK_FLAGS; + nflags = flags | IRQ_WORK_CLAIMED; oflags = cmpxchg(&work->flags, flags, nflags); if (oflags == flags) break; -- cgit v1.2.3 From b2157399cc9898260d6031c5bfe45fe137c1fbe7 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Sun, 7 Jan 2018 17:33:02 -0800 Subject: bpf: prevent out-of-bounds speculation Under speculation, CPUs may mis-predict branches in bounds checks. Thus, memory accesses under a bounds check may be speculated even if the bounds check fails, providing a primitive for building a side channel. To avoid leaking kernel data round up array-based maps and mask the index after bounds check, so speculated load with out of bounds index will load either valid value from the array or zero from the padded area. Unconditionally mask index for all array types even when max_entries are not rounded to power of 2 for root user. When map is created by unpriv user generate a sequence of bpf insns that includes AND operation to make sure that JITed code includes the same 'index & index_mask' operation. If prog_array map is created by unpriv user replace bpf_tail_call(ctx, map, index); with if (index >= max_entries) { index &= map->index_mask; bpf_tail_call(ctx, map, index); } (along with roundup to power 2) to prevent out-of-bounds speculation. There is secondary redundant 'if (index >= max_entries)' in the interpreter and in all JITs, but they can be optimized later if necessary. Other array-like maps (cpumap, devmap, sockmap, perf_event_array, cgroup_array) cannot be used by unpriv, so no changes there. That fixes bpf side of "Variant 1: bounds check bypass (CVE-2017-5753)" on all architectures with and without JIT. v2->v3: Daniel noticed that attack potentially can be crafted via syscall commands without loading the program, so add masking to those paths as well. Signed-off-by: Alexei Starovoitov Acked-by: John Fastabend Signed-off-by: Daniel Borkmann --- kernel/bpf/arraymap.c | 47 ++++++++++++++++++++++++++++++++++++----------- kernel/bpf/verifier.c | 36 ++++++++++++++++++++++++++++++++++++ 2 files changed, 72 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 7c25426d3cf5..aaa319848e7d 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -53,9 +53,10 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr) { bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY; int numa_node = bpf_map_attr_numa_node(attr); + u32 elem_size, index_mask, max_entries; + bool unpriv = !capable(CAP_SYS_ADMIN); struct bpf_array *array; u64 array_size; - u32 elem_size; /* check sanity of attributes */ if (attr->max_entries == 0 || attr->key_size != 4 || @@ -72,11 +73,20 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr) elem_size = round_up(attr->value_size, 8); + max_entries = attr->max_entries; + index_mask = roundup_pow_of_two(max_entries) - 1; + + if (unpriv) + /* round up array size to nearest power of 2, + * since cpu will speculate within index_mask limits + */ + max_entries = index_mask + 1; + array_size = sizeof(*array); if (percpu) - array_size += (u64) attr->max_entries * sizeof(void *); + array_size += (u64) max_entries * sizeof(void *); else - array_size += (u64) attr->max_entries * elem_size; + array_size += (u64) max_entries * elem_size; /* make sure there is no u32 overflow later in round_up() */ if (array_size >= U32_MAX - PAGE_SIZE) @@ -86,6 +96,8 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr) array = bpf_map_area_alloc(array_size, numa_node); if (!array) return ERR_PTR(-ENOMEM); + array->index_mask = index_mask; + array->map.unpriv_array = unpriv; /* copy mandatory map attributes */ array->map.map_type = attr->map_type; @@ -121,12 +133,13 @@ static void *array_map_lookup_elem(struct bpf_map *map, void *key) if (unlikely(index >= array->map.max_entries)) return NULL; - return array->value + array->elem_size * index; + return array->value + array->elem_size * (index & array->index_mask); } /* emit BPF instructions equivalent to C code of array_map_lookup_elem() */ static u32 array_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) { + struct bpf_array *array = container_of(map, struct bpf_array, map); struct bpf_insn *insn = insn_buf; u32 elem_size = round_up(map->value_size, 8); const int ret = BPF_REG_0; @@ -135,7 +148,12 @@ static u32 array_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) *insn++ = BPF_ALU64_IMM(BPF_ADD, map_ptr, offsetof(struct bpf_array, value)); *insn++ = BPF_LDX_MEM(BPF_W, ret, index, 0); - *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 3); + if (map->unpriv_array) { + *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 4); + *insn++ = BPF_ALU32_IMM(BPF_AND, ret, array->index_mask); + } else { + *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 3); + } if (is_power_of_2(elem_size)) { *insn++ = BPF_ALU64_IMM(BPF_LSH, ret, ilog2(elem_size)); @@ -157,7 +175,7 @@ static void *percpu_array_map_lookup_elem(struct bpf_map *map, void *key) if (unlikely(index >= array->map.max_entries)) return NULL; - return this_cpu_ptr(array->pptrs[index]); + return this_cpu_ptr(array->pptrs[index & array->index_mask]); } int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value) @@ -177,7 +195,7 @@ int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value) */ size = round_up(map->value_size, 8); rcu_read_lock(); - pptr = array->pptrs[index]; + pptr = array->pptrs[index & array->index_mask]; for_each_possible_cpu(cpu) { bpf_long_memcpy(value + off, per_cpu_ptr(pptr, cpu), size); off += size; @@ -225,10 +243,11 @@ static int array_map_update_elem(struct bpf_map *map, void *key, void *value, return -EEXIST; if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY) - memcpy(this_cpu_ptr(array->pptrs[index]), + memcpy(this_cpu_ptr(array->pptrs[index & array->index_mask]), value, map->value_size); else - memcpy(array->value + array->elem_size * index, + memcpy(array->value + + array->elem_size * (index & array->index_mask), value, map->value_size); return 0; } @@ -262,7 +281,7 @@ int bpf_percpu_array_update(struct bpf_map *map, void *key, void *value, */ size = round_up(map->value_size, 8); rcu_read_lock(); - pptr = array->pptrs[index]; + pptr = array->pptrs[index & array->index_mask]; for_each_possible_cpu(cpu) { bpf_long_memcpy(per_cpu_ptr(pptr, cpu), value + off, size); off += size; @@ -613,6 +632,7 @@ static void *array_of_map_lookup_elem(struct bpf_map *map, void *key) static u32 array_of_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) { + struct bpf_array *array = container_of(map, struct bpf_array, map); u32 elem_size = round_up(map->value_size, 8); struct bpf_insn *insn = insn_buf; const int ret = BPF_REG_0; @@ -621,7 +641,12 @@ static u32 array_of_map_gen_lookup(struct bpf_map *map, *insn++ = BPF_ALU64_IMM(BPF_ADD, map_ptr, offsetof(struct bpf_array, value)); *insn++ = BPF_LDX_MEM(BPF_W, ret, index, 0); - *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 5); + if (map->unpriv_array) { + *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 6); + *insn++ = BPF_ALU32_IMM(BPF_AND, ret, array->index_mask); + } else { + *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 5); + } if (is_power_of_2(elem_size)) *insn++ = BPF_ALU64_IMM(BPF_LSH, ret, ilog2(elem_size)); else diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 04b24876cd23..b414d6b2d470 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1729,6 +1729,13 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx) err = check_func_arg(env, BPF_REG_2, fn->arg2_type, &meta); if (err) return err; + if (func_id == BPF_FUNC_tail_call) { + if (meta.map_ptr == NULL) { + verbose(env, "verifier bug\n"); + return -EINVAL; + } + env->insn_aux_data[insn_idx].map_ptr = meta.map_ptr; + } err = check_func_arg(env, BPF_REG_3, fn->arg3_type, &meta); if (err) return err; @@ -4456,6 +4463,35 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) */ insn->imm = 0; insn->code = BPF_JMP | BPF_TAIL_CALL; + + /* instead of changing every JIT dealing with tail_call + * emit two extra insns: + * if (index >= max_entries) goto out; + * index &= array->index_mask; + * to avoid out-of-bounds cpu speculation + */ + map_ptr = env->insn_aux_data[i + delta].map_ptr; + if (map_ptr == BPF_MAP_PTR_POISON) { + verbose(env, "tail_call obusing map_ptr\n"); + return -EINVAL; + } + if (!map_ptr->unpriv_array) + continue; + insn_buf[0] = BPF_JMP_IMM(BPF_JGE, BPF_REG_3, + map_ptr->max_entries, 2); + insn_buf[1] = BPF_ALU32_IMM(BPF_AND, BPF_REG_3, + container_of(map_ptr, + struct bpf_array, + map)->index_mask); + insn_buf[2] = *insn; + cnt = 3; + new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); + if (!new_prog) + return -ENOMEM; + + delta += cnt - 1; + env->prog = prog = new_prog; + insn = new_prog->insnsi + i + delta; continue; } -- cgit v1.2.3 From 290af86629b25ffd1ed6232c4e9107da031705cb Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 9 Jan 2018 10:04:29 -0800 Subject: bpf: introduce BPF_JIT_ALWAYS_ON config The BPF interpreter has been used as part of the spectre 2 attack CVE-2017-5715. A quote from goolge project zero blog: "At this point, it would normally be necessary to locate gadgets in the host kernel code that can be used to actually leak data by reading from an attacker-controlled location, shifting and masking the result appropriately and then using the result of that as offset to an attacker-controlled address for a load. But piecing gadgets together and figuring out which ones work in a speculation context seems annoying. So instead, we decided to use the eBPF interpreter, which is built into the host kernel - while there is no legitimate way to invoke it from inside a VM, the presence of the code in the host kernel's text section is sufficient to make it usable for the attack, just like with ordinary ROP gadgets." To make attacker job harder introduce BPF_JIT_ALWAYS_ON config option that removes interpreter from the kernel in favor of JIT-only mode. So far eBPF JIT is supported by: x64, arm64, arm32, sparc64, s390, powerpc64, mips64 The start of JITed program is randomized and code page is marked as read-only. In addition "constant blinding" can be turned on with net.core.bpf_jit_harden v2->v3: - move __bpf_prog_ret0 under ifdef (Daniel) v1->v2: - fix init order, test_bpf and cBPF (Daniel's feedback) - fix offloaded bpf (Jakub's feedback) - add 'return 0' dummy in case something can invoke prog->bpf_func - retarget bpf tree. For bpf-next the patch would need one extra hunk. It will be sent when the trees are merged back to net-next Considered doing: int bpf_jit_enable __read_mostly = BPF_EBPF_JIT_DEFAULT; but it seems better to land the patch as-is and in bpf-next remove bpf_jit_enable global variable from all JITs, consolidate in one place and remove this jit_init() function. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- kernel/bpf/core.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 86b50aa26ee8..51ec2dda7f08 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -767,6 +767,7 @@ noinline u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) } EXPORT_SYMBOL_GPL(__bpf_call_base); +#ifndef CONFIG_BPF_JIT_ALWAYS_ON /** * __bpf_prog_run - run eBPF program on a given context * @ctx: is the data we are operating on @@ -1317,6 +1318,14 @@ EVAL6(PROG_NAME_LIST, 224, 256, 288, 320, 352, 384) EVAL4(PROG_NAME_LIST, 416, 448, 480, 512) }; +#else +static unsigned int __bpf_prog_ret0(const void *ctx, + const struct bpf_insn *insn) +{ + return 0; +} +#endif + bool bpf_prog_array_compatible(struct bpf_array *array, const struct bpf_prog *fp) { @@ -1364,9 +1373,13 @@ static int bpf_check_tail_call(const struct bpf_prog *fp) */ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err) { +#ifndef CONFIG_BPF_JIT_ALWAYS_ON u32 stack_depth = max_t(u32, fp->aux->stack_depth, 1); fp->bpf_func = interpreters[(round_up(stack_depth, 32) / 32) - 1]; +#else + fp->bpf_func = __bpf_prog_ret0; +#endif /* eBPF JITs can rewrite the program in case constant * blinding is active. However, in case of error during @@ -1376,6 +1389,12 @@ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err) */ if (!bpf_prog_is_dev_bound(fp->aux)) { fp = bpf_int_jit_compile(fp); +#ifdef CONFIG_BPF_JIT_ALWAYS_ON + if (!fp->jited) { + *err = -ENOTSUPP; + return fp; + } +#endif } else { *err = bpf_prog_offload_compile(fp); if (*err) -- cgit v1.2.3 From 4bf236a3330e97d275e5848420f7e31948fef07a Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Fri, 5 Jan 2018 09:19:08 -0800 Subject: PM / sleep: Make lock/unlock_system_sleep() available to kernel modules Since pm_mutex is not exported using lock/unlock_system_sleep() from inside a kernel module causes a "pm_mutex undefined" linker error. Hence move lock/unlock_system_sleep() into kernel/power/main.c and export these. Signed-off-by: Bart Van Assche Signed-off-by: Rafael J. Wysocki --- kernel/power/main.c | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) (limited to 'kernel') diff --git a/kernel/power/main.c b/kernel/power/main.c index 3a2ca9066583..705c2366dafe 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -22,6 +22,35 @@ DEFINE_MUTEX(pm_mutex); #ifdef CONFIG_PM_SLEEP +void lock_system_sleep(void) +{ + current->flags |= PF_FREEZER_SKIP; + mutex_lock(&pm_mutex); +} +EXPORT_SYMBOL_GPL(lock_system_sleep); + +void unlock_system_sleep(void) +{ + /* + * Don't use freezer_count() because we don't want the call to + * try_to_freeze() here. + * + * Reason: + * Fundamentally, we just don't need it, because freezing condition + * doesn't come into effect until we release the pm_mutex lock, + * since the freezer always works with pm_mutex held. + * + * More importantly, in the case of hibernation, + * unlock_system_sleep() gets called in snapshot_read() and + * snapshot_write() when the freezing condition is still in effect. + * Which means, if we use try_to_freeze() here, it would make them + * enter the refrigerator, thus causing hibernation to lockup. + */ + current->flags &= ~PF_FREEZER_SKIP; + mutex_unlock(&pm_mutex); +} +EXPORT_SYMBOL_GPL(unlock_system_sleep); + /* Routines for PM-transition notifications */ static BLOCKING_NOTIFIER_HEAD(pm_chain_head); -- cgit v1.2.3 From 541676078b52f365f53d46ee5517d305cd1b6350 Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Fri, 15 Dec 2017 14:23:10 -0500 Subject: membarrier: Disable preemption when calling smp_call_function_many() smp_call_function_many() requires disabling preemption around the call. Signed-off-by: Mathieu Desnoyers Cc: # v4.14+ Cc: Andrea Parri Cc: Andrew Hunter Cc: Avi Kivity Cc: Benjamin Herrenschmidt Cc: Boqun Feng Cc: Dave Watson Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Maged Michael Cc: Michael Ellerman Cc: Paul E . McKenney Cc: Paul E. McKenney Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20171215192310.25293-1-mathieu.desnoyers@efficios.com Signed-off-by: Ingo Molnar --- kernel/sched/membarrier.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c index dd7908743dab..9bcbacba82a8 100644 --- a/kernel/sched/membarrier.c +++ b/kernel/sched/membarrier.c @@ -89,7 +89,9 @@ static int membarrier_private_expedited(void) rcu_read_unlock(); } if (!fallback) { + preempt_disable(); smp_call_function_many(tmpmask, ipi_mb, NULL, 1); + preempt_enable(); free_cpumask_var(tmpmask); } cpus_read_unlock(); -- cgit v1.2.3 From 31cb1bc0dc94882a588930f4d007b570c481fd17 Mon Sep 17 00:00:00 2001 From: rodrigosiqueira Date: Fri, 15 Dec 2017 12:06:03 -0200 Subject: sched/core: Rework and clarify prepare_lock_switch() The prepare_lock_switch() function has an unused parameter, and also the function name was not descriptive. To improve readability and remove the extra parameter, do the following changes: * Move prepare_lock_switch() from kernel/sched/sched.h to kernel/sched/core.c, rename it to prepare_task(), and remove the unused parameter. * Split the smp_store_release() out from finish_lock_switch() to a function named finish_task. * Comments ajdustments. Signed-off-by: Rodrigo Siqueira Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20171215140603.gxe5i2y6fg5ojfpp@smtp.gmail.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 53 ++++++++++++++++++++++++++++++++++++++++++++++++---- kernel/sched/sched.h | 41 ---------------------------------------- 2 files changed, 49 insertions(+), 45 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 644fa2e3d993..a794f8155cd5 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2045,7 +2045,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) * If the owning (remote) CPU is still in the middle of schedule() with * this task as prev, wait until its done referencing the task. * - * Pairs with the smp_store_release() in finish_lock_switch(). + * Pairs with the smp_store_release() in finish_task(). * * This ensures that tasks getting woken will be fully ordered against * their previous state and preserve Program Order. @@ -2571,6 +2571,50 @@ fire_sched_out_preempt_notifiers(struct task_struct *curr, #endif /* CONFIG_PREEMPT_NOTIFIERS */ +static inline void prepare_task(struct task_struct *next) +{ +#ifdef CONFIG_SMP + /* + * Claim the task as running, we do this before switching to it + * such that any running task will have this set. + */ + next->on_cpu = 1; +#endif +} + +static inline void finish_task(struct task_struct *prev) +{ +#ifdef CONFIG_SMP + /* + * After ->on_cpu is cleared, the task can be moved to a different CPU. + * We must ensure this doesn't happen until the switch is completely + * finished. + * + * In particular, the load of prev->state in finish_task_switch() must + * happen before this. + * + * Pairs with the smp_cond_load_acquire() in try_to_wake_up(). + */ + smp_store_release(&prev->on_cpu, 0); +#endif +} + +static inline void finish_lock_switch(struct rq *rq) +{ +#ifdef CONFIG_DEBUG_SPINLOCK + /* this is a valid case when another task releases the spinlock */ + rq->lock.owner = current; +#endif + /* + * If we are tracking spinlock dependencies then we have to + * fix up the runqueue lock - which gets 'carried over' from + * prev into current: + */ + spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_); + + raw_spin_unlock_irq(&rq->lock); +} + /** * prepare_task_switch - prepare to switch tasks * @rq: the runqueue preparing to switch @@ -2591,7 +2635,7 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev, sched_info_switch(rq, prev, next); perf_event_task_sched_out(prev, next); fire_sched_out_preempt_notifiers(prev, next); - prepare_lock_switch(rq, next); + prepare_task(next); prepare_arch_switch(next); } @@ -2646,7 +2690,7 @@ static struct rq *finish_task_switch(struct task_struct *prev) * the scheduled task must drop that reference. * * We must observe prev->state before clearing prev->on_cpu (in - * finish_lock_switch), otherwise a concurrent wakeup can get prev + * finish_task), otherwise a concurrent wakeup can get prev * running on another CPU and we could rave with its RUNNING -> DEAD * transition, resulting in a double drop. */ @@ -2663,7 +2707,8 @@ static struct rq *finish_task_switch(struct task_struct *prev) * to use. */ smp_mb__after_unlock_lock(); - finish_lock_switch(rq, prev); + finish_task(prev); + finish_lock_switch(rq); finish_arch_post_lock_switch(); fire_sched_in_preempt_notifiers(current); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index b19552a212de..43f5d6e936bb 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1328,47 +1328,6 @@ static inline int task_on_rq_migrating(struct task_struct *p) # define finish_arch_post_lock_switch() do { } while (0) #endif -static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) -{ -#ifdef CONFIG_SMP - /* - * We can optimise this out completely for !SMP, because the - * SMP rebalancing from interrupt is the only thing that cares - * here. - */ - next->on_cpu = 1; -#endif -} - -static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) -{ -#ifdef CONFIG_SMP - /* - * After ->on_cpu is cleared, the task can be moved to a different CPU. - * We must ensure this doesn't happen until the switch is completely - * finished. - * - * In particular, the load of prev->state in finish_task_switch() must - * happen before this. - * - * Pairs with the smp_cond_load_acquire() in try_to_wake_up(). - */ - smp_store_release(&prev->on_cpu, 0); -#endif -#ifdef CONFIG_DEBUG_SPINLOCK - /* this is a valid case when another task releases the spinlock */ - rq->lock.owner = current; -#endif - /* - * If we are tracking spinlock dependencies then we have to - * fix up the runqueue lock - which gets 'carried over' from - * prev into current: - */ - spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_); - - raw_spin_unlock_irq(&rq->lock); -} - /* * wake flags */ -- cgit v1.2.3 From f01415fdbfe83380c2dfcf90b7b26042f88963aa Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Tue, 5 Dec 2017 17:10:15 +0000 Subject: sched/fair: Use 'unsigned long' for utilization, consistently Utilization and capacity are tracked as 'unsigned long', however some functions using them return an 'int' which is ultimately assigned back to 'unsigned long' variables. Since there is not scope on using a different and signed type, consolidate the signature of functions returning utilization to always use the native type. This change improves code consistency, and it also benefits code paths where utilizations should be clamped by avoiding further type conversions or ugly type casts. Signed-off-by: Patrick Bellasi Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Chris Redpath Reviewed-by: Brendan Jackman Reviewed-by: Dietmar Eggemann Cc: Joel Fernandes Cc: Juri Lelli Cc: Linus Torvalds Cc: Morten Rasmussen Cc: Paul Turner Cc: Peter Zijlstra Cc: Rafael J . Wysocki Cc: Thomas Gleixner Cc: Todd Kjos Cc: Vincent Guittot Cc: Viresh Kumar Link: http://lkml.kernel.org/r/20171205171018.9203-2-patrick.bellasi@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 2915c0d95107..de43bd80a98f 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5765,8 +5765,8 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, return affine; } -static inline int task_util(struct task_struct *p); -static int cpu_util_wake(int cpu, struct task_struct *p); +static inline unsigned long task_util(struct task_struct *p); +static unsigned long cpu_util_wake(int cpu, struct task_struct *p); static unsigned long capacity_spare_wake(int cpu, struct task_struct *p) { @@ -6247,7 +6247,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) * capacity_orig) as it useful for predicting the capacity required after task * migrations (scheduler-driven DVFS). */ -static int cpu_util(int cpu) +static unsigned long cpu_util(int cpu) { unsigned long util = cpu_rq(cpu)->cfs.avg.util_avg; unsigned long capacity = capacity_orig_of(cpu); @@ -6255,7 +6255,7 @@ static int cpu_util(int cpu) return (util >= capacity) ? capacity : util; } -static inline int task_util(struct task_struct *p) +static inline unsigned long task_util(struct task_struct *p) { return p->se.avg.util_avg; } @@ -6264,7 +6264,7 @@ static inline int task_util(struct task_struct *p) * cpu_util_wake: Compute cpu utilization with any contributions from * the waking task p removed. */ -static int cpu_util_wake(int cpu, struct task_struct *p) +static unsigned long cpu_util_wake(int cpu, struct task_struct *p) { unsigned long util, capacity; -- cgit v1.2.3 From f453ae2200b0d1b7abc0c3794ce088899ac7a2af Mon Sep 17 00:00:00 2001 From: Joel Fernandes Date: Thu, 14 Dec 2017 13:21:58 -0800 Subject: sched/fair: Consider RT/IRQ pressure in capacity_spare_wake() capacity_spare_wake() in the slow path influences choice of idlest groups, as we search for groups with maximum spare capacity. In scenarios where RT pressure is high, a sub optimal group can be chosen and hurt performance of the task being woken up. Fix this by using capacity_of() instead of capacity_orig_of() in capacity_spare_wake(). Tests results from improvements with this change are below. More tests were also done by myself and Matt Fleming to ensure no degradation in different benchmarks. 1) Rohit ran barrier.c test (details below) with following improvements: ------------------------------------------------------------------------ This was Rohit's original use case for a patch he posted at [1] however from his recent tests he showed my patch can replace his slow path changes [1] and there's no need to selectively scan/skip CPUs in find_idlest_group_cpu in the slow path to get the improvement he sees. barrier.c (open_mp code) as a micro-benchmark. It does a number of iterations and barrier sync at the end of each for loop. Here barrier,c is running in along with ping on CPU 0 and 1 as: 'ping -l 10000 -q -s 10 -f hostX' barrier.c can be found at: http://www.spinics.net/lists/kernel/msg2506955.html Following are the results for the iterations per second with this micro-benchmark (higher is better), on a 44 core, 2 socket 88 Threads Intel x86 machine: +--------+------------------+---------------------------+ |Threads | Without patch | With patch | | | | | +--------+--------+---------+-----------------+---------+ | | Mean | Std Dev | Mean | Std Dev | +--------+--------+---------+-----------------+---------+ |1 | 539.36 | 60.16 | 572.54 (+6.15%) | 40.95 | |2 | 481.01 | 19.32 | 530.64 (+10.32%)| 56.16 | |4 | 474.78 | 22.28 | 479.46 (+0.99%) | 18.89 | |8 | 450.06 | 24.91 | 447.82 (-0.50%) | 12.36 | |16 | 436.99 | 22.57 | 441.88 (+1.12%) | 7.39 | |32 | 388.28 | 55.59 | 429.4 (+10.59%)| 31.14 | |64 | 314.62 | 6.33 | 311.81 (-0.89%) | 11.99 | +--------+--------+---------+-----------------+---------+ 2) ping+hackbench test on bare-metal sever (by Rohit) ----------------------------------------------------- Here hackbench is running in threaded mode along with, running ping on CPU 0 and 1 as: 'ping -l 10000 -q -s 10 -f hostX' This test is running on 2 socket, 20 core and 40 threads Intel x86 machine: Number of loops is 10000 and runtime is in seconds (Lower is better). +--------------+-----------------+--------------------------+ |Task Groups | Without patch | With patch | | +-------+---------+----------------+---------+ |(Groups of 40)| Mean | Std Dev | Mean | Std Dev | +--------------+-------+---------+----------------+---------+ |1 | 0.851 | 0.007 | 0.828 (+2.77%)| 0.032 | |2 | 1.083 | 0.203 | 1.087 (-0.37%)| 0.246 | |4 | 1.601 | 0.051 | 1.611 (-0.62%)| 0.055 | |8 | 2.837 | 0.060 | 2.827 (+0.35%)| 0.031 | |16 | 5.139 | 0.133 | 5.107 (+0.63%)| 0.085 | |25 | 7.569 | 0.142 | 7.503 (+0.88%)| 0.143 | +--------------+-------+---------+----------------+---------+ [1] https://patchwork.kernel.org/patch/9991635/ Matt Fleming also ran several different hackbench tests and cyclic test to santiy-check that the patch doesn't harm other usecases. Tested-by: Matt Fleming Tested-by: Rohit Jain Signed-off-by: Joel Fernandes Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Vincent Guittot Reviewed-by: Dietmar Eggemann Cc: Atish Patra Cc: Brendan Jackman Cc: Chris Redpath Cc: Frederic Weisbecker Cc: Juri Lelli Cc: Len Brown Cc: Linus Torvalds Cc: Morten Ramussen Cc: Patrick Bellasi Cc: Peter Zijlstra Cc: Rafael J. Wysocki Cc: Saravana Kannan Cc: Srinivas Pandruvada Cc: Steve Muckle Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Vikram Mulukutla Cc: Viresh Kumar Link: http://lkml.kernel.org/r/20171214212158.188190-1-joelaf@google.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index de43bd80a98f..6e775ac39eb4 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5770,7 +5770,7 @@ static unsigned long cpu_util_wake(int cpu, struct task_struct *p); static unsigned long capacity_spare_wake(int cpu, struct task_struct *p) { - return capacity_orig_of(cpu) - cpu_util_wake(cpu, p); + return max_t(long, capacity_of(cpu) - cpu_util_wake(cpu, p), 0); } /* -- cgit v1.2.3 From 6257e7047890084fbeeb84c641200b43f0668abc Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Wed, 13 Dec 2017 15:23:20 +0530 Subject: sched/cpufreq: Initialize sg_cpu->flags to 0 Initializing sg_cpu->flags to SCHED_CPUFREQ_RT has no obvious benefit. The flags field wouldn't be used until the utilization update handler is called for the first time, and once that is called we will overwrite flags anyway. Initialize it to 0. Signed-off-by: Viresh Kumar Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Juri Lelli Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rafael Wysocki Cc: Thomas Gleixner Cc: Vincent Guittot Cc: dietmar.eggemann@arm.com Cc: joelaf@google.com Cc: morten.rasmussen@arm.com Cc: tkjos@android.com Link: http://lkml.kernel.org/r/763feda6424ced8486b25a0c52979634e6104478.1513158452.git.viresh.kumar@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/cpufreq_schedutil.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index d6717a3331a1..22d4630142ab 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -655,7 +655,7 @@ static int sugov_start(struct cpufreq_policy *policy) memset(sg_cpu, 0, sizeof(*sg_cpu)); sg_cpu->cpu = cpu; sg_cpu->sg_policy = sg_policy; - sg_cpu->flags = SCHED_CPUFREQ_RT; + sg_cpu->flags = 0; sg_cpu->iowait_boost_max = policy->cpuinfo.max_freq; } -- cgit v1.2.3 From 5083452f8c7a11577e83842596f97625abbc9c8e Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Wed, 13 Dec 2017 15:23:22 +0530 Subject: sched/cpufreq: Don't pass flags to sugov_set_iowait_boost() We are already passing sg_cpu as argument to sugov_set_iowait_boost() helper and the same can be used to retrieve the flags value. Get rid of the redundant argument. Signed-off-by: Viresh Kumar Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rafael Wysocki Cc: Thomas Gleixner Cc: Vincent Guittot Cc: dietmar.eggemann@arm.com Cc: joelaf@google.com Cc: juri.lelli@redhat.com Cc: morten.rasmussen@arm.com Cc: tkjos@android.com Link: http://lkml.kernel.org/r/4ec5562b1a87e146ebab11fb5dde1ca9c763a7fb.1513158452.git.viresh.kumar@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/cpufreq_schedutil.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 22d4630142ab..6dd1ec9e2995 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -187,10 +187,9 @@ static void sugov_get_util(unsigned long *util, unsigned long *max, int cpu) *max = cfs_max; } -static void sugov_set_iowait_boost(struct sugov_cpu *sg_cpu, u64 time, - unsigned int flags) +static void sugov_set_iowait_boost(struct sugov_cpu *sg_cpu, u64 time) { - if (flags & SCHED_CPUFREQ_IOWAIT) { + if (sg_cpu->flags & SCHED_CPUFREQ_IOWAIT) { if (sg_cpu->iowait_boost_pending) return; @@ -264,7 +263,7 @@ static void sugov_update_single(struct update_util_data *hook, u64 time, unsigned int next_f; bool busy; - sugov_set_iowait_boost(sg_cpu, time, flags); + sugov_set_iowait_boost(sg_cpu, time); sg_cpu->last_update = time; if (!sugov_should_update_freq(sg_policy, time)) @@ -349,7 +348,7 @@ static void sugov_update_shared(struct update_util_data *hook, u64 time, sg_cpu->max = max; sg_cpu->flags = flags; - sugov_set_iowait_boost(sg_cpu, time, flags); + sugov_set_iowait_boost(sg_cpu, time); sg_cpu->last_update = time; if (sugov_should_update_freq(sg_policy, time)) { -- cgit v1.2.3 From 18cec7e0ddd5e28b7722f7049d715873373be3e9 Mon Sep 17 00:00:00 2001 From: Joel Fernandes Date: Fri, 15 Dec 2017 07:39:44 -0800 Subject: sched/fair: Remove impossible condition from find_idlest_group_cpu() find_idlest_group_cpu() goes through CPUs of a group previous selected by find_idlest_group(). find_idlest_group() returns NULL if the local group is the selected one and doesn't execute find_idlest_group_cpu if the group to which 'cpu' belongs to is chosen. So we're always guaranteed to call find_idlest_group_cpu() with a group to which 'cpu' is non-local. This makes one of the conditions in find_idlest_group_cpu() an impossible one, which we can get rid off. Signed-off-by: Joel Fernandes Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Brendan Jackman Reviewed-by: Vincent Guittot Cc: Android Kernel Cc: Atish Patra Cc: Chris Redpath Cc: Dietmar Eggemann Cc: EAS Dev Cc: Frederic Weisbecker Cc: Josef Bacik Cc: Juri Lelli Cc: Len Brown Cc: Linus Torvalds Cc: Morten Ramussen Cc: Patrick Bellasi Cc: Peter Zijlstra Cc: Rafael J. Wysocki Cc: Rohit Jain Cc: Saravana Kannan Cc: Srinivas Pandruvada Cc: Steve Muckle Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Vikram Mulukutla Cc: Viresh Kumar Link: http://lkml.kernel.org/r/20171215153944.220146-3-joelaf@google.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 6e775ac39eb4..3e7606d3ad0f 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5950,7 +5950,7 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this } } else if (shallowest_idle_cpu == -1) { load = weighted_cpuload(cpu_rq(i)); - if (load < min_load || (load == min_load && i == this_cpu)) { + if (load < min_load) { min_load = load; least_loaded_cpu = i; } -- cgit v1.2.3 From 9783be2c0e90bbaceec3c471c4fb017bff7293ba Mon Sep 17 00:00:00 2001 From: Joel Fernandes Date: Fri, 15 Dec 2017 07:39:43 -0800 Subject: sched/fair: Correct obsolete comment about cpufreq_update_util() Since the remote cpufreq callback work, the cpufreq_update_util() call can happen from remote CPUs. The comment about local CPUs is thus obsolete. Update it accordingly. Signed-off-by: Joel Fernandes Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Viresh Kumar Cc: Android Kernel Cc: Atish Patra Cc: Chris Redpath Cc: Dietmar Eggemann Cc: EAS Dev Cc: Frederic Weisbecker Cc: Josef Bacik Cc: Juri Lelli Cc: Len Brown Cc: Linus Torvalds Cc: Morten Ramussen Cc: Patrick Bellasi Cc: Peter Zijlstra Cc: Rafael J. Wysocki Cc: Rohit Jain Cc: Saravana Kannan Cc: Srinivas Pandruvada Cc: Steve Muckle Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Vikram Mulukutla Cc: Vincent Guittot Link: http://lkml.kernel.org/r/20171215153944.220146-2-joelaf@google.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 3e7606d3ad0f..59e66a5848d0 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3020,9 +3020,7 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq) /* * There are a few boundary cases this might miss but it should * get called often enough that that should (hopefully) not be - * a real problem -- added to that it only calls on the local - * CPU, so if we enqueue remotely we'll miss an update, but - * the next tick/schedule should update. + * a real problem. * * It will not get called when we go idle, because the idle * thread is a different class (!fair), nor will the utilization -- cgit v1.2.3 From 7332dec055f2457c386032f7e9b2991eb05c2a0a Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 19 Dec 2017 08:59:47 +0000 Subject: sched/fair: Only immediately migrate tasks due to interrupts if prev and target CPUs share cache If waking from an idle CPU due to an interrupt then it's possible that the waker task will be pulled to wake on the current CPU. Unfortunately, depending on the type of interrupt and IRQ configuration, there may not be a strong relationship between the CPU an interrupt was delivered on and the CPU a task was running on. For example, the interrupts could all be delivered to CPUs on one particular node due to the machine topology or IRQ affinity configuration. Another example is an interrupt for an IO completion which can be delivered to any CPU where there is no guarantee the data is either cache hot or even local. This patch was motivated by the observation that an IO workload was being pulled cross-node on a frequent basis when IO completed. From a wakeup latency perspective, it's still useful to know that an idle CPU is immediately available for use but lets only consider an automatic migration if the CPUs share cache to limit damage due to NUMA migrations. Migrations may still occur if wake_affine_weight determines it's appropriate. These are the throughput results for dbench running on ext4 comparing 4.15-rc3 and this patch on a 2-socket machine where interrupts due to IO completions can happen on any CPU. 4.15.0-rc3 4.15.0-rc3 vanilla lessmigrate Hmean 1 854.64 ( 0.00%) 865.01 ( 1.21%) Hmean 2 1229.60 ( 0.00%) 1274.44 ( 3.65%) Hmean 4 1591.81 ( 0.00%) 1628.08 ( 2.28%) Hmean 8 1845.04 ( 0.00%) 1831.80 ( -0.72%) Hmean 16 2038.61 ( 0.00%) 2091.44 ( 2.59%) Hmean 32 2327.19 ( 0.00%) 2430.29 ( 4.43%) Hmean 64 2570.61 ( 0.00%) 2568.54 ( -0.08%) Hmean 128 2481.89 ( 0.00%) 2499.28 ( 0.70%) Stddev 1 14.31 ( 0.00%) 5.35 ( 62.65%) Stddev 2 21.29 ( 0.00%) 11.09 ( 47.92%) Stddev 4 7.22 ( 0.00%) 6.80 ( 5.92%) Stddev 8 26.70 ( 0.00%) 9.41 ( 64.76%) Stddev 16 22.40 ( 0.00%) 20.01 ( 10.70%) Stddev 32 45.13 ( 0.00%) 44.74 ( 0.85%) Stddev 64 93.10 ( 0.00%) 93.18 ( -0.09%) Stddev 128 184.28 ( 0.00%) 177.85 ( 3.49%) Note the small increase in throughput for low thread counts but also note that the standard deviation for each sample during the test run is lower. The throughput figures for dbench can be misleading so the benchmark is actually modified to time the latency of the processing of one load file with many samples taken. The difference in latency is 4.15.0-rc3 4.15.0-rc3 vanilla lessmigrate Amean 1 21.71 ( 0.00%) 21.47 ( 1.08%) Amean 2 30.89 ( 0.00%) 29.58 ( 4.26%) Amean 4 47.54 ( 0.00%) 46.61 ( 1.97%) Amean 8 82.71 ( 0.00%) 82.81 ( -0.12%) Amean 16 149.45 ( 0.00%) 145.01 ( 2.97%) Amean 32 265.49 ( 0.00%) 248.43 ( 6.42%) Amean 64 463.23 ( 0.00%) 463.55 ( -0.07%) Amean 128 933.97 ( 0.00%) 935.50 ( -0.16%) Stddev 1 1.58 ( 0.00%) 1.54 ( 2.26%) Stddev 2 2.84 ( 0.00%) 2.95 ( -4.15%) Stddev 4 6.78 ( 0.00%) 6.85 ( -0.99%) Stddev 8 16.85 ( 0.00%) 16.37 ( 2.85%) Stddev 16 41.59 ( 0.00%) 41.04 ( 1.32%) Stddev 32 111.05 ( 0.00%) 105.11 ( 5.35%) Stddev 64 285.94 ( 0.00%) 288.01 ( -0.72%) Stddev 128 803.39 ( 0.00%) 809.73 ( -0.79%) It's a small improvement which is not surprising given that migrations that migrate to a different node as not that common. However, it is noticeable in the CPU migration statistics which are reduced by 24%. There was a query for v1 of this patch about NAS so here are the results for C-class using MPI for parallelisation on the same machine nas-mpi 4.15.0-rc3 4.15.0-rc3 vanilla noirq Time cg.C 24.25 ( 0.00%) 23.17 ( 4.45%) Time ep.C 8.22 ( 0.00%) 8.29 ( -0.85%) Time ft.C 22.67 ( 0.00%) 20.34 ( 10.28%) Time is.C 1.42 ( 0.00%) 1.47 ( -3.52%) Time lu.C 55.62 ( 0.00%) 54.81 ( 1.46%) Time mg.C 7.93 ( 0.00%) 7.91 ( 0.25%) 4.15.0-rc3 4.15.0-rc3 vanilla noirq-v1r1 User 3799.96 3748.34 System 672.10 626.15 Elapsed 91.91 79.49 lu.C sees a small gain, ft.C a large gain and ep.C and is.C see small regressions but in terms of absolute time, the difference is small and likely within run-to-run variance. System CPU usage is slightly reduced. schbench from Facebook was also requested. This is a bit of a mixed bag but it's important to note that this workload should not be heavily impacted by wakeups from interrupt context. 4.15.0-rc3 4.15.0-rc3 vanilla noirq-v1r1 Lat 50.00th-qrtle-1 41.00 ( 0.00%) 41.00 ( 0.00%) Lat 75.00th-qrtle-1 42.00 ( 0.00%) 42.00 ( 0.00%) Lat 90.00th-qrtle-1 43.00 ( 0.00%) 44.00 ( -2.33%) Lat 95.00th-qrtle-1 44.00 ( 0.00%) 46.00 ( -4.55%) Lat 99.00th-qrtle-1 57.00 ( 0.00%) 58.00 ( -1.75%) Lat 99.50th-qrtle-1 59.00 ( 0.00%) 59.00 ( 0.00%) Lat 99.90th-qrtle-1 67.00 ( 0.00%) 78.00 ( -16.42%) Lat 50.00th-qrtle-2 40.00 ( 0.00%) 51.00 ( -27.50%) Lat 75.00th-qrtle-2 45.00 ( 0.00%) 56.00 ( -24.44%) Lat 90.00th-qrtle-2 53.00 ( 0.00%) 59.00 ( -11.32%) Lat 95.00th-qrtle-2 57.00 ( 0.00%) 61.00 ( -7.02%) Lat 99.00th-qrtle-2 67.00 ( 0.00%) 71.00 ( -5.97%) Lat 99.50th-qrtle-2 69.00 ( 0.00%) 74.00 ( -7.25%) Lat 99.90th-qrtle-2 83.00 ( 0.00%) 77.00 ( 7.23%) Lat 50.00th-qrtle-4 51.00 ( 0.00%) 51.00 ( 0.00%) Lat 75.00th-qrtle-4 57.00 ( 0.00%) 56.00 ( 1.75%) Lat 90.00th-qrtle-4 60.00 ( 0.00%) 59.00 ( 1.67%) Lat 95.00th-qrtle-4 62.00 ( 0.00%) 62.00 ( 0.00%) Lat 99.00th-qrtle-4 73.00 ( 0.00%) 72.00 ( 1.37%) Lat 99.50th-qrtle-4 76.00 ( 0.00%) 74.00 ( 2.63%) Lat 99.90th-qrtle-4 85.00 ( 0.00%) 78.00 ( 8.24%) Lat 50.00th-qrtle-8 54.00 ( 0.00%) 58.00 ( -7.41%) Lat 75.00th-qrtle-8 59.00 ( 0.00%) 62.00 ( -5.08%) Lat 90.00th-qrtle-8 65.00 ( 0.00%) 66.00 ( -1.54%) Lat 95.00th-qrtle-8 67.00 ( 0.00%) 70.00 ( -4.48%) Lat 99.00th-qrtle-8 78.00 ( 0.00%) 79.00 ( -1.28%) Lat 99.50th-qrtle-8 81.00 ( 0.00%) 80.00 ( 1.23%) Lat 99.90th-qrtle-8 116.00 ( 0.00%) 83.00 ( 28.45%) Lat 50.00th-qrtle-16 65.00 ( 0.00%) 64.00 ( 1.54%) Lat 75.00th-qrtle-16 77.00 ( 0.00%) 71.00 ( 7.79%) Lat 90.00th-qrtle-16 83.00 ( 0.00%) 82.00 ( 1.20%) Lat 95.00th-qrtle-16 87.00 ( 0.00%) 87.00 ( 0.00%) Lat 99.00th-qrtle-16 95.00 ( 0.00%) 96.00 ( -1.05%) Lat 99.50th-qrtle-16 99.00 ( 0.00%) 103.00 ( -4.04%) Lat 99.90th-qrtle-16 104.00 ( 0.00%) 122.00 ( -17.31%) Lat 50.00th-qrtle-32 71.00 ( 0.00%) 73.00 ( -2.82%) Lat 75.00th-qrtle-32 91.00 ( 0.00%) 92.00 ( -1.10%) Lat 90.00th-qrtle-32 108.00 ( 0.00%) 107.00 ( 0.93%) Lat 95.00th-qrtle-32 118.00 ( 0.00%) 115.00 ( 2.54%) Lat 99.00th-qrtle-32 134.00 ( 0.00%) 129.00 ( 3.73%) Lat 99.50th-qrtle-32 138.00 ( 0.00%) 133.00 ( 3.62%) Lat 99.90th-qrtle-32 149.00 ( 0.00%) 146.00 ( 2.01%) Lat 50.00th-qrtle-39 83.00 ( 0.00%) 81.00 ( 2.41%) Lat 75.00th-qrtle-39 105.00 ( 0.00%) 102.00 ( 2.86%) Lat 90.00th-qrtle-39 120.00 ( 0.00%) 119.00 ( 0.83%) Lat 95.00th-qrtle-39 129.00 ( 0.00%) 128.00 ( 0.78%) Lat 99.00th-qrtle-39 153.00 ( 0.00%) 149.00 ( 2.61%) Lat 99.50th-qrtle-39 166.00 ( 0.00%) 156.00 ( 6.02%) Lat 99.90th-qrtle-39 12304.00 ( 0.00%) 12848.00 ( -4.42%) When heavily loaded (e.g. 99.50th-qrtle-39 indicates 39 threads), there are small gains in many cases. Otherwise it depends on the quartile used where it can be bad -- e.g. 75.00th-qrtle-2. However, even these results are probably a co-incidence. For this workload, much depends on what node the threads get placed on and their relative locality and not wakeups from interrupt context. A larger component on how it behaves would be automatic NUMA balancing where a fault incurred to measure locality would be a much larger contributer to latency than the wakeup path. This is the results from an almost identical machine that happened to run the same test. They only differ in terms of storage which is irrelevant for this test. 4.15.0-rc3 4.15.0-rc3 vanilla noirq-v1r1 Lat 50.00th-qrtle-1 41.00 ( 0.00%) 41.00 ( 0.00%) Lat 75.00th-qrtle-1 42.00 ( 0.00%) 42.00 ( 0.00%) Lat 90.00th-qrtle-1 44.00 ( 0.00%) 43.00 ( 2.27%) Lat 95.00th-qrtle-1 53.00 ( 0.00%) 45.00 ( 15.09%) Lat 99.00th-qrtle-1 59.00 ( 0.00%) 58.00 ( 1.69%) Lat 99.50th-qrtle-1 60.00 ( 0.00%) 59.00 ( 1.67%) Lat 99.90th-qrtle-1 86.00 ( 0.00%) 61.00 ( 29.07%) Lat 50.00th-qrtle-2 52.00 ( 0.00%) 41.00 ( 21.15%) Lat 75.00th-qrtle-2 57.00 ( 0.00%) 46.00 ( 19.30%) Lat 90.00th-qrtle-2 60.00 ( 0.00%) 53.00 ( 11.67%) Lat 95.00th-qrtle-2 62.00 ( 0.00%) 57.00 ( 8.06%) Lat 99.00th-qrtle-2 73.00 ( 0.00%) 68.00 ( 6.85%) Lat 99.50th-qrtle-2 74.00 ( 0.00%) 71.00 ( 4.05%) Lat 99.90th-qrtle-2 90.00 ( 0.00%) 75.00 ( 16.67%) Lat 50.00th-qrtle-4 57.00 ( 0.00%) 52.00 ( 8.77%) Lat 75.00th-qrtle-4 60.00 ( 0.00%) 58.00 ( 3.33%) Lat 90.00th-qrtle-4 62.00 ( 0.00%) 62.00 ( 0.00%) Lat 95.00th-qrtle-4 65.00 ( 0.00%) 65.00 ( 0.00%) Lat 99.00th-qrtle-4 76.00 ( 0.00%) 75.00 ( 1.32%) Lat 99.50th-qrtle-4 77.00 ( 0.00%) 77.00 ( 0.00%) Lat 99.90th-qrtle-4 87.00 ( 0.00%) 81.00 ( 6.90%) Lat 50.00th-qrtle-8 59.00 ( 0.00%) 57.00 ( 3.39%) Lat 75.00th-qrtle-8 63.00 ( 0.00%) 62.00 ( 1.59%) Lat 90.00th-qrtle-8 66.00 ( 0.00%) 67.00 ( -1.52%) Lat 95.00th-qrtle-8 68.00 ( 0.00%) 70.00 ( -2.94%) Lat 99.00th-qrtle-8 79.00 ( 0.00%) 80.00 ( -1.27%) Lat 99.50th-qrtle-8 80.00 ( 0.00%) 84.00 ( -5.00%) Lat 99.90th-qrtle-8 84.00 ( 0.00%) 90.00 ( -7.14%) Lat 50.00th-qrtle-16 65.00 ( 0.00%) 65.00 ( 0.00%) Lat 75.00th-qrtle-16 77.00 ( 0.00%) 75.00 ( 2.60%) Lat 90.00th-qrtle-16 84.00 ( 0.00%) 83.00 ( 1.19%) Lat 95.00th-qrtle-16 88.00 ( 0.00%) 87.00 ( 1.14%) Lat 99.00th-qrtle-16 97.00 ( 0.00%) 96.00 ( 1.03%) Lat 99.50th-qrtle-16 100.00 ( 0.00%) 104.00 ( -4.00%) Lat 99.90th-qrtle-16 110.00 ( 0.00%) 126.00 ( -14.55%) Lat 50.00th-qrtle-32 70.00 ( 0.00%) 71.00 ( -1.43%) Lat 75.00th-qrtle-32 92.00 ( 0.00%) 94.00 ( -2.17%) Lat 90.00th-qrtle-32 110.00 ( 0.00%) 110.00 ( 0.00%) Lat 95.00th-qrtle-32 121.00 ( 0.00%) 118.00 ( 2.48%) Lat 99.00th-qrtle-32 135.00 ( 0.00%) 137.00 ( -1.48%) Lat 99.50th-qrtle-32 140.00 ( 0.00%) 146.00 ( -4.29%) Lat 99.90th-qrtle-32 150.00 ( 0.00%) 160.00 ( -6.67%) Lat 50.00th-qrtle-39 80.00 ( 0.00%) 71.00 ( 11.25%) Lat 75.00th-qrtle-39 102.00 ( 0.00%) 91.00 ( 10.78%) Lat 90.00th-qrtle-39 118.00 ( 0.00%) 108.00 ( 8.47%) Lat 95.00th-qrtle-39 128.00 ( 0.00%) 117.00 ( 8.59%) Lat 99.00th-qrtle-39 149.00 ( 0.00%) 133.00 ( 10.74%) Lat 99.50th-qrtle-39 160.00 ( 0.00%) 139.00 ( 13.12%) Lat 99.90th-qrtle-39 13808.00 ( 0.00%) 4920.00 ( 64.37%) Despite being nearly identical, it showed a variety of major gains so I'm not convinced that heavy emphasis should be placed on this particular workload in terms of evaluating this particular patch. Further evidence of this is the fact that testing on a UMA machine showed small gains/losses even though the patch should be a no-op on UMA. Signed-off-by: Mel Gorman Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Matt Fleming Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20171219085947.13136-2-mgorman@techsingularity.net Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 59e66a5848d0..9fec992410f7 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5687,8 +5687,8 @@ static int wake_wide(struct task_struct *p) * soonest. For the purpose of speed we only consider the waking and previous * CPU. * - * wake_affine_idle() - only considers 'now', it check if the waking CPU is (or - * will be) idle. + * wake_affine_idle() - only considers 'now', it check if the waking CPU is + * cache-affine and is (or will be) idle. * * wake_affine_weight() - considers the weight to reflect the average * scheduling latency of the CPUs. This seems to work @@ -5699,7 +5699,13 @@ static bool wake_affine_idle(struct sched_domain *sd, struct task_struct *p, int this_cpu, int prev_cpu, int sync) { - if (idle_cpu(this_cpu)) + /* + * If this_cpu is idle, it implies the wakeup is from interrupt + * context. Only allow the move if cache is shared. Otherwise an + * interrupt intensive workload could force all tasks onto one + * node depending on the IO topology or IRQ affinity settings. + */ + if (idle_cpu(this_cpu) && cpus_share_cache(this_cpu, prev_cpu)) return true; if (sync && cpu_rq(this_cpu)->nr_running == 1) -- cgit v1.2.3 From 34be39305a77b8b1ec9f279163c7cdb6cc719b91 Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Tue, 12 Dec 2017 12:10:24 +0100 Subject: sched/deadline: Implement "runtime overrun signal" support This patch adds the possibility of getting the delivery of a SIGXCPU signal whenever there is a runtime overrun. The request is done through the sched_flags field within the sched_attr structure. Forward port of https://lkml.org/lkml/2009/10/16/170 Tested-by: Mathieu Poirier Signed-off-by: Juri Lelli Signed-off-by: Claudio Scordino Signed-off-by: Luca Abeni Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Tommaso Cucinotta Link: http://lkml.kernel.org/r/1513077024-25461-1-git-send-email-claudio@evidence.eu.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 3 +-- kernel/sched/deadline.c | 7 +++++++ kernel/time/posix-cpu-timers.c | 18 ++++++++++++++++++ 3 files changed, 26 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index a794f8155cd5..e28391bf8b04 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4085,8 +4085,7 @@ recheck: return -EINVAL; } - if (attr->sched_flags & - ~(SCHED_FLAG_RESET_ON_FORK | SCHED_FLAG_RECLAIM)) + if (attr->sched_flags & ~SCHED_FLAG_ALL) return -EINVAL; /* diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 2473736c7616..4c666dbe5038 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1155,6 +1155,12 @@ static void update_curr_dl(struct rq *rq) throttle: if (dl_runtime_exceeded(dl_se) || dl_se->dl_yielded) { dl_se->dl_throttled = 1; + + /* If requested, inform the user about runtime overruns. */ + if (dl_runtime_exceeded(dl_se) && + (dl_se->flags & SCHED_FLAG_DL_OVERRUN)) + dl_se->dl_overrun = 1; + __dequeue_task_dl(rq, curr, 0); if (unlikely(dl_se->dl_boosted || !start_dl_timer(curr))) enqueue_task_dl(rq, curr, ENQUEUE_REPLENISH); @@ -2566,6 +2572,7 @@ void __dl_clear_params(struct task_struct *p) dl_se->dl_throttled = 0; dl_se->dl_yielded = 0; dl_se->dl_non_contending = 0; + dl_se->dl_overrun = 0; } bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr) diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 1f27887aa194..cf50ea34dbd1 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -14,6 +14,7 @@ #include #include #include +#include #include "posix-timers.h" @@ -791,6 +792,14 @@ check_timers_list(struct list_head *timers, return 0; } +static inline void check_dl_overrun(struct task_struct *tsk) +{ + if (tsk->dl.dl_overrun) { + tsk->dl.dl_overrun = 0; + __group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk); + } +} + /* * Check for any per-thread CPU timers that have fired and move them off * the tsk->cpu_timers[N] list onto the firing list. Here we update the @@ -804,6 +813,9 @@ static void check_thread_timers(struct task_struct *tsk, u64 expires; unsigned long soft; + if (dl_task(tsk)) + check_dl_overrun(tsk); + /* * If cputime_expires is zero, then there are no active * per thread CPU timers. @@ -906,6 +918,9 @@ static void check_process_timers(struct task_struct *tsk, struct task_cputime cputime; unsigned long soft; + if (dl_task(tsk)) + check_dl_overrun(tsk); + /* * If cputimer is not running, then there are no active * process wide timers (POSIX 1.b, itimers, RLIMIT_CPU). @@ -1111,6 +1126,9 @@ static inline int fastpath_timer_check(struct task_struct *tsk) return 1; } + if (dl_task(tsk) && tsk->dl.dl_overrun) + return 1; + return 0; } -- cgit v1.2.3 From d4edd662ac1657126df7ffd74a278958b133a77d Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Mon, 4 Dec 2017 11:23:18 +0100 Subject: sched/cpufreq: Use the DEADLINE utilization signal SCHED_DEADLINE tracks active utilization signal with a per dl_rq variable named running_bw. Make use of that to drive CPU frequency selection: add up FAIR and DEADLINE contribution to get the required CPU capacity to handle both requirements (while RT still selects max frequency). Co-authored-by: Claudio Scordino Signed-off-by: Juri Lelli Signed-off-by: Peter Zijlstra (Intel) Acked-by: Viresh Kumar Cc: Linus Torvalds Cc: Luca Abeni Cc: Peter Zijlstra Cc: Rafael J . Wysocki Cc: Thomas Gleixner Cc: alessio.balsini@arm.com Cc: bristot@redhat.com Cc: dietmar.eggemann@arm.com Cc: joelaf@google.com Cc: juri.lelli@redhat.com Cc: mathieu.poirier@linaro.org Cc: morten.rasmussen@arm.com Cc: patrick.bellasi@arm.com Cc: rjw@rjwysocki.net Cc: rostedt@goodmis.org Cc: tkjos@android.com Cc: tommaso.cucinotta@santannapisa.it Cc: vincent.guittot@linaro.org Link: http://lkml.kernel.org/r/20171204102325.5110-2-juri.lelli@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/cpufreq_schedutil.c | 25 +++++++++++++++---------- kernel/sched/sched.h | 10 ++++++++++ 2 files changed, 25 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 6dd1ec9e2995..8d266bc5c67d 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -179,12 +179,17 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy, static void sugov_get_util(unsigned long *util, unsigned long *max, int cpu) { struct rq *rq = cpu_rq(cpu); - unsigned long cfs_max; + unsigned long util_cfs = cpu_util_cfs(rq); + unsigned long util_dl = cpu_util_dl(rq); - cfs_max = arch_scale_cpu_capacity(NULL, cpu); + *max = arch_scale_cpu_capacity(NULL, cpu); - *util = min(rq->cfs.avg.util_avg, cfs_max); - *max = cfs_max; + /* + * Ideally we would like to set util_dl as min/guaranteed freq and + * util_cfs + util_dl as requested freq. However, cpufreq is not yet + * ready for such an interface. So, we only do the latter for now. + */ + *util = min(util_cfs + util_dl, *max); } static void sugov_set_iowait_boost(struct sugov_cpu *sg_cpu, u64 time) @@ -271,7 +276,7 @@ static void sugov_update_single(struct update_util_data *hook, u64 time, busy = sugov_cpu_is_busy(sg_cpu); - if (flags & SCHED_CPUFREQ_RT_DL) { + if (flags & SCHED_CPUFREQ_RT) { next_f = policy->cpuinfo.max_freq; } else { sugov_get_util(&util, &max, sg_cpu->cpu); @@ -316,7 +321,7 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time) j_sg_cpu->iowait_boost_pending = false; continue; } - if (j_sg_cpu->flags & SCHED_CPUFREQ_RT_DL) + if (j_sg_cpu->flags & SCHED_CPUFREQ_RT) return policy->cpuinfo.max_freq; j_util = j_sg_cpu->util; @@ -352,7 +357,7 @@ static void sugov_update_shared(struct update_util_data *hook, u64 time, sg_cpu->last_update = time; if (sugov_should_update_freq(sg_policy, time)) { - if (flags & SCHED_CPUFREQ_RT_DL) + if (flags & SCHED_CPUFREQ_RT) next_f = sg_policy->policy->cpuinfo.max_freq; else next_f = sugov_next_freq_shared(sg_cpu, time); @@ -382,9 +387,9 @@ static void sugov_irq_work(struct irq_work *irq_work) sg_policy = container_of(irq_work, struct sugov_policy, irq_work); /* - * For RT and deadline tasks, the schedutil governor shoots the - * frequency to maximum. Special care must be taken to ensure that this - * kthread doesn't result in the same behavior. + * For RT tasks, the schedutil governor shoots the frequency to maximum. + * Special care must be taken to ensure that this kthread doesn't result + * in the same behavior. * * This is (mostly) guaranteed by the work_in_progress flag. The flag is * updated only at the end of the sugov_work() function and before that diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 43f5d6e936bb..136ab500daeb 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2084,3 +2084,13 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {} #else /* arch_scale_freq_capacity */ #define arch_scale_freq_invariant() (false) #endif + +static inline unsigned long cpu_util_dl(struct rq *rq) +{ + return (rq->dl.running_bw * SCHED_CAPACITY_SCALE) >> BW_SHIFT; +} + +static inline unsigned long cpu_util_cfs(struct rq *rq) +{ + return rq->cfs.avg.util_avg; +} -- cgit v1.2.3 From e0367b12674bf4420870cd0237e3ebafb2ec9593 Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Mon, 4 Dec 2017 11:23:19 +0100 Subject: sched/deadline: Move CPU frequency selection triggering points Since SCHED_DEADLINE doesn't track utilization signal (but reserves a fraction of CPU bandwidth to tasks admitted to the system), there is no point in evaluating frequency changes during each tick event. Move frequency selection triggering points to where running_bw changes. Co-authored-by: Claudio Scordino Signed-off-by: Juri Lelli Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Viresh Kumar Cc: Linus Torvalds Cc: Luca Abeni Cc: Peter Zijlstra Cc: Rafael J . Wysocki Cc: Thomas Gleixner Cc: alessio.balsini@arm.com Cc: bristot@redhat.com Cc: dietmar.eggemann@arm.com Cc: joelaf@google.com Cc: juri.lelli@redhat.com Cc: mathieu.poirier@linaro.org Cc: morten.rasmussen@arm.com Cc: patrick.bellasi@arm.com Cc: rjw@rjwysocki.net Cc: rostedt@goodmis.org Cc: tkjos@android.com Cc: tommaso.cucinotta@santannapisa.it Cc: vincent.guittot@linaro.org Link: http://lkml.kernel.org/r/20171204102325.5110-3-juri.lelli@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/deadline.c | 7 ++++--- kernel/sched/sched.h | 12 ++++++------ 2 files changed, 10 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 4c666dbe5038..f584837b32e7 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -86,6 +86,8 @@ void add_running_bw(u64 dl_bw, struct dl_rq *dl_rq) dl_rq->running_bw += dl_bw; SCHED_WARN_ON(dl_rq->running_bw < old); /* overflow */ SCHED_WARN_ON(dl_rq->running_bw > dl_rq->this_bw); + /* kick cpufreq (see the comment in kernel/sched/sched.h). */ + cpufreq_update_util(rq_of_dl_rq(dl_rq), SCHED_CPUFREQ_DL); } static inline @@ -98,6 +100,8 @@ void sub_running_bw(u64 dl_bw, struct dl_rq *dl_rq) SCHED_WARN_ON(dl_rq->running_bw > old); /* underflow */ if (dl_rq->running_bw > old) dl_rq->running_bw = 0; + /* kick cpufreq (see the comment in kernel/sched/sched.h). */ + cpufreq_update_util(rq_of_dl_rq(dl_rq), SCHED_CPUFREQ_DL); } static inline @@ -1134,9 +1138,6 @@ static void update_curr_dl(struct rq *rq) return; } - /* kick cpufreq (see the comment in kernel/sched/sched.h). */ - cpufreq_update_util(rq, SCHED_CPUFREQ_DL); - schedstat_set(curr->se.statistics.exec_max, max(curr->se.statistics.exec_max, delta_exec)); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 136ab500daeb..863964fbcfd2 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2055,14 +2055,14 @@ DECLARE_PER_CPU(struct update_util_data *, cpufreq_update_util_data); * The way cpufreq is currently arranged requires it to evaluate the CPU * performance state (frequency/voltage) on a regular basis to prevent it from * being stuck in a completely inadequate performance level for too long. - * That is not guaranteed to happen if the updates are only triggered from CFS, - * though, because they may not be coming in if RT or deadline tasks are active - * all the time (or there are RT and DL tasks only). + * That is not guaranteed to happen if the updates are only triggered from CFS + * and DL, though, because they may not be coming in if only RT tasks are + * active all the time (or there are RT tasks only). * - * As a workaround for that issue, this function is called by the RT and DL - * sched classes to trigger extra cpufreq updates to prevent it from stalling, + * As a workaround for that issue, this function is called periodically by the + * RT sched class to trigger extra cpufreq updates to prevent it from stalling, * but that really is a band-aid. Going forward it should be replaced with - * solutions targeted more specifically at RT and DL tasks. + * solutions targeted more specifically at RT tasks. */ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) { -- cgit v1.2.3 From 794a56ebd9a57db12abaec63f038c6eb073461f7 Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Mon, 4 Dec 2017 11:23:20 +0100 Subject: sched/cpufreq: Change the worker kthread to SCHED_DEADLINE Worker kthread needs to be able to change frequency for all other threads. Make it special, just under STOP class. Signed-off-by: Juri Lelli Signed-off-by: Peter Zijlstra (Intel) Cc: Claudio Scordino Cc: Linus Torvalds Cc: Luca Abeni Cc: Peter Zijlstra Cc: Rafael J . Wysocki Cc: Thomas Gleixner Cc: Viresh Kumar Cc: alessio.balsini@arm.com Cc: bristot@redhat.com Cc: dietmar.eggemann@arm.com Cc: joelaf@google.com Cc: juri.lelli@redhat.com Cc: mathieu.poirier@linaro.org Cc: morten.rasmussen@arm.com Cc: patrick.bellasi@arm.com Cc: rjw@rjwysocki.net Cc: rostedt@goodmis.org Cc: tkjos@android.com Cc: tommaso.cucinotta@santannapisa.it Cc: vincent.guittot@linaro.org Link: http://lkml.kernel.org/r/20171204102325.5110-4-juri.lelli@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 13 ++++- kernel/sched/cpufreq_schedutil.c | 19 ++++++-- kernel/sched/deadline.c | 103 +++++++++++++++++++++++++++------------ kernel/sched/sched.h | 30 +++++++++++- 4 files changed, 129 insertions(+), 36 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index e28391bf8b04..402ef4fa0e1c 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4085,7 +4085,7 @@ recheck: return -EINVAL; } - if (attr->sched_flags & ~SCHED_FLAG_ALL) + if (attr->sched_flags & ~(SCHED_FLAG_ALL | SCHED_FLAG_SUGOV)) return -EINVAL; /* @@ -4152,6 +4152,9 @@ recheck: } if (user) { + if (attr->sched_flags & SCHED_FLAG_SUGOV) + return -EINVAL; + retval = security_task_setscheduler(p); if (retval) return retval; @@ -4207,7 +4210,8 @@ change: } #endif #ifdef CONFIG_SMP - if (dl_bandwidth_enabled() && dl_policy(policy)) { + if (dl_bandwidth_enabled() && dl_policy(policy) && + !(attr->sched_flags & SCHED_FLAG_SUGOV)) { cpumask_t *span = rq->rd->span; /* @@ -4337,6 +4341,11 @@ int sched_setattr(struct task_struct *p, const struct sched_attr *attr) } EXPORT_SYMBOL_GPL(sched_setattr); +int sched_setattr_nocheck(struct task_struct *p, const struct sched_attr *attr) +{ + return __sched_setscheduler(p, attr, false, true); +} + /** * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace. * @p: the task in question. diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 8d266bc5c67d..bd5f9976892d 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -474,7 +474,20 @@ static void sugov_policy_free(struct sugov_policy *sg_policy) static int sugov_kthread_create(struct sugov_policy *sg_policy) { struct task_struct *thread; - struct sched_param param = { .sched_priority = MAX_USER_RT_PRIO / 2 }; + struct sched_attr attr = { + .size = sizeof(struct sched_attr), + .sched_policy = SCHED_DEADLINE, + .sched_flags = SCHED_FLAG_SUGOV, + .sched_nice = 0, + .sched_priority = 0, + /* + * Fake (unused) bandwidth; workaround to "fix" + * priority inheritance. + */ + .sched_runtime = 1000000, + .sched_deadline = 10000000, + .sched_period = 10000000, + }; struct cpufreq_policy *policy = sg_policy->policy; int ret; @@ -492,10 +505,10 @@ static int sugov_kthread_create(struct sugov_policy *sg_policy) return PTR_ERR(thread); } - ret = sched_setscheduler_nocheck(thread, SCHED_FIFO, ¶m); + ret = sched_setattr_nocheck(thread, &attr); if (ret) { kthread_stop(thread); - pr_warn("%s: failed to set SCHED_FIFO\n", __func__); + pr_warn("%s: failed to set SCHED_DEADLINE\n", __func__); return ret; } diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index f584837b32e7..54a0dc1424a9 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -78,7 +78,7 @@ static inline int dl_bw_cpus(int i) #endif static inline -void add_running_bw(u64 dl_bw, struct dl_rq *dl_rq) +void __add_running_bw(u64 dl_bw, struct dl_rq *dl_rq) { u64 old = dl_rq->running_bw; @@ -91,7 +91,7 @@ void add_running_bw(u64 dl_bw, struct dl_rq *dl_rq) } static inline -void sub_running_bw(u64 dl_bw, struct dl_rq *dl_rq) +void __sub_running_bw(u64 dl_bw, struct dl_rq *dl_rq) { u64 old = dl_rq->running_bw; @@ -105,7 +105,7 @@ void sub_running_bw(u64 dl_bw, struct dl_rq *dl_rq) } static inline -void add_rq_bw(u64 dl_bw, struct dl_rq *dl_rq) +void __add_rq_bw(u64 dl_bw, struct dl_rq *dl_rq) { u64 old = dl_rq->this_bw; @@ -115,7 +115,7 @@ void add_rq_bw(u64 dl_bw, struct dl_rq *dl_rq) } static inline -void sub_rq_bw(u64 dl_bw, struct dl_rq *dl_rq) +void __sub_rq_bw(u64 dl_bw, struct dl_rq *dl_rq) { u64 old = dl_rq->this_bw; @@ -127,16 +127,46 @@ void sub_rq_bw(u64 dl_bw, struct dl_rq *dl_rq) SCHED_WARN_ON(dl_rq->running_bw > dl_rq->this_bw); } +static inline +void add_rq_bw(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) +{ + if (!dl_entity_is_special(dl_se)) + __add_rq_bw(dl_se->dl_bw, dl_rq); +} + +static inline +void sub_rq_bw(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) +{ + if (!dl_entity_is_special(dl_se)) + __sub_rq_bw(dl_se->dl_bw, dl_rq); +} + +static inline +void add_running_bw(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) +{ + if (!dl_entity_is_special(dl_se)) + __add_running_bw(dl_se->dl_bw, dl_rq); +} + +static inline +void sub_running_bw(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) +{ + if (!dl_entity_is_special(dl_se)) + __sub_running_bw(dl_se->dl_bw, dl_rq); +} + void dl_change_utilization(struct task_struct *p, u64 new_bw) { struct rq *rq; + BUG_ON(p->dl.flags & SCHED_FLAG_SUGOV); + if (task_on_rq_queued(p)) return; rq = task_rq(p); if (p->dl.dl_non_contending) { - sub_running_bw(p->dl.dl_bw, &rq->dl); + sub_running_bw(&p->dl, &rq->dl); p->dl.dl_non_contending = 0; /* * If the timer handler is currently running and the @@ -148,8 +178,8 @@ void dl_change_utilization(struct task_struct *p, u64 new_bw) if (hrtimer_try_to_cancel(&p->dl.inactive_timer) == 1) put_task_struct(p); } - sub_rq_bw(p->dl.dl_bw, &rq->dl); - add_rq_bw(new_bw, &rq->dl); + __sub_rq_bw(p->dl.dl_bw, &rq->dl); + __add_rq_bw(new_bw, &rq->dl); } /* @@ -221,6 +251,9 @@ static void task_non_contending(struct task_struct *p) if (dl_se->dl_runtime == 0) return; + if (dl_entity_is_special(dl_se)) + return; + WARN_ON(hrtimer_active(&dl_se->inactive_timer)); WARN_ON(dl_se->dl_non_contending); @@ -240,12 +273,12 @@ static void task_non_contending(struct task_struct *p) */ if (zerolag_time < 0) { if (dl_task(p)) - sub_running_bw(dl_se->dl_bw, dl_rq); + sub_running_bw(dl_se, dl_rq); if (!dl_task(p) || p->state == TASK_DEAD) { struct dl_bw *dl_b = dl_bw_of(task_cpu(p)); if (p->state == TASK_DEAD) - sub_rq_bw(p->dl.dl_bw, &rq->dl); + sub_rq_bw(&p->dl, &rq->dl); raw_spin_lock(&dl_b->lock); __dl_sub(dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p))); __dl_clear_params(p); @@ -272,7 +305,7 @@ static void task_contending(struct sched_dl_entity *dl_se, int flags) return; if (flags & ENQUEUE_MIGRATED) - add_rq_bw(dl_se->dl_bw, dl_rq); + add_rq_bw(dl_se, dl_rq); if (dl_se->dl_non_contending) { dl_se->dl_non_contending = 0; @@ -293,7 +326,7 @@ static void task_contending(struct sched_dl_entity *dl_se, int flags) * when the "inactive timer" fired). * So, add it back. */ - add_running_bw(dl_se->dl_bw, dl_rq); + add_running_bw(dl_se, dl_rq); } } @@ -1149,6 +1182,9 @@ static void update_curr_dl(struct rq *rq) sched_rt_avg_update(rq, delta_exec); + if (dl_entity_is_special(dl_se)) + return; + if (unlikely(dl_se->flags & SCHED_FLAG_RECLAIM)) delta_exec = grub_reclaim(delta_exec, rq, &curr->dl); dl_se->runtime -= delta_exec; @@ -1211,8 +1247,8 @@ static enum hrtimer_restart inactive_task_timer(struct hrtimer *timer) struct dl_bw *dl_b = dl_bw_of(task_cpu(p)); if (p->state == TASK_DEAD && dl_se->dl_non_contending) { - sub_running_bw(p->dl.dl_bw, dl_rq_of_se(&p->dl)); - sub_rq_bw(p->dl.dl_bw, dl_rq_of_se(&p->dl)); + sub_running_bw(&p->dl, dl_rq_of_se(&p->dl)); + sub_rq_bw(&p->dl, dl_rq_of_se(&p->dl)); dl_se->dl_non_contending = 0; } @@ -1229,7 +1265,7 @@ static enum hrtimer_restart inactive_task_timer(struct hrtimer *timer) sched_clock_tick(); update_rq_clock(rq); - sub_running_bw(dl_se->dl_bw, &rq->dl); + sub_running_bw(dl_se, &rq->dl); dl_se->dl_non_contending = 0; unlock: task_rq_unlock(rq, p, &rf); @@ -1423,8 +1459,8 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) dl_check_constrained_dl(&p->dl); if (p->on_rq == TASK_ON_RQ_MIGRATING || flags & ENQUEUE_RESTORE) { - add_rq_bw(p->dl.dl_bw, &rq->dl); - add_running_bw(p->dl.dl_bw, &rq->dl); + add_rq_bw(&p->dl, &rq->dl); + add_running_bw(&p->dl, &rq->dl); } /* @@ -1464,8 +1500,8 @@ static void dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags) __dequeue_task_dl(rq, p, flags); if (p->on_rq == TASK_ON_RQ_MIGRATING || flags & DEQUEUE_SAVE) { - sub_running_bw(p->dl.dl_bw, &rq->dl); - sub_rq_bw(p->dl.dl_bw, &rq->dl); + sub_running_bw(&p->dl, &rq->dl); + sub_rq_bw(&p->dl, &rq->dl); } /* @@ -1571,7 +1607,7 @@ static void migrate_task_rq_dl(struct task_struct *p) */ raw_spin_lock(&rq->lock); if (p->dl.dl_non_contending) { - sub_running_bw(p->dl.dl_bw, &rq->dl); + sub_running_bw(&p->dl, &rq->dl); p->dl.dl_non_contending = 0; /* * If the timer handler is currently running and the @@ -1583,7 +1619,7 @@ static void migrate_task_rq_dl(struct task_struct *p) if (hrtimer_try_to_cancel(&p->dl.inactive_timer) == 1) put_task_struct(p); } - sub_rq_bw(p->dl.dl_bw, &rq->dl); + sub_rq_bw(&p->dl, &rq->dl); raw_spin_unlock(&rq->lock); } @@ -2026,11 +2062,11 @@ retry: } deactivate_task(rq, next_task, 0); - sub_running_bw(next_task->dl.dl_bw, &rq->dl); - sub_rq_bw(next_task->dl.dl_bw, &rq->dl); + sub_running_bw(&next_task->dl, &rq->dl); + sub_rq_bw(&next_task->dl, &rq->dl); set_task_cpu(next_task, later_rq->cpu); - add_rq_bw(next_task->dl.dl_bw, &later_rq->dl); - add_running_bw(next_task->dl.dl_bw, &later_rq->dl); + add_rq_bw(&next_task->dl, &later_rq->dl); + add_running_bw(&next_task->dl, &later_rq->dl); activate_task(later_rq, next_task, 0); ret = 1; @@ -2118,11 +2154,11 @@ static void pull_dl_task(struct rq *this_rq) resched = true; deactivate_task(src_rq, p, 0); - sub_running_bw(p->dl.dl_bw, &src_rq->dl); - sub_rq_bw(p->dl.dl_bw, &src_rq->dl); + sub_running_bw(&p->dl, &src_rq->dl); + sub_rq_bw(&p->dl, &src_rq->dl); set_task_cpu(p, this_cpu); - add_rq_bw(p->dl.dl_bw, &this_rq->dl); - add_running_bw(p->dl.dl_bw, &this_rq->dl); + add_rq_bw(&p->dl, &this_rq->dl); + add_running_bw(&p->dl, &this_rq->dl); activate_task(this_rq, p, 0); dmin = p->dl.deadline; @@ -2231,7 +2267,7 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p) task_non_contending(p); if (!task_on_rq_queued(p)) - sub_rq_bw(p->dl.dl_bw, &rq->dl); + sub_rq_bw(&p->dl, &rq->dl); /* * We cannot use inactive_task_timer() to invoke sub_running_bw() @@ -2263,7 +2299,7 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p) /* If p is not queued we will update its parameters at next wakeup. */ if (!task_on_rq_queued(p)) { - add_rq_bw(p->dl.dl_bw, &rq->dl); + add_rq_bw(&p->dl, &rq->dl); return; } @@ -2442,6 +2478,9 @@ int sched_dl_overflow(struct task_struct *p, int policy, u64 new_bw = dl_policy(policy) ? to_ratio(period, runtime) : 0; int cpus, err = -1; + if (attr->sched_flags & SCHED_FLAG_SUGOV) + return 0; + /* !deadline task may carry old deadline bandwidth */ if (new_bw == p->dl.dl_bw && task_has_dl_policy(p)) return 0; @@ -2528,6 +2567,10 @@ void __getparam_dl(struct task_struct *p, struct sched_attr *attr) */ bool __checkparam_dl(const struct sched_attr *attr) { + /* special dl tasks don't actually use any parameter */ + if (attr->sched_flags & SCHED_FLAG_SUGOV) + return true; + /* deadline != 0 */ if (attr->sched_deadline == 0) return false; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 863964fbcfd2..c5197338ac47 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -156,13 +156,37 @@ static inline int task_has_dl_policy(struct task_struct *p) return dl_policy(p->policy); } +/* + * !! For sched_setattr_nocheck() (kernel) only !! + * + * This is actually gross. :( + * + * It is used to make schedutil kworker(s) higher priority than SCHED_DEADLINE + * tasks, but still be able to sleep. We need this on platforms that cannot + * atomically change clock frequency. Remove once fast switching will be + * available on such platforms. + * + * SUGOV stands for SchedUtil GOVernor. + */ +#define SCHED_FLAG_SUGOV 0x10000000 + +static inline bool dl_entity_is_special(struct sched_dl_entity *dl_se) +{ +#ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL + return unlikely(dl_se->flags & SCHED_FLAG_SUGOV); +#else + return false; +#endif +} + /* * Tells if entity @a should preempt entity @b. */ static inline bool dl_entity_preempt(struct sched_dl_entity *a, struct sched_dl_entity *b) { - return dl_time_before(a->deadline, b->deadline); + return dl_entity_is_special(a) || + dl_time_before(a->deadline, b->deadline); } /* @@ -2085,6 +2109,8 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {} #define arch_scale_freq_invariant() (false) #endif +#ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL + static inline unsigned long cpu_util_dl(struct rq *rq) { return (rq->dl.running_bw * SCHED_CAPACITY_SCALE) >> BW_SHIFT; @@ -2094,3 +2120,5 @@ static inline unsigned long cpu_util_cfs(struct rq *rq) { return rq->cfs.avg.util_avg; } + +#endif -- cgit v1.2.3 From d18be45dbfef2e0bb12b9696c21aeae92f83b1ea Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Mon, 4 Dec 2017 11:23:21 +0100 Subject: sched/cpufreq: Split utilization signals To be able to treat utilization signals of different scheduling classes in different ways (e.g., CFS signal might be stale while DEADLINE signal is never stale by design) we need to split sugov_cpu::util signal in two: util_cfs and util_dl. This patch does that by also changing sugov_get_util() parameter list. After this change, aggregation of the different signals has to be performed by sugov_get_util() users (so that they can decide what to do with the different signals). Suggested-by: Rafael J. Wysocki Signed-off-by: Juri Lelli Signed-off-by: Peter Zijlstra (Intel) Acked-by: Viresh Kumar Cc: Claudio Scordino Cc: Linus Torvalds Cc: Luca Abeni Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: alessio.balsini@arm.com Cc: bristot@redhat.com Cc: dietmar.eggemann@arm.com Cc: joelaf@google.com Cc: juri.lelli@redhat.com Cc: mathieu.poirier@linaro.org Cc: morten.rasmussen@arm.com Cc: patrick.bellasi@arm.com Cc: rjw@rjwysocki.net Cc: rostedt@goodmis.org Cc: tkjos@android.com Cc: tommaso.cucinotta@santannapisa.it Cc: vincent.guittot@linaro.org Link: http://lkml.kernel.org/r/20171204102325.5110-5-juri.lelli@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/cpufreq_schedutil.c | 30 ++++++++++++++++-------------- 1 file changed, 16 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index bd5f9976892d..e9e0713f85f3 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -60,7 +60,8 @@ struct sugov_cpu { u64 last_update; /* The fields below are only needed when sharing a policy. */ - unsigned long util; + unsigned long util_cfs; + unsigned long util_dl; unsigned long max; unsigned int flags; @@ -176,20 +177,23 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy, return cpufreq_driver_resolve_freq(policy, freq); } -static void sugov_get_util(unsigned long *util, unsigned long *max, int cpu) +static void sugov_get_util(struct sugov_cpu *sg_cpu) { - struct rq *rq = cpu_rq(cpu); - unsigned long util_cfs = cpu_util_cfs(rq); - unsigned long util_dl = cpu_util_dl(rq); + struct rq *rq = cpu_rq(sg_cpu->cpu); - *max = arch_scale_cpu_capacity(NULL, cpu); + sg_cpu->max = arch_scale_cpu_capacity(NULL, sg_cpu->cpu); + sg_cpu->util_cfs = cpu_util_cfs(rq); + sg_cpu->util_dl = cpu_util_dl(rq); +} +static unsigned long sugov_aggregate_util(struct sugov_cpu *sg_cpu) +{ /* * Ideally we would like to set util_dl as min/guaranteed freq and * util_cfs + util_dl as requested freq. However, cpufreq is not yet * ready for such an interface. So, we only do the latter for now. */ - *util = min(util_cfs + util_dl, *max); + return min(sg_cpu->util_cfs + sg_cpu->util_dl, sg_cpu->max); } static void sugov_set_iowait_boost(struct sugov_cpu *sg_cpu, u64 time) @@ -279,7 +283,9 @@ static void sugov_update_single(struct update_util_data *hook, u64 time, if (flags & SCHED_CPUFREQ_RT) { next_f = policy->cpuinfo.max_freq; } else { - sugov_get_util(&util, &max, sg_cpu->cpu); + sugov_get_util(sg_cpu); + max = sg_cpu->max; + util = sugov_aggregate_util(sg_cpu); sugov_iowait_boost(sg_cpu, &util, &max); next_f = get_next_freq(sg_policy, util, max); /* @@ -324,8 +330,8 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time) if (j_sg_cpu->flags & SCHED_CPUFREQ_RT) return policy->cpuinfo.max_freq; - j_util = j_sg_cpu->util; j_max = j_sg_cpu->max; + j_util = sugov_aggregate_util(j_sg_cpu); if (j_util * max > j_max * util) { util = j_util; max = j_max; @@ -342,15 +348,11 @@ static void sugov_update_shared(struct update_util_data *hook, u64 time, { struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util); struct sugov_policy *sg_policy = sg_cpu->sg_policy; - unsigned long util, max; unsigned int next_f; - sugov_get_util(&util, &max, sg_cpu->cpu); - raw_spin_lock(&sg_policy->update_lock); - sg_cpu->util = util; - sg_cpu->max = max; + sugov_get_util(sg_cpu); sg_cpu->flags = flags; sugov_set_iowait_boost(sg_cpu, time); -- cgit v1.2.3 From 0fa7d181f1a60149061632266bb432b4b61acdac Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Mon, 4 Dec 2017 11:23:22 +0100 Subject: sched/cpufreq: Always consider all CPUs when deciding next freq No assumption can be made upon the rate at which frequency updates get triggered, as there are scheduling policies (like SCHED_DEADLINE) which don't trigger them so frequently. Remove such assumption from the code, by always considering SCHED_DEADLINE utilization signal as not stale. Signed-off-by: Juri Lelli Signed-off-by: Peter Zijlstra (Intel) Acked-by: Viresh Kumar Cc: Claudio Scordino Cc: Linus Torvalds Cc: Luca Abeni Cc: Peter Zijlstra Cc: Rafael J . Wysocki Cc: Thomas Gleixner Cc: alessio.balsini@arm.com Cc: bristot@redhat.com Cc: dietmar.eggemann@arm.com Cc: joelaf@google.com Cc: juri.lelli@redhat.com Cc: mathieu.poirier@linaro.org Cc: morten.rasmussen@arm.com Cc: patrick.bellasi@arm.com Cc: rjw@rjwysocki.net Cc: rostedt@goodmis.org Cc: tkjos@android.com Cc: tommaso.cucinotta@santannapisa.it Cc: vincent.guittot@linaro.org Link: http://lkml.kernel.org/r/20171204102325.5110-6-juri.lelli@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/cpufreq_schedutil.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index e9e0713f85f3..dd062a1c8cf0 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -315,17 +315,21 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time) s64 delta_ns; /* - * If the CPU utilization was last updated before the previous - * frequency update and the time elapsed between the last update - * of the CPU utilization and the last frequency update is long - * enough, don't take the CPU into account as it probably is - * idle now (and clear iowait_boost for it). + * If the CFS CPU utilization was last updated before the + * previous frequency update and the time elapsed between the + * last update of the CPU utilization and the last frequency + * update is long enough, reset iowait_boost and util_cfs, as + * they are now probably stale. However, still consider the + * CPU contribution if it has some DEADLINE utilization + * (util_dl). */ delta_ns = time - j_sg_cpu->last_update; if (delta_ns > TICK_NSEC) { j_sg_cpu->iowait_boost = 0; j_sg_cpu->iowait_boost_pending = false; - continue; + j_sg_cpu->util_cfs = 0; + if (j_sg_cpu->util_dl == 0) + continue; } if (j_sg_cpu->flags & SCHED_CPUFREQ_RT) return policy->cpuinfo.max_freq; -- cgit v1.2.3 From 7673c8a4c75d1cac2cd47156b9768f462683a09d Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Mon, 4 Dec 2017 11:23:23 +0100 Subject: sched/cpufreq: Remove arch_scale_freq_capacity()'s 'sd' parameter The 'sd' parameter is never used in arch_scale_freq_capacity() (and it's hard to see where information coming from scheduling domains might help doing frequency invariance scaling). Remove it; also in anticipation of moving arch_scale_freq_capacity() outside CONFIG_SMP. Signed-off-by: Juri Lelli Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: alessio.balsini@arm.com Cc: bristot@redhat.com Cc: claudio@evidence.eu.com Cc: dietmar.eggemann@arm.com Cc: joelaf@google.com Cc: juri.lelli@redhat.com Cc: luca.abeni@santannapisa.it Cc: mathieu.poirier@linaro.org Cc: morten.rasmussen@arm.com Cc: patrick.bellasi@arm.com Cc: rjw@rjwysocki.net Cc: rostedt@goodmis.org Cc: tkjos@android.com Cc: tommaso.cucinotta@santannapisa.it Cc: vincent.guittot@linaro.org Cc: viresh.kumar@linaro.org Link: http://lkml.kernel.org/r/20171204102325.5110-7-juri.lelli@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 2 +- kernel/sched/sched.h | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 9fec992410f7..14859757bff0 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3120,7 +3120,7 @@ accumulate_sum(u64 delta, int cpu, struct sched_avg *sa, u32 contrib = (u32)delta; /* p == 0 -> delta < 1024 */ u64 periods; - scale_freq = arch_scale_freq_capacity(NULL, cpu); + scale_freq = arch_scale_freq_capacity(cpu); scale_cpu = arch_scale_cpu_capacity(NULL, cpu); delta += sa->period_contrib; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index c5197338ac47..b7100192ecd3 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1675,7 +1675,7 @@ extern void sched_avg_update(struct rq *rq); #ifndef arch_scale_freq_capacity static __always_inline -unsigned long arch_scale_freq_capacity(struct sched_domain *sd, int cpu) +unsigned long arch_scale_freq_capacity(int cpu) { return SCHED_CAPACITY_SCALE; } @@ -1694,7 +1694,7 @@ unsigned long arch_scale_cpu_capacity(struct sched_domain *sd, int cpu) static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) { - rq->rt_avg += rt_delta * arch_scale_freq_capacity(NULL, cpu_of(rq)); + rq->rt_avg += rt_delta * arch_scale_freq_capacity(cpu_of(rq)); sched_avg_update(rq); } #else -- cgit v1.2.3 From 7e1a9208f6c7e66bb4e5d2ed18dfd191230f431b Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Mon, 4 Dec 2017 11:23:24 +0100 Subject: sched/cpufreq: Move arch_scale_{freq,cpu}_capacity() outside of #ifdef CONFIG_SMP Currently, frequency and cpu capacity scaling is only performed on CONFIG_SMP systems (as CFS PELT signals are only present for such systems). However, other scheduling classes want to do freq/cpu scaling, and for !CONFIG_SMP configurations as well. arch_scale_freq_capacity() is useful to implement frequency scaling even on !CONFIG_SMP platforms, so we simply move it outside CONFIG_SMP ifdeffery. Even if arch_scale_cpu_capacity() is not useful on !CONFIG_SMP platforms, we make a default implementation available for such configurations anyway to simplify scheduler code doing CPU scale invariance. Signed-off-by: Juri Lelli Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Steven Rostedt (VMware) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: alessio.balsini@arm.com Cc: bristot@redhat.com Cc: claudio@evidence.eu.com Cc: dietmar.eggemann@arm.com Cc: joelaf@google.com Cc: juri.lelli@redhat.com Cc: luca.abeni@santannapisa.it Cc: mathieu.poirier@linaro.org Cc: morten.rasmussen@arm.com Cc: patrick.bellasi@arm.com Cc: rjw@rjwysocki.net Cc: tkjos@android.com Cc: tommaso.cucinotta@santannapisa.it Cc: vincent.guittot@linaro.org Cc: viresh.kumar@linaro.org Link: http://lkml.kernel.org/r/20171204102325.5110-8-juri.lelli@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/sched.h | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index b7100192ecd3..e122c89bdbdd 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1670,9 +1670,6 @@ static inline int hrtick_enabled(struct rq *rq) #endif /* CONFIG_SCHED_HRTICK */ -#ifdef CONFIG_SMP -extern void sched_avg_update(struct rq *rq); - #ifndef arch_scale_freq_capacity static __always_inline unsigned long arch_scale_freq_capacity(int cpu) @@ -1681,6 +1678,9 @@ unsigned long arch_scale_freq_capacity(int cpu) } #endif +#ifdef CONFIG_SMP +extern void sched_avg_update(struct rq *rq); + #ifndef arch_scale_cpu_capacity static __always_inline unsigned long arch_scale_cpu_capacity(struct sched_domain *sd, int cpu) @@ -1698,6 +1698,13 @@ static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) sched_avg_update(rq); } #else +#ifndef arch_scale_cpu_capacity +static __always_inline +unsigned long arch_scale_cpu_capacity(void __always_unused *sd, int cpu) +{ + return SCHED_CAPACITY_SCALE; +} +#endif static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) { } static inline void sched_avg_update(struct rq *rq) { } #endif -- cgit v1.2.3 From 07881166a892fa4908ac4924660a7793f75d6544 Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Mon, 4 Dec 2017 11:23:25 +0100 Subject: sched/deadline: Make bandwidth enforcement scale-invariant Apply frequency and CPU scale-invariance correction factor to bandwidth enforcement (similar to what we already do to fair utilization tracking). Each delta_exec gets scaled considering current frequency and maximum CPU capacity; which means that the reservation runtime parameter (that need to be specified profiling the task execution at max frequency on biggest capacity core) gets thus scaled accordingly. Signed-off-by: Juri Lelli Signed-off-by: Peter Zijlstra (Intel) Cc: Claudio Scordino Cc: Linus Torvalds Cc: Luca Abeni Cc: Peter Zijlstra Cc: Rafael J . Wysocki Cc: Thomas Gleixner Cc: Viresh Kumar Cc: alessio.balsini@arm.com Cc: bristot@redhat.com Cc: dietmar.eggemann@arm.com Cc: joelaf@google.com Cc: juri.lelli@redhat.com Cc: mathieu.poirier@linaro.org Cc: morten.rasmussen@arm.com Cc: patrick.bellasi@arm.com Cc: rjw@rjwysocki.net Cc: rostedt@goodmis.org Cc: tkjos@android.com Cc: tommaso.cucinotta@santannapisa.it Cc: vincent.guittot@linaro.org Link: http://lkml.kernel.org/r/20171204102325.5110-9-juri.lelli@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/deadline.c | 26 ++++++++++++++++++++++---- kernel/sched/fair.c | 2 -- kernel/sched/sched.h | 2 ++ 3 files changed, 24 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 54a0dc1424a9..9bb0e0c412ec 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1151,7 +1151,8 @@ static void update_curr_dl(struct rq *rq) { struct task_struct *curr = rq->curr; struct sched_dl_entity *dl_se = &curr->dl; - u64 delta_exec; + u64 delta_exec, scaled_delta_exec; + int cpu = cpu_of(rq); if (!dl_task(curr) || !on_dl_rq(dl_se)) return; @@ -1185,9 +1186,26 @@ static void update_curr_dl(struct rq *rq) if (dl_entity_is_special(dl_se)) return; - if (unlikely(dl_se->flags & SCHED_FLAG_RECLAIM)) - delta_exec = grub_reclaim(delta_exec, rq, &curr->dl); - dl_se->runtime -= delta_exec; + /* + * For tasks that participate in GRUB, we implement GRUB-PA: the + * spare reclaimed bandwidth is used to clock down frequency. + * + * For the others, we still need to scale reservation parameters + * according to current frequency and CPU maximum capacity. + */ + if (unlikely(dl_se->flags & SCHED_FLAG_RECLAIM)) { + scaled_delta_exec = grub_reclaim(delta_exec, + rq, + &curr->dl); + } else { + unsigned long scale_freq = arch_scale_freq_capacity(cpu); + unsigned long scale_cpu = arch_scale_cpu_capacity(NULL, cpu); + + scaled_delta_exec = cap_scale(delta_exec, scale_freq); + scaled_delta_exec = cap_scale(scaled_delta_exec, scale_cpu); + } + + dl_se->runtime -= scaled_delta_exec; throttle: if (dl_runtime_exceeded(dl_se) || dl_se->dl_yielded) { diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 14859757bff0..1070803cb423 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3089,8 +3089,6 @@ static u32 __accumulate_pelt_segments(u64 periods, u32 d1, u32 d3) return c1 + c2 + c3; } -#define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT) - /* * Accumulate the three separate parts of the sum; d1 the remainder * of the last (incomplete) period, d2 the span of full periods and d3 diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index e122c89bdbdd..2e95505e23c6 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -156,6 +156,8 @@ static inline int task_has_dl_policy(struct task_struct *p) return dl_policy(p->policy); } +#define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT) + /* * !! For sched_setattr_nocheck() (kernel) only !! * -- cgit v1.2.3 From 4f58424da3deead2605e39a9df65f5f06107a3cb Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Wed, 10 Jan 2018 04:35:12 -0800 Subject: cgroup: make cgroup.threads delegatable Make cgroup.threads file delegatable. The behavior of cgroup.threads should follow the behavior of cgroup.procs. Signed-off-by: Roman Gushchin Discovered-by: Michael Kerrisk Cc: Tejun Heo Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 2cf06c274e4c..7e4c44538119 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -4447,6 +4447,7 @@ static struct cftype cgroup_base_files[] = { }, { .name = "cgroup.threads", + .flags = CFTYPE_NS_DELEGATABLE, .release = cgroup_procs_release, .seq_start = cgroup_threads_start, .seq_next = cgroup_procs_next, -- cgit v1.2.3 From 40950343932879247861ae152dcb55e4555afdff Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Wed, 10 Jan 2018 09:20:54 +0000 Subject: bpf: fix spelling mistake: "obusing" -> "abusing" Trivial fix to spelling mistake in error message text. Signed-off-by: Colin Ian King Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index b414d6b2d470..96ab165c873c 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4472,7 +4472,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) */ map_ptr = env->insn_aux_data[i + delta].map_ptr; if (map_ptr == BPF_MAP_PTR_POISON) { - verbose(env, "tail_call obusing map_ptr\n"); + verbose(env, "tail_call abusing map_ptr\n"); return -EINVAL; } if (!map_ptr->unpriv_array) -- cgit v1.2.3 From 7891a87efc7116590eaba57acc3c422487802c6f Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 10 Jan 2018 20:04:37 +0100 Subject: bpf: arsh is not supported in 32 bit alu thus reject it The following snippet was throwing an 'unknown opcode cc' warning in BPF interpreter: 0: (18) r0 = 0x0 2: (7b) *(u64 *)(r10 -16) = r0 3: (cc) (u32) r0 s>>= (u32) r0 4: (95) exit Although a number of JITs do support BPF_ALU | BPF_ARSH | BPF_{K,X} generation, not all of them do and interpreter does neither. We can leave existing ones and implement it later in bpf-next for the remaining ones, but reject this properly in verifier for the time being. Fixes: 17a5267067f3 ("bpf: verifier (add verifier core)") Reported-by: syzbot+93c4904c5c70348a6890@syzkaller.appspotmail.com Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 96ab165c873c..20eb04fd155e 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2493,6 +2493,11 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) return -EINVAL; } + if (opcode == BPF_ARSH && BPF_CLASS(insn->code) != BPF_ALU64) { + verbose(env, "BPF_ARSH not supported for 32 bit ALU\n"); + return -EINVAL; + } + if ((opcode == BPF_LSH || opcode == BPF_RSH || opcode == BPF_ARSH) && BPF_SRC(insn->code) == BPF_K) { int size = BPF_CLASS(insn->code) == BPF_ALU64 ? 64 : 32; -- cgit v1.2.3 From bbeb6e4323dad9b5e0ee9f60c223dd532e2403b1 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 10 Jan 2018 23:25:05 +0100 Subject: bpf, array: fix overflow in max_entries and undefined behavior in index_mask syzkaller tried to alloc a map with 0xfffffffd entries out of a userns, and thus unprivileged. With the recently added logic in b2157399cc98 ("bpf: prevent out-of-bounds speculation") we round this up to the next power of two value for max_entries for unprivileged such that we can apply proper masking into potentially zeroed out map slots. However, this will generate an index_mask of 0xffffffff, and therefore a + 1 will let this overflow into new max_entries of 0. This will pass allocation, etc, and later on map access we still enforce on the original attr->max_entries value which was 0xfffffffd, therefore triggering GPF all over the place. Thus bail out on overflow in such case. Moreover, on 32 bit archs roundup_pow_of_two() can also not be used, since fls_long(max_entries - 1) can result in 32 and 1UL << 32 in 32 bit space is undefined. Therefore, do this by hand in a 64 bit variable. This fixes all the issues triggered by syzkaller's reproducers. Fixes: b2157399cc98 ("bpf: prevent out-of-bounds speculation") Reported-by: syzbot+b0efb8e572d01bce1ae0@syzkaller.appspotmail.com Reported-by: syzbot+6c15e9744f75f2364773@syzkaller.appspotmail.com Reported-by: syzbot+d2f5524fb46fd3b312ee@syzkaller.appspotmail.com Reported-by: syzbot+61d23c95395cc90dbc2b@syzkaller.appspotmail.com Reported-by: syzbot+0d363c942452cca68c01@syzkaller.appspotmail.com Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov --- kernel/bpf/arraymap.c | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index aaa319848e7d..ab94d304a634 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -56,7 +56,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr) u32 elem_size, index_mask, max_entries; bool unpriv = !capable(CAP_SYS_ADMIN); struct bpf_array *array; - u64 array_size; + u64 array_size, mask64; /* check sanity of attributes */ if (attr->max_entries == 0 || attr->key_size != 4 || @@ -74,13 +74,25 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr) elem_size = round_up(attr->value_size, 8); max_entries = attr->max_entries; - index_mask = roundup_pow_of_two(max_entries) - 1; - if (unpriv) + /* On 32 bit archs roundup_pow_of_two() with max_entries that has + * upper most bit set in u32 space is undefined behavior due to + * resulting 1U << 32, so do it manually here in u64 space. + */ + mask64 = fls_long(max_entries - 1); + mask64 = 1ULL << mask64; + mask64 -= 1; + + index_mask = mask64; + if (unpriv) { /* round up array size to nearest power of 2, * since cpu will speculate within index_mask limits */ max_entries = index_mask + 1; + /* Check for overflows. */ + if (max_entries < attr->max_entries) + return ERR_PTR(-E2BIG); + } array_size = sizeof(*array); if (percpu) -- cgit v1.2.3 From 84676c1f21e8ff54befe985f4f14dc1edc10046b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 12 Jan 2018 10:53:05 +0800 Subject: genirq/affinity: assign vectors to all possible CPUs Currently we assign managed interrupt vectors to all present CPUs. This works fine for systems were we only online/offline CPUs. But in case of systems that support physical CPU hotplug (or the virtualized version of it) this means the additional CPUs covered for in the ACPI tables or on the command line are not catered for. To fix this we'd either need to introduce new hotplug CPU states just for this case, or we can start assining vectors to possible but not present CPUs. Reported-by: Christian Borntraeger Tested-by: Christian Borntraeger Tested-by: Stefan Haberland Fixes: 4b855ad37194 ("blk-mq: Create hctx for each present CPU") Cc: linux-kernel@vger.kernel.org Cc: Thomas Gleixner Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- kernel/irq/affinity.c | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c index e12d35108225..a37a3b4b6342 100644 --- a/kernel/irq/affinity.c +++ b/kernel/irq/affinity.c @@ -39,7 +39,7 @@ static void irq_spread_init_one(struct cpumask *irqmsk, struct cpumask *nmsk, } } -static cpumask_var_t *alloc_node_to_present_cpumask(void) +static cpumask_var_t *alloc_node_to_possible_cpumask(void) { cpumask_var_t *masks; int node; @@ -62,7 +62,7 @@ out_unwind: return NULL; } -static void free_node_to_present_cpumask(cpumask_var_t *masks) +static void free_node_to_possible_cpumask(cpumask_var_t *masks) { int node; @@ -71,22 +71,22 @@ static void free_node_to_present_cpumask(cpumask_var_t *masks) kfree(masks); } -static void build_node_to_present_cpumask(cpumask_var_t *masks) +static void build_node_to_possible_cpumask(cpumask_var_t *masks) { int cpu; - for_each_present_cpu(cpu) + for_each_possible_cpu(cpu) cpumask_set_cpu(cpu, masks[cpu_to_node(cpu)]); } -static int get_nodes_in_cpumask(cpumask_var_t *node_to_present_cpumask, +static int get_nodes_in_cpumask(cpumask_var_t *node_to_possible_cpumask, const struct cpumask *mask, nodemask_t *nodemsk) { int n, nodes = 0; /* Calculate the number of nodes in the supplied affinity mask */ for_each_node(n) { - if (cpumask_intersects(mask, node_to_present_cpumask[n])) { + if (cpumask_intersects(mask, node_to_possible_cpumask[n])) { node_set(n, *nodemsk); nodes++; } @@ -109,7 +109,7 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd) int last_affv = affv + affd->pre_vectors; nodemask_t nodemsk = NODE_MASK_NONE; struct cpumask *masks; - cpumask_var_t nmsk, *node_to_present_cpumask; + cpumask_var_t nmsk, *node_to_possible_cpumask; /* * If there aren't any vectors left after applying the pre/post @@ -125,8 +125,8 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd) if (!masks) goto out; - node_to_present_cpumask = alloc_node_to_present_cpumask(); - if (!node_to_present_cpumask) + node_to_possible_cpumask = alloc_node_to_possible_cpumask(); + if (!node_to_possible_cpumask) goto out; /* Fill out vectors at the beginning that don't need affinity */ @@ -135,8 +135,8 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd) /* Stabilize the cpumasks */ get_online_cpus(); - build_node_to_present_cpumask(node_to_present_cpumask); - nodes = get_nodes_in_cpumask(node_to_present_cpumask, cpu_present_mask, + build_node_to_possible_cpumask(node_to_possible_cpumask); + nodes = get_nodes_in_cpumask(node_to_possible_cpumask, cpu_possible_mask, &nodemsk); /* @@ -146,7 +146,7 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd) if (affv <= nodes) { for_each_node_mask(n, nodemsk) { cpumask_copy(masks + curvec, - node_to_present_cpumask[n]); + node_to_possible_cpumask[n]); if (++curvec == last_affv) break; } @@ -160,7 +160,7 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd) vecs_per_node = (affv - (curvec - affd->pre_vectors)) / nodes; /* Get the cpus on this node which are in the mask */ - cpumask_and(nmsk, cpu_present_mask, node_to_present_cpumask[n]); + cpumask_and(nmsk, cpu_possible_mask, node_to_possible_cpumask[n]); /* Calculate the number of cpus per vector */ ncpus = cpumask_weight(nmsk); @@ -192,7 +192,7 @@ done: /* Fill out vectors at the end that don't need affinity */ for (; curvec < nvecs; curvec++) cpumask_copy(masks + curvec, irq_default_affinity); - free_node_to_present_cpumask(node_to_present_cpumask); + free_node_to_possible_cpumask(node_to_possible_cpumask); out: free_cpumask_var(nmsk); return masks; @@ -214,7 +214,7 @@ int irq_calc_affinity_vectors(int minvec, int maxvec, const struct irq_affinity return 0; get_online_cpus(); - ret = min_t(int, cpumask_weight(cpu_present_mask), vecs) + resv; + ret = min_t(int, cpumask_weight(cpu_possible_mask), vecs) + resv; put_online_cpus(); return ret; } -- cgit v1.2.3 From 62635ea8c18f0f62df4cc58379e4f1d33afd5801 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Thu, 11 Jan 2018 09:53:35 +0900 Subject: workqueue: avoid hard lockups in show_workqueue_state() show_workqueue_state() can print out a lot of messages while being in atomic context, e.g. sysrq-t -> show_workqueue_state(). If the console device is slow it may end up triggering NMI hard lockup watchdog. Signed-off-by: Sergey Senozhatsky Signed-off-by: Tejun Heo Cc: stable@vger.kernel.org # v4.5+ --- kernel/workqueue.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 43d18cb46308..f699122dab32 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -48,6 +48,7 @@ #include #include #include +#include #include "workqueue_internal.h" @@ -4463,6 +4464,12 @@ void show_workqueue_state(void) if (pwq->nr_active || !list_empty(&pwq->delayed_works)) show_pwq(pwq); spin_unlock_irqrestore(&pwq->pool->lock, flags); + /* + * We could be printing a lot from atomic context, e.g. + * sysrq-t -> show_workqueue_state(). Avoid triggering + * hard lockup. + */ + touch_nmi_watchdog(); } } @@ -4490,6 +4497,12 @@ void show_workqueue_state(void) pr_cont("\n"); next_pool: spin_unlock_irqrestore(&pool->lock, flags); + /* + * We could be printing a lot from atomic context, e.g. + * sysrq-t -> show_workqueue_state(). Avoid triggering + * hard lockup. + */ + touch_nmi_watchdog(); } rcu_read_unlock_sched(); -- cgit v1.2.3 From a0b1280368d1e91ab72f849ef095b4f07a39bbf1 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Fri, 12 Jan 2018 16:53:14 -0800 Subject: kdump: write correct address of mem_section into vmcoreinfo Depending on configuration mem_section can now be an array or a pointer to an array allocated dynamically. In most cases, we can continue to refer to it as 'mem_section' regardless of what it is. But there's one exception: '&mem_section' means "address of the array" if mem_section is an array, but if mem_section is a pointer, it would mean "address of the pointer". We've stepped onto this in kdump code. VMCOREINFO_SYMBOL(mem_section) writes down address of pointer into vmcoreinfo, not array as we wanted. Let's introduce VMCOREINFO_SYMBOL_ARRAY() that would handle the situation correctly for both cases. Link: http://lkml.kernel.org/r/20180112162532.35896-1-kirill.shutemov@linux.intel.com Signed-off-by: Kirill A. Shutemov Fixes: 83e3c48729d9 ("mm/sparsemem: Allocate mem_section at runtime for CONFIG_SPARSEMEM_EXTREME=y") Acked-by: Baoquan He Acked-by: Dave Young Cc: Ingo Molnar Cc: Thomas Gleixner Cc: "H. Peter Anvin" Cc: Greg Kroah-Hartman Cc: Dave Young Cc: Baoquan He Cc: Vivek Goyal Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/crash_core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/crash_core.c b/kernel/crash_core.c index b3663896278e..4f63597c824d 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -410,7 +410,7 @@ static int __init crash_save_vmcoreinfo_init(void) VMCOREINFO_SYMBOL(contig_page_data); #endif #ifdef CONFIG_SPARSEMEM - VMCOREINFO_SYMBOL(mem_section); + VMCOREINFO_SYMBOL_ARRAY(mem_section); VMCOREINFO_LENGTH(mem_section, NR_SECTION_ROOTS); VMCOREINFO_STRUCT_SIZE(mem_section); VMCOREINFO_OFFSET(mem_section, section_mem_map); -- cgit v1.2.3 From c366287ebd698ef5e3de300d90cd62ee9ee7373e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 12 Jan 2018 17:43:23 -0800 Subject: bpf: fix divides by zero Divides by zero are not nice, lets avoid them if possible. Also do_div() seems not needed when dealing with 32bit operands, but this seems a minor detail. Fixes: bd4cf0ed331a ("net: filter: rework/optimize internal BPF interpreter's instruction set") Signed-off-by: Eric Dumazet Reported-by: syzbot Signed-off-by: Alexei Starovoitov --- kernel/bpf/core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 51ec2dda7f08..7949e8b8f94e 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -956,7 +956,7 @@ select_insn: DST = tmp; CONT; ALU_MOD_X: - if (unlikely(SRC == 0)) + if (unlikely((u32)SRC == 0)) return 0; tmp = (u32) DST; DST = do_div(tmp, (u32) SRC); @@ -975,7 +975,7 @@ select_insn: DST = div64_u64(DST, SRC); CONT; ALU_DIV_X: - if (unlikely(SRC == 0)) + if (unlikely((u32)SRC == 0)) return 0; tmp = (u32) DST; do_div(tmp, (u32) SRC); -- cgit v1.2.3 From c1e2f0eaf015fb7076d51a339011f2383e6dd389 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 8 Dec 2017 13:49:39 +0100 Subject: futex: Avoid violating the 10th rule of futex Julia reported futex state corruption in the following scenario: waiter waker stealer (prio > waiter) futex(WAIT_REQUEUE_PI, uaddr, uaddr2, timeout=[N ms]) futex_wait_requeue_pi() futex_wait_queue_me() freezable_schedule() futex(LOCK_PI, uaddr2) futex(CMP_REQUEUE_PI, uaddr, uaddr2, 1, 0) /* requeues waiter to uaddr2 */ futex(UNLOCK_PI, uaddr2) wake_futex_pi() cmp_futex_value_locked(uaddr2, waiter) wake_up_q() task> futex(LOCK_PI, uaddr2) __rt_mutex_start_proxy_lock() try_to_take_rt_mutex() /* steals lock */ rt_mutex_set_owner(lock, stealer) rt_mutex_wait_proxy_lock() __rt_mutex_slowlock() try_to_take_rt_mutex() /* fails, lock held by stealer */ if (timeout && !timeout->task) return -ETIMEDOUT; fixup_owner() /* lock wasn't acquired, so, fixup_pi_state_owner skipped */ return -ETIMEDOUT; /* At this point, we've returned -ETIMEDOUT to userspace, but the * futex word shows waiter to be the owner, and the pi_mutex has * stealer as the owner */ futex_lock(LOCK_PI, uaddr2) -> bails with EDEADLK, futex word says we're owner. And suggested that what commit: 73d786bd043e ("futex: Rework inconsistent rt_mutex/futex_q state") removes from fixup_owner() looks to be just what is needed. And indeed it is -- I completely missed that requeue_pi could also result in this case. So we need to restore that, except that subsequent patches, like commit: 16ffa12d7425 ("futex: Pull rt_mutex_futex_unlock() out from under hb->lock") changed all the locking rules. Even without that, the sequence: - if (rt_mutex_futex_trylock(&q->pi_state->pi_mutex)) { - locked = 1; - goto out; - } - raw_spin_lock_irq(&q->pi_state->pi_mutex.wait_lock); - owner = rt_mutex_owner(&q->pi_state->pi_mutex); - if (!owner) - owner = rt_mutex_next_owner(&q->pi_state->pi_mutex); - raw_spin_unlock_irq(&q->pi_state->pi_mutex.wait_lock); - ret = fixup_pi_state_owner(uaddr, q, owner); already suggests there were races; otherwise we'd never have to look at next_owner. So instead of doing 3 consecutive wait_lock sections with who knows what races, we do it all in a single section. Additionally, the usage of pi_state->owner in fixup_owner() was only safe because only the rt_mutex owner would modify it, which this additional case wrecks. Luckily the values can only change away and not to the value we're testing, this means we can do a speculative test and double check once we have the wait_lock. Fixes: 73d786bd043e ("futex: Rework inconsistent rt_mutex/futex_q state") Reported-by: Julia Cartwright Reported-by: Gratian Crisan Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Tested-by: Julia Cartwright Tested-by: Gratian Crisan Cc: Darren Hart Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20171208124939.7livp7no2ov65rrc@hirez.programming.kicks-ass.net --- kernel/futex.c | 83 +++++++++++++++++++++++++++++++++-------- kernel/locking/rtmutex.c | 26 +++++++++---- kernel/locking/rtmutex_common.h | 1 + 3 files changed, 87 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 57d0b3657e16..9e69589b9248 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -2294,21 +2294,17 @@ static void unqueue_me_pi(struct futex_q *q) spin_unlock(q->lock_ptr); } -/* - * Fixup the pi_state owner with the new owner. - * - * Must be called with hash bucket lock held and mm->sem held for non - * private futexes. - */ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, - struct task_struct *newowner) + struct task_struct *argowner) { - u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS; struct futex_pi_state *pi_state = q->pi_state; u32 uval, uninitialized_var(curval), newval; - struct task_struct *oldowner; + struct task_struct *oldowner, *newowner; + u32 newtid; int ret; + lockdep_assert_held(q->lock_ptr); + raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); oldowner = pi_state->owner; @@ -2317,11 +2313,17 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, newtid |= FUTEX_OWNER_DIED; /* - * We are here either because we stole the rtmutex from the - * previous highest priority waiter or we are the highest priority - * waiter but have failed to get the rtmutex the first time. + * We are here because either: + * + * - we stole the lock and pi_state->owner needs updating to reflect + * that (@argowner == current), * - * We have to replace the newowner TID in the user space variable. + * or: + * + * - someone stole our lock and we need to fix things to point to the + * new owner (@argowner == NULL). + * + * Either way, we have to replace the TID in the user space variable. * This must be atomic as we have to preserve the owner died bit here. * * Note: We write the user space value _before_ changing the pi_state @@ -2334,6 +2336,42 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, * in the PID check in lookup_pi_state. */ retry: + if (!argowner) { + if (oldowner != current) { + /* + * We raced against a concurrent self; things are + * already fixed up. Nothing to do. + */ + ret = 0; + goto out_unlock; + } + + if (__rt_mutex_futex_trylock(&pi_state->pi_mutex)) { + /* We got the lock after all, nothing to fix. */ + ret = 0; + goto out_unlock; + } + + /* + * Since we just failed the trylock; there must be an owner. + */ + newowner = rt_mutex_owner(&pi_state->pi_mutex); + BUG_ON(!newowner); + } else { + WARN_ON_ONCE(argowner != current); + if (oldowner == current) { + /* + * We raced against a concurrent self; things are + * already fixed up. Nothing to do. + */ + ret = 0; + goto out_unlock; + } + newowner = argowner; + } + + newtid = task_pid_vnr(newowner) | FUTEX_WAITERS; + if (get_futex_value_locked(&uval, uaddr)) goto handle_fault; @@ -2434,15 +2472,28 @@ static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked) * Got the lock. We might not be the anticipated owner if we * did a lock-steal - fix up the PI-state in that case: * - * We can safely read pi_state->owner without holding wait_lock - * because we now own the rt_mutex, only the owner will attempt - * to change it. + * Speculative pi_state->owner read (we don't hold wait_lock); + * since we own the lock pi_state->owner == current is the + * stable state, anything else needs more attention. */ if (q->pi_state->owner != current) ret = fixup_pi_state_owner(uaddr, q, current); goto out; } + /* + * If we didn't get the lock; check if anybody stole it from us. In + * that case, we need to fix up the uval to point to them instead of + * us, otherwise bad things happen. [10] + * + * Another speculative read; pi_state->owner == current is unstable + * but needs our attention. + */ + if (q->pi_state->owner == current) { + ret = fixup_pi_state_owner(uaddr, q, NULL); + goto out; + } + /* * Paranoia check. If we did not take the lock, then we should not be * the owner of the rt_mutex. diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 6f3dba6e4e9e..65cc0cb984e6 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -1290,6 +1290,19 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state, return ret; } +static inline int __rt_mutex_slowtrylock(struct rt_mutex *lock) +{ + int ret = try_to_take_rt_mutex(lock, current, NULL); + + /* + * try_to_take_rt_mutex() sets the lock waiters bit + * unconditionally. Clean this up. + */ + fixup_rt_mutex_waiters(lock); + + return ret; +} + /* * Slow path try-lock function: */ @@ -1312,13 +1325,7 @@ static inline int rt_mutex_slowtrylock(struct rt_mutex *lock) */ raw_spin_lock_irqsave(&lock->wait_lock, flags); - ret = try_to_take_rt_mutex(lock, current, NULL); - - /* - * try_to_take_rt_mutex() sets the lock waiters bit - * unconditionally. Clean this up. - */ - fixup_rt_mutex_waiters(lock); + ret = __rt_mutex_slowtrylock(lock); raw_spin_unlock_irqrestore(&lock->wait_lock, flags); @@ -1505,6 +1512,11 @@ int __sched rt_mutex_futex_trylock(struct rt_mutex *lock) return rt_mutex_slowtrylock(lock); } +int __sched __rt_mutex_futex_trylock(struct rt_mutex *lock) +{ + return __rt_mutex_slowtrylock(lock); +} + /** * rt_mutex_timed_lock - lock a rt_mutex interruptible * the timeout structure is provided diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h index 124e98ca0b17..68686b3ec3c1 100644 --- a/kernel/locking/rtmutex_common.h +++ b/kernel/locking/rtmutex_common.h @@ -148,6 +148,7 @@ extern bool rt_mutex_cleanup_proxy_lock(struct rt_mutex *lock, struct rt_mutex_waiter *waiter); extern int rt_mutex_futex_trylock(struct rt_mutex *l); +extern int __rt_mutex_futex_trylock(struct rt_mutex *l); extern void rt_mutex_futex_unlock(struct rt_mutex *lock); extern bool __rt_mutex_futex_unlock(struct rt_mutex *lock, -- cgit v1.2.3 From fbe0e839d1e22d88810f3ee3e2f1479be4c0aa4a Mon Sep 17 00:00:00 2001 From: Li Jinyue Date: Thu, 14 Dec 2017 17:04:54 +0800 Subject: futex: Prevent overflow by strengthen input validation UBSAN reports signed integer overflow in kernel/futex.c: UBSAN: Undefined behaviour in kernel/futex.c:2041:18 signed integer overflow: 0 - -2147483648 cannot be represented in type 'int' Add a sanity check to catch negative values of nr_wake and nr_requeue. Signed-off-by: Li Jinyue Signed-off-by: Thomas Gleixner Cc: peterz@infradead.org Cc: dvhart@infradead.org Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/1513242294-31786-1-git-send-email-lijinyue@huawei.com --- kernel/futex.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 9e69589b9248..8c5424dd5924 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1878,6 +1878,9 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags, struct futex_q *this, *next; DEFINE_WAKE_Q(wake_q); + if (nr_wake < 0 || nr_requeue < 0) + return -EINVAL; + /* * When PI not supported: return -ENOSYS if requeue_pi is true, * consequently the compiler knows requeue_pi is always false past -- cgit v1.2.3 From a9445e47d897054876b8f43e46dc5a3eca2b844d Mon Sep 17 00:00:00 2001 From: "Max R. P. Grossmann" Date: Mon, 8 Jan 2018 20:01:57 +0100 Subject: posix-cpu-timers: Make set_process_cpu_timer() more robust Because the return value of cpu_timer_sample_group() is not checked, compilers and static checkers can legitimately warn about a potential use of the uninitialized variable 'now'. This is not a runtime issue as all call sites hand in valid clock ids. Also cpu_timer_sample_group() is invoked unconditionally even when the result is not used because *oldval is NULL. Make the invocation conditional and check the return value. [ tglx: Massage changelog ] Signed-off-by: Max R. P. Grossmann Signed-off-by: Thomas Gleixner Cc: john.stultz@linaro.org Link: https://lkml.kernel.org/r/20180108190157.10048-1-m@max.pm --- kernel/time/posix-cpu-timers.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index cef79ca5bbd5..ec9f5da6f163 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -1189,9 +1189,8 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx, u64 now; WARN_ON_ONCE(clock_idx == CPUCLOCK_SCHED); - cpu_timer_sample_group(clock_idx, tsk, &now); - if (oldval) { + if (oldval && cpu_timer_sample_group(clock_idx, tsk, &now) != -EINVAL) { /* * We are setting itimer. The *oldval is absolute and we update * it to be relative, *newval argument is relative and we update -- cgit v1.2.3 From 68fda450a7df51cff9e5a4d4a4d9d0d5f2589153 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Fri, 12 Jan 2018 18:59:52 -0800 Subject: bpf: fix 32-bit divide by zero due to some JITs doing if (src_reg == 0) check in 64-bit mode for div/mod operations mask upper 32-bits of src register before doing the check Fixes: 622582786c9e ("net: filter: x86: internal BPF JIT") Fixes: 7a12b5031c6b ("sparc64: Add eBPF JIT.") Reported-by: syzbot+48340bb518e88849e2e3@syzkaller.appspotmail.com Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- kernel/bpf/verifier.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 20eb04fd155e..b7448347e6b6 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4445,6 +4445,24 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) int i, cnt, delta = 0; for (i = 0; i < insn_cnt; i++, insn++) { + if (insn->code == (BPF_ALU | BPF_MOD | BPF_X) || + insn->code == (BPF_ALU | BPF_DIV | BPF_X)) { + /* due to JIT bugs clear upper 32-bits of src register + * before div/mod operation + */ + insn_buf[0] = BPF_MOV32_REG(insn->src_reg, insn->src_reg); + insn_buf[1] = *insn; + cnt = 2; + new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); + if (!new_prog) + return -ENOMEM; + + delta += cnt - 1; + env->prog = prog = new_prog; + insn = new_prog->insnsi + i + delta; + continue; + } + if (insn->code != (BPF_JMP | BPF_CALL)) continue; -- cgit v1.2.3 From ed4bbf7910b28ce3c691aef28d245585eaabda06 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 14 Jan 2018 23:19:49 +0100 Subject: timers: Unconditionally check deferrable base When the timer base is checked for expired timers then the deferrable base must be checked as well. This was missed when making the deferrable base independent of base::nohz_active. Fixes: ced6d5c11d3e ("timers: Use deferrable base independent of base::nohz_active") Signed-off-by: Thomas Gleixner Cc: Anna-Maria Gleixner Cc: Frederic Weisbecker Cc: Peter Zijlstra Cc: Sebastian Siewior Cc: Paul McKenney Cc: stable@vger.kernel.org Cc: rt@linutronix.de --- kernel/time/timer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 89a9e1b4264a..0bcf00e3ce48 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -1696,7 +1696,7 @@ void run_local_timers(void) hrtimer_run_queues(); /* Raise the softirq only if required. */ if (time_before(jiffies, base->clk)) { - if (!IS_ENABLED(CONFIG_NO_HZ_COMMON) || !base->nohz_active) + if (!IS_ENABLED(CONFIG_NO_HZ_COMMON)) return; /* CPU is awake, so check the deferrable base. */ base++; -- cgit v1.2.3 From a0e3a18f4baf8e3754ac1e56f0ade924d0c0c721 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Mon, 15 Jan 2018 10:47:09 -0500 Subject: ring-buffer: Bring back context level recursive checks Commit 1a149d7d3f45 ("ring-buffer: Rewrite trace_recursive_(un)lock() to be simpler") replaced the context level recursion checks with a simple counter. This would prevent the ring buffer code from recursively calling itself more than the max number of contexts that exist (Normal, softirq, irq, nmi). But this change caused a lockup in a specific case, which was during suspend and resume using a global clock. Adding a stack dump to see where this occurred, the issue was in the trace global clock itself: trace_buffer_lock_reserve+0x1c/0x50 __trace_graph_entry+0x2d/0x90 trace_graph_entry+0xe8/0x200 prepare_ftrace_return+0x69/0xc0 ftrace_graph_caller+0x78/0xa8 queued_spin_lock_slowpath+0x5/0x1d0 trace_clock_global+0xb0/0xc0 ring_buffer_lock_reserve+0xf9/0x390 The function graph tracer traced queued_spin_lock_slowpath that was called by trace_clock_global. This pointed out that the trace_clock_global() is not reentrant, as it takes a spin lock. It depended on the ring buffer recursive lock from letting that happen. By removing the context detection and adding just a max number of allowable recursions, it allowed the trace_clock_global() to be entered again and try to retake the spinlock it already held, causing a deadlock. Fixes: 1a149d7d3f45 ("ring-buffer: Rewrite trace_recursive_(un)lock() to be simpler") Reported-by: David Weinehall Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ring_buffer.c | 62 +++++++++++++++++++++++++++++++++------------- 1 file changed, 45 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 9ab18995ff1e..0cddf60186da 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -2534,29 +2534,59 @@ rb_wakeups(struct ring_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer) * The lock and unlock are done within a preempt disable section. * The current_context per_cpu variable can only be modified * by the current task between lock and unlock. But it can - * be modified more than once via an interrupt. There are four - * different contexts that we need to consider. + * be modified more than once via an interrupt. To pass this + * information from the lock to the unlock without having to + * access the 'in_interrupt()' functions again (which do show + * a bit of overhead in something as critical as function tracing, + * we use a bitmask trick. * - * Normal context. - * SoftIRQ context - * IRQ context - * NMI context + * bit 0 = NMI context + * bit 1 = IRQ context + * bit 2 = SoftIRQ context + * bit 3 = normal context. * - * If for some reason the ring buffer starts to recurse, we - * only allow that to happen at most 4 times (one for each - * context). If it happens 5 times, then we consider this a - * recusive loop and do not let it go further. + * This works because this is the order of contexts that can + * preempt other contexts. A SoftIRQ never preempts an IRQ + * context. + * + * When the context is determined, the corresponding bit is + * checked and set (if it was set, then a recursion of that context + * happened). + * + * On unlock, we need to clear this bit. To do so, just subtract + * 1 from the current_context and AND it to itself. + * + * (binary) + * 101 - 1 = 100 + * 101 & 100 = 100 (clearing bit zero) + * + * 1010 - 1 = 1001 + * 1010 & 1001 = 1000 (clearing bit 1) + * + * The least significant bit can be cleared this way, and it + * just so happens that it is the same bit corresponding to + * the current context. */ static __always_inline int trace_recursive_lock(struct ring_buffer_per_cpu *cpu_buffer) { - if (cpu_buffer->current_context >= 4) + unsigned int val = cpu_buffer->current_context; + unsigned long pc = preempt_count(); + int bit; + + if (!(pc & (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET))) + bit = RB_CTX_NORMAL; + else + bit = pc & NMI_MASK ? RB_CTX_NMI : + pc & HARDIRQ_MASK ? RB_CTX_IRQ : + pc & SOFTIRQ_OFFSET ? 2 : RB_CTX_SOFTIRQ; + + if (unlikely(val & (1 << bit))) return 1; - cpu_buffer->current_context++; - /* Interrupts must see this update */ - barrier(); + val |= (1 << bit); + cpu_buffer->current_context = val; return 0; } @@ -2564,9 +2594,7 @@ trace_recursive_lock(struct ring_buffer_per_cpu *cpu_buffer) static __always_inline void trace_recursive_unlock(struct ring_buffer_per_cpu *cpu_buffer) { - /* Don't let the dec leak out */ - barrier(); - cpu_buffer->current_context--; + cpu_buffer->current_context &= cpu_buffer->current_context - 1; } /** -- cgit v1.2.3 From 68e76e034b6b1c1ce2eece1ab8ae4008e14be470 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Mon, 15 Jan 2018 11:07:27 -0800 Subject: tracing: Prevent PROFILE_ALL_BRANCHES when FORTIFY_SOURCE=y I regularly get 50 MB - 60 MB files during kernel randconfig builds. These large files mostly contain (many repeats of; e.g., 124,594): In file included from ../include/linux/string.h:6:0, from ../include/linux/uuid.h:20, from ../include/linux/mod_devicetable.h:13, from ../scripts/mod/devicetable-offsets.c:3: ../include/linux/compiler.h:64:4: warning: '______f' is static but declared in inline function 'strcpy' which is not static [enabled by default] ______f = { \ ^ ../include/linux/compiler.h:56:23: note: in expansion of macro '__trace_if' ^ ../include/linux/string.h:425:2: note: in expansion of macro 'if' if (p_size == (size_t)-1 && q_size == (size_t)-1) ^ This only happens when CONFIG_FORTIFY_SOURCE=y and CONFIG_PROFILE_ALL_BRANCHES=y, so prevent PROFILE_ALL_BRANCHES if FORTIFY_SOURCE=y. Link: http://lkml.kernel.org/r/9199446b-a141-c0c3-9678-a3f9107f2750@infradead.org Signed-off-by: Randy Dunlap Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 904c952ac383..f54dc62b599c 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -355,7 +355,7 @@ config PROFILE_ANNOTATED_BRANCHES on if you need to profile the system's use of these macros. config PROFILE_ALL_BRANCHES - bool "Profile all if conditionals" + bool "Profile all if conditionals" if !FORTIFY_SOURCE select TRACE_BRANCH_PROFILING help This tracer profiles all branch conditions. Every if () -- cgit v1.2.3 From ae67badaa1643253998cb21d5782e4ea7c231a29 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 14 Jan 2018 23:30:51 +0100 Subject: hrtimer: Optimize the hrtimer code by using static keys for migration_enable/nohz_active The hrtimer_cpu_base::migration_enable and ::nohz_active fields were originally introduced to avoid accessing global variables for these decisions. Still that results in a (cache hot) load and conditional branch, which can be avoided by using static keys. Implement it with static keys and optimize for the most critical case of high performance networking which tends to disable the timer migration functionality. No change in functionality. Signed-off-by: Thomas Gleixner Cc: Anna-Maria Gleixner Cc: Christoph Hellwig Cc: Frederic Weisbecker Cc: John Stultz Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Sebastian Andrzej Siewior Cc: keescook@chromium.org Link: http://lkml.kernel.org/r/alpine.DEB.2.20.1801142327490.2371@nanos Link: https://lkml.kernel.org/r/20171221104205.7269-2-anna-maria@linutronix.de Signed-off-by: Ingo Molnar --- kernel/time/hrtimer.c | 17 +++------- kernel/time/tick-internal.h | 19 +++++++---- kernel/time/tick-sched.c | 2 +- kernel/time/timer.c | 83 +++++++++++++++++++++++---------------------- 4 files changed, 60 insertions(+), 61 deletions(-) (limited to 'kernel') diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index d32520840fde..1d06d2bde733 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -178,23 +178,16 @@ hrtimer_check_target(struct hrtimer *timer, struct hrtimer_clock_base *new_base) #endif } -#ifdef CONFIG_NO_HZ_COMMON -static inline -struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base, - int pinned) -{ - if (pinned || !base->migration_enabled) - return base; - return &per_cpu(hrtimer_bases, get_nohz_timer_target()); -} -#else static inline struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base, int pinned) { +#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) + if (static_branch_likely(&timers_migration_enabled) && !pinned) + return &per_cpu(hrtimer_bases, get_nohz_timer_target()); +#endif return base; } -#endif /* * We switch the timer base to a power-optimized selected CPU target, @@ -969,7 +962,7 @@ void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, * Kick to reschedule the next tick to handle the new timer * on dynticks target. */ - if (new_base->cpu_base->nohz_active) + if (is_timers_nohz_active()) wake_up_nohz_cpu(new_base->cpu_base->cpu); } else { hrtimer_reprogram(timer, new_base); diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index f8e1845aa464..f690628e068c 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h @@ -150,14 +150,19 @@ static inline void tick_nohz_init(void) { } #ifdef CONFIG_NO_HZ_COMMON extern unsigned long tick_nohz_active; -#else +extern void timers_update_nohz(void); +extern struct static_key_false timers_nohz_active; +static inline bool is_timers_nohz_active(void) +{ + return static_branch_likely(&timers_nohz_active); +} +# ifdef CONFIG_SMP +extern struct static_key_false timers_migration_enabled; +# endif +#else /* CONFIG_NO_HZ_COMMON */ +static inline void timers_update_nohz(void) { } #define tick_nohz_active (0) -#endif - -#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) -extern void timers_update_migration(bool update_nohz); -#else -static inline void timers_update_migration(bool update_nohz) { } +static inline bool is_timers_nohz_active(void) { return false; } #endif DECLARE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases); diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index f7cc7abfcf25..29a5733eff83 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -1107,7 +1107,7 @@ static inline void tick_nohz_activate(struct tick_sched *ts, int mode) ts->nohz_mode = mode; /* One update is enough */ if (!test_and_set_bit(0, &tick_nohz_active)) - timers_update_migration(true); + timers_update_nohz(); } /** diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 0bcf00e3ce48..d530f72b32f9 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -200,8 +200,6 @@ struct timer_base { unsigned long clk; unsigned long next_expiry; unsigned int cpu; - bool migration_enabled; - bool nohz_active; bool is_idle; bool must_forward_clk; DECLARE_BITMAP(pending_map, WHEEL_SIZE); @@ -210,45 +208,57 @@ struct timer_base { static DEFINE_PER_CPU(struct timer_base, timer_bases[NR_BASES]); -#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) +#ifdef CONFIG_NO_HZ_COMMON + +DEFINE_STATIC_KEY_FALSE(timers_nohz_active); +static DEFINE_MUTEX(timer_keys_mutex); + +static void timer_update_keys(struct work_struct *work); +static DECLARE_WORK(timer_update_work, timer_update_keys); + +#ifdef CONFIG_SMP unsigned int sysctl_timer_migration = 1; -void timers_update_migration(bool update_nohz) +DEFINE_STATIC_KEY_FALSE(timers_migration_enabled); + +static void timers_update_migration(void) { - bool on = sysctl_timer_migration && tick_nohz_active; - unsigned int cpu; + if (sysctl_timer_migration && tick_nohz_active) + static_branch_enable(&timers_migration_enabled); + else + static_branch_disable(&timers_migration_enabled); +} +#else +static inline void timers_update_migration(void) { } +#endif /* !CONFIG_SMP */ - /* Avoid the loop, if nothing to update */ - if (this_cpu_read(timer_bases[BASE_STD].migration_enabled) == on) - return; +static void timer_update_keys(struct work_struct *work) +{ + mutex_lock(&timer_keys_mutex); + timers_update_migration(); + static_branch_enable(&timers_nohz_active); + mutex_unlock(&timer_keys_mutex); +} - for_each_possible_cpu(cpu) { - per_cpu(timer_bases[BASE_STD].migration_enabled, cpu) = on; - per_cpu(timer_bases[BASE_DEF].migration_enabled, cpu) = on; - per_cpu(hrtimer_bases.migration_enabled, cpu) = on; - if (!update_nohz) - continue; - per_cpu(timer_bases[BASE_STD].nohz_active, cpu) = true; - per_cpu(timer_bases[BASE_DEF].nohz_active, cpu) = true; - per_cpu(hrtimer_bases.nohz_active, cpu) = true; - } +void timers_update_nohz(void) +{ + schedule_work(&timer_update_work); } int timer_migration_handler(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { - static DEFINE_MUTEX(mutex); int ret; - mutex_lock(&mutex); + mutex_lock(&timer_keys_mutex); ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); if (!ret && write) - timers_update_migration(false); - mutex_unlock(&mutex); + timers_update_migration(); + mutex_unlock(&timer_keys_mutex); return ret; } -#endif +#endif /* NO_HZ_COMMON */ static unsigned long round_jiffies_common(unsigned long j, int cpu, bool force_up) @@ -534,7 +544,7 @@ __internal_add_timer(struct timer_base *base, struct timer_list *timer) static void trigger_dyntick_cpu(struct timer_base *base, struct timer_list *timer) { - if (!IS_ENABLED(CONFIG_NO_HZ_COMMON) || !base->nohz_active) + if (!is_timers_nohz_active()) return; /* @@ -849,21 +859,20 @@ static inline struct timer_base *get_timer_base(u32 tflags) return get_timer_cpu_base(tflags, tflags & TIMER_CPUMASK); } -#ifdef CONFIG_NO_HZ_COMMON static inline struct timer_base * get_target_base(struct timer_base *base, unsigned tflags) { -#ifdef CONFIG_SMP - if ((tflags & TIMER_PINNED) || !base->migration_enabled) - return get_timer_this_cpu_base(tflags); - return get_timer_cpu_base(tflags, get_nohz_timer_target()); -#else - return get_timer_this_cpu_base(tflags); +#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) + if (static_branch_likely(&timers_migration_enabled) && + !(tflags & TIMER_PINNED)) + return get_timer_cpu_base(tflags, get_nohz_timer_target()); #endif + return get_timer_this_cpu_base(tflags); } static inline void forward_timer_base(struct timer_base *base) { +#ifdef CONFIG_NO_HZ_COMMON unsigned long jnow; /* @@ -887,16 +896,8 @@ static inline void forward_timer_base(struct timer_base *base) base->clk = jnow; else base->clk = base->next_expiry; -} -#else -static inline struct timer_base * -get_target_base(struct timer_base *base, unsigned tflags) -{ - return get_timer_this_cpu_base(tflags); -} - -static inline void forward_timer_base(struct timer_base *base) { } #endif +} /* -- cgit v1.2.3 From d05ca13b8d3f685667b3b1748fa89285466270c5 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 21 Dec 2017 11:41:31 +0100 Subject: hrtimer: Correct blatantly incorrect comment The protection of a hrtimer which runs its callback against migration to a different CPU has nothing to do with hard interrupt context. The protection against migration of a hrtimer running the expiry callback is the pointer in the cpu_base which holds a pointer to the currently running timer. This pointer is evaluated in the code which potentially switches the timer base and makes sure it's kept on the CPU on which the callback is running. Reported-by: Anna-Maria Gleixner Signed-off-by: Thomas Gleixner Signed-off-by: Anna-Maria Gleixner Reviewed-by: Frederic Weisbecker Cc: Christoph Hellwig Cc: John Stultz Cc: Linus Torvalds Cc: Peter Zijlstra Cc: keescook@chromium.org Link: http://lkml.kernel.org/r/20171221104205.7269-3-anna-maria@linutronix.de Signed-off-by: Ingo Molnar --- kernel/time/hrtimer.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 1d06d2bde733..7687355c00ff 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -1195,9 +1195,9 @@ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base, timer->is_rel = false; /* - * Because we run timers from hardirq context, there is no chance - * they get migrated to another cpu, therefore its safe to unlock - * the timer base. + * The timer is marked as running in the CPU base, so it is + * protected against migration to a different CPU even if the lock + * is dropped. */ raw_spin_unlock(&cpu_base->lock); trace_hrtimer_expire_entry(timer, now); -- cgit v1.2.3 From 907777136f80d0cc0f714e5a389c4dfa9b4670ee Mon Sep 17 00:00:00 2001 From: Anna-Maria Gleixner Date: Thu, 21 Dec 2017 11:41:33 +0100 Subject: hrtimer: Clean up the 'int clock' parameter of schedule_hrtimeout_range_clock() schedule_hrtimeout_range_clock() uses an 'int clock' parameter for the clock ID, instead of the customary predefined "clockid_t" type. In hrtimer coding style the canonical variable name for the clock ID is 'clock_id', therefore change the name of the parameter here as well to make it all consistent. While at it, clean up the description for the 'clock_id' and 'mode' function parameters. The clock modes and the clock IDs are not restricted as the comment suggests. Fix the mode description as well for the callers of schedule_hrtimeout_range_clock(). No functional changes intended. Signed-off-by: Anna-Maria Gleixner Cc: Christoph Hellwig Cc: John Stultz Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: keescook@chromium.org Link: http://lkml.kernel.org/r/20171221104205.7269-5-anna-maria@linutronix.de Signed-off-by: Ingo Molnar --- kernel/time/hrtimer.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 7687355c00ff..f2de328bb8d5 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -1662,12 +1662,12 @@ void __init hrtimers_init(void) * schedule_hrtimeout_range_clock - sleep until timeout * @expires: timeout value (ktime_t) * @delta: slack in expires timeout (ktime_t) - * @mode: timer mode, HRTIMER_MODE_ABS or HRTIMER_MODE_REL - * @clock: timer clock, CLOCK_MONOTONIC or CLOCK_REALTIME + * @mode: timer mode + * @clock_id: timer clock to be used */ int __sched schedule_hrtimeout_range_clock(ktime_t *expires, u64 delta, - const enum hrtimer_mode mode, int clock) + const enum hrtimer_mode mode, clockid_t clock_id) { struct hrtimer_sleeper t; @@ -1688,7 +1688,7 @@ schedule_hrtimeout_range_clock(ktime_t *expires, u64 delta, return -EINTR; } - hrtimer_init_on_stack(&t.timer, clock, mode); + hrtimer_init_on_stack(&t.timer, clock_id, mode); hrtimer_set_expires_range_ns(&t.timer, *expires, delta); hrtimer_init_sleeper(&t, current); @@ -1710,7 +1710,7 @@ schedule_hrtimeout_range_clock(ktime_t *expires, u64 delta, * schedule_hrtimeout_range - sleep until timeout * @expires: timeout value (ktime_t) * @delta: slack in expires timeout (ktime_t) - * @mode: timer mode, HRTIMER_MODE_ABS or HRTIMER_MODE_REL + * @mode: timer mode * * Make the current task sleep until the given expiry time has * elapsed. The routine will return immediately unless @@ -1749,7 +1749,7 @@ EXPORT_SYMBOL_GPL(schedule_hrtimeout_range); /** * schedule_hrtimeout - sleep until timeout * @expires: timeout value (ktime_t) - * @mode: timer mode, HRTIMER_MODE_ABS or HRTIMER_MODE_REL + * @mode: timer mode * * Make the current task sleep until the given expiry time has * elapsed. The routine will return immediately unless -- cgit v1.2.3 From 6de6250c759781daeadca784d0cc34ae73f3b502 Mon Sep 17 00:00:00 2001 From: Anna-Maria Gleixner Date: Thu, 21 Dec 2017 11:41:34 +0100 Subject: hrtimer: Fix hrtimer_start[_range_ns]() function descriptions The hrtimer_start[_range_ns]() functions start a timer reliably on this CPU only when HRTIMER_MODE_PINNED is set. Furthermore the HRTIMER_MODE_PINNED mode is not considered when a hrtimer is initialized. Signed-off-by: Anna-Maria Gleixner Cc: Christoph Hellwig Cc: John Stultz Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: keescook@chromium.org Link: http://lkml.kernel.org/r/20171221104205.7269-6-anna-maria@linutronix.de Signed-off-by: Ingo Molnar --- kernel/time/hrtimer.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index f2de328bb8d5..fd08729de5d2 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -924,12 +924,12 @@ static inline ktime_t hrtimer_update_lowres(struct hrtimer *timer, ktime_t tim, } /** - * hrtimer_start_range_ns - (re)start an hrtimer on the current CPU + * hrtimer_start_range_ns - (re)start an hrtimer * @timer: the timer to be added * @tim: expiry time * @delta_ns: "slack" range for the timer - * @mode: expiry mode: absolute (HRTIMER_MODE_ABS) or - * relative (HRTIMER_MODE_REL) + * @mode: timer mode: absolute (HRTIMER_MODE_ABS) or + * relative (HRTIMER_MODE_REL), and pinned (HRTIMER_MODE_PINNED) */ void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, u64 delta_ns, const enum hrtimer_mode mode) @@ -1107,7 +1107,8 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id, * hrtimer_init - initialize a timer to the given clock * @timer: the timer to be initialized * @clock_id: the clock to be used - * @mode: timer mode abs/rel + * @mode: timer mode: absolute (HRTIMER_MODE_ABS) or + * relative (HRTIMER_MODE_REL); pinned is not considered here! */ void hrtimer_init(struct hrtimer *timer, clockid_t clock_id, enum hrtimer_mode mode) -- cgit v1.2.3 From 48d0c9becc7f3c66874c100c126459a9da0fdced Mon Sep 17 00:00:00 2001 From: Anna-Maria Gleixner Date: Thu, 21 Dec 2017 11:41:35 +0100 Subject: hrtimer: Ensure POSIX compliance (relative CLOCK_REALTIME hrtimers) The POSIX specification defines that relative CLOCK_REALTIME timers are not affected by clock modifications. Those timers have to use CLOCK_MONOTONIC to ensure POSIX compliance. The introduction of the additional HRTIMER_MODE_PINNED mode broke this requirement for pinned timers. There is no user space visible impact because user space timers are not using pinned mode, but for consistency reasons this needs to be fixed. Check whether the mode has the HRTIMER_MODE_REL bit set instead of comparing with HRTIMER_MODE_ABS. Signed-off-by: Anna-Maria Gleixner Cc: Christoph Hellwig Cc: John Stultz Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: keescook@chromium.org Fixes: 597d0275736d ("timers: Framework for identifying pinned timers") Link: http://lkml.kernel.org/r/20171221104205.7269-7-anna-maria@linutronix.de Signed-off-by: Ingo Molnar --- kernel/time/hrtimer.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index fd08729de5d2..60faade2bb4e 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -1095,7 +1095,12 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id, cpu_base = raw_cpu_ptr(&hrtimer_bases); - if (clock_id == CLOCK_REALTIME && mode != HRTIMER_MODE_ABS) + /* + * POSIX magic: Relative CLOCK_REALTIME timers are not affected by + * clock modifications, so they needs to become CLOCK_MONOTONIC to + * ensure POSIX compliance. + */ + if (clock_id == CLOCK_REALTIME && mode & HRTIMER_MODE_REL) clock_id = CLOCK_MONOTONIC; base = hrtimer_clockid_to_base(clock_id); -- cgit v1.2.3 From 63e2ed3659752a4850e0ef3a07f809988fcd74a4 Mon Sep 17 00:00:00 2001 From: Anna-Maria Gleixner Date: Thu, 21 Dec 2017 11:41:38 +0100 Subject: tracing/hrtimer: Print the hrtimer mode in the 'hrtimer_start' tracepoint The 'hrtimer_start' tracepoint lacks the mode information. The mode is important because consecutive starts can switch from ABS to REL or from PINNED to non PINNED. Append the mode field. Signed-off-by: Anna-Maria Gleixner Cc: Christoph Hellwig Cc: John Stultz Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: keescook@chromium.org Link: http://lkml.kernel.org/r/20171221104205.7269-10-anna-maria@linutronix.de Signed-off-by: Ingo Molnar --- kernel/time/hrtimer.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 60faade2bb4e..f4f46589d7cc 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -435,10 +435,11 @@ debug_init(struct hrtimer *timer, clockid_t clockid, trace_hrtimer_init(timer, clockid, mode); } -static inline void debug_activate(struct hrtimer *timer) +static inline void debug_activate(struct hrtimer *timer, + enum hrtimer_mode mode) { debug_hrtimer_activate(timer); - trace_hrtimer_start(timer); + trace_hrtimer_start(timer, mode); } static inline void debug_deactivate(struct hrtimer *timer) @@ -828,9 +829,10 @@ EXPORT_SYMBOL_GPL(hrtimer_forward); * Returns 1 when the new timer is the leftmost timer in the tree. */ static int enqueue_hrtimer(struct hrtimer *timer, - struct hrtimer_clock_base *base) + struct hrtimer_clock_base *base, + enum hrtimer_mode mode) { - debug_activate(timer); + debug_activate(timer, mode); base->cpu_base->active_bases |= 1 << base->index; @@ -953,7 +955,7 @@ void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, /* Switch the timer base, if necessary: */ new_base = switch_hrtimer_base(timer, base, mode & HRTIMER_MODE_PINNED); - leftmost = enqueue_hrtimer(timer, new_base); + leftmost = enqueue_hrtimer(timer, new_base, mode); if (!leftmost) goto unlock; @@ -1222,7 +1224,7 @@ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base, */ if (restart != HRTIMER_NORESTART && !(timer->state & HRTIMER_STATE_ENQUEUED)) - enqueue_hrtimer(timer, base); + enqueue_hrtimer(timer, base, HRTIMER_MODE_ABS); /* * Separate the ->running assignment from the ->state assignment. @@ -1621,7 +1623,7 @@ static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base, * sort out already expired timers and reprogram the * event device. */ - enqueue_hrtimer(timer, new_base); + enqueue_hrtimer(timer, new_base, HRTIMER_MODE_ABS); } } -- cgit v1.2.3 From c272ca58c3ec5631f4ab507489d9477f74efe645 Mon Sep 17 00:00:00 2001 From: Anna-Maria Gleixner Date: Thu, 21 Dec 2017 11:41:39 +0100 Subject: hrtimer: Switch 'for' loop to _ffs() evaluation Looping over all clock bases to find active bits is suboptimal if not all bases are active. Avoid this by converting it to a __ffs() evaluation. The functionallity is outsourced into its own function and is called via a macro as suggested by Peter Zijlstra. Suggested-by: Peter Zijlstra Signed-off-by: Anna-Maria Gleixner Cc: Christoph Hellwig Cc: John Stultz Cc: Linus Torvalds Cc: Thomas Gleixner Cc: keescook@chromium.org Link: http://lkml.kernel.org/r/20171221104205.7269-11-anna-maria@linutronix.de Signed-off-by: Ingo Molnar --- kernel/time/hrtimer.c | 31 +++++++++++++++++++++---------- 1 file changed, 21 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index f4f46589d7cc..cfcf8decf102 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -448,6 +448,23 @@ static inline void debug_deactivate(struct hrtimer *timer) trace_hrtimer_cancel(timer); } +static struct hrtimer_clock_base * +__next_base(struct hrtimer_cpu_base *cpu_base, unsigned int *active) +{ + unsigned int idx; + + if (!*active) + return NULL; + + idx = __ffs(*active); + *active &= ~(1U << idx); + + return &cpu_base->clock_base[idx]; +} + +#define for_each_active_base(base, cpu_base, active) \ + while ((base = __next_base((cpu_base), &(active)))) + #if defined(CONFIG_NO_HZ_COMMON) || defined(CONFIG_HIGH_RES_TIMERS) static inline void hrtimer_update_next_timer(struct hrtimer_cpu_base *cpu_base, struct hrtimer *timer) @@ -459,18 +476,15 @@ static inline void hrtimer_update_next_timer(struct hrtimer_cpu_base *cpu_base, static ktime_t __hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base) { - struct hrtimer_clock_base *base = cpu_base->clock_base; + struct hrtimer_clock_base *base; unsigned int active = cpu_base->active_bases; ktime_t expires, expires_next = KTIME_MAX; hrtimer_update_next_timer(cpu_base, NULL); - for (; active; base++, active >>= 1) { + for_each_active_base(base, cpu_base, active) { struct timerqueue_node *next; struct hrtimer *timer; - if (!(active & 0x01)) - continue; - next = timerqueue_getnext(&base->active); timer = container_of(next, struct hrtimer, node); expires = ktime_sub(hrtimer_get_expires(timer), base->offset); @@ -1241,16 +1255,13 @@ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base, static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now) { - struct hrtimer_clock_base *base = cpu_base->clock_base; + struct hrtimer_clock_base *base; unsigned int active = cpu_base->active_bases; - for (; active; base++, active >>= 1) { + for_each_active_base(base, cpu_base, active) { struct timerqueue_node *node; ktime_t basenow; - if (!(active & 0x01)) - continue; - basenow = ktime_add(now, base->offset); while ((node = timerqueue_getnext(&base->active))) { -- cgit v1.2.3 From 3f0b9e8eec7262648ab9c8321bf931624ee5c10a Mon Sep 17 00:00:00 2001 From: Anna-Maria Gleixner Date: Thu, 21 Dec 2017 11:41:40 +0100 Subject: hrtimer: Store running timer in hrtimer_clock_base The pointer to the currently running timer is stored in hrtimer_cpu_base before the base lock is dropped and the callback is invoked. This results in two levels of indirections and the upcoming support for softirq based hrtimer requires splitting the "running" storage into soft and hard IRQ context expiry. Storing both in the cpu base would require conditionals in all code paths accessing that information. It's possible to have a per clock base sequence count and running pointer without changing the semantics of the related mechanisms because the timer base pointer cannot be changed while a timer is running the callback. Unfortunately this makes cpu_clock base larger than 32 bytes on 32-bit kernels. Instead of having huge gaps due to alignment, remove the alignment and let the compiler pack CPU base for 32-bit kernels. The resulting cache access patterns are fortunately not really different from the current behaviour. On 64-bit kernels the 64-byte alignment stays and the behaviour is unchanged. This was determined by analyzing the resulting layout and looking at the number of cache lines involved for the frequently used clocks. Signed-off-by: Anna-Maria Gleixner Cc: Christoph Hellwig Cc: John Stultz Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: keescook@chromium.org Link: http://lkml.kernel.org/r/20171221104205.7269-12-anna-maria@linutronix.de Signed-off-by: Ingo Molnar --- kernel/time/hrtimer.c | 28 +++++++++++++--------------- 1 file changed, 13 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index cfcf8decf102..e56805fe5d00 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -70,7 +70,6 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) = { .lock = __RAW_SPIN_LOCK_UNLOCKED(hrtimer_bases.lock), - .seq = SEQCNT_ZERO(hrtimer_bases.seq), .clock_base = { { @@ -118,7 +117,6 @@ static const int hrtimer_clock_to_base_table[MAX_CLOCKS] = { * timer->base->cpu_base */ static struct hrtimer_cpu_base migration_cpu_base = { - .seq = SEQCNT_ZERO(migration_cpu_base), .clock_base = { { .cpu_base = &migration_cpu_base, }, }, }; @@ -1148,19 +1146,19 @@ EXPORT_SYMBOL_GPL(hrtimer_init); */ bool hrtimer_active(const struct hrtimer *timer) { - struct hrtimer_cpu_base *cpu_base; + struct hrtimer_clock_base *base; unsigned int seq; do { - cpu_base = READ_ONCE(timer->base->cpu_base); - seq = raw_read_seqcount_begin(&cpu_base->seq); + base = READ_ONCE(timer->base); + seq = raw_read_seqcount_begin(&base->seq); if (timer->state != HRTIMER_STATE_INACTIVE || - cpu_base->running == timer) + base->running == timer) return true; - } while (read_seqcount_retry(&cpu_base->seq, seq) || - cpu_base != READ_ONCE(timer->base->cpu_base)); + } while (read_seqcount_retry(&base->seq, seq) || + base != READ_ONCE(timer->base)); return false; } @@ -1194,16 +1192,16 @@ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base, lockdep_assert_held(&cpu_base->lock); debug_deactivate(timer); - cpu_base->running = timer; + base->running = timer; /* * Separate the ->running assignment from the ->state assignment. * * As with a regular write barrier, this ensures the read side in - * hrtimer_active() cannot observe cpu_base->running == NULL && + * hrtimer_active() cannot observe base->running == NULL && * timer->state == INACTIVE. */ - raw_write_seqcount_barrier(&cpu_base->seq); + raw_write_seqcount_barrier(&base->seq); __remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE, 0); fn = timer->function; @@ -1244,13 +1242,13 @@ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base, * Separate the ->running assignment from the ->state assignment. * * As with a regular write barrier, this ensures the read side in - * hrtimer_active() cannot observe cpu_base->running == NULL && + * hrtimer_active() cannot observe base->running.timer == NULL && * timer->state == INACTIVE. */ - raw_write_seqcount_barrier(&cpu_base->seq); + raw_write_seqcount_barrier(&base->seq); - WARN_ON_ONCE(cpu_base->running != timer); - cpu_base->running = NULL; + WARN_ON_ONCE(base->running != timer); + base->running = NULL; } static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now) -- cgit v1.2.3 From 28bfd18bf3daa5db8bb3422ea7138c8b7d2444ac Mon Sep 17 00:00:00 2001 From: Anna-Maria Gleixner Date: Thu, 21 Dec 2017 11:41:42 +0100 Subject: hrtimer: Make the hrtimer_cpu_base::hres_active field unconditional, to simplify the code The hrtimer_cpu_base::hres_active_member field depends on CONFIG_HIGH_RES_TIMERS=y currently, and all related functions to this member are conditional as well. To simplify the code make it unconditional and set it to zero during initialization. (This will also help with the upcoming softirq based hrtimers code.) The conditional code sections can be avoided by adding IS_ENABLED(HIGHRES) conditionals into common functions, which ensures dead code elimination. There is no functional change. Suggested-by: Thomas Gleixner Signed-off-by: Anna-Maria Gleixner Cc: Christoph Hellwig Cc: John Stultz Cc: Linus Torvalds Cc: Peter Zijlstra Cc: keescook@chromium.org Link: http://lkml.kernel.org/r/20171221104205.7269-14-anna-maria@linutronix.de Signed-off-by: Ingo Molnar --- kernel/time/hrtimer.c | 31 +++++++++++++++---------------- 1 file changed, 15 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index e56805fe5d00..b688090093d6 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -512,6 +512,20 @@ static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base) offs_real, offs_boot, offs_tai); } +/* + * Is the high resolution mode active ? + */ +static inline int __hrtimer_hres_active(struct hrtimer_cpu_base *cpu_base) +{ + return IS_ENABLED(CONFIG_HIGH_RES_TIMERS) ? + cpu_base->hres_active : 0; +} + +static inline int hrtimer_hres_active(void) +{ + return __hrtimer_hres_active(this_cpu_ptr(&hrtimer_bases)); +} + /* High resolution timer related functions */ #ifdef CONFIG_HIGH_RES_TIMERS @@ -540,19 +554,6 @@ static inline int hrtimer_is_hres_enabled(void) return hrtimer_hres_enabled; } -/* - * Is the high resolution mode active ? - */ -static inline int __hrtimer_hres_active(struct hrtimer_cpu_base *cpu_base) -{ - return cpu_base->hres_active; -} - -static inline int hrtimer_hres_active(void) -{ - return __hrtimer_hres_active(this_cpu_ptr(&hrtimer_bases)); -} - /* * Reprogram the event source with checking both queues for the * next event @@ -661,7 +662,6 @@ static void hrtimer_reprogram(struct hrtimer *timer, static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) { base->expires_next = KTIME_MAX; - base->hres_active = 0; } /* @@ -720,8 +720,6 @@ void clock_was_set_delayed(void) #else -static inline int __hrtimer_hres_active(struct hrtimer_cpu_base *b) { return 0; } -static inline int hrtimer_hres_active(void) { return 0; } static inline int hrtimer_is_hres_enabled(void) { return 0; } static inline void hrtimer_switch_to_hres(void) { } static inline void @@ -1600,6 +1598,7 @@ int hrtimers_prepare_cpu(unsigned int cpu) } cpu_base->cpu = cpu; + cpu_base->hres_active = 0; hrtimer_init_hres(cpu_base); return 0; } -- cgit v1.2.3 From 851cff8caf4d638d001aac6e57a3511abd94f100 Mon Sep 17 00:00:00 2001 From: Anna-Maria Gleixner Date: Thu, 21 Dec 2017 11:41:43 +0100 Subject: hrtimer: Use accesor functions instead of direct access __hrtimer_hres_active() is now available unconditionally, so replace open coded direct accesses to hrtimer_cpu_base.hres_active. No functional change. Signed-off-by: Anna-Maria Gleixner Cc: Christoph Hellwig Cc: John Stultz Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: keescook@chromium.org Link: http://lkml.kernel.org/r/20171221104205.7269-15-anna-maria@linutronix.de Signed-off-by: Ingo Molnar --- kernel/time/hrtimer.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index b688090093d6..5a624f9c8408 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -564,7 +564,7 @@ hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal) { ktime_t expires_next; - if (!cpu_base->hres_active) + if (!__hrtimer_hres_active(cpu_base)) return; expires_next = __hrtimer_get_next_event(cpu_base); @@ -673,7 +673,7 @@ static void retrigger_next_event(void *arg) { struct hrtimer_cpu_base *base = this_cpu_ptr(&hrtimer_bases); - if (!base->hres_active) + if (!__hrtimer_hres_active(base)) return; raw_spin_lock(&base->lock); -- cgit v1.2.3 From 07a9a7eae86abb796468b225586086d7c4cb59fc Mon Sep 17 00:00:00 2001 From: Anna-Maria Gleixner Date: Thu, 21 Dec 2017 11:41:44 +0100 Subject: hrtimer: Make the remote enqueue check unconditional hrtimer_cpu_base.expires_next is used to cache the next event armed in the timer hardware. The value is used to check whether an hrtimer can be enqueued remotely. If the new hrtimer is expiring before expires_next, then remote enqueue is not possible as the remote hrtimer hardware cannot be accessed for reprogramming to an earlier expiry time. The remote enqueue check is currently conditional on CONFIG_HIGH_RES_TIMERS=y and hrtimer_cpu_base.hres_active. There is no compelling reason to make this conditional. Move hrtimer_cpu_base.expires_next out of the CONFIG_HIGH_RES_TIMERS=y guarded area and remove the conditionals in hrtimer_check_target(). The check is currently a NOOP for the CONFIG_HIGH_RES_TIMERS=n and the !hrtimer_cpu_base.hres_active case because in these cases nothing updates hrtimer_cpu_base.expires_next yet. This will be changed with later patches which further reduce the #ifdef zoo in this code. Signed-off-by: Anna-Maria Gleixner Cc: Christoph Hellwig Cc: John Stultz Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: keescook@chromium.org Link: http://lkml.kernel.org/r/20171221104205.7269-16-anna-maria@linutronix.de Signed-off-by: Ingo Molnar --- kernel/time/hrtimer.c | 26 ++++++-------------------- 1 file changed, 6 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 5a624f9c8408..a9ab67f3e5d5 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -154,26 +154,21 @@ struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer, } /* - * With HIGHRES=y we do not migrate the timer when it is expiring - * before the next event on the target cpu because we cannot reprogram - * the target cpu hardware and we would cause it to fire late. + * We do not migrate the timer when it is expiring before the next + * event on the target cpu. When high resolution is enabled, we cannot + * reprogram the target cpu hardware and we would cause it to fire + * late. To keep it simple, we handle the high resolution enabled and + * disabled case similar. * * Called with cpu_base->lock of target cpu held. */ static int hrtimer_check_target(struct hrtimer *timer, struct hrtimer_clock_base *new_base) { -#ifdef CONFIG_HIGH_RES_TIMERS ktime_t expires; - if (!new_base->cpu_base->hres_active) - return 0; - expires = ktime_sub(hrtimer_get_expires(timer), new_base->offset); return expires <= new_base->cpu_base->expires_next; -#else - return 0; -#endif } static inline @@ -656,14 +651,6 @@ static void hrtimer_reprogram(struct hrtimer *timer, tick_program_event(expires, 1); } -/* - * Initialize the high resolution related parts of cpu_base - */ -static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) -{ - base->expires_next = KTIME_MAX; -} - /* * Retrigger next event is called after clock was set * @@ -729,7 +716,6 @@ static inline int hrtimer_reprogram(struct hrtimer *timer, { return 0; } -static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) { } static inline void retrigger_next_event(void *arg) { } #endif /* CONFIG_HIGH_RES_TIMERS */ @@ -1599,7 +1585,7 @@ int hrtimers_prepare_cpu(unsigned int cpu) cpu_base->cpu = cpu; cpu_base->hres_active = 0; - hrtimer_init_hres(cpu_base); + cpu_base->expires_next = KTIME_MAX; return 0; } -- cgit v1.2.3 From eb27926ba05233dc4f2052cc9d4f19359ec3cd2c Mon Sep 17 00:00:00 2001 From: Anna-Maria Gleixner Date: Thu, 21 Dec 2017 11:41:45 +0100 Subject: hrtimer: Make hrtimer_cpu_base.next_timer handling unconditional hrtimer_cpu_base.next_timer stores the pointer to the next expiring timer in a CPU base. This pointer cannot be dereferenced and is solely used to check whether a hrtimer which is removed is the hrtimer which is the first to expire in the CPU base. If this is the case, then the timer hardware needs to be reprogrammed to avoid an extra interrupt for nothing. Again, this is conditional functionality, but there is no compelling reason to make this conditional. As a preparation, hrtimer_cpu_base.next_timer needs to be available unconditonally. Aside of that the upcoming support for softirq based hrtimers requires access to this pointer unconditionally as well, so our motivation is not entirely simplicity based. Make the update of hrtimer_cpu_base.next_timer unconditional and remove the #ifdef cruft. The impact on CONFIG_HIGH_RES_TIMERS=n && CONFIG_NOHZ=n is marginal as it's just a store on an already dirtied cacheline. No functional change. Signed-off-by: Anna-Maria Gleixner Cc: Christoph Hellwig Cc: John Stultz Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: keescook@chromium.org Link: http://lkml.kernel.org/r/20171221104205.7269-17-anna-maria@linutronix.de Signed-off-by: Ingo Molnar --- kernel/time/hrtimer.c | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index a9ab67f3e5d5..26abaa7b0419 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -459,21 +459,13 @@ __next_base(struct hrtimer_cpu_base *cpu_base, unsigned int *active) while ((base = __next_base((cpu_base), &(active)))) #if defined(CONFIG_NO_HZ_COMMON) || defined(CONFIG_HIGH_RES_TIMERS) -static inline void hrtimer_update_next_timer(struct hrtimer_cpu_base *cpu_base, - struct hrtimer *timer) -{ -#ifdef CONFIG_HIGH_RES_TIMERS - cpu_base->next_timer = timer; -#endif -} - static ktime_t __hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base) { struct hrtimer_clock_base *base; unsigned int active = cpu_base->active_bases; ktime_t expires, expires_next = KTIME_MAX; - hrtimer_update_next_timer(cpu_base, NULL); + cpu_base->next_timer = NULL; for_each_active_base(base, cpu_base, active) { struct timerqueue_node *next; struct hrtimer *timer; @@ -483,7 +475,7 @@ static ktime_t __hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base) expires = ktime_sub(hrtimer_get_expires(timer), base->offset); if (expires < expires_next) { expires_next = expires; - hrtimer_update_next_timer(cpu_base, timer); + cpu_base->next_timer = timer; } } /* -- cgit v1.2.3 From 11a9fe069e341ac53bddb8fe1a85ea986cff1a42 Mon Sep 17 00:00:00 2001 From: Anna-Maria Gleixner Date: Thu, 21 Dec 2017 11:41:46 +0100 Subject: hrtimer: Make hrtimer_reprogramm() unconditional hrtimer_reprogram() needs to be available unconditionally for softirq based hrtimers. Move the function and all required struct members out of the CONFIG_HIGH_RES_TIMERS #ifdef. There is no functional change because hrtimer_reprogram() is only invoked when hrtimer_cpu_base.hres_active is true. Making it unconditional increases the text size for the CONFIG_HIGH_RES_TIMERS=n case, but avoids replication of that code for the upcoming softirq based hrtimers support. Signed-off-by: Anna-Maria Gleixner Cc: Christoph Hellwig Cc: John Stultz Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: keescook@chromium.org Link: http://lkml.kernel.org/r/20171221104205.7269-18-anna-maria@linutronix.de Signed-off-by: Ingo Molnar --- kernel/time/hrtimer.c | 129 ++++++++++++++++++++++++-------------------------- 1 file changed, 62 insertions(+), 67 deletions(-) (limited to 'kernel') diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 26abaa7b0419..63d804aea1ea 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -581,68 +581,6 @@ hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal) tick_program_event(cpu_base->expires_next, 1); } -/* - * When a timer is enqueued and expires earlier than the already enqueued - * timers, we have to check, whether it expires earlier than the timer for - * which the clock event device was armed. - * - * Called with interrupts disabled and base->cpu_base.lock held - */ -static void hrtimer_reprogram(struct hrtimer *timer, - struct hrtimer_clock_base *base) -{ - struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); - ktime_t expires = ktime_sub(hrtimer_get_expires(timer), base->offset); - - WARN_ON_ONCE(hrtimer_get_expires_tv64(timer) < 0); - - /* - * If the timer is not on the current cpu, we cannot reprogram - * the other cpus clock event device. - */ - if (base->cpu_base != cpu_base) - return; - - /* - * If the hrtimer interrupt is running, then it will - * reevaluate the clock bases and reprogram the clock event - * device. The callbacks are always executed in hard interrupt - * context so we don't need an extra check for a running - * callback. - */ - if (cpu_base->in_hrtirq) - return; - - /* - * CLOCK_REALTIME timer might be requested with an absolute - * expiry time which is less than base->offset. Set it to 0. - */ - if (expires < 0) - expires = 0; - - if (expires >= cpu_base->expires_next) - return; - - /* Update the pointer to the next expiring timer */ - cpu_base->next_timer = timer; - - /* - * If a hang was detected in the last timer interrupt then we - * do not schedule a timer which is earlier than the expiry - * which we enforced in the hang detection. We want the system - * to make progress. - */ - if (cpu_base->hang_detected) - return; - - /* - * Program the timer hardware. We enforce the expiry for - * events which are already in the past. - */ - cpu_base->expires_next = expires; - tick_program_event(expires, 1); -} - /* * Retrigger next event is called after clock was set * @@ -703,15 +641,72 @@ static inline int hrtimer_is_hres_enabled(void) { return 0; } static inline void hrtimer_switch_to_hres(void) { } static inline void hrtimer_force_reprogram(struct hrtimer_cpu_base *base, int skip_equal) { } -static inline int hrtimer_reprogram(struct hrtimer *timer, - struct hrtimer_clock_base *base) -{ - return 0; -} static inline void retrigger_next_event(void *arg) { } #endif /* CONFIG_HIGH_RES_TIMERS */ +/* + * When a timer is enqueued and expires earlier than the already enqueued + * timers, we have to check, whether it expires earlier than the timer for + * which the clock event device was armed. + * + * Called with interrupts disabled and base->cpu_base.lock held + */ +static void hrtimer_reprogram(struct hrtimer *timer, + struct hrtimer_clock_base *base) +{ + struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); + ktime_t expires = ktime_sub(hrtimer_get_expires(timer), base->offset); + + WARN_ON_ONCE(hrtimer_get_expires_tv64(timer) < 0); + + /* + * If the timer is not on the current cpu, we cannot reprogram + * the other cpus clock event device. + */ + if (base->cpu_base != cpu_base) + return; + + /* + * If the hrtimer interrupt is running, then it will + * reevaluate the clock bases and reprogram the clock event + * device. The callbacks are always executed in hard interrupt + * context so we don't need an extra check for a running + * callback. + */ + if (cpu_base->in_hrtirq) + return; + + /* + * CLOCK_REALTIME timer might be requested with an absolute + * expiry time which is less than base->offset. Set it to 0. + */ + if (expires < 0) + expires = 0; + + if (expires >= cpu_base->expires_next) + return; + + /* Update the pointer to the next expiring timer */ + cpu_base->next_timer = timer; + + /* + * If a hang was detected in the last timer interrupt then we + * do not schedule a timer which is earlier than the expiry + * which we enforced in the hang detection. We want the system + * to make progress. + */ + if (cpu_base->hang_detected) + return; + + /* + * Program the timer hardware. We enforce the expiry for + * events which are already in the past. + */ + cpu_base->expires_next = expires; + tick_program_event(expires, 1); +} + /* * Clock realtime was set * -- cgit v1.2.3 From ebba2c723f38a766546b2eaf828c522576c791d4 Mon Sep 17 00:00:00 2001 From: Anna-Maria Gleixner Date: Thu, 21 Dec 2017 11:41:47 +0100 Subject: hrtimer: Make hrtimer_force_reprogramm() unconditionally available hrtimer_force_reprogram() needs to be available unconditionally for softirq based hrtimers. Move the function and all required struct members out of the CONFIG_HIGH_RES_TIMERS #ifdef. There is no functional change because hrtimer_force_reprogram() is only invoked when hrtimer_cpu_base.hres_active is true and CONFIG_HIGH_RES_TIMERS=y. Making it unconditional increases the text size for the CONFIG_HIGH_RES_TIMERS=n case slightly, but avoids replication of that code for the upcoming softirq based hrtimers support. Most of the code gets eliminated in the CONFIG_HIGH_RES_TIMERS=n case by the compiler. Signed-off-by: Anna-Maria Gleixner Cc: Christoph Hellwig Cc: John Stultz Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: keescook@chromium.org Link: http://lkml.kernel.org/r/20171221104205.7269-19-anna-maria@linutronix.de [ Made it build on !CONFIG_HIGH_RES_TIMERS ] Signed-off-by: Ingo Molnar --- kernel/time/hrtimer.c | 60 ++++++++++++++++++++++++--------------------------- 1 file changed, 28 insertions(+), 32 deletions(-) (limited to 'kernel') diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 63d804aea1ea..2b3222ea2a6c 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -458,7 +458,6 @@ __next_base(struct hrtimer_cpu_base *cpu_base, unsigned int *active) #define for_each_active_base(base, cpu_base, active) \ while ((base = __next_base((cpu_base), &(active)))) -#if defined(CONFIG_NO_HZ_COMMON) || defined(CONFIG_HIGH_RES_TIMERS) static ktime_t __hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base) { struct hrtimer_clock_base *base; @@ -487,7 +486,6 @@ static ktime_t __hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base) expires_next = 0; return expires_next; } -#endif static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base) { @@ -513,34 +511,6 @@ static inline int hrtimer_hres_active(void) return __hrtimer_hres_active(this_cpu_ptr(&hrtimer_bases)); } -/* High resolution timer related functions */ -#ifdef CONFIG_HIGH_RES_TIMERS - -/* - * High resolution timer enabled ? - */ -static bool hrtimer_hres_enabled __read_mostly = true; -unsigned int hrtimer_resolution __read_mostly = LOW_RES_NSEC; -EXPORT_SYMBOL_GPL(hrtimer_resolution); - -/* - * Enable / Disable high resolution mode - */ -static int __init setup_hrtimer_hres(char *str) -{ - return (kstrtobool(str, &hrtimer_hres_enabled) == 0); -} - -__setup("highres=", setup_hrtimer_hres); - -/* - * hrtimer_high_res_enabled - query, if the highres mode is enabled - */ -static inline int hrtimer_is_hres_enabled(void) -{ - return hrtimer_hres_enabled; -} - /* * Reprogram the event source with checking both queues for the * next event @@ -581,6 +551,34 @@ hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal) tick_program_event(cpu_base->expires_next, 1); } +/* High resolution timer related functions */ +#ifdef CONFIG_HIGH_RES_TIMERS + +/* + * High resolution timer enabled ? + */ +static bool hrtimer_hres_enabled __read_mostly = true; +unsigned int hrtimer_resolution __read_mostly = LOW_RES_NSEC; +EXPORT_SYMBOL_GPL(hrtimer_resolution); + +/* + * Enable / Disable high resolution mode + */ +static int __init setup_hrtimer_hres(char *str) +{ + return (kstrtobool(str, &hrtimer_hres_enabled) == 0); +} + +__setup("highres=", setup_hrtimer_hres); + +/* + * hrtimer_high_res_enabled - query, if the highres mode is enabled + */ +static inline int hrtimer_is_hres_enabled(void) +{ + return hrtimer_hres_enabled; +} + /* * Retrigger next event is called after clock was set * @@ -639,8 +637,6 @@ void clock_was_set_delayed(void) static inline int hrtimer_is_hres_enabled(void) { return 0; } static inline void hrtimer_switch_to_hres(void) { } -static inline void -hrtimer_force_reprogram(struct hrtimer_cpu_base *base, int skip_equal) { } static inline void retrigger_next_event(void *arg) { } #endif /* CONFIG_HIGH_RES_TIMERS */ -- cgit v1.2.3 From 61bb4bcb79c7afcd0bf0d20aef4704977172fd60 Mon Sep 17 00:00:00 2001 From: Anna-Maria Gleixner Date: Thu, 21 Dec 2017 11:41:48 +0100 Subject: hrtimer: Unify hrtimer removal handling When the first hrtimer on the current CPU is removed, hrtimer_force_reprogram() is invoked but only when CONFIG_HIGH_RES_TIMERS=y and hrtimer_cpu_base.hres_active is set. hrtimer_force_reprogram() updates hrtimer_cpu_base.expires_next and reprograms the clock event device. When CONFIG_HIGH_RES_TIMERS=y and hrtimer_cpu_base.hres_active is set, a pointless hrtimer interrupt can be prevented. hrtimer_check_target() makes the 'can remote enqueue' decision. As soon as hrtimer_check_target() is unconditionally available and hrtimer_cpu_base.expires_next is updated by hrtimer_reprogram(), hrtimer_force_reprogram() needs to be available unconditionally as well to prevent the following scenario with CONFIG_HIGH_RES_TIMERS=n: - the first hrtimer on this CPU is removed and hrtimer_force_reprogram() is not executed - CPU goes idle (next timer is calculated and hrtimers are taken into account) - a hrtimer is enqueued remote on the idle CPU: hrtimer_check_target() compares expiry value and hrtimer_cpu_base.expires_next. The expiry value is after expires_next, so the hrtimer is enqueued. This timer will fire late, if it expires before the effective first hrtimer on this CPU and the comparison was with an outdated expires_next value. To prevent this scenario, make hrtimer_force_reprogram() unconditional except the effective reprogramming part, which gets eliminated by the compiler in the CONFIG_HIGH_RES_TIMERS=n case. Signed-off-by: Anna-Maria Gleixner Cc: Christoph Hellwig Cc: John Stultz Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: keescook@chromium.org Link: http://lkml.kernel.org/r/20171221104205.7269-20-anna-maria@linutronix.de Signed-off-by: Ingo Molnar --- kernel/time/hrtimer.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 2b3222ea2a6c..e6a78ae103ca 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -521,9 +521,6 @@ hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal) { ktime_t expires_next; - if (!__hrtimer_hres_active(cpu_base)) - return; - expires_next = __hrtimer_get_next_event(cpu_base); if (skip_equal && expires_next == cpu_base->expires_next) @@ -532,6 +529,9 @@ hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal) cpu_base->expires_next = expires_next; /* + * If hres is not active, hardware does not have to be + * reprogrammed yet. + * * If a hang was detected in the last timer interrupt then we * leave the hang delay active in the hardware. We want the * system to make progress. That also prevents the following @@ -545,7 +545,7 @@ hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal) * set. So we'd effectivly block all timers until the T2 event * fires. */ - if (cpu_base->hang_detected) + if (!__hrtimer_hres_active(cpu_base) || cpu_base->hang_detected) return; tick_program_event(cpu_base->expires_next, 1); @@ -844,7 +844,6 @@ static void __remove_hrtimer(struct hrtimer *timer, if (!timerqueue_del(&base->active, &timer->node)) cpu_base->active_bases &= ~(1 << base->index); -#ifdef CONFIG_HIGH_RES_TIMERS /* * Note: If reprogram is false we do not update * cpu_base->next_timer. This happens when we remove the first @@ -855,7 +854,6 @@ static void __remove_hrtimer(struct hrtimer *timer, */ if (reprogram && timer == cpu_base->next_timer) hrtimer_force_reprogram(cpu_base, 1); -#endif } /* -- cgit v1.2.3 From 14c803419de6acba08e143d51813ac5e0f3443b8 Mon Sep 17 00:00:00 2001 From: Anna-Maria Gleixner Date: Thu, 21 Dec 2017 11:41:49 +0100 Subject: hrtimer: Unify remote enqueue handling hrtimer_reprogram() is conditionally invoked from hrtimer_start_range_ns() when hrtimer_cpu_base.hres_active is true. In the !hres_active case there is a special condition for the nohz_active case: If the newly enqueued timer expires before the first expiring timer on a remote CPU then the remote CPU needs to be notified and woken up from a NOHZ idle sleep to take the new first expiring timer into account. Previous changes have already established the prerequisites to make the remote enqueue behaviour the same whether high resolution mode is active or not: If the to be enqueued timer expires before the first expiring timer on a remote CPU, then it cannot be enqueued there. This was done for the high resolution mode because there is no way to access the remote CPU timer hardware. The same is true for NOHZ, but was handled differently by unconditionally enqueuing the timer and waking up the remote CPU so it can reprogram its timer. Again there is no compelling reason for this difference. hrtimer_check_target(), which makes the 'can remote enqueue' decision is already unconditional, but not yet functional because nothing updates hrtimer_cpu_base.expires_next in the !hres_active case. To unify this the following changes are required: 1) Make the store of the new first expiry time unconditonal in hrtimer_reprogram() and check __hrtimer_hres_active() before proceeding to the actual hardware access. This check also lets the compiler eliminate the rest of the function in case of CONFIG_HIGH_RES_TIMERS=n. 2) Invoke hrtimer_reprogram() unconditionally from hrtimer_start_range_ns() 3) Remove the remote wakeup special case for the !high_res && nohz_active case. Confine the timers_nohz_active static key to timer.c which is the only user now. Signed-off-by: Anna-Maria Gleixner Cc: Christoph Hellwig Cc: John Stultz Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: keescook@chromium.org Link: http://lkml.kernel.org/r/20171221104205.7269-21-anna-maria@linutronix.de Signed-off-by: Ingo Molnar --- kernel/time/hrtimer.c | 18 ++++++------------ kernel/time/tick-internal.h | 6 ------ kernel/time/timer.c | 9 ++++++++- 3 files changed, 14 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index e6a78ae103ca..1c68bf21f603 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -685,21 +685,24 @@ static void hrtimer_reprogram(struct hrtimer *timer, /* Update the pointer to the next expiring timer */ cpu_base->next_timer = timer; + cpu_base->expires_next = expires; /* + * If hres is not active, hardware does not have to be + * programmed yet. + * * If a hang was detected in the last timer interrupt then we * do not schedule a timer which is earlier than the expiry * which we enforced in the hang detection. We want the system * to make progress. */ - if (cpu_base->hang_detected) + if (!__hrtimer_hres_active(cpu_base) || cpu_base->hang_detected) return; /* * Program the timer hardware. We enforce the expiry for * events which are already in the past. */ - cpu_base->expires_next = expires; tick_program_event(expires, 1); } @@ -936,16 +939,7 @@ void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, if (!leftmost) goto unlock; - if (!hrtimer_is_hres_active(timer)) { - /* - * Kick to reschedule the next tick to handle the new timer - * on dynticks target. - */ - if (is_timers_nohz_active()) - wake_up_nohz_cpu(new_base->cpu_base->cpu); - } else { - hrtimer_reprogram(timer, new_base); - } + hrtimer_reprogram(timer, new_base); unlock: unlock_hrtimer_base(timer, &flags); } diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index f690628e068c..e277284c2831 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h @@ -151,18 +151,12 @@ static inline void tick_nohz_init(void) { } #ifdef CONFIG_NO_HZ_COMMON extern unsigned long tick_nohz_active; extern void timers_update_nohz(void); -extern struct static_key_false timers_nohz_active; -static inline bool is_timers_nohz_active(void) -{ - return static_branch_likely(&timers_nohz_active); -} # ifdef CONFIG_SMP extern struct static_key_false timers_migration_enabled; # endif #else /* CONFIG_NO_HZ_COMMON */ static inline void timers_update_nohz(void) { } #define tick_nohz_active (0) -static inline bool is_timers_nohz_active(void) { return false; } #endif DECLARE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases); diff --git a/kernel/time/timer.c b/kernel/time/timer.c index d530f72b32f9..48150ab42de9 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -210,7 +210,7 @@ static DEFINE_PER_CPU(struct timer_base, timer_bases[NR_BASES]); #ifdef CONFIG_NO_HZ_COMMON -DEFINE_STATIC_KEY_FALSE(timers_nohz_active); +static DEFINE_STATIC_KEY_FALSE(timers_nohz_active); static DEFINE_MUTEX(timer_keys_mutex); static void timer_update_keys(struct work_struct *work); @@ -258,6 +258,13 @@ int timer_migration_handler(struct ctl_table *table, int write, mutex_unlock(&timer_keys_mutex); return ret; } + +static inline bool is_timers_nohz_active(void) +{ + return static_branch_unlikely(&timers_nohz_active); +} +#else +static inline bool is_timers_nohz_active(void) { return false; } #endif /* NO_HZ_COMMON */ static unsigned long round_jiffies_common(unsigned long j, int cpu, -- cgit v1.2.3 From 2ac2dccce9d16a7b1a8fddf69a955d249375bce4 Mon Sep 17 00:00:00 2001 From: Anna-Maria Gleixner Date: Thu, 21 Dec 2017 11:41:50 +0100 Subject: hrtimer: Make remote enqueue decision less restrictive The current decision whether a timer can be queued on a remote CPU checks for timer->expiry <= remote_cpu_base.expires_next. This is too restrictive because a timer with the same expiry time as an existing timer will be enqueued on right-hand size of the existing timer inside the rbtree, i.e. behind the first expiring timer. So its safe to allow enqueuing timers with the same expiry time as the first expiring timer on a remote CPU base. Signed-off-by: Anna-Maria Gleixner Cc: Christoph Hellwig Cc: John Stultz Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: keescook@chromium.org Link: http://lkml.kernel.org/r/20171221104205.7269-22-anna-maria@linutronix.de Signed-off-by: Ingo Molnar --- kernel/time/hrtimer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 1c68bf21f603..f4a56fbae662 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -168,7 +168,7 @@ hrtimer_check_target(struct hrtimer *timer, struct hrtimer_clock_base *new_base) ktime_t expires; expires = ktime_sub(hrtimer_get_expires(timer), new_base->offset); - return expires <= new_base->cpu_base->expires_next; + return expires < new_base->cpu_base->expires_next; } static inline -- cgit v1.2.3 From 3ec7a3ee9f15f6dcac1591902d85b94c2a4b520d Mon Sep 17 00:00:00 2001 From: Anna-Maria Gleixner Date: Thu, 21 Dec 2017 11:41:51 +0100 Subject: hrtimer: Remove the 'base' parameter from hrtimer_reprogram() hrtimer_reprogram() must have access to the hrtimer_clock_base of the new first expiring timer to access hrtimer_clock_base.offset for adjusting the expiry time to CLOCK_MONOTONIC. This is required to evaluate whether the new left most timer in the hrtimer_clock_base is the first expiring timer of all clock bases in a hrtimer_cpu_base. The only user of hrtimer_reprogram() is hrtimer_start_range_ns(), which has a pointer to hrtimer_clock_base() already and hands it in as a parameter. But hrtimer_start_range_ns() will be split for the upcoming support for softirq based hrtimers to avoid code duplication and will lose the direct access to the clock base pointer. Instead of handing in timer and timer->base as a parameter remove the base parameter from hrtimer_reprogram() instead and retrieve the clock base internally. Signed-off-by: Anna-Maria Gleixner Cc: Christoph Hellwig Cc: John Stultz Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: keescook@chromium.org Link: http://lkml.kernel.org/r/20171221104205.7269-23-anna-maria@linutronix.de Signed-off-by: Ingo Molnar --- kernel/time/hrtimer.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index f4a56fbae662..33a6c990166d 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -648,10 +648,10 @@ static inline void retrigger_next_event(void *arg) { } * * Called with interrupts disabled and base->cpu_base.lock held */ -static void hrtimer_reprogram(struct hrtimer *timer, - struct hrtimer_clock_base *base) +static void hrtimer_reprogram(struct hrtimer *timer) { struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); + struct hrtimer_clock_base *base = timer->base; ktime_t expires = ktime_sub(hrtimer_get_expires(timer), base->offset); WARN_ON_ONCE(hrtimer_get_expires_tv64(timer) < 0); @@ -939,7 +939,7 @@ void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, if (!leftmost) goto unlock; - hrtimer_reprogram(timer, new_base); + hrtimer_reprogram(timer); unlock: unlock_hrtimer_base(timer, &flags); } -- cgit v1.2.3 From 138a6b7ae4dedde5513678f57b275eee19c41b6a Mon Sep 17 00:00:00 2001 From: Anna-Maria Gleixner Date: Thu, 21 Dec 2017 11:41:52 +0100 Subject: hrtimer: Factor out __hrtimer_start_range_ns() Preparatory patch for softirq based hrtimers to avoid code duplication, factor out the __hrtimer_start_range_ns() function from hrtimer_start_range_ns(). No functional change. Signed-off-by: Anna-Maria Gleixner Cc: Christoph Hellwig Cc: John Stultz Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: keescook@chromium.org Link: http://lkml.kernel.org/r/20171221104205.7269-24-anna-maria@linutronix.de Signed-off-by: Ingo Molnar --- kernel/time/hrtimer.c | 44 ++++++++++++++++++++++++-------------------- 1 file changed, 24 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 33a6c990166d..4142e6f536b4 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -905,22 +905,11 @@ static inline ktime_t hrtimer_update_lowres(struct hrtimer *timer, ktime_t tim, return tim; } -/** - * hrtimer_start_range_ns - (re)start an hrtimer - * @timer: the timer to be added - * @tim: expiry time - * @delta_ns: "slack" range for the timer - * @mode: timer mode: absolute (HRTIMER_MODE_ABS) or - * relative (HRTIMER_MODE_REL), and pinned (HRTIMER_MODE_PINNED) - */ -void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, - u64 delta_ns, const enum hrtimer_mode mode) +static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, + u64 delta_ns, const enum hrtimer_mode mode, + struct hrtimer_clock_base *base) { - struct hrtimer_clock_base *base, *new_base; - unsigned long flags; - int leftmost; - - base = lock_hrtimer_base(timer, &flags); + struct hrtimer_clock_base *new_base; /* Remove an active timer from the queue: */ remove_hrtimer(timer, base, true); @@ -935,12 +924,27 @@ void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, /* Switch the timer base, if necessary: */ new_base = switch_hrtimer_base(timer, base, mode & HRTIMER_MODE_PINNED); - leftmost = enqueue_hrtimer(timer, new_base, mode); - if (!leftmost) - goto unlock; + return enqueue_hrtimer(timer, new_base, mode); +} +/** + * hrtimer_start_range_ns - (re)start an hrtimer + * @timer: the timer to be added + * @tim: expiry time + * @delta_ns: "slack" range for the timer + * @mode: timer mode: absolute (HRTIMER_MODE_ABS) or + * relative (HRTIMER_MODE_REL), and pinned (HRTIMER_MODE_PINNED) + */ +void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, + u64 delta_ns, const enum hrtimer_mode mode) +{ + struct hrtimer_clock_base *base; + unsigned long flags; + + base = lock_hrtimer_base(timer, &flags); + + if (__hrtimer_start_range_ns(timer, tim, delta_ns, mode, base)) + hrtimer_reprogram(timer); - hrtimer_reprogram(timer); -unlock: unlock_hrtimer_base(timer, &flags); } EXPORT_SYMBOL_GPL(hrtimer_start_range_ns); -- cgit v1.2.3 From ad38f596d8e4babc19be8b21a7a49debffb4a7f5 Mon Sep 17 00:00:00 2001 From: Anna-Maria Gleixner Date: Thu, 21 Dec 2017 11:41:53 +0100 Subject: hrtimer: Factor out __hrtimer_next_event_base() Preparatory patch for softirq based hrtimers to avoid code duplication. No functional change. Signed-off-by: Anna-Maria Gleixner Cc: Christoph Hellwig Cc: John Stultz Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: keescook@chromium.org Link: http://lkml.kernel.org/r/20171221104205.7269-25-anna-maria@linutronix.de Signed-off-by: Ingo Molnar --- kernel/time/hrtimer.c | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 4142e6f536b4..5d9b81d224b3 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -458,13 +458,13 @@ __next_base(struct hrtimer_cpu_base *cpu_base, unsigned int *active) #define for_each_active_base(base, cpu_base, active) \ while ((base = __next_base((cpu_base), &(active)))) -static ktime_t __hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base) +static ktime_t __hrtimer_next_event_base(struct hrtimer_cpu_base *cpu_base, + unsigned int active, + ktime_t expires_next) { struct hrtimer_clock_base *base; - unsigned int active = cpu_base->active_bases; - ktime_t expires, expires_next = KTIME_MAX; + ktime_t expires; - cpu_base->next_timer = NULL; for_each_active_base(base, cpu_base, active) { struct timerqueue_node *next; struct hrtimer *timer; @@ -487,6 +487,18 @@ static ktime_t __hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base) return expires_next; } +static ktime_t __hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base) +{ + unsigned int active = cpu_base->active_bases; + ktime_t expires_next = KTIME_MAX; + + cpu_base->next_timer = NULL; + + expires_next = __hrtimer_next_event_base(cpu_base, active, expires_next); + + return expires_next; +} + static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base) { ktime_t *offs_real = &base->clock_base[HRTIMER_BASE_REALTIME].offset; -- cgit v1.2.3 From dd934aa8ad1fbaab3d916125c7fe42fff75aa7ff Mon Sep 17 00:00:00 2001 From: Anna-Maria Gleixner Date: Thu, 21 Dec 2017 11:41:54 +0100 Subject: hrtimer: Use irqsave/irqrestore around __run_hrtimer() __run_hrtimer() is called with the hrtimer_cpu_base.lock held and interrupts disabled. Before invoking the timer callback the base lock is dropped, but interrupts stay disabled. The upcoming support for softirq based hrtimers requires that interrupts are enabled before the timer callback is invoked. To avoid code duplication, take hrtimer_cpu_base.lock with raw_spin_lock_irqsave(flags) at the call site and hand in the flags as a parameter. So raw_spin_unlock_irqrestore() before the callback invocation will either keep interrupts disabled in interrupt context or restore to interrupt enabled state when called from softirq context. Suggested-by: Peter Zijlstra Signed-off-by: Anna-Maria Gleixner Cc: Christoph Hellwig Cc: John Stultz Cc: Linus Torvalds Cc: Thomas Gleixner Cc: keescook@chromium.org Link: http://lkml.kernel.org/r/20171221104205.7269-26-anna-maria@linutronix.de Signed-off-by: Ingo Molnar --- kernel/time/hrtimer.c | 31 ++++++++++++++++++------------- 1 file changed, 18 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 5d9b81d224b3..31ccd86e63c0 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -1159,7 +1159,8 @@ EXPORT_SYMBOL_GPL(hrtimer_active); static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base, struct hrtimer_clock_base *base, - struct hrtimer *timer, ktime_t *now) + struct hrtimer *timer, ktime_t *now, + unsigned long flags) { enum hrtimer_restart (*fn)(struct hrtimer *); int restart; @@ -1194,11 +1195,11 @@ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base, * protected against migration to a different CPU even if the lock * is dropped. */ - raw_spin_unlock(&cpu_base->lock); + raw_spin_unlock_irqrestore(&cpu_base->lock, flags); trace_hrtimer_expire_entry(timer, now); restart = fn(timer); trace_hrtimer_expire_exit(timer); - raw_spin_lock(&cpu_base->lock); + raw_spin_lock_irq(&cpu_base->lock); /* * Note: We clear the running state after enqueue_hrtimer and @@ -1226,7 +1227,8 @@ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base, base->running = NULL; } -static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now) +static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now, + unsigned long flags) { struct hrtimer_clock_base *base; unsigned int active = cpu_base->active_bases; @@ -1257,7 +1259,7 @@ static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now) if (basenow < hrtimer_get_softexpires_tv64(timer)) break; - __run_hrtimer(cpu_base, base, timer, &basenow); + __run_hrtimer(cpu_base, base, timer, &basenow, flags); } } } @@ -1272,13 +1274,14 @@ void hrtimer_interrupt(struct clock_event_device *dev) { struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); ktime_t expires_next, now, entry_time, delta; + unsigned long flags; int retries = 0; BUG_ON(!cpu_base->hres_active); cpu_base->nr_events++; dev->next_event = KTIME_MAX; - raw_spin_lock(&cpu_base->lock); + raw_spin_lock_irqsave(&cpu_base->lock, flags); entry_time = now = hrtimer_update_base(cpu_base); retry: cpu_base->in_hrtirq = 1; @@ -1291,7 +1294,7 @@ retry: */ cpu_base->expires_next = KTIME_MAX; - __hrtimer_run_queues(cpu_base, now); + __hrtimer_run_queues(cpu_base, now, flags); /* Reevaluate the clock bases for the next expiry */ expires_next = __hrtimer_get_next_event(cpu_base); @@ -1301,7 +1304,7 @@ retry: */ cpu_base->expires_next = expires_next; cpu_base->in_hrtirq = 0; - raw_spin_unlock(&cpu_base->lock); + raw_spin_unlock_irqrestore(&cpu_base->lock, flags); /* Reprogramming necessary ? */ if (!tick_program_event(expires_next, 0)) { @@ -1322,7 +1325,7 @@ retry: * Acquire base lock for updating the offsets and retrieving * the current time. */ - raw_spin_lock(&cpu_base->lock); + raw_spin_lock_irqsave(&cpu_base->lock, flags); now = hrtimer_update_base(cpu_base); cpu_base->nr_retries++; if (++retries < 3) @@ -1335,7 +1338,8 @@ retry: */ cpu_base->nr_hangs++; cpu_base->hang_detected = 1; - raw_spin_unlock(&cpu_base->lock); + raw_spin_unlock_irqrestore(&cpu_base->lock, flags); + delta = ktime_sub(now, entry_time); if ((unsigned int)delta > cpu_base->max_hang_time) cpu_base->max_hang_time = (unsigned int) delta; @@ -1377,6 +1381,7 @@ static inline void __hrtimer_peek_ahead_timers(void) { } void hrtimer_run_queues(void) { struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); + unsigned long flags; ktime_t now; if (__hrtimer_hres_active(cpu_base)) @@ -1394,10 +1399,10 @@ void hrtimer_run_queues(void) return; } - raw_spin_lock(&cpu_base->lock); + raw_spin_lock_irqsave(&cpu_base->lock, flags); now = hrtimer_update_base(cpu_base); - __hrtimer_run_queues(cpu_base, now); - raw_spin_unlock(&cpu_base->lock); + __hrtimer_run_queues(cpu_base, now, flags); + raw_spin_unlock_irqrestore(&cpu_base->lock, flags); } /* -- cgit v1.2.3 From 98ecadd4305d8677ba77162152485798d47dcc85 Mon Sep 17 00:00:00 2001 From: Anna-Maria Gleixner Date: Thu, 21 Dec 2017 11:41:55 +0100 Subject: hrtimer: Add clock bases and hrtimer mode for softirq context Currently hrtimer callback functions are always executed in hard interrupt context. Users of hrtimers, which need their timer function to be executed in soft interrupt context, make use of tasklets to get the proper context. Add additional hrtimer clock bases for timers which must expire in softirq context, so the detour via the tasklet can be avoided. This is also required for RT, where the majority of hrtimer is moved into softirq hrtimer context. The selection of the expiry mode happens via a mode bit. Introduce HRTIMER_MODE_SOFT and the matching combinations with the ABS/REL/PINNED bits and update the decoding of hrtimer_mode in tracepoints. Signed-off-by: Anna-Maria Gleixner Cc: Christoph Hellwig Cc: John Stultz Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: keescook@chromium.org Link: http://lkml.kernel.org/r/20171221104205.7269-27-anna-maria@linutronix.de Signed-off-by: Ingo Molnar --- kernel/time/hrtimer.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) (limited to 'kernel') diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 31ccd86e63c0..e2353f5cdf51 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -92,6 +92,26 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) = .clockid = CLOCK_TAI, .get_time = &ktime_get_clocktai, }, + { + .index = HRTIMER_BASE_MONOTONIC_SOFT, + .clockid = CLOCK_MONOTONIC, + .get_time = &ktime_get, + }, + { + .index = HRTIMER_BASE_REALTIME_SOFT, + .clockid = CLOCK_REALTIME, + .get_time = &ktime_get_real, + }, + { + .index = HRTIMER_BASE_BOOTTIME_SOFT, + .clockid = CLOCK_BOOTTIME, + .get_time = &ktime_get_boottime, + }, + { + .index = HRTIMER_BASE_TAI_SOFT, + .clockid = CLOCK_TAI, + .get_time = &ktime_get_clocktai, + }, } }; -- cgit v1.2.3 From c458b1d102036eaa2c70e03000c959bd491c2037 Mon Sep 17 00:00:00 2001 From: Anna-Maria Gleixner Date: Thu, 21 Dec 2017 11:41:56 +0100 Subject: hrtimer: Prepare handling of hard and softirq based hrtimers The softirq based hrtimer can utilize most of the existing hrtimers functions, but need to operate on a different data set. Add an 'active_mask' parameter to various functions so the hard and soft bases can be selected. Fixup the existing callers and hand in the ACTIVE_HARD mask. Signed-off-by: Anna-Maria Gleixner Cc: Christoph Hellwig Cc: John Stultz Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: keescook@chromium.org Link: http://lkml.kernel.org/r/20171221104205.7269-28-anna-maria@linutronix.de Signed-off-by: Ingo Molnar --- kernel/time/hrtimer.c | 38 +++++++++++++++++++++++++++++--------- 1 file changed, 29 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index e2353f5cdf51..ba4674e9adc2 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -59,6 +59,15 @@ #include "tick-internal.h" +/* + * Masks for selecting the soft and hard context timers from + * cpu_base->active + */ +#define MASK_SHIFT (HRTIMER_BASE_MONOTONIC_SOFT) +#define HRTIMER_ACTIVE_HARD ((1U << MASK_SHIFT) - 1) +#define HRTIMER_ACTIVE_SOFT (HRTIMER_ACTIVE_HARD << MASK_SHIFT) +#define HRTIMER_ACTIVE_ALL (HRTIMER_ACTIVE_SOFT | HRTIMER_ACTIVE_HARD) + /* * The timer bases: * @@ -507,13 +516,24 @@ static ktime_t __hrtimer_next_event_base(struct hrtimer_cpu_base *cpu_base, return expires_next; } -static ktime_t __hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base) +/* + * Recomputes cpu_base::*next_timer and returns the earliest expires_next but + * does not set cpu_base::*expires_next, that is done by hrtimer_reprogram. + * + * @active_mask must be one of: + * - HRTIMER_ACTIVE, + * - HRTIMER_ACTIVE_SOFT, or + * - HRTIMER_ACTIVE_HARD. + */ +static ktime_t __hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base, + unsigned int active_mask) { - unsigned int active = cpu_base->active_bases; + unsigned int active; ktime_t expires_next = KTIME_MAX; cpu_base->next_timer = NULL; + active = cpu_base->active_bases & active_mask; expires_next = __hrtimer_next_event_base(cpu_base, active, expires_next); return expires_next; @@ -553,7 +573,7 @@ hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal) { ktime_t expires_next; - expires_next = __hrtimer_get_next_event(cpu_base); + expires_next = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_HARD); if (skip_equal && expires_next == cpu_base->expires_next) return; @@ -1074,7 +1094,7 @@ u64 hrtimer_get_next_event(void) raw_spin_lock_irqsave(&cpu_base->lock, flags); if (!__hrtimer_hres_active(cpu_base)) - expires = __hrtimer_get_next_event(cpu_base); + expires = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_HARD); raw_spin_unlock_irqrestore(&cpu_base->lock, flags); @@ -1248,10 +1268,10 @@ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base, } static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now, - unsigned long flags) + unsigned long flags, unsigned int active_mask) { struct hrtimer_clock_base *base; - unsigned int active = cpu_base->active_bases; + unsigned int active = cpu_base->active_bases & active_mask; for_each_active_base(base, cpu_base, active) { struct timerqueue_node *node; @@ -1314,10 +1334,10 @@ retry: */ cpu_base->expires_next = KTIME_MAX; - __hrtimer_run_queues(cpu_base, now, flags); + __hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_HARD); /* Reevaluate the clock bases for the next expiry */ - expires_next = __hrtimer_get_next_event(cpu_base); + expires_next = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_HARD); /* * Store the new expiry value so the migration code can verify * against it. @@ -1421,7 +1441,7 @@ void hrtimer_run_queues(void) raw_spin_lock_irqsave(&cpu_base->lock, flags); now = hrtimer_update_base(cpu_base); - __hrtimer_run_queues(cpu_base, now, flags); + __hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_HARD); raw_spin_unlock_irqrestore(&cpu_base->lock, flags); } -- cgit v1.2.3 From c96f5471ce7d2aefd0dda560cc23f08ab00bc65d Mon Sep 17 00:00:00 2001 From: Josh Snyder Date: Mon, 18 Dec 2017 16:15:10 +0000 Subject: delayacct: Account blkio completion on the correct task Before commit: e33a9bba85a8 ("sched/core: move IO scheduling accounting from io_schedule_timeout() into scheduler") delayacct_blkio_end() was called after context-switching into the task which completed I/O. This resulted in double counting: the task would account a delay both waiting for I/O and for time spent in the runqueue. With e33a9bba85a8, delayacct_blkio_end() is called by try_to_wake_up(). In ttwu, we have not yet context-switched. This is more correct, in that the delay accounting ends when the I/O is complete. But delayacct_blkio_end() relies on 'get_current()', and we have not yet context-switched into the task whose I/O completed. This results in the wrong task having its delay accounting statistics updated. Instead of doing that, pass the task_struct being woken to delayacct_blkio_end(), so that it can update the statistics of the correct task. Signed-off-by: Josh Snyder Acked-by: Tejun Heo Acked-by: Balbir Singh Cc: Cc: Brendan Gregg Cc: Jens Axboe Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-block@vger.kernel.org Fixes: e33a9bba85a8 ("sched/core: move IO scheduling accounting from io_schedule_timeout() into scheduler") Link: http://lkml.kernel.org/r/1513613712-571-1-git-send-email-joshs@netflix.com Signed-off-by: Ingo Molnar --- kernel/delayacct.c | 42 ++++++++++++++++++++++++++---------------- kernel/sched/core.c | 6 +++--- 2 files changed, 29 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/delayacct.c b/kernel/delayacct.c index 4a1c33416b6a..e2764d767f18 100644 --- a/kernel/delayacct.c +++ b/kernel/delayacct.c @@ -51,16 +51,16 @@ void __delayacct_tsk_init(struct task_struct *tsk) * Finish delay accounting for a statistic using its timestamps (@start), * accumalator (@total) and @count */ -static void delayacct_end(u64 *start, u64 *total, u32 *count) +static void delayacct_end(spinlock_t *lock, u64 *start, u64 *total, u32 *count) { s64 ns = ktime_get_ns() - *start; unsigned long flags; if (ns > 0) { - spin_lock_irqsave(¤t->delays->lock, flags); + spin_lock_irqsave(lock, flags); *total += ns; (*count)++; - spin_unlock_irqrestore(¤t->delays->lock, flags); + spin_unlock_irqrestore(lock, flags); } } @@ -69,17 +69,25 @@ void __delayacct_blkio_start(void) current->delays->blkio_start = ktime_get_ns(); } -void __delayacct_blkio_end(void) +/* + * We cannot rely on the `current` macro, as we haven't yet switched back to + * the process being woken. + */ +void __delayacct_blkio_end(struct task_struct *p) { - if (current->delays->flags & DELAYACCT_PF_SWAPIN) - /* Swapin block I/O */ - delayacct_end(¤t->delays->blkio_start, - ¤t->delays->swapin_delay, - ¤t->delays->swapin_count); - else /* Other block I/O */ - delayacct_end(¤t->delays->blkio_start, - ¤t->delays->blkio_delay, - ¤t->delays->blkio_count); + struct task_delay_info *delays = p->delays; + u64 *total; + u32 *count; + + if (p->delays->flags & DELAYACCT_PF_SWAPIN) { + total = &delays->swapin_delay; + count = &delays->swapin_count; + } else { + total = &delays->blkio_delay; + count = &delays->blkio_count; + } + + delayacct_end(&delays->lock, &delays->blkio_start, total, count); } int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) @@ -153,8 +161,10 @@ void __delayacct_freepages_start(void) void __delayacct_freepages_end(void) { - delayacct_end(¤t->delays->freepages_start, - ¤t->delays->freepages_delay, - ¤t->delays->freepages_count); + delayacct_end( + ¤t->delays->lock, + ¤t->delays->freepages_start, + ¤t->delays->freepages_delay, + ¤t->delays->freepages_count); } diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 644fa2e3d993..a7bf32aabfda 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2056,7 +2056,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) p->state = TASK_WAKING; if (p->in_iowait) { - delayacct_blkio_end(); + delayacct_blkio_end(p); atomic_dec(&task_rq(p)->nr_iowait); } @@ -2069,7 +2069,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) #else /* CONFIG_SMP */ if (p->in_iowait) { - delayacct_blkio_end(); + delayacct_blkio_end(p); atomic_dec(&task_rq(p)->nr_iowait); } @@ -2122,7 +2122,7 @@ static void try_to_wake_up_local(struct task_struct *p, struct rq_flags *rf) if (!task_on_rq_queued(p)) { if (p->in_iowait) { - delayacct_blkio_end(); + delayacct_blkio_end(p); atomic_dec(&rq->nr_iowait); } ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_NOCLOCK); -- cgit v1.2.3 From 5da70160462e80b0ab8a6960cdd0cdd476907523 Mon Sep 17 00:00:00 2001 From: Anna-Maria Gleixner Date: Thu, 21 Dec 2017 11:41:57 +0100 Subject: hrtimer: Implement support for softirq based hrtimers hrtimer callbacks are always invoked in hard interrupt context. Several users in tree require soft interrupt context for their callbacks and achieve this by combining a hrtimer with a tasklet. The hrtimer schedules the tasklet in hard interrupt context and the tasklet callback gets invoked in softirq context later. That's suboptimal and aside of that the real-time patch moves most of the hrtimers into softirq context. So adding native support for hrtimers expiring in softirq context is a valuable extension for both mainline and the RT patch set. Each valid hrtimer clock id has two associated hrtimer clock bases: one for timers expiring in hardirq context and one for timers expiring in softirq context. Implement the functionality to associate a hrtimer with the hard or softirq related clock bases and update the relevant functions to take them into account when the next expiry time needs to be evaluated. Add a check into the hard interrupt context handler functions to check whether the first expiring softirq based timer has expired. If it's expired the softirq is raised and the accounting of softirq based timers to evaluate the next expiry time for programming the timer hardware is skipped until the softirq processing has finished. At the end of the softirq processing the regular processing is resumed. Suggested-by: Thomas Gleixner Suggested-by: Peter Zijlstra Signed-off-by: Anna-Maria Gleixner Cc: Christoph Hellwig Cc: John Stultz Cc: Linus Torvalds Cc: keescook@chromium.org Link: http://lkml.kernel.org/r/20171221104205.7269-29-anna-maria@linutronix.de Signed-off-by: Ingo Molnar --- kernel/time/hrtimer.c | 196 +++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 172 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index ba4674e9adc2..d93e3e745592 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -411,7 +411,8 @@ static inline void debug_hrtimer_init(struct hrtimer *timer) debug_object_init(timer, &hrtimer_debug_descr); } -static inline void debug_hrtimer_activate(struct hrtimer *timer) +static inline void debug_hrtimer_activate(struct hrtimer *timer, + enum hrtimer_mode mode) { debug_object_activate(timer, &hrtimer_debug_descr); } @@ -444,8 +445,10 @@ void destroy_hrtimer_on_stack(struct hrtimer *timer) EXPORT_SYMBOL_GPL(destroy_hrtimer_on_stack); #else + static inline void debug_hrtimer_init(struct hrtimer *timer) { } -static inline void debug_hrtimer_activate(struct hrtimer *timer) { } +static inline void debug_hrtimer_activate(struct hrtimer *timer, + enum hrtimer_mode mode) { } static inline void debug_hrtimer_deactivate(struct hrtimer *timer) { } #endif @@ -460,7 +463,7 @@ debug_init(struct hrtimer *timer, clockid_t clockid, static inline void debug_activate(struct hrtimer *timer, enum hrtimer_mode mode) { - debug_hrtimer_activate(timer); + debug_hrtimer_activate(timer, mode); trace_hrtimer_start(timer, mode); } @@ -503,7 +506,10 @@ static ktime_t __hrtimer_next_event_base(struct hrtimer_cpu_base *cpu_base, expires = ktime_sub(hrtimer_get_expires(timer), base->offset); if (expires < expires_next) { expires_next = expires; - cpu_base->next_timer = timer; + if (timer->is_soft) + cpu_base->softirq_next_timer = timer; + else + cpu_base->next_timer = timer; } } /* @@ -520,21 +526,39 @@ static ktime_t __hrtimer_next_event_base(struct hrtimer_cpu_base *cpu_base, * Recomputes cpu_base::*next_timer and returns the earliest expires_next but * does not set cpu_base::*expires_next, that is done by hrtimer_reprogram. * + * When a softirq is pending, we can ignore the HRTIMER_ACTIVE_SOFT bases, + * those timers will get run whenever the softirq gets handled, at the end of + * hrtimer_run_softirq(), hrtimer_update_softirq_timer() will re-add these bases. + * + * Therefore softirq values are those from the HRTIMER_ACTIVE_SOFT clock bases. + * The !softirq values are the minima across HRTIMER_ACTIVE_ALL, unless an actual + * softirq is pending, in which case they're the minima of HRTIMER_ACTIVE_HARD. + * * @active_mask must be one of: - * - HRTIMER_ACTIVE, + * - HRTIMER_ACTIVE_ALL, * - HRTIMER_ACTIVE_SOFT, or * - HRTIMER_ACTIVE_HARD. */ -static ktime_t __hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base, - unsigned int active_mask) +static ktime_t +__hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base, unsigned int active_mask) { unsigned int active; + struct hrtimer *next_timer = NULL; ktime_t expires_next = KTIME_MAX; - cpu_base->next_timer = NULL; + if (!cpu_base->softirq_activated && (active_mask & HRTIMER_ACTIVE_SOFT)) { + active = cpu_base->active_bases & HRTIMER_ACTIVE_SOFT; + cpu_base->softirq_next_timer = NULL; + expires_next = __hrtimer_next_event_base(cpu_base, active, KTIME_MAX); + + next_timer = cpu_base->softirq_next_timer; + } - active = cpu_base->active_bases & active_mask; - expires_next = __hrtimer_next_event_base(cpu_base, active, expires_next); + if (active_mask & HRTIMER_ACTIVE_HARD) { + active = cpu_base->active_bases & HRTIMER_ACTIVE_HARD; + cpu_base->next_timer = next_timer; + expires_next = __hrtimer_next_event_base(cpu_base, active, expires_next); + } return expires_next; } @@ -545,8 +569,14 @@ static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base) ktime_t *offs_boot = &base->clock_base[HRTIMER_BASE_BOOTTIME].offset; ktime_t *offs_tai = &base->clock_base[HRTIMER_BASE_TAI].offset; - return ktime_get_update_offsets_now(&base->clock_was_set_seq, + ktime_t now = ktime_get_update_offsets_now(&base->clock_was_set_seq, offs_real, offs_boot, offs_tai); + + base->clock_base[HRTIMER_BASE_REALTIME_SOFT].offset = *offs_real; + base->clock_base[HRTIMER_BASE_BOOTTIME_SOFT].offset = *offs_boot; + base->clock_base[HRTIMER_BASE_TAI_SOFT].offset = *offs_tai; + + return now; } /* @@ -573,7 +603,23 @@ hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal) { ktime_t expires_next; - expires_next = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_HARD); + /* + * Find the current next expiration time. + */ + expires_next = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_ALL); + + if (cpu_base->next_timer && cpu_base->next_timer->is_soft) { + /* + * When the softirq is activated, hrtimer has to be + * programmed with the first hard hrtimer because soft + * timer interrupt could occur too late. + */ + if (cpu_base->softirq_activated) + expires_next = __hrtimer_get_next_event(cpu_base, + HRTIMER_ACTIVE_HARD); + else + cpu_base->softirq_expires_next = expires_next; + } if (skip_equal && expires_next == cpu_base->expires_next) return; @@ -700,7 +746,7 @@ static inline void retrigger_next_event(void *arg) { } * * Called with interrupts disabled and base->cpu_base.lock held */ -static void hrtimer_reprogram(struct hrtimer *timer) +static void hrtimer_reprogram(struct hrtimer *timer, bool reprogram) { struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); struct hrtimer_clock_base *base = timer->base; @@ -708,6 +754,37 @@ static void hrtimer_reprogram(struct hrtimer *timer) WARN_ON_ONCE(hrtimer_get_expires_tv64(timer) < 0); + /* + * CLOCK_REALTIME timer might be requested with an absolute + * expiry time which is less than base->offset. Set it to 0. + */ + if (expires < 0) + expires = 0; + + if (timer->is_soft) { + /* + * soft hrtimer could be started on a remote CPU. In this + * case softirq_expires_next needs to be updated on the + * remote CPU. The soft hrtimer will not expire before the + * first hard hrtimer on the remote CPU - + * hrtimer_check_target() prevents this case. + */ + struct hrtimer_cpu_base *timer_cpu_base = base->cpu_base; + + if (timer_cpu_base->softirq_activated) + return; + + if (!ktime_before(expires, timer_cpu_base->softirq_expires_next)) + return; + + timer_cpu_base->softirq_next_timer = timer; + timer_cpu_base->softirq_expires_next = expires; + + if (!ktime_before(expires, timer_cpu_base->expires_next) || + !reprogram) + return; + } + /* * If the timer is not on the current cpu, we cannot reprogram * the other cpus clock event device. @@ -725,13 +802,6 @@ static void hrtimer_reprogram(struct hrtimer *timer) if (cpu_base->in_hrtirq) return; - /* - * CLOCK_REALTIME timer might be requested with an absolute - * expiry time which is less than base->offset. Set it to 0. - */ - if (expires < 0) - expires = 0; - if (expires >= cpu_base->expires_next) return; @@ -957,6 +1027,31 @@ static inline ktime_t hrtimer_update_lowres(struct hrtimer *timer, ktime_t tim, return tim; } +static void +hrtimer_update_softirq_timer(struct hrtimer_cpu_base *cpu_base, bool reprogram) +{ + ktime_t expires; + + /* + * Find the next SOFT expiration. + */ + expires = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_SOFT); + + /* + * reprogramming needs to be triggered, even if the next soft + * hrtimer expires at the same time than the next hard + * hrtimer. cpu_base->softirq_expires_next needs to be updated! + */ + if (expires == KTIME_MAX) + return; + + /* + * cpu_base->*next_timer is recomputed by __hrtimer_get_next_event() + * cpu_base->*expires_next is only set by hrtimer_reprogram() + */ + hrtimer_reprogram(cpu_base->softirq_next_timer, reprogram); +} + static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, u64 delta_ns, const enum hrtimer_mode mode, struct hrtimer_clock_base *base) @@ -978,13 +1073,15 @@ static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, return enqueue_hrtimer(timer, new_base, mode); } + /** * hrtimer_start_range_ns - (re)start an hrtimer * @timer: the timer to be added * @tim: expiry time * @delta_ns: "slack" range for the timer * @mode: timer mode: absolute (HRTIMER_MODE_ABS) or - * relative (HRTIMER_MODE_REL), and pinned (HRTIMER_MODE_PINNED) + * relative (HRTIMER_MODE_REL), and pinned (HRTIMER_MODE_PINNED); + * softirq based mode is considered for debug purpose only! */ void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, u64 delta_ns, const enum hrtimer_mode mode) @@ -992,10 +1089,16 @@ void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, struct hrtimer_clock_base *base; unsigned long flags; + /* + * Check whether the HRTIMER_MODE_SOFT bit and hrtimer.is_soft + * match. + */ + WARN_ON_ONCE(!(mode & HRTIMER_MODE_SOFT) ^ !timer->is_soft); + base = lock_hrtimer_base(timer, &flags); if (__hrtimer_start_range_ns(timer, tim, delta_ns, mode, base)) - hrtimer_reprogram(timer); + hrtimer_reprogram(timer, true); unlock_hrtimer_base(timer, &flags); } @@ -1094,7 +1197,7 @@ u64 hrtimer_get_next_event(void) raw_spin_lock_irqsave(&cpu_base->lock, flags); if (!__hrtimer_hres_active(cpu_base)) - expires = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_HARD); + expires = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_ALL); raw_spin_unlock_irqrestore(&cpu_base->lock, flags); @@ -1304,6 +1407,23 @@ static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now, } } +static __latent_entropy void hrtimer_run_softirq(struct softirq_action *h) +{ + struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); + unsigned long flags; + ktime_t now; + + raw_spin_lock_irqsave(&cpu_base->lock, flags); + + now = hrtimer_update_base(cpu_base); + __hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_SOFT); + + cpu_base->softirq_activated = 0; + hrtimer_update_softirq_timer(cpu_base, true); + + raw_spin_unlock_irqrestore(&cpu_base->lock, flags); +} + #ifdef CONFIG_HIGH_RES_TIMERS /* @@ -1334,10 +1454,16 @@ retry: */ cpu_base->expires_next = KTIME_MAX; + if (!ktime_before(now, cpu_base->softirq_expires_next)) { + cpu_base->softirq_expires_next = KTIME_MAX; + cpu_base->softirq_activated = 1; + raise_softirq_irqoff(HRTIMER_SOFTIRQ); + } + __hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_HARD); /* Reevaluate the clock bases for the next expiry */ - expires_next = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_HARD); + expires_next = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_ALL); /* * Store the new expiry value so the migration code can verify * against it. @@ -1441,6 +1567,13 @@ void hrtimer_run_queues(void) raw_spin_lock_irqsave(&cpu_base->lock, flags); now = hrtimer_update_base(cpu_base); + + if (!ktime_before(now, cpu_base->softirq_expires_next)) { + cpu_base->softirq_expires_next = KTIME_MAX; + cpu_base->softirq_activated = 1; + raise_softirq_irqoff(HRTIMER_SOFTIRQ); + } + __hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_HARD); raw_spin_unlock_irqrestore(&cpu_base->lock, flags); } @@ -1622,6 +1755,7 @@ int hrtimers_prepare_cpu(unsigned int cpu) cpu_base->cpu = cpu; cpu_base->hres_active = 0; cpu_base->expires_next = KTIME_MAX; + cpu_base->softirq_expires_next = KTIME_MAX; return 0; } @@ -1665,6 +1799,12 @@ int hrtimers_dead_cpu(unsigned int scpu) BUG_ON(cpu_online(scpu)); tick_cancel_sched_timer(scpu); + /* + * this BH disable ensures that raise_softirq_irqoff() does + * not wakeup ksoftirqd (and acquire the pi-lock) while + * holding the cpu_base lock + */ + local_bh_disable(); local_irq_disable(); old_base = &per_cpu(hrtimer_bases, scpu); new_base = this_cpu_ptr(&hrtimer_bases); @@ -1680,12 +1820,19 @@ int hrtimers_dead_cpu(unsigned int scpu) &new_base->clock_base[i]); } + /* + * The migration might have changed the first expiring softirq + * timer on this CPU. Update it. + */ + hrtimer_update_softirq_timer(new_base, false); + raw_spin_unlock(&old_base->lock); raw_spin_unlock(&new_base->lock); /* Check, if we got expired work to do */ __hrtimer_peek_ahead_timers(); local_irq_enable(); + local_bh_enable(); return 0; } @@ -1694,6 +1841,7 @@ int hrtimers_dead_cpu(unsigned int scpu) void __init hrtimers_init(void) { hrtimers_prepare_cpu(smp_processor_id()); + open_softirq(HRTIMER_SOFTIRQ, hrtimer_run_softirq); } /** -- cgit v1.2.3 From 42f42da41b54c191ae6a775e84a86c100d66c5e8 Mon Sep 17 00:00:00 2001 From: Anna-Maria Gleixner Date: Thu, 21 Dec 2017 11:41:58 +0100 Subject: hrtimer: Implement SOFT/HARD clock base selection All prerequisites to handle hrtimers for expiry in either hard or soft interrupt context are in place. Add the missing bit in hrtimer_init() which associates the timer to the hard or the softirq clock base. Signed-off-by: Anna-Maria Gleixner Cc: Christoph Hellwig Cc: John Stultz Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: keescook@chromium.org Link: http://lkml.kernel.org/r/20171221104205.7269-30-anna-maria@linutronix.de Signed-off-by: Ingo Molnar --- kernel/time/hrtimer.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index d93e3e745592..3d201582630d 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -1220,8 +1220,9 @@ static inline int hrtimer_clockid_to_base(clockid_t clock_id) static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id, enum hrtimer_mode mode) { + bool softtimer = !!(mode & HRTIMER_MODE_SOFT); + int base = softtimer ? HRTIMER_MAX_CLOCK_BASES / 2 : 0; struct hrtimer_cpu_base *cpu_base; - int base; memset(timer, 0, sizeof(struct hrtimer)); @@ -1235,7 +1236,8 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id, if (clock_id == CLOCK_REALTIME && mode & HRTIMER_MODE_REL) clock_id = CLOCK_MONOTONIC; - base = hrtimer_clockid_to_base(clock_id); + base += hrtimer_clockid_to_base(clock_id); + timer->is_soft = softtimer; timer->base = &cpu_base->clock_base[base]; timerqueue_init(&timer->node); } @@ -1244,8 +1246,13 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id, * hrtimer_init - initialize a timer to the given clock * @timer: the timer to be initialized * @clock_id: the clock to be used - * @mode: timer mode: absolute (HRTIMER_MODE_ABS) or - * relative (HRTIMER_MODE_REL); pinned is not considered here! + * @mode: The modes which are relevant for intitialization: + * HRTIMER_MODE_ABS, HRTIMER_MODE_REL, HRTIMER_MODE_ABS_SOFT, + * HRTIMER_MODE_REL_SOFT + * + * The PINNED variants of the above can be handed in, + * but the PINNED bit is ignored as pinning happens + * when the hrtimer is started */ void hrtimer_init(struct hrtimer *timer, clockid_t clock_id, enum hrtimer_mode mode) -- cgit v1.2.3 From f37a8cb84cce18762e8f86a70bd6a49a66ab964c Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 16 Jan 2018 23:30:10 +0100 Subject: bpf: reject stores into ctx via st and xadd Alexei found that verifier does not reject stores into context via BPF_ST instead of BPF_STX. And while looking at it, we also should not allow XADD variant of BPF_STX. The context rewriter is only assuming either BPF_LDX_MEM- or BPF_STX_MEM-type operations, thus reject anything other than that so that assumptions in the rewriter properly hold. Add test cases as well for BPF selftests. Fixes: d691f9e8d440 ("bpf: allow programs to write to certain skb fields") Reported-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index b7448347e6b6..eb062b0fbf27 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -978,6 +978,13 @@ static bool is_pointer_value(struct bpf_verifier_env *env, int regno) return __is_pointer_value(env->allow_ptr_leaks, cur_regs(env) + regno); } +static bool is_ctx_reg(struct bpf_verifier_env *env, int regno) +{ + const struct bpf_reg_state *reg = cur_regs(env) + regno; + + return reg->type == PTR_TO_CTX; +} + static int check_pkt_ptr_alignment(struct bpf_verifier_env *env, const struct bpf_reg_state *reg, int off, int size, bool strict) @@ -1258,6 +1265,12 @@ static int check_xadd(struct bpf_verifier_env *env, int insn_idx, struct bpf_ins return -EACCES; } + if (is_ctx_reg(env, insn->dst_reg)) { + verbose(env, "BPF_XADD stores into R%d context is not allowed\n", + insn->dst_reg); + return -EACCES; + } + /* check whether atomic_add can read the memory */ err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off, BPF_SIZE(insn->code), BPF_READ, -1); @@ -3993,6 +4006,12 @@ static int do_check(struct bpf_verifier_env *env) if (err) return err; + if (is_ctx_reg(env, insn->dst_reg)) { + verbose(env, "BPF_ST stores into R%d context is not allowed\n", + insn->dst_reg); + return -EACCES; + } + /* check that memory (dst_reg + off) is writeable */ err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off, BPF_SIZE(insn->code), BPF_WRITE, -- cgit v1.2.3 From e1e871aff3ded26348c631b1370e257d401cd22d Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 2 Jan 2018 15:12:01 +0000 Subject: Expand INIT_STRUCT_PID and remove Expand INIT_STRUCT_PID in the single place that uses it and then remove it. There doesn't seem any point in the macro. Signed-off-by: David Howells Tested-by: Tony Luck Tested-by: Will Deacon (arm64) Tested-by: Palmer Dabbelt Acked-by: Thomas Gleixner --- kernel/pid.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/pid.c b/kernel/pid.c index b13b624e2c49..161af2eda943 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -41,7 +41,19 @@ #include #include -struct pid init_struct_pid = INIT_STRUCT_PID; +struct pid init_struct_pid = { + .count = ATOMIC_INIT(1), + .tasks = { + { .first = NULL }, + { .first = NULL }, + { .first = NULL }, + }, + .level = 0, + .numbers = { { + .nr = 0, + .ns = &init_pid_ns, + }, } +}; int pid_max = PID_MAX_DEFAULT; -- cgit v1.2.3 From 8ffdfe35b8a67e509dee5719807b7f88ed2cda7e Mon Sep 17 00:00:00 2001 From: Kyungsik Lee Date: Tue, 16 Jan 2018 10:19:43 +0900 Subject: PM / hibernate: Drop unused parameter of enough_swap Parameter flags is no longer used, remove it. Signed-off-by: Kyungsik Lee Signed-off-by: Rafael J. Wysocki --- kernel/power/swap.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 293ead59eccc..a46be1261c09 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -879,7 +879,7 @@ out_clean: * space avaiable from the resume partition. */ -static int enough_swap(unsigned int nr_pages, unsigned int flags) +static int enough_swap(unsigned int nr_pages) { unsigned int free_swap = count_swap_pages(root_swap, 1); unsigned int required; @@ -915,7 +915,7 @@ int swsusp_write(unsigned int flags) return error; } if (flags & SF_NOCOMPRESS_MODE) { - if (!enough_swap(pages, flags)) { + if (!enough_swap(pages)) { pr_err("Not enough free swap\n"); error = -ENOSPC; goto out_finish; -- cgit v1.2.3 From 6f16101e6a8b4324c36e58a29d9e0dbb287cdedb Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 18 Jan 2018 01:15:21 +0100 Subject: bpf: mark dst unknown on inconsistent {s, u}bounds adjustments syzkaller generated a BPF proglet and triggered a warning with the following: 0: (b7) r0 = 0 1: (d5) if r0 s<= 0x0 goto pc+0 R0=inv0 R1=ctx(id=0,off=0,imm=0) R10=fp0 2: (1f) r0 -= r1 R0=inv0 R1=ctx(id=0,off=0,imm=0) R10=fp0 verifier internal error: known but bad sbounds What happens is that in the first insn, r0's min/max value are both 0 due to the immediate assignment, later in the jsle test the bounds are updated for the min value in the false path, meaning, they yield smin_val = 1, smax_val = 0, and when ctx pointer is subtracted from r0, verifier bails out with the internal error and throwing a WARN since smin_val != smax_val for the known constant. For min_val > max_val scenario it means that reg_set_min_max() and reg_set_min_max_inv() (which both refine existing bounds) demonstrated that such branch cannot be taken at runtime. In above scenario for the case where it will be taken, the existing [0, 0] bounds are kept intact. Meaning, the rejection is not due to a verifier internal error, and therefore the WARN() is not necessary either. We could just reject such cases in adjust_{ptr,scalar}_min_max_vals() when either known scalars have smin_val != smax_val or umin_val != umax_val or any scalar reg with bounds smin_val > smax_val or umin_val > umax_val. However, there may be a small risk of breakage of buggy programs, so handle this more gracefully and in adjust_{ptr,scalar}_min_max_vals() just taint the dst reg as unknown scalar when we see ops with such kind of src reg. Reported-by: syzbot+6d362cadd45dc0a12ba4@syzkaller.appspotmail.com Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 27 ++++++++++++++++----------- 1 file changed, 16 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index eb062b0fbf27..13551e623501 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1895,17 +1895,13 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, dst_reg = ®s[dst]; - if (WARN_ON_ONCE(known && (smin_val != smax_val))) { - print_verifier_state(env, env->cur_state); - verbose(env, - "verifier internal error: known but bad sbounds\n"); - return -EINVAL; - } - if (WARN_ON_ONCE(known && (umin_val != umax_val))) { - print_verifier_state(env, env->cur_state); - verbose(env, - "verifier internal error: known but bad ubounds\n"); - return -EINVAL; + if ((known && (smin_val != smax_val || umin_val != umax_val)) || + smin_val > smax_val || umin_val > umax_val) { + /* Taint dst register if offset had invalid bounds derived from + * e.g. dead branches. + */ + __mark_reg_unknown(dst_reg); + return 0; } if (BPF_CLASS(insn->code) != BPF_ALU64) { @@ -2097,6 +2093,15 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, src_known = tnum_is_const(src_reg.var_off); dst_known = tnum_is_const(dst_reg->var_off); + if ((src_known && (smin_val != smax_val || umin_val != umax_val)) || + smin_val > smax_val || umin_val > umax_val) { + /* Taint dst register if offset had invalid bounds derived from + * e.g. dead branches. + */ + __mark_reg_unknown(dst_reg); + return 0; + } + if (!src_known && opcode != BPF_ADD && opcode != BPF_SUB && opcode != BPF_AND) { __mark_reg_unknown(dst_reg); -- cgit v1.2.3 From a0c9259dc4e1923a98356967ce8b732da1979df8 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 17 Jan 2018 16:01:47 +0100 Subject: irq/matrix: Spread interrupts on allocation Keith reported an issue with vector space exhaustion on a server machine which is caused by the i40e driver allocating 168 MSI interrupts when the driver is initialized, even when most of these interrupts are not used at all. The x86 vector allocation code tries to avoid the immediate allocation with the reservation mode, but the card uses MSI and does not support MSI entry masking, which prevents reservation mode and requires immediate vector allocation. The matrix allocator is a bit naive and prefers the first CPU in the cpumask which describes the possible target CPUs for an allocation. That results in allocating all 168 vectors on CPU0 which later causes vector space exhaustion when the NVMe driver tries to allocate managed interrupts on each CPU for the per CPU queues. Avoid this by finding the CPU which has the lowest vector allocation count to spread out the non managed interrupt accross the possible target CPUs. Fixes: 2f75d9e1c905 ("genirq: Implement bitmap matrix allocator") Reported-by: Keith Busch Signed-off-by: Thomas Gleixner Tested-by: Keith Busch Link: https://lkml.kernel.org/r/alpine.DEB.2.20.1801171557330.1777@nanos --- kernel/irq/matrix.c | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/matrix.c b/kernel/irq/matrix.c index 0ba0dd8863a7..5187dfe809ac 100644 --- a/kernel/irq/matrix.c +++ b/kernel/irq/matrix.c @@ -321,15 +321,23 @@ void irq_matrix_remove_reserved(struct irq_matrix *m) int irq_matrix_alloc(struct irq_matrix *m, const struct cpumask *msk, bool reserved, unsigned int *mapped_cpu) { - unsigned int cpu; + unsigned int cpu, best_cpu, maxavl = 0; + struct cpumap *cm; + unsigned int bit; + best_cpu = UINT_MAX; for_each_cpu(cpu, msk) { - struct cpumap *cm = per_cpu_ptr(m->maps, cpu); - unsigned int bit; + cm = per_cpu_ptr(m->maps, cpu); - if (!cm->online) + if (!cm->online || cm->available <= maxavl) continue; + best_cpu = cpu; + maxavl = cm->available; + } + + if (maxavl) { + cm = per_cpu_ptr(m->maps, best_cpu); bit = matrix_alloc_area(m, cm, 1, false); if (bit < m->alloc_end) { cm->allocated++; @@ -338,8 +346,8 @@ int irq_matrix_alloc(struct irq_matrix *m, const struct cpumask *msk, m->global_available--; if (reserved) m->global_reserved--; - *mapped_cpu = cpu; - trace_irq_matrix_alloc(bit, cpu, m, cm); + *mapped_cpu = best_cpu; + trace_irq_matrix_alloc(bit, best_cpu, m, cm); return bit; } } -- cgit v1.2.3 From 64f29d1bc9fb8196df3d0f1df694245230e208c0 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Wed, 17 Jan 2018 07:14:12 -0800 Subject: lockdep: Assign lock keys on registration Lockdep is assigning lock keys when a lock was looked up. This is unnecessary; if the lock has never been registered then it is known that it is not locked. It also complicates the calling convention. Switch to assigning the lock key in register_lock_class(). Signed-off-by: Matthew Wilcox Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Cc: "David S. Miller" Link: https://lkml.kernel.org/r/20180117151414.23686-2-willy@infradead.org --- kernel/locking/lockdep.c | 76 +++++++++++++++++++++++++----------------------- 1 file changed, 40 insertions(+), 36 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 5fa1324a4f29..472547dd45c3 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -647,18 +647,12 @@ static int count_matching_names(struct lock_class *new_class) return count + 1; } -/* - * Register a lock's class in the hash-table, if the class is not present - * yet. Otherwise we look it up. We cache the result in the lock object - * itself, so actual lookup of the hash should be once per lock object. - */ static inline struct lock_class * look_up_lock_class(struct lockdep_map *lock, unsigned int subclass) { struct lockdep_subclass_key *key; struct hlist_head *hash_head; struct lock_class *class; - bool is_static = false; if (unlikely(subclass >= MAX_LOCKDEP_SUBCLASSES)) { debug_locks_off(); @@ -671,24 +665,11 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass) } /* - * Static locks do not have their class-keys yet - for them the key - * is the lock object itself. If the lock is in the per cpu area, - * the canonical address of the lock (per cpu offset removed) is - * used. + * If it is not initialised then it has never been locked, + * so it won't be present in the hash table. */ - if (unlikely(!lock->key)) { - unsigned long can_addr, addr = (unsigned long)lock; - - if (__is_kernel_percpu_address(addr, &can_addr)) - lock->key = (void *)can_addr; - else if (__is_module_percpu_address(addr, &can_addr)) - lock->key = (void *)can_addr; - else if (static_obj(lock)) - lock->key = (void *)lock; - else - return ERR_PTR(-EINVAL); - is_static = true; - } + if (unlikely(!lock->key)) + return NULL; /* * NOTE: the class-key must be unique. For dynamic locks, a static @@ -720,7 +701,35 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass) } } - return is_static || static_obj(lock->key) ? NULL : ERR_PTR(-EINVAL); + return NULL; +} + +/* + * Static locks do not have their class-keys yet - for them the key is + * the lock object itself. If the lock is in the per cpu area, the + * canonical address of the lock (per cpu offset removed) is used. + */ +static bool assign_lock_key(struct lockdep_map *lock) +{ + unsigned long can_addr, addr = (unsigned long)lock; + + if (__is_kernel_percpu_address(addr, &can_addr)) + lock->key = (void *)can_addr; + else if (__is_module_percpu_address(addr, &can_addr)) + lock->key = (void *)can_addr; + else if (static_obj(lock)) + lock->key = (void *)lock; + else { + /* Debug-check: all keys must be persistent! */ + debug_locks_off(); + pr_err("INFO: trying to register non-static key.\n"); + pr_err("the code is fine but needs lockdep annotation.\n"); + pr_err("turning off the locking correctness validator.\n"); + dump_stack(); + return false; + } + + return true; } /* @@ -738,18 +747,13 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force) DEBUG_LOCKS_WARN_ON(!irqs_disabled()); class = look_up_lock_class(lock, subclass); - if (likely(!IS_ERR_OR_NULL(class))) + if (likely(class)) goto out_set_class_cache; - /* - * Debug-check: all keys must be persistent! - */ - if (IS_ERR(class)) { - debug_locks_off(); - printk("INFO: trying to register non-static key.\n"); - printk("the code is fine but needs lockdep annotation.\n"); - printk("turning off the locking correctness validator.\n"); - dump_stack(); + if (!lock->key) { + if (!assign_lock_key(lock)) + return NULL; + } else if (!static_obj(lock->key)) { return NULL; } @@ -3498,7 +3502,7 @@ static int match_held_lock(struct held_lock *hlock, struct lockdep_map *lock) * Clearly if the lock hasn't been acquired _ever_, we're not * holding it either, so report failure. */ - if (IS_ERR_OR_NULL(class)) + if (!class) return 0; /* @@ -4294,7 +4298,7 @@ void lockdep_reset_lock(struct lockdep_map *lock) * If the class exists we look it up and zap it: */ class = look_up_lock_class(lock, j); - if (!IS_ERR_OR_NULL(class)) + if (class) zap_class(class); } /* -- cgit v1.2.3 From 08f36ff642342fb058212099757cb5d40f158c2a Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Wed, 17 Jan 2018 07:14:13 -0800 Subject: lockdep: Make lockdep checking constant There are several places in the kernel which would like to pass a const pointer to lockdep_is_held(). Constify the entire path so nobody has to trick the compiler. Signed-off-by: Matthew Wilcox Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Cc: "David S. Miller" Link: https://lkml.kernel.org/r/20180117151414.23686-3-willy@infradead.org --- kernel/locking/lockdep.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 472547dd45c3..b7a307b53704 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -648,7 +648,7 @@ static int count_matching_names(struct lock_class *new_class) } static inline struct lock_class * -look_up_lock_class(struct lockdep_map *lock, unsigned int subclass) +look_up_lock_class(const struct lockdep_map *lock, unsigned int subclass) { struct lockdep_subclass_key *key; struct hlist_head *hash_head; @@ -3276,7 +3276,7 @@ print_lock_nested_lock_not_held(struct task_struct *curr, return 0; } -static int __lock_is_held(struct lockdep_map *lock, int read); +static int __lock_is_held(const struct lockdep_map *lock, int read); /* * This gets called for every mutex_lock*()/spin_lock*() operation. @@ -3485,13 +3485,14 @@ print_unlock_imbalance_bug(struct task_struct *curr, struct lockdep_map *lock, return 0; } -static int match_held_lock(struct held_lock *hlock, struct lockdep_map *lock) +static int match_held_lock(const struct held_lock *hlock, + const struct lockdep_map *lock) { if (hlock->instance == lock) return 1; if (hlock->references) { - struct lock_class *class = lock->class_cache[0]; + const struct lock_class *class = lock->class_cache[0]; if (!class) class = look_up_lock_class(lock, 0); @@ -3727,7 +3728,7 @@ __lock_release(struct lockdep_map *lock, int nested, unsigned long ip) return 1; } -static int __lock_is_held(struct lockdep_map *lock, int read) +static int __lock_is_held(const struct lockdep_map *lock, int read) { struct task_struct *curr = current; int i; @@ -3941,7 +3942,7 @@ void lock_release(struct lockdep_map *lock, int nested, } EXPORT_SYMBOL_GPL(lock_release); -int lock_is_held_type(struct lockdep_map *lock, int read) +int lock_is_held_type(const struct lockdep_map *lock, int read) { unsigned long flags; int ret = 0; -- cgit v1.2.3 From 0164e0d7e803af3ee1c63770978c728f8778ad01 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 18 Jan 2018 15:42:09 -0500 Subject: ring-buffer: Fix duplicate results in mapping context to bits in recursive lock In bringing back the context checks, the code checks first if its normal (non-interrupt) context, and then for NMI then IRQ then softirq. The final check is redundant. Since the if branch is only hit if the context is one of NMI, IRQ, or SOFTIRQ, if it's not NMI or IRQ there's no reason to check if it is SOFTIRQ. The current code returns the same result even if its not a SOFTIRQ. Which is confusing. pc & SOFTIRQ_OFFSET ? 2 : RB_CTX_SOFTIRQ Is redundant as RB_CTX_SOFTIRQ *is* 2! Fixes: a0e3a18f4baf ("ring-buffer: Bring back context level recursive checks") Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ring_buffer.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 0cddf60186da..5af2842dea96 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -2579,8 +2579,7 @@ trace_recursive_lock(struct ring_buffer_per_cpu *cpu_buffer) bit = RB_CTX_NORMAL; else bit = pc & NMI_MASK ? RB_CTX_NMI : - pc & HARDIRQ_MASK ? RB_CTX_IRQ : - pc & SOFTIRQ_OFFSET ? 2 : RB_CTX_SOFTIRQ; + pc & HARDIRQ_MASK ? RB_CTX_IRQ : RB_CTX_SOFTIRQ; if (unlikely(val & (1 << bit))) return 1; -- cgit v1.2.3 From 1ebe1eaf2f02784921759992ae1fde1a9bec8fd0 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 18 Jan 2018 15:53:10 -0500 Subject: tracing: Fix converting enum's from the map in trace_event_eval_update() Since enums do not get converted by the TRACE_EVENT macro into their values, the event format displaces the enum name and not the value. This breaks tools like perf and trace-cmd that need to interpret the raw binary data. To solve this, an enum map was created to convert these enums into their actual numbers on boot up. This is done by TRACE_EVENTS() adding a TRACE_DEFINE_ENUM() macro. Some enums were not being converted. This was caused by an optization that had a bug in it. All calls get checked against this enum map to see if it should be converted or not, and it compares the call's system to the system that the enum map was created under. If they match, then they call is processed. To cut down on the number of iterations needed to find the maps with a matching system, since calls and maps are grouped by system, when a match is made, the index into the map array is saved, so that the next call, if it belongs to the same system as the previous call, could start right at that array index and not have to scan all the previous arrays. The problem was, the saved index was used as the variable to know if this is a call in a new system or not. If the index was zero, it was assumed that the call is in a new system and would keep incrementing the saved index until it found a matching system. The issue arises when the first matching system was at index zero. The next map, if it belonged to the same system, would then think it was the first match and increment the index to one. If the next call belong to the same system, it would begin its search of the maps off by one, and miss the first enum that should be converted. This left a single enum not converted properly. Also add a comment to describe exactly what that index was for. It took me a bit too long to figure out what I was thinking when debugging this issue. Link: http://lkml.kernel.org/r/717BE572-2070-4C1E-9902-9F2E0FEDA4F8@oracle.com Cc: stable@vger.kernel.org Fixes: 0c564a538aa93 ("tracing: Add TRACE_DEFINE_ENUM() macro to map enums to their values") Reported-by: Chuck Lever Teste-by: Chuck Lever Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index ec0f9aa4e151..1b87157edbff 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -2213,6 +2213,7 @@ void trace_event_eval_update(struct trace_eval_map **map, int len) { struct trace_event_call *call, *p; const char *last_system = NULL; + bool first = false; int last_i; int i; @@ -2220,15 +2221,28 @@ void trace_event_eval_update(struct trace_eval_map **map, int len) list_for_each_entry_safe(call, p, &ftrace_events, list) { /* events are usually grouped together with systems */ if (!last_system || call->class->system != last_system) { + first = true; last_i = 0; last_system = call->class->system; } + /* + * Since calls are grouped by systems, the likelyhood that the + * next call in the iteration belongs to the same system as the + * previous call is high. As an optimization, we skip seaching + * for a map[] that matches the call's system if the last call + * was from the same system. That's what last_i is for. If the + * call has the same system as the previous call, then last_i + * will be the index of the first map[] that has a matching + * system. + */ for (i = last_i; i < len; i++) { if (call->class->system == map[i]->system) { /* Save the first system if need be */ - if (!last_i) + if (first) { last_i = i; + first = false; + } update_event_printk(call, map[i]); } } -- cgit v1.2.3 From 6be7fa3c74d1e0cd50f2157b5c1524f152bf641e Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Mon, 22 Jan 2018 22:32:51 -0500 Subject: ftrace, orc, x86: Handle ftrace dynamically allocated trampolines The function tracer can create a dynamically allocated trampoline that is called by the function mcount or fentry hook that is used to call the function callback that is registered. The problem is that the orc undwinder will bail if it encounters one of these trampolines. This breaks the stack trace of function callbacks, which include the stack tracer and setting the stack trace for individual functions. Since these dynamic trampolines are basically copies of the static ftrace trampolines defined in ftrace_*.S, we do not need to create new orc entries for the dynamic trampolines. Finding the return address on the stack will be identical as the functions that were copied to create the dynamic trampolines. When encountering a ftrace dynamic trampoline, we can just use the orc entry of the ftrace static function that was copied for that trampoline. Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 29 +++++++++++++++++------------ 1 file changed, 17 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index ccdf3664e4a9..554b517c61a0 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1119,15 +1119,11 @@ static struct ftrace_ops global_ops = { }; /* - * This is used by __kernel_text_address() to return true if the - * address is on a dynamically allocated trampoline that would - * not return true for either core_kernel_text() or - * is_module_text_address(). + * Used by the stack undwinder to know about dynamic ftrace trampolines. */ -bool is_ftrace_trampoline(unsigned long addr) +struct ftrace_ops *ftrace_ops_trampoline(unsigned long addr) { - struct ftrace_ops *op; - bool ret = false; + struct ftrace_ops *op = NULL; /* * Some of the ops may be dynamically allocated, @@ -1144,15 +1140,24 @@ bool is_ftrace_trampoline(unsigned long addr) if (op->trampoline && op->trampoline_size) if (addr >= op->trampoline && addr < op->trampoline + op->trampoline_size) { - ret = true; - goto out; + preempt_enable_notrace(); + return op; } } while_for_each_ftrace_op(op); - - out: preempt_enable_notrace(); - return ret; + return NULL; +} + +/* + * This is used by __kernel_text_address() to return true if the + * address is on a dynamically allocated trampoline that would + * not return true for either core_kernel_text() or + * is_module_text_address(). + */ +bool is_ftrace_trampoline(unsigned long addr) +{ + return ftrace_ops_trampoline(addr) != NULL; } struct ftrace_page { -- cgit v1.2.3 From 2ee5b92a2598d9e403337185fdf88f661dee8616 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Tue, 23 Jan 2018 13:25:04 -0500 Subject: tracing: Update stack trace skipping for ORC unwinder With the addition of ORC unwinder and FRAME POINTER unwinder, the stack trace skipping requirements have changed. I went through the tracing stack trace dumps with ORC and with frame pointers and recalculated the proper values. Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 34 ++++++++++++++----------- kernel/trace/trace_events_trigger.c | 13 ++++++++-- kernel/trace/trace_functions.c | 49 +++++++++++++++++++++++++++---------- 3 files changed, 67 insertions(+), 29 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 2a8d8a294345..8e3f20a18a06 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2374,6 +2374,15 @@ void trace_event_buffer_commit(struct trace_event_buffer *fbuffer) } EXPORT_SYMBOL_GPL(trace_event_buffer_commit); +/* + * Skip 3: + * + * trace_buffer_unlock_commit_regs() + * trace_event_buffer_commit() + * trace_event_raw_event_xxx() +*/ +# define STACK_SKIP 3 + void trace_buffer_unlock_commit_regs(struct trace_array *tr, struct ring_buffer *buffer, struct ring_buffer_event *event, @@ -2383,16 +2392,12 @@ void trace_buffer_unlock_commit_regs(struct trace_array *tr, __buffer_unlock_commit(buffer, event); /* - * If regs is not set, then skip the following callers: - * trace_buffer_unlock_commit_regs - * event_trigger_unlock_commit - * trace_event_buffer_commit - * trace_event_raw_event_sched_switch + * If regs is not set, then skip the necessary functions. * Note, we can still get here via blktrace, wakeup tracer * and mmiotrace, but that's ok if they lose a function or - * two. They are that meaningful. + * two. They are not that meaningful. */ - ftrace_trace_stack(tr, buffer, flags, regs ? 0 : 4, pc, regs); + ftrace_trace_stack(tr, buffer, flags, regs ? 0 : STACK_SKIP, pc, regs); ftrace_trace_userstack(buffer, flags, pc); } @@ -2579,11 +2584,13 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer, trace.skip = skip; /* - * Add two, for this function and the call to save_stack_trace() + * Add one, for this function and the call to save_stack_trace() * If regs is set, then these functions will not be in the way. */ +#ifndef CONFIG_UNWINDER_ORC if (!regs) - trace.skip += 2; + trace.skip++; +#endif /* * Since events can happen in NMIs there's no safe way to @@ -2711,11 +2718,10 @@ void trace_dump_stack(int skip) local_save_flags(flags); - /* - * Skip 3 more, seems to get us at the caller of - * this function. - */ - skip += 3; +#ifndef CONFIG_UNWINDER_ORC + /* Skip 1 to skip this function. */ + skip++; +#endif __ftrace_trace_stack(global_trace.trace_buffer.buffer, flags, skip, preempt_count(), NULL); } diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index f2ac9d44f6c4..87411482a46f 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -1123,13 +1123,22 @@ static __init int register_trigger_snapshot_cmd(void) { return 0; } #endif /* CONFIG_TRACER_SNAPSHOT */ #ifdef CONFIG_STACKTRACE +#ifdef CONFIG_UNWINDER_ORC +/* Skip 2: + * event_triggers_post_call() + * trace_event_raw_event_xxx() + */ +# define STACK_SKIP 2 +#else /* - * Skip 3: + * Skip 4: * stacktrace_trigger() * event_triggers_post_call() + * trace_event_buffer_commit() * trace_event_raw_event_xxx() */ -#define STACK_SKIP 3 +#define STACK_SKIP 4 +#endif static void stacktrace_trigger(struct event_trigger_data *data, void *rec) diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index 27f7ad12c4b1..b611cd36e22d 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -154,6 +154,24 @@ function_trace_call(unsigned long ip, unsigned long parent_ip, preempt_enable_notrace(); } +#ifdef CONFIG_UNWINDER_ORC +/* + * Skip 2: + * + * function_stack_trace_call() + * ftrace_call() + */ +#define STACK_SKIP 2 +#else +/* + * Skip 3: + * __trace_stack() + * function_stack_trace_call() + * ftrace_call() + */ +#define STACK_SKIP 3 +#endif + static void function_stack_trace_call(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *op, struct pt_regs *pt_regs) @@ -180,15 +198,7 @@ function_stack_trace_call(unsigned long ip, unsigned long parent_ip, if (likely(disabled == 1)) { pc = preempt_count(); trace_function(tr, ip, parent_ip, flags, pc); - /* - * skip over 5 funcs: - * __ftrace_trace_stack, - * __trace_stack, - * function_stack_trace_call - * ftrace_list_func - * ftrace_call - */ - __trace_stack(tr, flags, 5, pc); + __trace_stack(tr, flags, STACK_SKIP, pc); } atomic_dec(&data->disabled); @@ -367,14 +377,27 @@ ftrace_traceoff(unsigned long ip, unsigned long parent_ip, tracer_tracing_off(tr); } +#ifdef CONFIG_UNWINDER_ORC /* - * Skip 4: + * Skip 3: + * + * function_trace_probe_call() + * ftrace_ops_assist_func() + * ftrace_call() + */ +#define FTRACE_STACK_SKIP 3 +#else +/* + * Skip 5: + * + * __trace_stack() * ftrace_stacktrace() * function_trace_probe_call() - * ftrace_ops_list_func() + * ftrace_ops_assist_func() * ftrace_call() */ -#define STACK_SKIP 4 +#define FTRACE_STACK_SKIP 5 +#endif static __always_inline void trace_stack(struct trace_array *tr) { @@ -384,7 +407,7 @@ static __always_inline void trace_stack(struct trace_array *tr) local_save_flags(flags); pc = preempt_count(); - __trace_stack(tr, flags, STACK_SKIP, pc); + __trace_stack(tr, flags, FTRACE_STACK_SKIP, pc); } static void -- cgit v1.2.3 From a97cb0e7b3f4c6297fd857055ae8e895f402f501 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 22 Jan 2018 11:39:47 +0100 Subject: futex: Fix OWNER_DEAD fixup Both Geert and DaveJ reported that the recent futex commit: c1e2f0eaf015 ("futex: Avoid violating the 10th rule of futex") introduced a problem with setting OWNER_DEAD. We set the bit on an uninitialized variable and then entirely optimize it away as a dead-store. Move the setting of the bit to where it is more useful. Reported-by: Geert Uytterhoeven Reported-by: Dave Jones Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Fixes: c1e2f0eaf015 ("futex: Avoid violating the 10th rule of futex") Link: http://lkml.kernel.org/r/20180122103947.GD2228@hirez.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- kernel/futex.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 8c5424dd5924..7f719d110908 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -2311,9 +2311,6 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); oldowner = pi_state->owner; - /* Owner died? */ - if (!pi_state->owner) - newtid |= FUTEX_OWNER_DIED; /* * We are here because either: @@ -2374,6 +2371,9 @@ retry: } newtid = task_pid_vnr(newowner) | FUTEX_WAITERS; + /* Owner died? */ + if (!pi_state->owner) + newtid |= FUTEX_OWNER_DIED; if (get_futex_value_locked(&uval, uaddr)) goto handle_fault; -- cgit v1.2.3 From 88f1c87de11a86d839f4ce5313e552d96709b990 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 22 Jan 2018 14:00:55 -0800 Subject: locking/lockdep: Avoid triggering hardlockup from debug_show_all_locks() debug_show_all_locks() iterates all tasks and print held locks whole holding tasklist_lock. This can take a while on a slow console device and may end up triggering NMI hardlockup detector if someone else ends up waiting for tasklist_lock. Touch the NMI watchdog while printing the held locks to avoid spuriously triggering the hardlockup detector. Signed-off-by: Tejun Heo Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: kernel-team@fb.com Link: http://lkml.kernel.org/r/20180122220055.GB1771050@devbig577.frc2.facebook.com Signed-off-by: Ingo Molnar --- kernel/locking/lockdep.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 5fa1324a4f29..521659044719 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -49,6 +49,7 @@ #include #include #include +#include #include @@ -4490,6 +4491,7 @@ retry: if (!unlock) if (read_trylock(&tasklist_lock)) unlock = 1; + touch_nmi_watchdog(); } while_each_thread(g, p); pr_warn("\n"); -- cgit v1.2.3 From ce48c146495a1a50e48cdbfbfaba3e708be7c07c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 22 Jan 2018 22:53:28 +0100 Subject: sched/core: Fix cpu.max vs. cpuhotplug deadlock Tejun reported the following cpu-hotplug lock (percpu-rwsem) read recursion: tg_set_cfs_bandwidth() get_online_cpus() cpus_read_lock() cfs_bandwidth_usage_inc() static_key_slow_inc() cpus_read_lock() Reported-by: Tejun Heo Tested-by: Tejun Heo Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20180122215328.GP3397@worktop Signed-off-by: Ingo Molnar --- kernel/jump_label.c | 12 +++++++++--- kernel/sched/fair.c | 4 ++-- 2 files changed, 11 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/jump_label.c b/kernel/jump_label.c index 8594d24e4adc..b4517095db6a 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -79,7 +79,7 @@ int static_key_count(struct static_key *key) } EXPORT_SYMBOL_GPL(static_key_count); -static void static_key_slow_inc_cpuslocked(struct static_key *key) +void static_key_slow_inc_cpuslocked(struct static_key *key) { int v, v1; @@ -180,7 +180,7 @@ void static_key_disable(struct static_key *key) } EXPORT_SYMBOL_GPL(static_key_disable); -static void static_key_slow_dec_cpuslocked(struct static_key *key, +static void __static_key_slow_dec_cpuslocked(struct static_key *key, unsigned long rate_limit, struct delayed_work *work) { @@ -211,7 +211,7 @@ static void __static_key_slow_dec(struct static_key *key, struct delayed_work *work) { cpus_read_lock(); - static_key_slow_dec_cpuslocked(key, rate_limit, work); + __static_key_slow_dec_cpuslocked(key, rate_limit, work); cpus_read_unlock(); } @@ -229,6 +229,12 @@ void static_key_slow_dec(struct static_key *key) } EXPORT_SYMBOL_GPL(static_key_slow_dec); +void static_key_slow_dec_cpuslocked(struct static_key *key) +{ + STATIC_KEY_CHECK_USE(key); + __static_key_slow_dec_cpuslocked(key, 0, NULL); +} + void static_key_slow_dec_deferred(struct static_key_deferred *key) { STATIC_KEY_CHECK_USE(key); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 2fe3aa853e4d..26a71ebcd3c2 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4365,12 +4365,12 @@ static inline bool cfs_bandwidth_used(void) void cfs_bandwidth_usage_inc(void) { - static_key_slow_inc(&__cfs_bandwidth_used); + static_key_slow_inc_cpuslocked(&__cfs_bandwidth_used); } void cfs_bandwidth_usage_dec(void) { - static_key_slow_dec(&__cfs_bandwidth_used); + static_key_slow_dec_cpuslocked(&__cfs_bandwidth_used); } #else /* HAVE_JUMP_LABEL */ static bool cfs_bandwidth_used(void) -- cgit v1.2.3 From c5baa1be8f559d5f33c412d00cc1c86762a8bbbf Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 17 Jan 2018 14:26:47 +0000 Subject: irqdomain: Kill CONFIG_IRQ_DOMAIN_DEBUG CONFIG_IRQ_DOMAIN_DEBUG is similar to CONFIG_GENERIC_IRQ_DEBUGFS, just with less information. Spring cleanup time. Signed-off-by: Marc Zyngier Signed-off-by: Thomas Gleixner Cc: Yang Shunyong Link: https://lkml.kernel.org/r/20180117142647.23622-1-marc.zyngier@arm.com --- kernel/irq/Kconfig | 10 ----- kernel/irq/irqdomain.c | 118 ------------------------------------------------- 2 files changed, 128 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig index 89e355866450..6fc87ccda1d7 100644 --- a/kernel/irq/Kconfig +++ b/kernel/irq/Kconfig @@ -103,16 +103,6 @@ config GENERIC_IRQ_MATRIX_ALLOCATOR config GENERIC_IRQ_RESERVATION_MODE bool -config IRQ_DOMAIN_DEBUG - bool "Expose hardware/virtual IRQ mapping via debugfs" - depends on IRQ_DOMAIN && DEBUG_FS - help - This option will show the mapping relationship between hardware irq - numbers and Linux irq numbers. The mapping is exposed via debugfs - in the file "irq_domain_mapping". - - If you don't know what this means you don't need it. - # Support forced irq threading config IRQ_FORCED_THREADING bool diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 62068ad46930..e6a9c36470ee 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -897,124 +897,6 @@ unsigned int irq_find_mapping(struct irq_domain *domain, } EXPORT_SYMBOL_GPL(irq_find_mapping); -#ifdef CONFIG_IRQ_DOMAIN_DEBUG -static void virq_debug_show_one(struct seq_file *m, struct irq_desc *desc) -{ - struct irq_domain *domain; - struct irq_data *data; - - domain = desc->irq_data.domain; - data = &desc->irq_data; - - while (domain) { - unsigned int irq = data->irq; - unsigned long hwirq = data->hwirq; - struct irq_chip *chip; - bool direct; - - if (data == &desc->irq_data) - seq_printf(m, "%5d ", irq); - else - seq_printf(m, "%5d+ ", irq); - seq_printf(m, "0x%05lx ", hwirq); - - chip = irq_data_get_irq_chip(data); - seq_printf(m, "%-15s ", (chip && chip->name) ? chip->name : "none"); - - seq_printf(m, "0x%p ", irq_data_get_irq_chip_data(data)); - - seq_printf(m, " %c ", (desc->action && desc->action->handler) ? '*' : ' '); - direct = (irq == hwirq) && (irq < domain->revmap_direct_max_irq); - seq_printf(m, "%6s%-8s ", - (hwirq < domain->revmap_size) ? "LINEAR" : "RADIX", - direct ? "(DIRECT)" : ""); - seq_printf(m, "%s\n", domain->name); -#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY - domain = domain->parent; - data = data->parent_data; -#else - domain = NULL; -#endif - } -} - -static int virq_debug_show(struct seq_file *m, void *private) -{ - unsigned long flags; - struct irq_desc *desc; - struct irq_domain *domain; - struct radix_tree_iter iter; - void __rcu **slot; - int i; - - seq_printf(m, " %-16s %-6s %-10s %-10s %s\n", - "name", "mapped", "linear-max", "direct-max", "devtree-node"); - mutex_lock(&irq_domain_mutex); - list_for_each_entry(domain, &irq_domain_list, link) { - struct device_node *of_node; - const char *name; - - int count = 0; - - of_node = irq_domain_get_of_node(domain); - if (of_node) - name = of_node_full_name(of_node); - else if (is_fwnode_irqchip(domain->fwnode)) - name = container_of(domain->fwnode, struct irqchip_fwid, - fwnode)->name; - else - name = ""; - - radix_tree_for_each_slot(slot, &domain->revmap_tree, &iter, 0) - count++; - seq_printf(m, "%c%-16s %6u %10u %10u %s\n", - domain == irq_default_domain ? '*' : ' ', domain->name, - domain->revmap_size + count, domain->revmap_size, - domain->revmap_direct_max_irq, - name); - } - mutex_unlock(&irq_domain_mutex); - - seq_printf(m, "%-5s %-7s %-15s %-*s %6s %-14s %s\n", "irq", "hwirq", - "chip name", (int)(2 * sizeof(void *) + 2), "chip data", - "active", "type", "domain"); - - for (i = 1; i < nr_irqs; i++) { - desc = irq_to_desc(i); - if (!desc) - continue; - - raw_spin_lock_irqsave(&desc->lock, flags); - virq_debug_show_one(m, desc); - raw_spin_unlock_irqrestore(&desc->lock, flags); - } - - return 0; -} - -static int virq_debug_open(struct inode *inode, struct file *file) -{ - return single_open(file, virq_debug_show, inode->i_private); -} - -static const struct file_operations virq_debug_fops = { - .open = virq_debug_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - -static int __init irq_debugfs_init(void) -{ - if (debugfs_create_file("irq_domain_mapping", S_IRUGO, NULL, - NULL, &virq_debug_fops) == NULL) - return -ENOMEM; - - return 0; -} -__initcall(irq_debugfs_init); -#endif /* CONFIG_IRQ_DOMAIN_DEBUG */ - /** * irq_domain_xlate_onecell() - Generic xlate for direct one cell bindings * -- cgit v1.2.3 From 82d94856fa221b5173eefd56bcd1057c037e9b07 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 9 Jan 2018 13:10:30 +0100 Subject: perf/core: Fix lock inversion between perf,trace,cpuhp Lockdep gifted us with noticing the following 4-way lockup scenario: perf_trace_init() #0 mutex_lock(&event_mutex) perf_trace_event_init() perf_trace_event_reg() tp_event->class->reg() := tracepoint_probe_register #1 mutex_lock(&tracepoints_mutex) trace_point_add_func() #2 static_key_enable() #2 do_cpu_up() perf_event_init_cpu() #3 mutex_lock(&pmus_lock) #4 mutex_lock(&ctx->mutex) perf_event_task_disable() mutex_lock(¤t->perf_event_mutex) #4 ctx = perf_event_ctx_lock() #5 perf_event_for_each_child() do_exit() task_work_run() __fput() perf_release() perf_event_release_kernel() #4 mutex_lock(&ctx->mutex) #5 mutex_lock(&event->child_mutex) free_event() _free_event() event->destroy() := perf_trace_destroy #0 mutex_lock(&event_mutex); Fix that by moving the free_event() out from under the locks. Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Steven Rostedt (VMware) Cc: Thomas Gleixner Cc: Vince Weaver Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/events/core.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 4df5b695bf0d..2d80824298a7 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1231,6 +1231,10 @@ static void put_ctx(struct perf_event_context *ctx) * perf_event_context::lock * perf_event::mmap_mutex * mmap_sem + * + * cpu_hotplug_lock + * pmus_lock + * cpuctx->mutex / perf_event_context::mutex */ static struct perf_event_context * perf_event_ctx_lock_nested(struct perf_event *event, int nesting) @@ -4196,6 +4200,7 @@ int perf_event_release_kernel(struct perf_event *event) { struct perf_event_context *ctx = event->ctx; struct perf_event *child, *tmp; + LIST_HEAD(free_list); /* * If we got here through err_file: fput(event_file); we will not have @@ -4268,8 +4273,7 @@ again: struct perf_event, child_list); if (tmp == child) { perf_remove_from_context(child, DETACH_GROUP); - list_del(&child->child_list); - free_event(child); + list_move(&child->child_list, &free_list); /* * This matches the refcount bump in inherit_event(); * this can't be the last reference. @@ -4284,6 +4288,11 @@ again: } mutex_unlock(&event->child_mutex); + list_for_each_entry_safe(child, tmp, &free_list, child_list) { + list_del(&child->child_list); + free_event(child); + } + no_ctx: put_event(event); /* Must be the 'last' reference */ return 0; -- cgit v1.2.3 From 43fa87f7deed52e8c8420182e0c133bc4cf395f6 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 9 Jan 2018 17:07:59 +0100 Subject: perf/core: Fix another perf,trace,cpuhp lock inversion Lockdep noticed the following 3-way lockup race: perf_trace_init() #0 mutex_lock(&event_mutex) perf_trace_event_init() perf_trace_event_reg() tp_event->class->reg() := tracepoint_probe_register #1 mutex_lock(&tracepoints_mutex) trace_point_add_func() #2 static_key_enable() #2 do_cpu_up() perf_event_init_cpu() #3 mutex_lock(&pmus_lock) #4 mutex_lock(&ctx->mutex) perf_ioctl() #4 ctx = perf_event_ctx_lock() _perf_iotcl() ftrace_profile_set_filter() #0 mutex_lock(&event_mutex) Fudge it for now by noting that the tracepoint state does not depend on the event <-> context relation. Ugly though :/ Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- kernel/events/core.c | 26 ++++++++++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 2d80824298a7..816f83d70fc6 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -8525,6 +8525,29 @@ fail_clear_files: return ret; } +static int +perf_tracepoint_set_filter(struct perf_event *event, char *filter_str) +{ + struct perf_event_context *ctx = event->ctx; + int ret; + + /* + * Beware, here be dragons!! + * + * the tracepoint muck will deadlock against ctx->mutex, but the tracepoint + * stuff does not actually need it. So temporarily drop ctx->mutex. As per + * perf_event_ctx_lock() we already have a reference on ctx. + * + * This can result in event getting moved to a different ctx, but that + * does not affect the tracepoint state. + */ + mutex_unlock(&ctx->mutex); + ret = ftrace_profile_set_filter(event, event->attr.config, filter_str); + mutex_lock(&ctx->mutex); + + return ret; +} + static int perf_event_set_filter(struct perf_event *event, void __user *arg) { char *filter_str; @@ -8541,8 +8564,7 @@ static int perf_event_set_filter(struct perf_event *event, void __user *arg) if (IS_ENABLED(CONFIG_EVENT_TRACING) && event->attr.type == PERF_TYPE_TRACEPOINT) - ret = ftrace_profile_set_filter(event, event->attr.config, - filter_str); + ret = perf_tracepoint_set_filter(event, filter_str); else if (has_addr_filter(event)) ret = perf_event_set_addr_filter(event, filter_str); -- cgit v1.2.3 From 0c7296cad651a3a40286d70ff37e73bd6fa4e4da Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 9 Jan 2018 21:23:02 +0100 Subject: perf/core: Fix ctx::mutex deadlock Lockdep noticed the following 3-way lockup scenario: sys_perf_event_open() perf_event_alloc() perf_try_init_event() #0 ctx = perf_event_ctx_lock_nested(1) perf_swevent_init() swevent_hlist_get() #1 mutex_lock(&pmus_lock) perf_event_init_cpu() #1 mutex_lock(&pmus_lock) #2 mutex_lock(&ctx->mutex) sys_perf_event_open() mutex_lock_double() #2 mutex_lock() #0 mutex_lock_nested() And while we need that perf_event_ctx_lock_nested() for HW PMUs such that they can iterate the sibling list, trying to match it to the available counters, the software PMUs need do no such thing. Exclude them. In particular the swevent triggers the above invertion, while the tpevent PMU triggers a more elaborate one through their event_mutex. Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/events/core.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 816f83d70fc6..5d8f4031f8d5 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -9199,7 +9199,13 @@ static int perf_try_init_event(struct pmu *pmu, struct perf_event *event) if (!try_module_get(pmu->module)) return -ENODEV; - if (event->group_leader != event) { + /* + * A number of pmu->event_init() methods iterate the sibling_list to, + * for example, validate if the group fits on the PMU. Therefore, + * if this is a sibling event, acquire the ctx->mutex to protect + * the sibling_list. + */ + if (event->group_leader != event && pmu->task_ctx_nr != perf_sw_context) { /* * This ctx->mutex can nest when we're called through * inheritance. See the perf_event_ctx_lock_nested() comment. -- cgit v1.2.3 From caf7501a1b4ec964190f31f9c3f163de252273b8 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 25 Jan 2018 15:50:28 -0800 Subject: module/retpoline: Warn about missing retpoline in module There's a risk that a kernel which has full retpoline mitigations becomes vulnerable when a module gets loaded that hasn't been compiled with the right compiler or the right option. To enable detection of that mismatch at module load time, add a module info string "retpoline" at build time when the module was compiled with retpoline support. This only covers compiled C source, but assembler source or prebuilt object files are not checked. If a retpoline enabled kernel detects a non retpoline protected module at load time, print a warning and report it in the sysfs vulnerability file. [ tglx: Massaged changelog ] Signed-off-by: Andi Kleen Signed-off-by: Thomas Gleixner Cc: David Woodhouse Cc: gregkh@linuxfoundation.org Cc: torvalds@linux-foundation.org Cc: jeyu@kernel.org Cc: arjan@linux.intel.com Link: https://lkml.kernel.org/r/20180125235028.31211-1-andi@firstfloor.org --- kernel/module.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index de66ec825992..690c0651c40f 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2855,6 +2855,15 @@ static int check_modinfo_livepatch(struct module *mod, struct load_info *info) } #endif /* CONFIG_LIVEPATCH */ +static void check_modinfo_retpoline(struct module *mod, struct load_info *info) +{ + if (retpoline_module_ok(get_modinfo(info, "retpoline"))) + return; + + pr_warn("%s: loading module not compiled with retpoline compiler.\n", + mod->name); +} + /* Sets info->hdr and info->len. */ static int copy_module_from_user(const void __user *umod, unsigned long len, struct load_info *info) @@ -3021,6 +3030,8 @@ static int check_modinfo(struct module *mod, struct load_info *info, int flags) add_taint_module(mod, TAINT_OOT_MODULE, LOCKDEP_STILL_OK); } + check_modinfo_retpoline(mod, info); + if (get_modinfo(info, "staging")) { add_taint_module(mod, TAINT_CRAP, LOCKDEP_STILL_OK); pr_warn("%s: module is from the staging directory, the quality " -- cgit v1.2.3 From d5421ea43d30701e03cadc56a38854c36a8b4433 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 26 Jan 2018 14:54:32 +0100 Subject: hrtimer: Reset hrtimer cpu base proper on CPU hotplug The hrtimer interrupt code contains a hang detection and mitigation mechanism, which prevents that a long delayed hrtimer interrupt causes a continous retriggering of interrupts which prevent the system from making progress. If a hang is detected then the timer hardware is programmed with a certain delay into the future and a flag is set in the hrtimer cpu base which prevents newly enqueued timers from reprogramming the timer hardware prior to the chosen delay. The subsequent hrtimer interrupt after the delay clears the flag and resumes normal operation. If such a hang happens in the last hrtimer interrupt before a CPU is unplugged then the hang_detected flag is set and stays that way when the CPU is plugged in again. At that point the timer hardware is not armed and it cannot be armed because the hang_detected flag is still active, so nothing clears that flag. As a consequence the CPU does not receive hrtimer interrupts and no timers expire on that CPU which results in RCU stalls and other malfunctions. Clear the flag along with some other less critical members of the hrtimer cpu base to ensure starting from a clean state when a CPU is plugged in. Thanks to Paul, Sebastian and Anna-Maria for their help to get down to the root cause of that hard to reproduce heisenbug. Once understood it's trivial and certainly justifies a brown paperbag. Fixes: 41d2e4949377 ("hrtimer: Tune hrtimer_interrupt hang logic") Reported-by: Paul E. McKenney Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: Sebastian Sewior Cc: Anna-Maria Gleixner Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/alpine.DEB.2.20.1801261447590.2067@nanos --- kernel/time/hrtimer.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index d32520840fde..aa9d2a2b1210 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -655,7 +655,9 @@ static void hrtimer_reprogram(struct hrtimer *timer, static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) { base->expires_next = KTIME_MAX; + base->hang_detected = 0; base->hres_active = 0; + base->next_timer = NULL; } /* @@ -1589,6 +1591,7 @@ int hrtimers_prepare_cpu(unsigned int cpu) timerqueue_init_head(&cpu_base->clock_base[i].active); } + cpu_base->active_bases = 0; cpu_base->cpu = cpu; hrtimer_init_hres(cpu_base); return 0; -- cgit v1.2.3