From c9dccacfccc72c32692eedff4a27a4b0833a2afd Mon Sep 17 00:00:00 2001 From: Vincent Whitchurch Date: Thu, 11 Jul 2019 16:29:37 +0200 Subject: printk: Do not lose last line in kmsg buffer dump kmsg_dump_get_buffer() is supposed to select all the youngest log messages which fit into the provided buffer. It determines the correct start index by using msg_print_text() with a NULL buffer to calculate the size of each entry. However, when performing the actual writes, msg_print_text() only writes the entry to the buffer if the written len is lesser than the size of the buffer. So if the lengths of the selected youngest log messages happen to precisely fill up the provided buffer, the last log message is not included. We don't want to modify msg_print_text() to fill up the buffer and start returning a length which is equal to the size of the buffer, since callers of its other users, such as kmsg_dump_get_line(), depend upon the current behaviour. Instead, fix kmsg_dump_get_buffer() to compensate for this. For example, with the following two final prints: [ 6.427502] AAAAAAAAAAAAA [ 6.427769] BBBBBBBB12345 A dump of a 64-byte buffer filled by kmsg_dump_get_buffer(), before this patch: 00000000: 3c 30 3e 5b 20 20 20 20 36 2e 35 32 32 31 39 37 <0>[ 6.522197 00000010: 5d 20 41 41 41 41 41 41 41 41 41 41 41 41 41 0a ] AAAAAAAAAAAAA. 00000020: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ 00000030: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ After this patch: 00000000: 3c 30 3e 5b 20 20 20 20 36 2e 34 35 36 36 37 38 <0>[ 6.456678 00000010: 5d 20 42 42 42 42 42 42 42 42 31 32 33 34 35 0a ] BBBBBBBB12345. 00000020: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ 00000030: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ Link: http://lkml.kernel.org/r/20190711142937.4083-1-vincent.whitchurch@axis.com Fixes: e2ae715d66bf4bec ("kmsg - kmsg_dump() use iterator to receive log buffer content") To: rostedt@goodmis.org Cc: linux-kernel@vger.kernel.org Cc: # v3.5+ Signed-off-by: Vincent Whitchurch Reviewed-by: Sergey Senozhatsky Signed-off-by: Petr Mladek --- kernel/printk/printk.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 1888f6a3b694..424abf802f02 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -3274,7 +3274,7 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog, /* move first record forward until length fits into the buffer */ seq = dumper->cur_seq; idx = dumper->cur_idx; - while (l > size && seq < dumper->next_seq) { + while (l >= size && seq < dumper->next_seq) { struct printk_log *msg = log_from_idx(idx); l -= msg_print_text(msg, true, time, NULL, 0); -- cgit v1.2.3 From f240652b6032b48ad7fa35c5e701cc4c8d697c0b Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Fri, 5 Jul 2019 10:53:21 -0700 Subject: x86/mpx: Remove MPX APIs MPX is being removed from the kernel due to a lack of support in the toolchain going forward (gcc). The first step is to remove the userspace-visible ABIs so that applications will stop using it. The most visible one are the enable/disable prctl()s. Remove them first. This is the most minimal and least invasive change needed to ensure that apps stop using MPX with new kernels. Signed-off-by: Dave Hansen Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20190705175321.DB42F0AD@viggo.jf.intel.com --- kernel/sys.c | 16 ++-------------- 1 file changed, 2 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index 2969304c29fe..384b000b7865 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -103,12 +103,6 @@ #ifndef SET_TSC_CTL # define SET_TSC_CTL(a) (-EINVAL) #endif -#ifndef MPX_ENABLE_MANAGEMENT -# define MPX_ENABLE_MANAGEMENT() (-EINVAL) -#endif -#ifndef MPX_DISABLE_MANAGEMENT -# define MPX_DISABLE_MANAGEMENT() (-EINVAL) -#endif #ifndef GET_FP_MODE # define GET_FP_MODE(a) (-EINVAL) #endif @@ -2456,15 +2450,9 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, up_write(&me->mm->mmap_sem); break; case PR_MPX_ENABLE_MANAGEMENT: - if (arg2 || arg3 || arg4 || arg5) - return -EINVAL; - error = MPX_ENABLE_MANAGEMENT(); - break; case PR_MPX_DISABLE_MANAGEMENT: - if (arg2 || arg3 || arg4 || arg5) - return -EINVAL; - error = MPX_DISABLE_MANAGEMENT(); - break; + /* No longer implemented: */ + return -EINVAL; case PR_SET_FP_MODE: error = SET_FP_MODE(me, arg2); break; -- cgit v1.2.3 From 3a79bc63d90750f737ab9d7219bd3091d2fd6d84 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 15 Jul 2019 13:03:20 +0200 Subject: PCI: irq: Introduce rearm_wake_irq() Introduce a new function, rearm_wake_irq(), allowing a wakeup IRQ to be armed for systen wakeup detection again without running any action handlers associated with it after it has been armed for wakeup detection and triggered. That is useful for IRQs, like ACPI SCI, that may deliver wakeup as well as non-wakeup interrupts when armed for systen wakeup detection. In those cases, it may be possible to determine whether or not the delivered interrupt is a systen wakeup one without running the entire action handler (or handlers, if the IRQ is shared) for the IRQ, and if the interrupt turns out to be a non-wakeup one, the IRQ can be rearmed with the help of the new function. Signed-off-by: Rafael J. Wysocki Acked-by: Thomas Gleixner --- kernel/irq/pm.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) (limited to 'kernel') diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c index d6961d3c6f9e..8f557fa1f4fe 100644 --- a/kernel/irq/pm.c +++ b/kernel/irq/pm.c @@ -176,6 +176,26 @@ static void resume_irqs(bool want_early) } } +/** + * rearm_wake_irq - rearm a wakeup interrupt line after signaling wakeup + * @irq: Interrupt to rearm + */ +void rearm_wake_irq(unsigned int irq) +{ + unsigned long flags; + struct irq_desc *desc = irq_get_desc_buslock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL); + + if (!desc || !(desc->istate & IRQS_SUSPENDED) || + !irqd_is_wakeup_set(&desc->irq_data)) + return; + + desc->istate &= ~IRQS_SUSPENDED; + irqd_set(&desc->irq_data, IRQD_WAKEUP_ARMED); + __enable_irq(desc); + + irq_put_desc_busunlock(desc, flags); +} + /** * irq_pm_syscore_ops - enable interrupt lines early * -- cgit v1.2.3 From 56b991849009f5def0443bfb2f48c8321d888e15 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 15 Jul 2019 23:52:03 +0200 Subject: PM: sleep: Simplify suspend-to-idle control flow After commit 33e4f80ee69b ("ACPI / PM: Ignore spurious SCI wakeups from suspend-to-idle") the "noirq" phases of device suspend and resume may run for multiple times during suspend-to-idle, if there are spurious system wakeup events while suspended. However, this is complicated and fragile and actually unnecessary. The main reason for doing this is that on some systems the EC may signal system wakeup events (power button events, for example) as well as events that should not cause the system to resume (spurious system wakeup events). Thus, in order to determine whether or not a given event signaled by the EC while suspended is a proper system wakeup one, the EC GPE needs to be dispatched and to start with that was achieved by allowing the ACPI SCI action handler to run, which was only possible after calling resume_device_irqs(). However, dispatching the EC GPE this way turned out to take too much time in some cases and some EC events might be missed due to that, so commit 68e22011856f ("ACPI: EC: Dispatch the EC GPE directly on s2idle wake") started to dispatch the EC GPE right after a wakeup event has been detected, so in fact the full ACPI SCI action handler doesn't need to run any more to deal with the wakeups coming from the EC. Use this observation to simplify the suspend-to-idle control flow so that the "noirq" phases of device suspend and resume are each run only once in every suspend-to-idle cycle, which is reported to significantly reduce power drawn by some systems when suspended to idle (by allowing them to reach a deep platform-wide low-power state through the suspend-to-idle flow). [What appears to happen is that the "noirq" resume of devices after a spurious EC wakeup brings some devices into a state in which they prevent the platform from reaching the deep low-power state going forward, even after a subsequent "noirq" suspend phase, and on some systems the EC triggers such wakeups already when the "noirq" suspend of devices is running for the first time in the given suspend/resume cycle, so the platform cannot reach the deep low-power state at all.] First, make acpi_s2idle_wake() use the acpi_ec_dispatch_gpe() return value to determine whether or not the wakeup may have been triggered by the EC (in which case the system wakeup is canceled and ACPI events are processed in order to determine whether or not the event is a proper system wakeup one) and use rearm_wake_irq() (introduced by a previous change) in it to rearm the ACPI SCI for system wakeup detection in case the system will remain suspended. Second, drop acpi_s2idle_sync(), which is not needed any more, and the corresponding global platform suspend-to-idle callback. Next, drop the pm_wakeup_pending() check (which is an optimization only) from __device_suspend_noirq() to prevent it from returning errors on system wakeups occurring before the "noirq" phase of device suspend is complete (as in the case of suspend-to-idle it is not known whether or not these wakeups are suprious at that point), in order to avoid having to carry out a "noirq" resume of devices on a spurious system wakeup. Finally, change the code flow in s2idle_loop() to (1) run the "noirq" suspend of devices once before starting the loop, (2) check for spurious EC wakeups (via the platform ->wake callback) for the first time before calling s2idle_enter(), and (3) run the "noirq" resume of devices once after leaving the loop. Signed-off-by: Rafael J. Wysocki Acked-by: Thomas Gleixner --- kernel/power/suspend.c | 53 ++++++++++++++++++++++---------------------------- 1 file changed, 23 insertions(+), 30 deletions(-) (limited to 'kernel') diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index c874a7026e24..907b2be0372f 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -119,48 +119,41 @@ static void s2idle_enter(void) static void s2idle_loop(void) { + int error; + + dpm_noirq_begin(); + error = dpm_noirq_suspend_devices(PMSG_SUSPEND); + if (error) + goto resume; + pm_pr_dbg("suspend-to-idle\n"); + /* + * Suspend-to-idle equals: + * frozen processes + suspended devices + idle processors. + * Thus s2idle_enter() should be called right after all devices have + * been suspended. + * + * Wakeups during the noirq suspend of devices may be spurious, so try + * to avoid them upfront. + */ for (;;) { - int error; - - dpm_noirq_begin(); - - /* - * Suspend-to-idle equals - * frozen processes + suspended devices + idle processors. - * Thus s2idle_enter() should be called right after - * all devices have been suspended. - * - * Wakeups during the noirq suspend of devices may be spurious, - * so prevent them from terminating the loop right away. - */ - error = dpm_noirq_suspend_devices(PMSG_SUSPEND); - if (!error) - s2idle_enter(); - else if (error == -EBUSY && pm_wakeup_pending()) - error = 0; - - if (!error && s2idle_ops && s2idle_ops->wake) + if (s2idle_ops && s2idle_ops->wake) s2idle_ops->wake(); - dpm_noirq_resume_devices(PMSG_RESUME); - - dpm_noirq_end(); - - if (error) - break; - - if (s2idle_ops && s2idle_ops->sync) - s2idle_ops->sync(); - if (pm_wakeup_pending()) break; pm_wakeup_clear(false); + + s2idle_enter(); } pm_pr_dbg("resume from suspend-to-idle\n"); + +resume: + dpm_noirq_resume_devices(PMSG_RESUME); + dpm_noirq_end(); } void s2idle_wake(void) -- cgit v1.2.3 From 8eb0fd3b55f084320ae511cd5a64d356cf497c83 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 15 Jul 2019 23:52:12 +0200 Subject: PM: sleep: Integrate suspend-to-idle with generig suspend flow After previous changes the suspend-to-idle code flow can be integrated more tightly with the generic system suspend code flow by making suspend_enter() call s2idle_loop() later and removing the direct invocations of dpm_noirq_begin(), dpm_noirq_suspend_devices(), dpm_noirq_end(), and dpm_noirq_resume_devices() from the latter, so do that. This change is not expected to alter functionality. Signed-off-by: Rafael J. Wysocki Acked-by: Thomas Gleixner --- kernel/power/suspend.c | 21 +++++---------------- 1 file changed, 5 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 907b2be0372f..2b6057853b33 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -119,13 +119,6 @@ static void s2idle_enter(void) static void s2idle_loop(void) { - int error; - - dpm_noirq_begin(); - error = dpm_noirq_suspend_devices(PMSG_SUSPEND); - if (error) - goto resume; - pm_pr_dbg("suspend-to-idle\n"); /* @@ -150,10 +143,6 @@ static void s2idle_loop(void) } pm_pr_dbg("resume from suspend-to-idle\n"); - -resume: - dpm_noirq_resume_devices(PMSG_RESUME); - dpm_noirq_end(); } void s2idle_wake(void) @@ -408,11 +397,6 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) if (error) goto Devices_early_resume; - if (state == PM_SUSPEND_TO_IDLE && pm_test_level != TEST_PLATFORM) { - s2idle_loop(); - goto Platform_early_resume; - } - error = dpm_suspend_noirq(PMSG_SUSPEND); if (error) { pr_err("noirq suspend of devices failed\n"); @@ -425,6 +409,11 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) if (suspend_test(TEST_PLATFORM)) goto Platform_wake; + if (state == PM_SUSPEND_TO_IDLE) { + s2idle_loop(); + goto Platform_wake; + } + error = suspend_disable_secondary_cpus(); if (error || suspend_test(TEST_CPUS)) goto Enable_cpus; -- cgit v1.2.3 From 85db0023376f529c477c6110043e069ccee16d9c Mon Sep 17 00:00:00 2001 From: Markus Elfring Date: Tue, 2 Jul 2019 19:26:59 +0200 Subject: cgroup: Replace a seq_printf() call by seq_puts() in cgroup_print_ss_mask() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A string which did not contain a data format specification should be put into a sequence. Thus use the corresponding function “seq_puts”. This issue was detected by using the Coccinelle software. Signed-off-by: Markus Elfring Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 753afbca549f..3c3d92d993e8 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -2894,7 +2894,7 @@ static void cgroup_print_ss_mask(struct seq_file *seq, u16 ss_mask) do_each_subsys_mask(ss, ssid, ss_mask) { if (printed) seq_putc(seq, ' '); - seq_printf(seq, "%s", ss->name); + seq_puts(seq, ss->name); printed = true; } while_each_subsys_mask(); if (printed) -- cgit v1.2.3 From a581563f1bef035e4c8d634a1df26dae9140b115 Mon Sep 17 00:00:00 2001 From: Peng Wang Date: Wed, 3 Jul 2019 10:07:49 +0800 Subject: cgroup: minor tweak for logic to get cgroup css We could only handle the case that css exists and css_try_get_online() fails. Signed-off-by: Peng Wang Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 3c3d92d993e8..21ddf3053235 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -488,7 +488,7 @@ static struct cgroup_subsys_state *cgroup_tryget_css(struct cgroup *cgrp, rcu_read_lock(); css = cgroup_css(cgrp, ss); - if (!css || !css_tryget_online(css)) + if (css && !css_tryget_online(css)) css = NULL; rcu_read_unlock(); -- cgit v1.2.3 From 364f6afc4f5537b79cf454eb35cae92920676075 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Mon, 22 Jul 2019 11:24:40 -0700 Subject: locking/lockdep: Make it clear that what lock_class::key points at is not modified This patch does not change the behavior of the lockdep code. Signed-off-by: Bart Van Assche Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Waiman Long Cc: Will Deacon Link: https://lkml.kernel.org/r/20190722182443.216015-2-bvanassche@acm.org Signed-off-by: Ingo Molnar --- kernel/locking/lockdep.c | 2 +- kernel/locking/lockdep_internals.h | 3 ++- kernel/locking/lockdep_proc.c | 2 +- 3 files changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 4861cf8e274b..af6627866191 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -511,7 +511,7 @@ static const char *usage_str[] = }; #endif -const char * __get_key_name(struct lockdep_subclass_key *key, char *str) +const char *__get_key_name(const struct lockdep_subclass_key *key, char *str) { return kallsyms_lookup((unsigned long)key, NULL, NULL, NULL, str); } diff --git a/kernel/locking/lockdep_internals.h b/kernel/locking/lockdep_internals.h index cc83568d5012..2e518369add4 100644 --- a/kernel/locking/lockdep_internals.h +++ b/kernel/locking/lockdep_internals.h @@ -116,7 +116,8 @@ extern struct lock_chain lock_chains[]; extern void get_usage_chars(struct lock_class *class, char usage[LOCK_USAGE_CHARS]); -extern const char * __get_key_name(struct lockdep_subclass_key *key, char *str); +extern const char *__get_key_name(const struct lockdep_subclass_key *key, + char *str); struct lock_class *lock_chain_get_class(struct lock_chain *chain, int i); diff --git a/kernel/locking/lockdep_proc.c b/kernel/locking/lockdep_proc.c index bda006f8a88b..ed9842425cac 100644 --- a/kernel/locking/lockdep_proc.c +++ b/kernel/locking/lockdep_proc.c @@ -399,7 +399,7 @@ static void seq_lock_time(struct seq_file *m, struct lock_time *lt) static void seq_stats(struct seq_file *m, struct lock_stat_data *data) { - struct lockdep_subclass_key *ckey; + const struct lockdep_subclass_key *ckey; struct lock_class_stats *stats; struct lock_class *class; const char *cname; -- cgit v1.2.3 From a2970421640bd9b6a78f2685d7750a791abdfd4e Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Mon, 22 Jul 2019 11:24:41 -0700 Subject: stacktrace: Constify 'entries' arguments Make it clear to humans and to the compiler that the stack trace ('entries') arguments are not modified. Signed-off-by: Bart Van Assche Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Waiman Long Cc: Will Deacon Link: https://lkml.kernel.org/r/20190722182443.216015-3-bvanassche@acm.org Signed-off-by: Ingo Molnar --- kernel/stacktrace.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c index f5440abb7532..6d1f68b7e528 100644 --- a/kernel/stacktrace.c +++ b/kernel/stacktrace.c @@ -20,7 +20,7 @@ * @nr_entries: Number of entries in the storage array * @spaces: Number of leading spaces to print */ -void stack_trace_print(unsigned long *entries, unsigned int nr_entries, +void stack_trace_print(const unsigned long *entries, unsigned int nr_entries, int spaces) { unsigned int i; @@ -43,7 +43,7 @@ EXPORT_SYMBOL_GPL(stack_trace_print); * * Return: Number of bytes printed. */ -int stack_trace_snprint(char *buf, size_t size, unsigned long *entries, +int stack_trace_snprint(char *buf, size_t size, const unsigned long *entries, unsigned int nr_entries, int spaces) { unsigned int generated, i, total = 0; -- cgit v1.2.3 From 12593b7467f9130b64a6d4b6a26ed4ec217b6784 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Mon, 22 Jul 2019 11:24:42 -0700 Subject: locking/lockdep: Reduce space occupied by stack traces Although commit 669de8bda87b ("kernel/workqueue: Use dynamic lockdep keys for workqueues") unregisters dynamic lockdep keys when a workqueue is destroyed, a side effect of that commit is that all stack traces associated with the lockdep key are leaked when a workqueue is destroyed. Fix this by storing each unique stack trace once. Other changes in this patch are: - Use NULL instead of { .nr_entries = 0 } to represent 'no trace'. - Store a pointer to a stack trace in struct lock_class and struct lock_list instead of storing 'nr_entries' and 'offset'. This patch avoids that the following program triggers the "BUG: MAX_STACK_TRACE_ENTRIES too low!" complaint: #include #include int main() { for (;;) { int fd = open("/dev/infiniband/rdma_cm", O_RDWR); close(fd); } } Suggested-by: Peter Zijlstra Reported-by: Eric Biggers Signed-off-by: Bart Van Assche Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Thomas Gleixner Cc: Waiman Long Cc: Will Deacon Cc: Yuyang Du Link: https://lkml.kernel.org/r/20190722182443.216015-4-bvanassche@acm.org Signed-off-by: Ingo Molnar --- kernel/locking/lockdep.c | 128 ++++++++++++++++++++++++++----------- kernel/locking/lockdep_internals.h | 2 + 2 files changed, 92 insertions(+), 38 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index af6627866191..1a96869cb2f0 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -449,33 +449,72 @@ static void print_lockdep_off(const char *bug_msg) unsigned long nr_stack_trace_entries; #ifdef CONFIG_PROVE_LOCKING +/** + * struct lock_trace - single stack backtrace + * @hash_entry: Entry in a stack_trace_hash[] list. + * @hash: jhash() of @entries. + * @nr_entries: Number of entries in @entries. + * @entries: Actual stack backtrace. + */ +struct lock_trace { + struct hlist_node hash_entry; + u32 hash; + u32 nr_entries; + unsigned long entries[0] __aligned(sizeof(unsigned long)); +}; +#define LOCK_TRACE_SIZE_IN_LONGS \ + (sizeof(struct lock_trace) / sizeof(unsigned long)) /* - * Stack-trace: tightly packed array of stack backtrace - * addresses. Protected by the graph_lock. + * Stack-trace: sequence of lock_trace structures. Protected by the graph_lock. */ static unsigned long stack_trace[MAX_STACK_TRACE_ENTRIES]; +static struct hlist_head stack_trace_hash[STACK_TRACE_HASH_SIZE]; + +static bool traces_identical(struct lock_trace *t1, struct lock_trace *t2) +{ + return t1->hash == t2->hash && t1->nr_entries == t2->nr_entries && + memcmp(t1->entries, t2->entries, + t1->nr_entries * sizeof(t1->entries[0])) == 0; +} -static int save_trace(struct lock_trace *trace) +static struct lock_trace *save_trace(void) { - unsigned long *entries = stack_trace + nr_stack_trace_entries; + struct lock_trace *trace, *t2; + struct hlist_head *hash_head; + u32 hash; unsigned int max_entries; - trace->offset = nr_stack_trace_entries; - max_entries = MAX_STACK_TRACE_ENTRIES - nr_stack_trace_entries; - trace->nr_entries = stack_trace_save(entries, max_entries, 3); - nr_stack_trace_entries += trace->nr_entries; + BUILD_BUG_ON_NOT_POWER_OF_2(STACK_TRACE_HASH_SIZE); + BUILD_BUG_ON(LOCK_TRACE_SIZE_IN_LONGS >= MAX_STACK_TRACE_ENTRIES); + + trace = (struct lock_trace *)(stack_trace + nr_stack_trace_entries); + max_entries = MAX_STACK_TRACE_ENTRIES - nr_stack_trace_entries - + LOCK_TRACE_SIZE_IN_LONGS; + trace->nr_entries = stack_trace_save(trace->entries, max_entries, 3); - if (nr_stack_trace_entries >= MAX_STACK_TRACE_ENTRIES-1) { + if (nr_stack_trace_entries >= MAX_STACK_TRACE_ENTRIES - + LOCK_TRACE_SIZE_IN_LONGS - 1) { if (!debug_locks_off_graph_unlock()) - return 0; + return NULL; print_lockdep_off("BUG: MAX_STACK_TRACE_ENTRIES too low!"); dump_stack(); - return 0; + return NULL; } - return 1; + hash = jhash(trace->entries, trace->nr_entries * + sizeof(trace->entries[0]), 0); + trace->hash = hash; + hash_head = stack_trace_hash + (hash & (STACK_TRACE_HASH_SIZE - 1)); + hlist_for_each_entry(t2, hash_head, hash_entry) { + if (traces_identical(trace, t2)) + return t2; + } + nr_stack_trace_entries += LOCK_TRACE_SIZE_IN_LONGS + trace->nr_entries; + hlist_add_head(&trace->hash_entry, hash_head); + + return trace; } #endif @@ -1235,7 +1274,7 @@ static struct lock_list *alloc_list_entry(void) static int add_lock_to_list(struct lock_class *this, struct lock_class *links_to, struct list_head *head, unsigned long ip, int distance, - struct lock_trace *trace) + const struct lock_trace *trace) { struct lock_list *entry; /* @@ -1249,7 +1288,7 @@ static int add_lock_to_list(struct lock_class *this, entry->class = this; entry->links_to = links_to; entry->distance = distance; - entry->trace = *trace; + entry->trace = trace; /* * Both allocation and removal are done under the graph lock; but * iteration is under RCU-sched; see look_up_lock_class() and @@ -1470,11 +1509,10 @@ static inline int __bfs_backwards(struct lock_list *src_entry, } -static void print_lock_trace(struct lock_trace *trace, unsigned int spaces) +static void print_lock_trace(const struct lock_trace *trace, + unsigned int spaces) { - unsigned long *entries = stack_trace + trace->offset; - - stack_trace_print(entries, trace->nr_entries, spaces); + stack_trace_print(trace->entries, trace->nr_entries, spaces); } /* @@ -1489,7 +1527,7 @@ print_circular_bug_entry(struct lock_list *target, int depth) printk("\n-> #%u", depth); print_lock_name(target->class); printk(KERN_CONT ":\n"); - print_lock_trace(&target->trace, 6); + print_lock_trace(target->trace, 6); } static void @@ -1592,7 +1630,8 @@ static noinline void print_circular_bug(struct lock_list *this, if (!debug_locks_off_graph_unlock() || debug_locks_silent) return; - if (!save_trace(&this->trace)) + this->trace = save_trace(); + if (!this->trace) return; depth = get_lock_depth(target); @@ -1715,7 +1754,7 @@ check_path(struct lock_class *target, struct lock_list *src_entry, */ static noinline int check_noncircular(struct held_lock *src, struct held_lock *target, - struct lock_trace *trace) + struct lock_trace **const trace) { int ret; struct lock_list *uninitialized_var(target_entry); @@ -1729,13 +1768,13 @@ check_noncircular(struct held_lock *src, struct held_lock *target, ret = check_path(hlock_class(target), &src_entry, &target_entry); if (unlikely(!ret)) { - if (!trace->nr_entries) { + if (!*trace) { /* * If save_trace fails here, the printing might * trigger a WARN but because of the !nr_entries it * should not do bad things. */ - save_trace(trace); + *trace = save_trace(); } print_circular_bug(&src_entry, target_entry, src, target); @@ -1859,7 +1898,7 @@ static void print_lock_class_header(struct lock_class *class, int depth) len += printk("%*s %s", depth, "", usage_str[bit]); len += printk(KERN_CONT " at:\n"); - print_lock_trace(class->usage_traces + bit, len); + print_lock_trace(class->usage_traces[bit], len); } } printk("%*s }\n", depth, ""); @@ -1884,7 +1923,7 @@ print_shortest_lock_dependencies(struct lock_list *leaf, do { print_lock_class_header(entry->class, depth); printk("%*s ... acquired at:\n", depth, ""); - print_lock_trace(&entry->trace, 2); + print_lock_trace(entry->trace, 2); printk("\n"); if (depth == 0 && (entry != root)) { @@ -1995,14 +2034,14 @@ print_bad_irq_dependency(struct task_struct *curr, print_lock_name(backwards_entry->class); pr_warn("\n... which became %s-irq-safe at:\n", irqclass); - print_lock_trace(backwards_entry->class->usage_traces + bit1, 1); + print_lock_trace(backwards_entry->class->usage_traces[bit1], 1); pr_warn("\nto a %s-irq-unsafe lock:\n", irqclass); print_lock_name(forwards_entry->class); pr_warn("\n... which became %s-irq-unsafe at:\n", irqclass); pr_warn("..."); - print_lock_trace(forwards_entry->class->usage_traces + bit2, 1); + print_lock_trace(forwards_entry->class->usage_traces[bit2], 1); pr_warn("\nother info that might help us debug this:\n\n"); print_irq_lock_scenario(backwards_entry, forwards_entry, @@ -2011,13 +2050,15 @@ print_bad_irq_dependency(struct task_struct *curr, lockdep_print_held_locks(curr); pr_warn("\nthe dependencies between %s-irq-safe lock and the holding lock:\n", irqclass); - if (!save_trace(&prev_root->trace)) + prev_root->trace = save_trace(); + if (!prev_root->trace) return; print_shortest_lock_dependencies(backwards_entry, prev_root); pr_warn("\nthe dependencies between the lock to be acquired"); pr_warn(" and %s-irq-unsafe lock:\n", irqclass); - if (!save_trace(&next_root->trace)) + next_root->trace = save_trace(); + if (!next_root->trace) return; print_shortest_lock_dependencies(forwards_entry, next_root); @@ -2369,7 +2410,8 @@ check_deadlock(struct task_struct *curr, struct held_lock *next) */ static int check_prev_add(struct task_struct *curr, struct held_lock *prev, - struct held_lock *next, int distance, struct lock_trace *trace) + struct held_lock *next, int distance, + struct lock_trace **const trace) { struct lock_list *entry; int ret; @@ -2444,8 +2486,11 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev, return ret; #endif - if (!trace->nr_entries && !save_trace(trace)) - return 0; + if (!*trace) { + *trace = save_trace(); + if (!*trace) + return 0; + } /* * Ok, all validations passed, add the new lock @@ -2453,14 +2498,14 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev, */ ret = add_lock_to_list(hlock_class(next), hlock_class(prev), &hlock_class(prev)->locks_after, - next->acquire_ip, distance, trace); + next->acquire_ip, distance, *trace); if (!ret) return 0; ret = add_lock_to_list(hlock_class(prev), hlock_class(next), &hlock_class(next)->locks_before, - next->acquire_ip, distance, trace); + next->acquire_ip, distance, *trace); if (!ret) return 0; @@ -2476,7 +2521,7 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev, static int check_prevs_add(struct task_struct *curr, struct held_lock *next) { - struct lock_trace trace = { .nr_entries = 0 }; + struct lock_trace *trace = NULL; int depth = curr->lockdep_depth; struct held_lock *hlock; @@ -3015,7 +3060,7 @@ print_usage_bug(struct task_struct *curr, struct held_lock *this, print_lock(this); pr_warn("{%s} state was registered at:\n", usage_str[prev_bit]); - print_lock_trace(hlock_class(this)->usage_traces + prev_bit, 1); + print_lock_trace(hlock_class(this)->usage_traces[prev_bit], 1); print_irqtrace_events(curr); pr_warn("\nother info that might help us debug this:\n"); @@ -3096,7 +3141,8 @@ print_irq_inversion_bug(struct task_struct *curr, lockdep_print_held_locks(curr); pr_warn("\nthe shortest dependencies between 2nd lock and 1st lock:\n"); - if (!save_trace(&root->trace)) + root->trace = save_trace(); + if (!root->trace) return; print_shortest_lock_dependencies(other, root); @@ -3580,7 +3626,7 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this, hlock_class(this)->usage_mask |= new_mask; - if (!save_trace(hlock_class(this)->usage_traces + new_bit)) + if (!(hlock_class(this)->usage_traces[new_bit] = save_trace())) return 0; switch (new_bit) { @@ -5157,6 +5203,12 @@ void __init lockdep_init(void) ) / 1024 ); +#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) + printk(" memory used for stack traces: %zu kB\n", + (sizeof(stack_trace) + sizeof(stack_trace_hash)) / 1024 + ); +#endif + printk(" per task-struct memory footprint: %zu bytes\n", sizeof(((struct task_struct *)NULL)->held_locks)); } diff --git a/kernel/locking/lockdep_internals.h b/kernel/locking/lockdep_internals.h index 2e518369add4..93a008bf77db 100644 --- a/kernel/locking/lockdep_internals.h +++ b/kernel/locking/lockdep_internals.h @@ -92,6 +92,7 @@ static const unsigned long LOCKF_USED_IN_IRQ_READ = #define MAX_LOCKDEP_ENTRIES 16384UL #define MAX_LOCKDEP_CHAINS_BITS 15 #define MAX_STACK_TRACE_ENTRIES 262144UL +#define STACK_TRACE_HASH_SIZE 8192 #else #define MAX_LOCKDEP_ENTRIES 32768UL @@ -102,6 +103,7 @@ static const unsigned long LOCKF_USED_IN_IRQ_READ = * addresses. Protected by the hash_lock. */ #define MAX_STACK_TRACE_ENTRIES 524288UL +#define STACK_TRACE_HASH_SIZE 16384 #endif #define MAX_LOCKDEP_CHAINS (1UL << MAX_LOCKDEP_CHAINS_BITS) -- cgit v1.2.3 From 8c779229d0f4fe83ead90bdcbbf08b02989aa200 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Mon, 22 Jul 2019 11:24:43 -0700 Subject: locking/lockdep: Report more stack trace statistics Report the number of stack traces and the number of stack trace hash chains. These two numbers are useful because these allow to estimate the number of stack trace hash collisions. Signed-off-by: Bart Van Assche Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Waiman Long Cc: Will Deacon Link: https://lkml.kernel.org/r/20190722182443.216015-5-bvanassche@acm.org Signed-off-by: Ingo Molnar --- kernel/locking/lockdep.c | 29 +++++++++++++++++++++++++++++ kernel/locking/lockdep_internals.h | 4 ++++ kernel/locking/lockdep_proc.c | 6 ++++++ 3 files changed, 39 insertions(+) (limited to 'kernel') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 1a96869cb2f0..3c3902c40a0e 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -516,6 +516,35 @@ static struct lock_trace *save_trace(void) return trace; } + +/* Return the number of stack traces in the stack_trace[] array. */ +u64 lockdep_stack_trace_count(void) +{ + struct lock_trace *trace; + u64 c = 0; + int i; + + for (i = 0; i < ARRAY_SIZE(stack_trace_hash); i++) { + hlist_for_each_entry(trace, &stack_trace_hash[i], hash_entry) { + c++; + } + } + + return c; +} + +/* Return the number of stack hash chains that have at least one stack trace. */ +u64 lockdep_stack_hash_count(void) +{ + u64 c = 0; + int i; + + for (i = 0; i < ARRAY_SIZE(stack_trace_hash); i++) + if (!hlist_empty(&stack_trace_hash[i])) + c++; + + return c; +} #endif unsigned int nr_hardirq_chains; diff --git a/kernel/locking/lockdep_internals.h b/kernel/locking/lockdep_internals.h index 93a008bf77db..18d85aebbb57 100644 --- a/kernel/locking/lockdep_internals.h +++ b/kernel/locking/lockdep_internals.h @@ -140,6 +140,10 @@ extern unsigned int max_bfs_queue_depth; #ifdef CONFIG_PROVE_LOCKING extern unsigned long lockdep_count_forward_deps(struct lock_class *); extern unsigned long lockdep_count_backward_deps(struct lock_class *); +#ifdef CONFIG_TRACE_IRQFLAGS +u64 lockdep_stack_trace_count(void); +u64 lockdep_stack_hash_count(void); +#endif #else static inline unsigned long lockdep_count_forward_deps(struct lock_class *class) diff --git a/kernel/locking/lockdep_proc.c b/kernel/locking/lockdep_proc.c index ed9842425cac..dadb7b7fba37 100644 --- a/kernel/locking/lockdep_proc.c +++ b/kernel/locking/lockdep_proc.c @@ -285,6 +285,12 @@ static int lockdep_stats_show(struct seq_file *m, void *v) nr_process_chains); seq_printf(m, " stack-trace entries: %11lu [max: %lu]\n", nr_stack_trace_entries, MAX_STACK_TRACE_ENTRIES); +#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) + seq_printf(m, " number of stack traces: %llu\n", + lockdep_stack_trace_count()); + seq_printf(m, " number of stack hash chains: %llu\n", + lockdep_stack_hash_count()); +#endif seq_printf(m, " combined max dependencies: %11u\n", (nr_hardirq_chains + 1) * (nr_softirq_chains + 1) * -- cgit v1.2.3 From e797bda3fd29137f6c151dfa10ea6a61c17895ce Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 22 Jul 2019 20:47:16 +0200 Subject: smp/hotplug: Track booted once CPUs in a cpumask The booted once information which is required to deal with the MCE broadcast issue on X86 correctly is stored in the per cpu hotplug state, which is perfectly fine for the intended purpose. X86 needs that information for supporting NMI broadcasting via shortcuts, but retrieving it from per cpu data is cumbersome. Move it to a cpumask so the information can be checked against the cpu_present_mask quickly. No functional change intended. Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20190722105219.818822855@linutronix.de --- kernel/cpu.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index e84c0873559e..05778e32674a 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -62,7 +62,6 @@ struct cpuhp_cpu_state { bool rollback; bool single; bool bringup; - bool booted_once; struct hlist_node *node; struct hlist_node *last; enum cpuhp_state cb_state; @@ -76,6 +75,10 @@ static DEFINE_PER_CPU(struct cpuhp_cpu_state, cpuhp_state) = { .fail = CPUHP_INVALID, }; +#ifdef CONFIG_SMP +cpumask_t cpus_booted_once_mask; +#endif + #if defined(CONFIG_LOCKDEP) && defined(CONFIG_SMP) static struct lockdep_map cpuhp_state_up_map = STATIC_LOCKDEP_MAP_INIT("cpuhp_state-up", &cpuhp_state_up_map); @@ -433,7 +436,7 @@ static inline bool cpu_smt_allowed(unsigned int cpu) * CPU. Otherwise, a broadacasted MCE observing CR4.MCE=0b on any * core will shutdown the machine. */ - return !per_cpu(cpuhp_state, cpu).booted_once; + return !cpumask_test_cpu(cpu, &cpus_booted_once_mask); } #else static inline bool cpu_smt_allowed(unsigned int cpu) { return true; } @@ -1066,7 +1069,7 @@ void notify_cpu_starting(unsigned int cpu) int ret; rcu_cpu_starting(cpu); /* Enables RCU usage on this CPU. */ - st->booted_once = true; + cpumask_set_cpu(cpu, &cpus_booted_once_mask); while (st->state < target) { st->state++; ret = cpuhp_invoke_callback(cpu, st->state, true, NULL, NULL); @@ -2334,7 +2337,7 @@ void __init boot_cpu_init(void) void __init boot_cpu_hotplug_init(void) { #ifdef CONFIG_SMP - this_cpu_write(cpuhp_state.booted_once, true); + cpumask_set_cpu(smp_processor_id(), &cpus_booted_once_mask); #endif this_cpu_write(cpuhp_state.state, CPUHP_ONLINE); } -- cgit v1.2.3 From 0c09ab96fc820109d63097a2adcbbd20836b655f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 9 Jul 2019 16:23:40 +0200 Subject: cpu/hotplug: Cache number of online CPUs Re-evaluating the bitmap wheight of the online cpus bitmap in every invocation of num_online_cpus() over and over is a pretty useless exercise. Especially when num_online_cpus() is used in code paths like the IPI delivery of x86 or the membarrier code. Cache the number of online CPUs in the core and just return the cached variable. The accessor function provides only a snapshot when used without protection against concurrent CPU hotplug. The storage needs to use an atomic_t because the kexec and reboot code (ab)use set_cpu_online() in their 'shutdown' handlers without any form of serialization as pointed out by Mathieu. Regular CPU hotplug usage is properly serialized. Signed-off-by: Thomas Gleixner Reviewed-by: Mathieu Desnoyers Link: https://lkml.kernel.org/r/alpine.DEB.2.21.1907091622590.1634@nanos.tec.linutronix.de --- kernel/cpu.c | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index 05778e32674a..e1967e9eddc2 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -2298,6 +2298,9 @@ EXPORT_SYMBOL(__cpu_present_mask); struct cpumask __cpu_active_mask __read_mostly; EXPORT_SYMBOL(__cpu_active_mask); +atomic_t __num_online_cpus __read_mostly; +EXPORT_SYMBOL(__num_online_cpus); + void init_cpu_present(const struct cpumask *src) { cpumask_copy(&__cpu_present_mask, src); @@ -2313,6 +2316,27 @@ void init_cpu_online(const struct cpumask *src) cpumask_copy(&__cpu_online_mask, src); } +void set_cpu_online(unsigned int cpu, bool online) +{ + /* + * atomic_inc/dec() is required to handle the horrid abuse of this + * function by the reboot and kexec code which invoke it from + * IPI/NMI broadcasts when shutting down CPUs. Invocation from + * regular CPU hotplug is properly serialized. + * + * Note, that the fact that __num_online_cpus is of type atomic_t + * does not protect readers which are not serialized against + * concurrent hotplug operations. + */ + if (online) { + if (!cpumask_test_and_set_cpu(cpu, &__cpu_online_mask)) + atomic_inc(&__num_online_cpus); + } else { + if (cpumask_test_and_clear_cpu(cpu, &__cpu_online_mask)) + atomic_dec(&__num_online_cpus); + } +} + /* * Activate the first processor. */ -- cgit v1.2.3 From d35927a144641700c8328d707d1c89d305b4ecb8 Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Mon, 15 Jul 2019 11:25:06 +0100 Subject: sched/fair: Move init_numa_balancing() below task_numa_work() To reference task_numa_work() from within init_numa_balancing(), we need the former to be declared before the latter. Do just that. This is a pure code movement. Signed-off-by: Valentin Schneider Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: mgorman@suse.de Cc: riel@surriel.com Link: https://lkml.kernel.org/r/20190715102508.32434-2-valentin.schneider@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 82 ++++++++++++++++++++++++++--------------------------- 1 file changed, 41 insertions(+), 41 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index bc9cfeaac8bd..f0c488015649 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1188,47 +1188,6 @@ static unsigned int task_scan_max(struct task_struct *p) return max(smin, smax); } -void init_numa_balancing(unsigned long clone_flags, struct task_struct *p) -{ - int mm_users = 0; - struct mm_struct *mm = p->mm; - - if (mm) { - mm_users = atomic_read(&mm->mm_users); - if (mm_users == 1) { - mm->numa_next_scan = jiffies + msecs_to_jiffies(sysctl_numa_balancing_scan_delay); - mm->numa_scan_seq = 0; - } - } - p->node_stamp = 0; - p->numa_scan_seq = mm ? mm->numa_scan_seq : 0; - p->numa_scan_period = sysctl_numa_balancing_scan_delay; - p->numa_work.next = &p->numa_work; - p->numa_faults = NULL; - RCU_INIT_POINTER(p->numa_group, NULL); - p->last_task_numa_placement = 0; - p->last_sum_exec_runtime = 0; - - /* New address space, reset the preferred nid */ - if (!(clone_flags & CLONE_VM)) { - p->numa_preferred_nid = NUMA_NO_NODE; - return; - } - - /* - * New thread, keep existing numa_preferred_nid which should be copied - * already by arch_dup_task_struct but stagger when scans start. - */ - if (mm) { - unsigned int delay; - - delay = min_t(unsigned int, task_scan_max(current), - current->numa_scan_period * mm_users * NSEC_PER_MSEC); - delay += 2 * TICK_NSEC; - p->node_stamp = delay; - } -} - static void account_numa_enqueue(struct rq *rq, struct task_struct *p) { rq->nr_numa_running += (p->numa_preferred_nid != NUMA_NO_NODE); @@ -2665,6 +2624,47 @@ out: } } +void init_numa_balancing(unsigned long clone_flags, struct task_struct *p) +{ + int mm_users = 0; + struct mm_struct *mm = p->mm; + + if (mm) { + mm_users = atomic_read(&mm->mm_users); + if (mm_users == 1) { + mm->numa_next_scan = jiffies + msecs_to_jiffies(sysctl_numa_balancing_scan_delay); + mm->numa_scan_seq = 0; + } + } + p->node_stamp = 0; + p->numa_scan_seq = mm ? mm->numa_scan_seq : 0; + p->numa_scan_period = sysctl_numa_balancing_scan_delay; + p->numa_work.next = &p->numa_work; + p->numa_faults = NULL; + RCU_INIT_POINTER(p->numa_group, NULL); + p->last_task_numa_placement = 0; + p->last_sum_exec_runtime = 0; + + /* New address space, reset the preferred nid */ + if (!(clone_flags & CLONE_VM)) { + p->numa_preferred_nid = NUMA_NO_NODE; + return; + } + + /* + * New thread, keep existing numa_preferred_nid which should be copied + * already by arch_dup_task_struct but stagger when scans start. + */ + if (mm) { + unsigned int delay; + + delay = min_t(unsigned int, task_scan_max(current), + current->numa_scan_period * mm_users * NSEC_PER_MSEC); + delay += 2 * TICK_NSEC; + p->node_stamp = delay; + } +} + /* * Drive the periodic memory faults.. */ -- cgit v1.2.3 From b34920d4ce6e6fc9424c20a4be98676eb543122f Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Mon, 15 Jul 2019 11:25:07 +0100 Subject: sched/fair: Move task_numa_work() init to init_numa_balancing() We only need to set the callback_head worker function once, do it during sched_fork(). While at it, move the comment regarding double task_work addition to init_numa_balancing(), since the double add sentinel is first set there. Suggested-by: Peter Zijlstra Signed-off-by: Valentin Schneider Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Thomas Gleixner Cc: mgorman@suse.de Cc: riel@surriel.com Link: https://lkml.kernel.org/r/20190715102508.32434-3-valentin.schneider@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index f0c488015649..fd391fc00ed8 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2495,7 +2495,7 @@ void task_numa_work(struct callback_head *work) SCHED_WARN_ON(p != container_of(work, struct task_struct, numa_work)); - work->next = work; /* protect against double add */ + work->next = work; /* * Who cares about NUMA placement when they're dying. * @@ -2639,12 +2639,15 @@ void init_numa_balancing(unsigned long clone_flags, struct task_struct *p) p->node_stamp = 0; p->numa_scan_seq = mm ? mm->numa_scan_seq : 0; p->numa_scan_period = sysctl_numa_balancing_scan_delay; + /* Protect against double add, see task_tick_numa and task_numa_work */ p->numa_work.next = &p->numa_work; p->numa_faults = NULL; RCU_INIT_POINTER(p->numa_group, NULL); p->last_task_numa_placement = 0; p->last_sum_exec_runtime = 0; + init_task_work(&p->numa_work, task_numa_work); + /* New address space, reset the preferred nid */ if (!(clone_flags & CLONE_VM)) { p->numa_preferred_nid = NUMA_NO_NODE; @@ -2693,10 +2696,8 @@ static void task_tick_numa(struct rq *rq, struct task_struct *curr) curr->numa_scan_period = task_scan_start(curr); curr->node_stamp += period; - if (!time_before(jiffies, curr->mm->numa_next_scan)) { - init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */ + if (!time_before(jiffies, curr->mm->numa_next_scan)) task_work_add(curr, work, true); - } } } -- cgit v1.2.3 From 9434f9f5d117302cc7ddf038e7879f6871dc7a81 Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Mon, 15 Jul 2019 11:25:08 +0100 Subject: sched/fair: Change task_numa_work() storage to static There are no callers outside of fair.c. Signed-off-by: Valentin Schneider Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: mgorman@suse.de Cc: riel@surriel.com Link: https://lkml.kernel.org/r/20190715102508.32434-4-valentin.schneider@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index fd391fc00ed8..b5546a15206c 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2482,7 +2482,7 @@ static void reset_ptenuma_scan(struct task_struct *p) * The expensive part of numa migration is done from task_work context. * Triggered from task_tick_numa(). */ -void task_numa_work(struct callback_head *work) +static void task_numa_work(struct callback_head *work) { unsigned long migrate, next_scan, now = jiffies; struct task_struct *p = current; -- cgit v1.2.3 From f6cad8df6b30a5d2bbbd2e698f74b4cafb9fb82b Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Mon, 1 Jul 2019 17:47:02 +0200 Subject: sched/fair: Fix imbalance due to CPU affinity The load_balance() has a dedicated mecanism to detect when an imbalance is due to CPU affinity and must be handled at parent level. In this case, the imbalance field of the parent's sched_group is set. The description of sg_imbalanced() gives a typical example of two groups of 4 CPUs each and 4 tasks each with a cpumask covering 1 CPU of the first group and 3 CPUs of the second group. Something like: { 0 1 2 3 } { 4 5 6 7 } * * * * But the load_balance fails to fix this UC on my octo cores system made of 2 clusters of quad cores. Whereas the load_balance is able to detect that the imbalanced is due to CPU affinity, it fails to fix it because the imbalance field is cleared before letting parent level a chance to run. In fact, when the imbalance is detected, the load_balance reruns without the CPU with pinned tasks. But there is no other running tasks in the situation described above and everything looks balanced this time so the imbalance field is immediately cleared. The imbalance field should not be cleared if there is no other task to move when the imbalance is detected. Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: https://lkml.kernel.org/r/1561996022-28829-1-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index b5546a15206c..9be36ffb5689 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -9048,9 +9048,10 @@ more_balance: out_balanced: /* * We reach balance although we may have faced some affinity - * constraints. Clear the imbalance flag if it was set. + * constraints. Clear the imbalance flag only if other tasks got + * a chance to move and fix the imbalance. */ - if (sd_parent) { + if (sd_parent && !(env.flags & LBF_ALL_PINNED)) { int *group_imbalance = &sd_parent->groups->sgc->imbalance; if (*group_imbalance) -- cgit v1.2.3 From 84ec3a0787086fcd25f284f59b3aa01fd6fc0a5d Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 25 Jun 2019 09:52:38 -0700 Subject: time/tick-broadcast: Fix tick_broadcast_offline() lockdep complaint time/tick-broadcast: Fix tick_broadcast_offline() lockdep complaint The TASKS03 and TREE04 rcutorture scenarios produce the following lockdep complaint: WARNING: inconsistent lock state 5.2.0-rc1+ #513 Not tainted -------------------------------- inconsistent {IN-HARDIRQ-W} -> {HARDIRQ-ON-W} usage. migration/1/14 [HC0[0]:SC0[0]:HE1:SE1] takes: (____ptrval____) (tick_broadcast_lock){?...}, at: tick_broadcast_offline+0xf/0x70 {IN-HARDIRQ-W} state was registered at: lock_acquire+0xb0/0x1c0 _raw_spin_lock_irqsave+0x3c/0x50 tick_broadcast_switch_to_oneshot+0xd/0x40 tick_switch_to_oneshot+0x4f/0xd0 hrtimer_run_queues+0xf3/0x130 run_local_timers+0x1c/0x50 update_process_times+0x1c/0x50 tick_periodic+0x26/0xc0 tick_handle_periodic+0x1a/0x60 smp_apic_timer_interrupt+0x80/0x2a0 apic_timer_interrupt+0xf/0x20 _raw_spin_unlock_irqrestore+0x4e/0x60 rcu_nocb_gp_kthread+0x15d/0x590 kthread+0xf3/0x130 ret_from_fork+0x3a/0x50 irq event stamp: 171 hardirqs last enabled at (171): [] trace_hardirqs_on_thunk+0x1a/0x1c hardirqs last disabled at (170): [] trace_hardirqs_off_thunk+0x1a/0x1c softirqs last enabled at (0): [] copy_process.part.56+0x650/0x1cb0 softirqs last disabled at (0): [<0000000000000000>] 0x0 [...] To reproduce, run the following rcutorture test: $ tools/testing/selftests/rcutorture/bin/kvm.sh --duration 5 --kconfig "CONFIG_DEBUG_LOCK_ALLOC=y CONFIG_PROVE_LOCKING=y" --configs "TASKS03 TREE04" It turns out that tick_broadcast_offline() was an innocent bystander. After all, interrupts are supposed to be disabled throughout take_cpu_down(), and therefore should have been disabled upon entry to tick_offline_cpu() and thus to tick_broadcast_offline(). This suggests that one of the CPU-hotplug notifiers was incorrectly enabling interrupts, and leaving them enabled on return. Some debugging code showed that the culprit was sched_cpu_dying(). It had irqs enabled after return from sched_tick_stop(). Which in turn had irqs enabled after return from cancel_delayed_work_sync(). Which is a wrapper around __cancel_work_timer(). Which can sleep in the case where something else is concurrently trying to cancel the same delayed work, and as Thomas Gleixner pointed out on IRC, sleeping is a decidedly bad idea when you are invoked from take_cpu_down(), regardless of the state you leave interrupts in upon return. Code inspection located no reason why the delayed work absolutely needed to be canceled from sched_tick_stop(): The work is not bound to the outgoing CPU by design, given that the whole point is to collect statistics without disturbing the outgoing CPU. This commit therefore simply drops the cancel_delayed_work_sync() from sched_tick_stop(). Instead, a new ->state field is added to the tick_work structure so that the delayed-work handler function sched_tick_remote() can avoid reposting itself. A cpu_is_offline() check is also added to sched_tick_remote() to avoid mucking with the state of an offlined CPU (though it does appear safe to do so). The sched_tick_start() and sched_tick_stop() functions also update ->state, and sched_tick_start() also schedules the delayed work if ->state indicates that it is not already in flight. Signed-off-by: Paul E. McKenney [ paulmck: Apply Peter Zijlstra and Frederic Weisbecker atomics feedback. ] Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Frederic Weisbecker Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: https://lkml.kernel.org/r/20190625165238.GJ26519@linux.ibm.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 57 +++++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 49 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 2b037f195473..0b22e55cebe8 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3486,8 +3486,36 @@ void scheduler_tick(void) struct tick_work { int cpu; + atomic_t state; struct delayed_work work; }; +/* Values for ->state, see diagram below. */ +#define TICK_SCHED_REMOTE_OFFLINE 0 +#define TICK_SCHED_REMOTE_OFFLINING 1 +#define TICK_SCHED_REMOTE_RUNNING 2 + +/* + * State diagram for ->state: + * + * + * TICK_SCHED_REMOTE_OFFLINE + * | ^ + * | | + * | | sched_tick_remote() + * | | + * | | + * +--TICK_SCHED_REMOTE_OFFLINING + * | ^ + * | | + * sched_tick_start() | | sched_tick_stop() + * | | + * V | + * TICK_SCHED_REMOTE_RUNNING + * + * + * Other transitions get WARN_ON_ONCE(), except that sched_tick_remote() + * and sched_tick_start() are happy to leave the state in RUNNING. + */ static struct tick_work __percpu *tick_work_cpu; @@ -3500,6 +3528,7 @@ static void sched_tick_remote(struct work_struct *work) struct task_struct *curr; struct rq_flags rf; u64 delta; + int os; /* * Handle the tick only if it appears the remote CPU is running in full @@ -3513,7 +3542,7 @@ static void sched_tick_remote(struct work_struct *work) rq_lock_irq(rq, &rf); curr = rq->curr; - if (is_idle_task(curr)) + if (is_idle_task(curr) || cpu_is_offline(cpu)) goto out_unlock; update_rq_clock(rq); @@ -3533,13 +3562,18 @@ out_requeue: /* * Run the remote tick once per second (1Hz). This arbitrary * frequency is large enough to avoid overload but short enough - * to keep scheduler internal stats reasonably up to date. + * to keep scheduler internal stats reasonably up to date. But + * first update state to reflect hotplug activity if required. */ - queue_delayed_work(system_unbound_wq, dwork, HZ); + os = atomic_fetch_add_unless(&twork->state, -1, TICK_SCHED_REMOTE_RUNNING); + WARN_ON_ONCE(os == TICK_SCHED_REMOTE_OFFLINE); + if (os == TICK_SCHED_REMOTE_RUNNING) + queue_delayed_work(system_unbound_wq, dwork, HZ); } static void sched_tick_start(int cpu) { + int os; struct tick_work *twork; if (housekeeping_cpu(cpu, HK_FLAG_TICK)) @@ -3548,15 +3582,20 @@ static void sched_tick_start(int cpu) WARN_ON_ONCE(!tick_work_cpu); twork = per_cpu_ptr(tick_work_cpu, cpu); - twork->cpu = cpu; - INIT_DELAYED_WORK(&twork->work, sched_tick_remote); - queue_delayed_work(system_unbound_wq, &twork->work, HZ); + os = atomic_xchg(&twork->state, TICK_SCHED_REMOTE_RUNNING); + WARN_ON_ONCE(os == TICK_SCHED_REMOTE_RUNNING); + if (os == TICK_SCHED_REMOTE_OFFLINE) { + twork->cpu = cpu; + INIT_DELAYED_WORK(&twork->work, sched_tick_remote); + queue_delayed_work(system_unbound_wq, &twork->work, HZ); + } } #ifdef CONFIG_HOTPLUG_CPU static void sched_tick_stop(int cpu) { struct tick_work *twork; + int os; if (housekeeping_cpu(cpu, HK_FLAG_TICK)) return; @@ -3564,7 +3603,10 @@ static void sched_tick_stop(int cpu) WARN_ON_ONCE(!tick_work_cpu); twork = per_cpu_ptr(tick_work_cpu, cpu); - cancel_delayed_work_sync(&twork->work); + /* There cannot be competing actions, but don't rely on stop-machine. */ + os = atomic_xchg(&twork->state, TICK_SCHED_REMOTE_OFFLINING); + WARN_ON_ONCE(os != TICK_SCHED_REMOTE_RUNNING); + /* Don't cancel, as this would mess up the state machine. */ } #endif /* CONFIG_HOTPLUG_CPU */ @@ -3572,7 +3614,6 @@ int __init sched_tick_offload_init(void) { tick_work_cpu = alloc_percpu(struct tick_work); BUG_ON(!tick_work_cpu); - return 0; } -- cgit v1.2.3 From 43e9f7f231e40e4534fc3a735da152911a085c16 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Wed, 26 Jun 2019 10:36:29 +0530 Subject: sched/fair: Start tracking SCHED_IDLE tasks count in cfs_rq Track how many tasks are present with SCHED_IDLE policy in each cfs_rq. This will be used by later commits. Signed-off-by: Viresh Kumar Signed-off-by: Peter Zijlstra (Intel) Cc: Daniel Lezcano Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Vincent Guittot Cc: chris.redpath@arm.com Cc: quentin.perret@linaro.org Cc: songliubraving@fb.com Cc: steven.sistare@oracle.com Cc: subhra.mazumdar@oracle.com Cc: tkjos@google.com Link: https://lkml.kernel.org/r/0d3cdc427fc68808ad5bccc40e86ed0bf9da8bb4.1561523542.git.viresh.kumar@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 14 ++++++++++++-- kernel/sched/sched.h | 3 ++- 2 files changed, 14 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 9be36ffb5689..9ed5ab53872f 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4555,7 +4555,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq) struct rq *rq = rq_of(cfs_rq); struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); struct sched_entity *se; - long task_delta, dequeue = 1; + long task_delta, idle_task_delta, dequeue = 1; bool empty; se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))]; @@ -4566,6 +4566,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq) rcu_read_unlock(); task_delta = cfs_rq->h_nr_running; + idle_task_delta = cfs_rq->idle_h_nr_running; for_each_sched_entity(se) { struct cfs_rq *qcfs_rq = cfs_rq_of(se); /* throttled entity or throttle-on-deactivate */ @@ -4575,6 +4576,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq) if (dequeue) dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP); qcfs_rq->h_nr_running -= task_delta; + qcfs_rq->idle_h_nr_running -= idle_task_delta; if (qcfs_rq->load.weight) dequeue = 0; @@ -4614,7 +4616,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); struct sched_entity *se; int enqueue = 1; - long task_delta; + long task_delta, idle_task_delta; se = cfs_rq->tg->se[cpu_of(rq)]; @@ -4634,6 +4636,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) return; task_delta = cfs_rq->h_nr_running; + idle_task_delta = cfs_rq->idle_h_nr_running; for_each_sched_entity(se) { if (se->on_rq) enqueue = 0; @@ -4642,6 +4645,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) if (enqueue) enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP); cfs_rq->h_nr_running += task_delta; + cfs_rq->idle_h_nr_running += idle_task_delta; if (cfs_rq_throttled(cfs_rq)) break; @@ -5255,6 +5259,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) { struct cfs_rq *cfs_rq; struct sched_entity *se = &p->se; + int idle_h_nr_running = task_has_idle_policy(p); /* * The code below (indirectly) updates schedutil which looks at @@ -5287,6 +5292,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (cfs_rq_throttled(cfs_rq)) break; cfs_rq->h_nr_running++; + cfs_rq->idle_h_nr_running += idle_h_nr_running; flags = ENQUEUE_WAKEUP; } @@ -5294,6 +5300,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) for_each_sched_entity(se) { cfs_rq = cfs_rq_of(se); cfs_rq->h_nr_running++; + cfs_rq->idle_h_nr_running += idle_h_nr_running; if (cfs_rq_throttled(cfs_rq)) break; @@ -5355,6 +5362,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) struct cfs_rq *cfs_rq; struct sched_entity *se = &p->se; int task_sleep = flags & DEQUEUE_SLEEP; + int idle_h_nr_running = task_has_idle_policy(p); for_each_sched_entity(se) { cfs_rq = cfs_rq_of(se); @@ -5369,6 +5377,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (cfs_rq_throttled(cfs_rq)) break; cfs_rq->h_nr_running--; + cfs_rq->idle_h_nr_running -= idle_h_nr_running; /* Don't dequeue parent if it has other entities besides us */ if (cfs_rq->load.weight) { @@ -5388,6 +5397,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) for_each_sched_entity(se) { cfs_rq = cfs_rq_of(se); cfs_rq->h_nr_running--; + cfs_rq->idle_h_nr_running -= idle_h_nr_running; if (cfs_rq_throttled(cfs_rq)) break; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 802b1f3405f2..aaca0e743776 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -483,7 +483,8 @@ struct cfs_rq { struct load_weight load; unsigned long runnable_weight; unsigned int nr_running; - unsigned int h_nr_running; + unsigned int h_nr_running; /* SCHED_{NORMAL,BATCH,IDLE} */ + unsigned int idle_h_nr_running; /* SCHED_IDLE */ u64 exec_clock; u64 min_vruntime; -- cgit v1.2.3 From 3c29e651e16dd3b3179cfb2d055ee9538e37515c Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Wed, 26 Jun 2019 10:36:30 +0530 Subject: sched/fair: Fall back to sched-idle CPU if idle CPU isn't found We try to find an idle CPU to run the next task, but in case we don't find an idle CPU it is better to pick a CPU which will run the task the soonest, for performance reason. A CPU which isn't idle but has only SCHED_IDLE activity queued on it should be a good target based on this criteria as any normal fair task will most likely preempt the currently running SCHED_IDLE task immediately. In fact, choosing a SCHED_IDLE CPU over a fully idle one shall give better results as it should be able to run the task sooner than an idle CPU (which requires to be woken up from an idle state). This patch updates both fast and slow paths with this optimization. Signed-off-by: Viresh Kumar Signed-off-by: Peter Zijlstra (Intel) Cc: Daniel Lezcano Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Vincent Guittot Cc: chris.redpath@arm.com Cc: quentin.perret@linaro.org Cc: songliubraving@fb.com Cc: steven.sistare@oracle.com Cc: subhra.mazumdar@oracle.com Cc: tkjos@google.com Link: https://lkml.kernel.org/r/eeafa25fdeb6f6edd5b2da716bc8f0ba7708cbcf.1561523542.git.viresh.kumar@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 43 +++++++++++++++++++++++++++++++++---------- 1 file changed, 33 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 9ed5ab53872f..52564e050062 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5431,6 +5431,15 @@ static struct { #endif /* CONFIG_NO_HZ_COMMON */ +/* CPU only has SCHED_IDLE tasks enqueued */ +static int sched_idle_cpu(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + + return unlikely(rq->nr_running == rq->cfs.idle_h_nr_running && + rq->nr_running); +} + static unsigned long cpu_runnable_load(struct rq *rq) { return cfs_rq_runnable_load_avg(&rq->cfs); @@ -5753,7 +5762,7 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this unsigned int min_exit_latency = UINT_MAX; u64 latest_idle_timestamp = 0; int least_loaded_cpu = this_cpu; - int shallowest_idle_cpu = -1; + int shallowest_idle_cpu = -1, si_cpu = -1; int i; /* Check if we have any choice: */ @@ -5784,7 +5793,12 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this latest_idle_timestamp = rq->idle_stamp; shallowest_idle_cpu = i; } - } else if (shallowest_idle_cpu == -1) { + } else if (shallowest_idle_cpu == -1 && si_cpu == -1) { + if (sched_idle_cpu(i)) { + si_cpu = i; + continue; + } + load = cpu_runnable_load(cpu_rq(i)); if (load < min_load) { min_load = load; @@ -5793,7 +5807,11 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this } } - return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu; + if (shallowest_idle_cpu != -1) + return shallowest_idle_cpu; + if (si_cpu != -1) + return si_cpu; + return least_loaded_cpu; } static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p, @@ -5946,7 +5964,7 @@ static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int */ static int select_idle_smt(struct task_struct *p, int target) { - int cpu; + int cpu, si_cpu = -1; if (!static_branch_likely(&sched_smt_present)) return -1; @@ -5956,9 +5974,11 @@ static int select_idle_smt(struct task_struct *p, int target) continue; if (available_idle_cpu(cpu)) return cpu; + if (si_cpu == -1 && sched_idle_cpu(cpu)) + si_cpu = cpu; } - return -1; + return si_cpu; } #else /* CONFIG_SCHED_SMT */ @@ -5986,8 +6006,8 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t u64 avg_cost, avg_idle; u64 time, cost; s64 delta; - int cpu, nr = INT_MAX; int this = smp_processor_id(); + int cpu, nr = INT_MAX, si_cpu = -1; this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc)); if (!this_sd) @@ -6015,11 +6035,13 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t for_each_cpu_wrap(cpu, sched_domain_span(sd), target) { if (!--nr) - return -1; + return si_cpu; if (!cpumask_test_cpu(cpu, p->cpus_ptr)) continue; if (available_idle_cpu(cpu)) break; + if (si_cpu == -1 && sched_idle_cpu(cpu)) + si_cpu = cpu; } time = cpu_clock(this) - time; @@ -6038,13 +6060,14 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) struct sched_domain *sd; int i, recent_used_cpu; - if (available_idle_cpu(target)) + if (available_idle_cpu(target) || sched_idle_cpu(target)) return target; /* * If the previous CPU is cache affine and idle, don't be stupid: */ - if (prev != target && cpus_share_cache(prev, target) && available_idle_cpu(prev)) + if (prev != target && cpus_share_cache(prev, target) && + (available_idle_cpu(prev) || sched_idle_cpu(prev))) return prev; /* Check a recently used CPU as a potential idle candidate: */ @@ -6052,7 +6075,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) if (recent_used_cpu != prev && recent_used_cpu != target && cpus_share_cache(recent_used_cpu, target) && - available_idle_cpu(recent_used_cpu) && + (available_idle_cpu(recent_used_cpu) || sched_idle_cpu(recent_used_cpu)) && cpumask_test_cpu(p->recent_used_cpu, p->cpus_ptr)) { /* * Replace recent_used_cpu with prev as it is a potential -- cgit v1.2.3 From 7b3c92b85a65c2db1f542265bc98e1f9e3056eba Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 4 Jul 2019 15:13:23 -0700 Subject: sched/core: Convert get_task_struct() to return the task Returning the pointer that was passed in allows us to write slightly more idiomatic code. Convert a few users. Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: https://lkml.kernel.org/r/20190704221323.24290-1-willy@infradead.org Signed-off-by: Ingo Molnar --- kernel/events/core.c | 9 +++------ kernel/irq/manage.c | 3 +-- kernel/locking/rtmutex.c | 6 ++---- kernel/trace/trace_sched_wakeup.c | 3 +-- 4 files changed, 7 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 026a14541a38..ea5e8139fe62 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -4089,10 +4089,8 @@ alloc_perf_context(struct pmu *pmu, struct task_struct *task) return NULL; __perf_event_init_context(ctx); - if (task) { - ctx->task = task; - get_task_struct(task); - } + if (task) + ctx->task = get_task_struct(task); ctx->pmu = pmu; return ctx; @@ -10355,8 +10353,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, * and we cannot use the ctx information because we need the * pmu before we get a ctx. */ - get_task_struct(task); - event->hw.target = task; + event->hw.target = get_task_struct(task); } event->clock = &local_clock; diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index e8f7f179bf77..9d50fbe5531a 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -1255,8 +1255,7 @@ setup_irq_thread(struct irqaction *new, unsigned int irq, bool secondary) * the thread dies to avoid that the interrupt code * references an already freed task_struct. */ - get_task_struct(t); - new->thread = t; + new->thread = get_task_struct(t); /* * Tell the thread to set its affinity. This is * important for shared interrupt handlers as we do diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index fa83d36e30c6..2874bf556162 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -628,8 +628,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, } /* [10] Grab the next task, i.e. owner of @lock */ - task = rt_mutex_owner(lock); - get_task_struct(task); + task = get_task_struct(rt_mutex_owner(lock)); raw_spin_lock(&task->pi_lock); /* @@ -709,8 +708,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, } /* [10] Grab the next task, i.e. the owner of @lock */ - task = rt_mutex_owner(lock); - get_task_struct(task); + task = get_task_struct(rt_mutex_owner(lock)); raw_spin_lock(&task->pi_lock); /* [11] requeue the pi waiters if necessary */ diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index 743b2b520d34..5e43b9664eca 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -579,8 +579,7 @@ probe_wakeup(void *ignore, struct task_struct *p) else tracing_dl = 0; - wakeup_task = p; - get_task_struct(wakeup_task); + wakeup_task = get_task_struct(p); local_save_flags(flags); -- cgit v1.2.3 From 65d74e91694e1afac40c96fb64a9ef120757729e Mon Sep 17 00:00:00 2001 From: Yi Wang Date: Fri, 5 Jul 2019 12:35:07 +0800 Subject: sched/stats: Fix unlikely() use of sched_info_on() sched_info_on() is called with unlikely hint, however, the test is to be a constant(1) on which compiler will do nothing when make defconfig, so remove the hint. Also, fix a lack of {}. Signed-off-by: Yi Wang Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: up2wing@gmail.com Cc: wang.liang82@zte.com.cn Cc: xue.zhihong@zte.com.cn Link: https://lkml.kernel.org/r/1562301307-43002-1-git-send-email-wang.yi59@zte.com.cn Signed-off-by: Ingo Molnar --- kernel/sched/stats.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index aa0de240fb41..ba683fe81a6e 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h @@ -157,9 +157,10 @@ static inline void sched_info_dequeued(struct rq *rq, struct task_struct *t) { unsigned long long now = rq_clock(rq), delta = 0; - if (unlikely(sched_info_on())) + if (sched_info_on()) { if (t->sched_info.last_queued) delta = now - t->sched_info.last_queued; + } sched_info_reset_dequeued(t); t->sched_info.run_delay += delta; @@ -192,7 +193,7 @@ static void sched_info_arrive(struct rq *rq, struct task_struct *t) */ static inline void sched_info_queued(struct rq *rq, struct task_struct *t) { - if (unlikely(sched_info_on())) { + if (sched_info_on()) { if (!t->sched_info.last_queued) t->sched_info.last_queued = rq_clock(rq); } @@ -239,7 +240,7 @@ __sched_info_switch(struct rq *rq, struct task_struct *prev, struct task_struct static inline void sched_info_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next) { - if (unlikely(sched_info_on())) + if (sched_info_on()) __sched_info_switch(rq, prev, next); } -- cgit v1.2.3 From e0e8d4911ed2695b12c3a01c15634000ede9bc73 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Fri, 28 Jun 2019 16:51:41 +0800 Subject: sched/isolation: Prefer housekeeping CPU in local node In real product setup, there will be houseeking CPUs in each nodes, it is prefer to do housekeeping from local node, fallback to global online cpumask if failed to find houseeking CPU from local node. Signed-off-by: Wanpeng Li Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Frederic Weisbecker Reviewed-by: Srikar Dronamraju Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: https://lkml.kernel.org/r/1561711901-4755-2-git-send-email-wanpengli@tencent.com Signed-off-by: Ingo Molnar --- kernel/sched/isolation.c | 12 ++++++++++-- kernel/sched/sched.h | 8 +++++--- kernel/sched/topology.c | 20 ++++++++++++++++++++ 3 files changed, 35 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c index ccb28085b114..9fcb2a695a41 100644 --- a/kernel/sched/isolation.c +++ b/kernel/sched/isolation.c @@ -22,9 +22,17 @@ EXPORT_SYMBOL_GPL(housekeeping_enabled); int housekeeping_any_cpu(enum hk_flags flags) { - if (static_branch_unlikely(&housekeeping_overridden)) - if (housekeeping_flags & flags) + int cpu; + + if (static_branch_unlikely(&housekeeping_overridden)) { + if (housekeeping_flags & flags) { + cpu = sched_numa_find_closest(housekeeping_mask, smp_processor_id()); + if (cpu < nr_cpu_ids) + return cpu; + return cpumask_any_and(housekeeping_mask, cpu_online_mask); + } + } return smp_processor_id(); } EXPORT_SYMBOL_GPL(housekeeping_any_cpu); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index aaca0e743776..16126efd14ed 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1262,16 +1262,18 @@ enum numa_topology_type { extern enum numa_topology_type sched_numa_topology_type; extern int sched_max_numa_distance; extern bool find_numa_distance(int distance); -#endif - -#ifdef CONFIG_NUMA extern void sched_init_numa(void); extern void sched_domains_numa_masks_set(unsigned int cpu); extern void sched_domains_numa_masks_clear(unsigned int cpu); +extern int sched_numa_find_closest(const struct cpumask *cpus, int cpu); #else static inline void sched_init_numa(void) { } static inline void sched_domains_numa_masks_set(unsigned int cpu) { } static inline void sched_domains_numa_masks_clear(unsigned int cpu) { } +static inline int sched_numa_find_closest(const struct cpumask *cpus, int cpu) +{ + return nr_cpu_ids; +} #endif #ifdef CONFIG_NUMA_BALANCING diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index f751ce0b783e..4eea2c9bc732 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -1724,6 +1724,26 @@ void sched_domains_numa_masks_clear(unsigned int cpu) } } +/* + * sched_numa_find_closest() - given the NUMA topology, find the cpu + * closest to @cpu from @cpumask. + * cpumask: cpumask to find a cpu from + * cpu: cpu to be close to + * + * returns: cpu, or nr_cpu_ids when nothing found. + */ +int sched_numa_find_closest(const struct cpumask *cpus, int cpu) +{ + int i, j = cpu_to_node(cpu); + + for (i = 0; i < sched_domains_numa_levels; i++) { + cpu = cpumask_any_and(cpus, sched_domains_numa_masks[i][j]); + if (cpu < nr_cpu_ids) + return cpu; + } + return nr_cpu_ids; +} + #endif /* CONFIG_NUMA */ static int __sdt_alloc(const struct cpumask *cpu_map) -- cgit v1.2.3 From 60e17f5cef838e9ca7946ced208ceddcec6c315d Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Tue, 4 Jun 2019 12:31:52 +0530 Subject: sched/fair: Introduce fits_capacity() The same formula to check utilization against capacity (after considering capacity_margin) is already used at 5 different locations. This patch creates a new macro, fits_capacity(), which can be used from all these locations without exposing the details of it and hence simplify code. All the 5 code locations are updated as well to use it.. Signed-off-by: Viresh Kumar Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Vincent Guittot Link: https://lkml.kernel.org/r/b477ac75a2b163048bdaeb37f57b4c3f04f75a31.1559631700.git.viresh.kumar@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 52564e050062..fb75c0bea80f 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -96,12 +96,12 @@ int __weak arch_asym_cpu_priority(int cpu) } /* - * The margin used when comparing utilization with CPU capacity: - * util * margin < capacity * 1024 + * The margin used when comparing utilization with CPU capacity. * * (default: ~20%) */ -static unsigned int capacity_margin = 1280; +#define fits_capacity(cap, max) ((cap) * 1280 < (max) * 1024) + #endif #ifdef CONFIG_CFS_BANDWIDTH @@ -3808,7 +3808,7 @@ util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep) static inline int task_fits_capacity(struct task_struct *p, long capacity) { - return capacity * 1024 > task_util_est(p) * capacity_margin; + return fits_capacity(task_util_est(p), capacity); } static inline void update_misfit_status(struct task_struct *p, struct rq *rq) @@ -5235,7 +5235,7 @@ static inline unsigned long cpu_util(int cpu); static inline bool cpu_overutilized(int cpu) { - return (capacity_of(cpu) * 1024) < (cpu_util(cpu) * capacity_margin); + return !fits_capacity(cpu_util(cpu), capacity_of(cpu)); } static inline void update_overutilized_status(struct rq *rq) @@ -6456,7 +6456,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) /* Skip CPUs that will be overutilized. */ util = cpu_util_next(cpu, p, cpu); cpu_cap = capacity_of(cpu); - if (cpu_cap * 1024 < util * capacity_margin) + if (!fits_capacity(util, cpu_cap)) continue; /* Always use prev_cpu as a candidate. */ @@ -8011,8 +8011,7 @@ group_is_overloaded(struct lb_env *env, struct sg_lb_stats *sgs) static inline bool group_smaller_min_cpu_capacity(struct sched_group *sg, struct sched_group *ref) { - return sg->sgc->min_capacity * capacity_margin < - ref->sgc->min_capacity * 1024; + return fits_capacity(sg->sgc->min_capacity, ref->sgc->min_capacity); } /* @@ -8022,8 +8021,7 @@ group_smaller_min_cpu_capacity(struct sched_group *sg, struct sched_group *ref) static inline bool group_smaller_max_cpu_capacity(struct sched_group *sg, struct sched_group *ref) { - return sg->sgc->max_capacity * capacity_margin < - ref->sgc->max_capacity * 1024; + return fits_capacity(sg->sgc->max_capacity, ref->sgc->max_capacity); } static inline enum -- cgit v1.2.3 From c22645f4c8f021fb1c5e7189eb1f968132cc0844 Mon Sep 17 00:00:00 2001 From: Mathieu Poirier Date: Fri, 19 Jul 2019 15:59:53 +0200 Subject: sched/topology: Add partition_sched_domains_locked() Introduce the partition_sched_domains_locked() function by taking the mutex locking code out of the original function. That way the work done by partition_sched_domains_locked() can be reused without dropping the mutex lock. No change of functionality is introduced by this patch. Tested-by: Dietmar Eggemann Signed-off-by: Mathieu Poirier Signed-off-by: Peter Zijlstra (Intel) Acked-by: Tejun Heo Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: bristot@redhat.com Cc: claudio@evidence.eu.com Cc: lizefan@huawei.com Cc: longman@redhat.com Cc: luca.abeni@santannapisa.it Cc: rostedt@goodmis.org Cc: tommaso.cucinotta@santannapisa.it Link: https://lkml.kernel.org/r/20190719140000.31694-2-juri.lelli@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/topology.c | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 4eea2c9bc732..5a174ae6ecf3 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -2169,16 +2169,16 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur, * ndoms_new == 0 is a special case for destroying existing domains, * and it will not create the default domain. * - * Call with hotplug lock held + * Call with hotplug lock and sched_domains_mutex held */ -void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], - struct sched_domain_attr *dattr_new) +void partition_sched_domains_locked(int ndoms_new, cpumask_var_t doms_new[], + struct sched_domain_attr *dattr_new) { bool __maybe_unused has_eas = false; int i, j, n; int new_topology; - mutex_lock(&sched_domains_mutex); + lockdep_assert_held(&sched_domains_mutex); /* Always unregister in case we don't destroy any domains: */ unregister_sched_domain_sysctl(); @@ -2261,6 +2261,15 @@ match3: ndoms_cur = ndoms_new; register_sched_domain_sysctl(); +} +/* + * Call with hotplug lock held + */ +void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], + struct sched_domain_attr *dattr_new) +{ + mutex_lock(&sched_domains_mutex); + partition_sched_domains_locked(ndoms_new, doms_new, dattr_new); mutex_unlock(&sched_domains_mutex); } -- cgit v1.2.3 From 4b211f2b129dd1f6a6956bbc76e2f232c1ec3ad8 Mon Sep 17 00:00:00 2001 From: Mathieu Poirier Date: Fri, 19 Jul 2019 15:59:54 +0200 Subject: sched/core: Streamle calls to task_rq_unlock() Calls to task_rq_unlock() are done several times in the __sched_setscheduler() function. This is fine when only the rq lock needs to be handled but not so much when other locks come into play. This patch streamlines the release of the rq lock so that only one location need to be modified when dealing with more than one lock. No change of functionality is introduced by this patch. Tested-by: Dietmar Eggemann Signed-off-by: Mathieu Poirier Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Steven Rostedt (VMware) Acked-by: Tejun Heo Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: bristot@redhat.com Cc: claudio@evidence.eu.com Cc: lizefan@huawei.com Cc: longman@redhat.com Cc: luca.abeni@santannapisa.it Cc: tommaso.cucinotta@santannapisa.it Link: https://lkml.kernel.org/r/20190719140000.31694-3-juri.lelli@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 0b22e55cebe8..1af3d2dc6b29 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4712,8 +4712,8 @@ recheck: * Changing the policy of the stop threads its a very bad idea: */ if (p == rq->stop) { - task_rq_unlock(rq, p, &rf); - return -EINVAL; + retval = -EINVAL; + goto unlock; } /* @@ -4731,8 +4731,8 @@ recheck: goto change; p->sched_reset_on_fork = reset_on_fork; - task_rq_unlock(rq, p, &rf); - return 0; + retval = 0; + goto unlock; } change: @@ -4745,8 +4745,8 @@ change: if (rt_bandwidth_enabled() && rt_policy(policy) && task_group(p)->rt_bandwidth.rt_runtime == 0 && !task_group_is_autogroup(task_group(p))) { - task_rq_unlock(rq, p, &rf); - return -EPERM; + retval = -EPERM; + goto unlock; } #endif #ifdef CONFIG_SMP @@ -4761,8 +4761,8 @@ change: */ if (!cpumask_subset(span, p->cpus_ptr) || rq->rd->dl_bw.bw == 0) { - task_rq_unlock(rq, p, &rf); - return -EPERM; + retval = -EPERM; + goto unlock; } } #endif @@ -4781,8 +4781,8 @@ change: * is available. */ if ((dl_policy(policy) || dl_task(p)) && sched_dl_overflow(p, policy, attr)) { - task_rq_unlock(rq, p, &rf); - return -EBUSY; + retval = -EBUSY; + goto unlock; } p->sched_reset_on_fork = reset_on_fork; @@ -4840,6 +4840,10 @@ change: preempt_enable(); return 0; + +unlock: + task_rq_unlock(rq, p, &rf); + return retval; } static int _sched_setscheduler(struct task_struct *p, int policy, -- cgit v1.2.3 From f9a25f776d780bfa3279f0b6e5f5cf3224997976 Mon Sep 17 00:00:00 2001 From: Mathieu Poirier Date: Fri, 19 Jul 2019 15:59:55 +0200 Subject: cpusets: Rebuild root domain deadline accounting information When the topology of root domains is modified by CPUset or CPUhotplug operations information about the current deadline bandwidth held in the root domain is lost. This patch addresses the issue by recalculating the lost deadline bandwidth information by circling through the deadline tasks held in CPUsets and adding their current load to the root domain they are associated with. Tested-by: Dietmar Eggemann Signed-off-by: Mathieu Poirier Signed-off-by: Juri Lelli [ Various additional modifications. ] Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: bristot@redhat.com Cc: claudio@evidence.eu.com Cc: lizefan@huawei.com Cc: longman@redhat.com Cc: luca.abeni@santannapisa.it Cc: rostedt@goodmis.org Cc: tj@kernel.org Cc: tommaso.cucinotta@santannapisa.it Link: https://lkml.kernel.org/r/20190719140000.31694-4-juri.lelli@redhat.com Signed-off-by: Ingo Molnar --- kernel/cgroup/cgroup.c | 2 +- kernel/cgroup/cpuset.c | 64 ++++++++++++++++++++++++++++++++++++++++++++++++- kernel/sched/deadline.c | 30 +++++++++++++++++++++++ kernel/sched/sched.h | 3 --- kernel/sched/topology.c | 13 +++++++++- 5 files changed, 106 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 753afbca549f..4b5bc452176c 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -1891,7 +1891,7 @@ static int cgroup_reconfigure(struct fs_context *fc) */ static bool use_task_css_set_links __read_mostly; -static void cgroup_enable_task_cg_lists(void) +void cgroup_enable_task_cg_lists(void) { struct task_struct *p, *g; diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 5aa37531ce76..846cbdb68566 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -45,6 +45,7 @@ #include #include #include +#include #include #include #include @@ -894,6 +895,67 @@ done: return ndoms; } +static void update_tasks_root_domain(struct cpuset *cs) +{ + struct css_task_iter it; + struct task_struct *task; + + css_task_iter_start(&cs->css, 0, &it); + + while ((task = css_task_iter_next(&it))) + dl_add_task_root_domain(task); + + css_task_iter_end(&it); +} + +static void rebuild_root_domains(void) +{ + struct cpuset *cs = NULL; + struct cgroup_subsys_state *pos_css; + + lockdep_assert_held(&cpuset_mutex); + lockdep_assert_cpus_held(); + lockdep_assert_held(&sched_domains_mutex); + + cgroup_enable_task_cg_lists(); + + rcu_read_lock(); + + /* + * Clear default root domain DL accounting, it will be computed again + * if a task belongs to it. + */ + dl_clear_root_domain(&def_root_domain); + + cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) { + + if (cpumask_empty(cs->effective_cpus)) { + pos_css = css_rightmost_descendant(pos_css); + continue; + } + + css_get(&cs->css); + + rcu_read_unlock(); + + update_tasks_root_domain(cs); + + rcu_read_lock(); + css_put(&cs->css); + } + rcu_read_unlock(); +} + +static void +partition_and_rebuild_sched_domains(int ndoms_new, cpumask_var_t doms_new[], + struct sched_domain_attr *dattr_new) +{ + mutex_lock(&sched_domains_mutex); + partition_sched_domains_locked(ndoms_new, doms_new, dattr_new); + rebuild_root_domains(); + mutex_unlock(&sched_domains_mutex); +} + /* * Rebuild scheduler domains. * @@ -931,7 +993,7 @@ static void rebuild_sched_domains_locked(void) ndoms = generate_sched_domains(&doms, &attr); /* Have scheduler rebuild the domains */ - partition_sched_domains(ndoms, doms, attr); + partition_and_rebuild_sched_domains(ndoms, doms, attr); out: put_online_cpus(); } diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index ef5b9f6b1d42..0f9d2180be23 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -2283,6 +2283,36 @@ void __init init_sched_dl_class(void) GFP_KERNEL, cpu_to_node(i)); } +void dl_add_task_root_domain(struct task_struct *p) +{ + struct rq_flags rf; + struct rq *rq; + struct dl_bw *dl_b; + + rq = task_rq_lock(p, &rf); + if (!dl_task(p)) + goto unlock; + + dl_b = &rq->rd->dl_bw; + raw_spin_lock(&dl_b->lock); + + __dl_add(dl_b, p->dl.dl_bw, cpumask_weight(rq->rd->span)); + + raw_spin_unlock(&dl_b->lock); + +unlock: + task_rq_unlock(rq, p, &rf); +} + +void dl_clear_root_domain(struct root_domain *rd) +{ + unsigned long flags; + + raw_spin_lock_irqsave(&rd->dl_bw.lock, flags); + rd->dl_bw.total_bw = 0; + raw_spin_unlock_irqrestore(&rd->dl_bw.lock, flags); +} + #endif /* CONFIG_SMP */ static void switched_from_dl(struct rq *rq, struct task_struct *p) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 16126efd14ed..7583faddba33 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -778,9 +778,6 @@ struct root_domain { struct perf_domain __rcu *pd; }; -extern struct root_domain def_root_domain; -extern struct mutex sched_domains_mutex; - extern void init_defrootdomain(void); extern int sched_init_domains(const struct cpumask *cpu_map); extern void rq_attach_root(struct rq *rq, struct root_domain *rd); diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 5a174ae6ecf3..8f83e8e3ea9a 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -2203,8 +2203,19 @@ void partition_sched_domains_locked(int ndoms_new, cpumask_var_t doms_new[], for (i = 0; i < ndoms_cur; i++) { for (j = 0; j < n && !new_topology; j++) { if (cpumask_equal(doms_cur[i], doms_new[j]) && - dattrs_equal(dattr_cur, i, dattr_new, j)) + dattrs_equal(dattr_cur, i, dattr_new, j)) { + struct root_domain *rd; + + /* + * This domain won't be destroyed and as such + * its dl_bw->total_bw needs to be cleared. It + * will be recomputed in function + * update_tasks_root_domain(). + */ + rd = cpu_rq(cpumask_any(doms_cur[i]))->rd; + dl_clear_root_domain(rd); goto match1; + } } /* No match - a current sched domain not in new doms_new[] */ detach_destroy_domains(doms_cur[i]); -- cgit v1.2.3 From 59d06cea1198d665ba11f7e8c5f45b00ff2e4812 Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Fri, 19 Jul 2019 15:59:56 +0200 Subject: sched/deadline: Fix bandwidth accounting at all levels after offline migration If a task happens to be throttled while the CPU it was running on gets hotplugged off, the bandwidth associated with the task is not correctly migrated with it when the replenishment timer fires (offline_migration). Fix things up, for this_bw, running_bw and total_bw, when replenishment timer fires and task is migrated (dl_task_offline_migration()). Tested-by: Dietmar Eggemann Signed-off-by: Juri Lelli Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: bristot@redhat.com Cc: claudio@evidence.eu.com Cc: lizefan@huawei.com Cc: longman@redhat.com Cc: luca.abeni@santannapisa.it Cc: mathieu.poirier@linaro.org Cc: rostedt@goodmis.org Cc: tj@kernel.org Cc: tommaso.cucinotta@santannapisa.it Link: https://lkml.kernel.org/r/20190719140000.31694-5-juri.lelli@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/deadline.c | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 0f9d2180be23..039dde2b1dac 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -529,6 +529,7 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq); static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p) { struct rq *later_rq = NULL; + struct dl_bw *dl_b; later_rq = find_lock_later_rq(p, rq); if (!later_rq) { @@ -557,6 +558,38 @@ static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p double_lock_balance(rq, later_rq); } + if (p->dl.dl_non_contending || p->dl.dl_throttled) { + /* + * Inactive timer is armed (or callback is running, but + * waiting for us to release rq locks). In any case, when it + * will fire (or continue), it will see running_bw of this + * task migrated to later_rq (and correctly handle it). + */ + sub_running_bw(&p->dl, &rq->dl); + sub_rq_bw(&p->dl, &rq->dl); + + add_rq_bw(&p->dl, &later_rq->dl); + add_running_bw(&p->dl, &later_rq->dl); + } else { + sub_rq_bw(&p->dl, &rq->dl); + add_rq_bw(&p->dl, &later_rq->dl); + } + + /* + * And we finally need to fixup root_domain(s) bandwidth accounting, + * since p is still hanging out in the old (now moved to default) root + * domain. + */ + dl_b = &rq->rd->dl_bw; + raw_spin_lock(&dl_b->lock); + __dl_sub(dl_b, p->dl.dl_bw, cpumask_weight(rq->rd->span)); + raw_spin_unlock(&dl_b->lock); + + dl_b = &later_rq->rd->dl_bw; + raw_spin_lock(&dl_b->lock); + __dl_add(dl_b, p->dl.dl_bw, cpumask_weight(later_rq->rd->span)); + raw_spin_unlock(&dl_b->lock); + set_task_cpu(p, later_rq->cpu); double_unlock_balance(later_rq, rq); -- cgit v1.2.3 From 1243dc518c9da467da6635313a2dbb41b8ffc275 Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Fri, 19 Jul 2019 15:59:57 +0200 Subject: cgroup/cpuset: Convert cpuset_mutex to percpu_rwsem Holding cpuset_mutex means that cpusets are stable (only the holder can make changes) and this is required for fixing a synchronization issue between cpusets and scheduler core. However, grabbing cpuset_mutex from setscheduler() hotpath (as implemented in a later patch) is a no-go, as it would create a bottleneck for tasks concurrently calling setscheduler(). Convert cpuset_mutex to be a percpu_rwsem (cpuset_rwsem), so that setscheduler() will then be able to read lock it and avoid concurrency issues. Tested-by: Dietmar Eggemann Signed-off-by: Juri Lelli Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: bristot@redhat.com Cc: claudio@evidence.eu.com Cc: lizefan@huawei.com Cc: longman@redhat.com Cc: luca.abeni@santannapisa.it Cc: mathieu.poirier@linaro.org Cc: rostedt@goodmis.org Cc: tj@kernel.org Cc: tommaso.cucinotta@santannapisa.it Link: https://lkml.kernel.org/r/20190719140000.31694-6-juri.lelli@redhat.com Signed-off-by: Ingo Molnar --- kernel/cgroup/cpuset.c | 68 ++++++++++++++++++++++++++------------------------ 1 file changed, 35 insertions(+), 33 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 846cbdb68566..e1a8d168c5e9 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -333,7 +333,7 @@ static struct cpuset top_cpuset = { * guidelines for accessing subsystem state in kernel/cgroup.c */ -static DEFINE_MUTEX(cpuset_mutex); +DEFINE_STATIC_PERCPU_RWSEM(cpuset_rwsem); static DEFINE_SPINLOCK(callback_lock); static struct workqueue_struct *cpuset_migrate_mm_wq; @@ -913,7 +913,7 @@ static void rebuild_root_domains(void) struct cpuset *cs = NULL; struct cgroup_subsys_state *pos_css; - lockdep_assert_held(&cpuset_mutex); + percpu_rwsem_assert_held(&cpuset_rwsem); lockdep_assert_cpus_held(); lockdep_assert_held(&sched_domains_mutex); @@ -973,7 +973,7 @@ static void rebuild_sched_domains_locked(void) cpumask_var_t *doms; int ndoms; - lockdep_assert_held(&cpuset_mutex); + percpu_rwsem_assert_held(&cpuset_rwsem); get_online_cpus(); /* @@ -1005,9 +1005,9 @@ static void rebuild_sched_domains_locked(void) void rebuild_sched_domains(void) { - mutex_lock(&cpuset_mutex); + percpu_down_write(&cpuset_rwsem); rebuild_sched_domains_locked(); - mutex_unlock(&cpuset_mutex); + percpu_up_write(&cpuset_rwsem); } /** @@ -1113,7 +1113,7 @@ static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd, int deleting; /* Moving cpus from subparts_cpus to effective_cpus */ bool part_error = false; /* Partition error? */ - lockdep_assert_held(&cpuset_mutex); + percpu_rwsem_assert_held(&cpuset_rwsem); /* * The parent must be a partition root. @@ -2101,7 +2101,7 @@ static int cpuset_can_attach(struct cgroup_taskset *tset) cpuset_attach_old_cs = task_cs(cgroup_taskset_first(tset, &css)); cs = css_cs(css); - mutex_lock(&cpuset_mutex); + percpu_down_write(&cpuset_rwsem); /* allow moving tasks into an empty cpuset if on default hierarchy */ ret = -ENOSPC; @@ -2125,7 +2125,7 @@ static int cpuset_can_attach(struct cgroup_taskset *tset) cs->attach_in_progress++; ret = 0; out_unlock: - mutex_unlock(&cpuset_mutex); + percpu_up_write(&cpuset_rwsem); return ret; } @@ -2135,9 +2135,9 @@ static void cpuset_cancel_attach(struct cgroup_taskset *tset) cgroup_taskset_first(tset, &css); - mutex_lock(&cpuset_mutex); + percpu_down_write(&cpuset_rwsem); css_cs(css)->attach_in_progress--; - mutex_unlock(&cpuset_mutex); + percpu_up_write(&cpuset_rwsem); } /* @@ -2160,7 +2160,7 @@ static void cpuset_attach(struct cgroup_taskset *tset) cgroup_taskset_first(tset, &css); cs = css_cs(css); - mutex_lock(&cpuset_mutex); + percpu_down_write(&cpuset_rwsem); /* prepare for attach */ if (cs == &top_cpuset) @@ -2214,7 +2214,7 @@ static void cpuset_attach(struct cgroup_taskset *tset) if (!cs->attach_in_progress) wake_up(&cpuset_attach_wq); - mutex_unlock(&cpuset_mutex); + percpu_up_write(&cpuset_rwsem); } /* The various types of files and directories in a cpuset file system */ @@ -2245,7 +2245,7 @@ static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft, cpuset_filetype_t type = cft->private; int retval = 0; - mutex_lock(&cpuset_mutex); + percpu_down_write(&cpuset_rwsem); if (!is_cpuset_online(cs)) { retval = -ENODEV; goto out_unlock; @@ -2281,7 +2281,7 @@ static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft, break; } out_unlock: - mutex_unlock(&cpuset_mutex); + percpu_up_write(&cpuset_rwsem); return retval; } @@ -2292,7 +2292,7 @@ static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft, cpuset_filetype_t type = cft->private; int retval = -ENODEV; - mutex_lock(&cpuset_mutex); + percpu_down_write(&cpuset_rwsem); if (!is_cpuset_online(cs)) goto out_unlock; @@ -2305,7 +2305,7 @@ static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft, break; } out_unlock: - mutex_unlock(&cpuset_mutex); + percpu_up_write(&cpuset_rwsem); return retval; } @@ -2344,7 +2344,7 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of, kernfs_break_active_protection(of->kn); flush_work(&cpuset_hotplug_work); - mutex_lock(&cpuset_mutex); + percpu_down_write(&cpuset_rwsem); if (!is_cpuset_online(cs)) goto out_unlock; @@ -2368,7 +2368,7 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of, free_cpuset(trialcs); out_unlock: - mutex_unlock(&cpuset_mutex); + percpu_up_write(&cpuset_rwsem); kernfs_unbreak_active_protection(of->kn); css_put(&cs->css); flush_workqueue(cpuset_migrate_mm_wq); @@ -2499,13 +2499,13 @@ static ssize_t sched_partition_write(struct kernfs_open_file *of, char *buf, return -EINVAL; css_get(&cs->css); - mutex_lock(&cpuset_mutex); + percpu_down_write(&cpuset_rwsem); if (!is_cpuset_online(cs)) goto out_unlock; retval = update_prstate(cs, val); out_unlock: - mutex_unlock(&cpuset_mutex); + percpu_up_write(&cpuset_rwsem); css_put(&cs->css); return retval ?: nbytes; } @@ -2711,7 +2711,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) if (!parent) return 0; - mutex_lock(&cpuset_mutex); + percpu_down_write(&cpuset_rwsem); set_bit(CS_ONLINE, &cs->flags); if (is_spread_page(parent)) @@ -2762,7 +2762,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) cpumask_copy(cs->effective_cpus, parent->cpus_allowed); spin_unlock_irq(&callback_lock); out_unlock: - mutex_unlock(&cpuset_mutex); + percpu_up_write(&cpuset_rwsem); return 0; } @@ -2781,7 +2781,7 @@ static void cpuset_css_offline(struct cgroup_subsys_state *css) { struct cpuset *cs = css_cs(css); - mutex_lock(&cpuset_mutex); + percpu_down_write(&cpuset_rwsem); if (is_partition_root(cs)) update_prstate(cs, 0); @@ -2800,7 +2800,7 @@ static void cpuset_css_offline(struct cgroup_subsys_state *css) cpuset_dec(); clear_bit(CS_ONLINE, &cs->flags); - mutex_unlock(&cpuset_mutex); + percpu_up_write(&cpuset_rwsem); } static void cpuset_css_free(struct cgroup_subsys_state *css) @@ -2812,7 +2812,7 @@ static void cpuset_css_free(struct cgroup_subsys_state *css) static void cpuset_bind(struct cgroup_subsys_state *root_css) { - mutex_lock(&cpuset_mutex); + percpu_down_write(&cpuset_rwsem); spin_lock_irq(&callback_lock); if (is_in_v2_mode()) { @@ -2825,7 +2825,7 @@ static void cpuset_bind(struct cgroup_subsys_state *root_css) } spin_unlock_irq(&callback_lock); - mutex_unlock(&cpuset_mutex); + percpu_up_write(&cpuset_rwsem); } /* @@ -2867,6 +2867,8 @@ struct cgroup_subsys cpuset_cgrp_subsys = { int __init cpuset_init(void) { + BUG_ON(percpu_init_rwsem(&cpuset_rwsem)); + BUG_ON(!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL)); BUG_ON(!alloc_cpumask_var(&top_cpuset.effective_cpus, GFP_KERNEL)); BUG_ON(!zalloc_cpumask_var(&top_cpuset.subparts_cpus, GFP_KERNEL)); @@ -2938,7 +2940,7 @@ hotplug_update_tasks_legacy(struct cpuset *cs, is_empty = cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed); - mutex_unlock(&cpuset_mutex); + percpu_up_write(&cpuset_rwsem); /* * Move tasks to the nearest ancestor with execution resources, @@ -2948,7 +2950,7 @@ hotplug_update_tasks_legacy(struct cpuset *cs, if (is_empty) remove_tasks_in_empty_cpuset(cs); - mutex_lock(&cpuset_mutex); + percpu_down_write(&cpuset_rwsem); } static void @@ -2998,14 +3000,14 @@ static void cpuset_hotplug_update_tasks(struct cpuset *cs, struct tmpmasks *tmp) retry: wait_event(cpuset_attach_wq, cs->attach_in_progress == 0); - mutex_lock(&cpuset_mutex); + percpu_down_write(&cpuset_rwsem); /* * We have raced with task attaching. We wait until attaching * is finished, so we won't attach a task to an empty cpuset. */ if (cs->attach_in_progress) { - mutex_unlock(&cpuset_mutex); + percpu_up_write(&cpuset_rwsem); goto retry; } @@ -3073,7 +3075,7 @@ update_tasks: hotplug_update_tasks_legacy(cs, &new_cpus, &new_mems, cpus_updated, mems_updated); - mutex_unlock(&cpuset_mutex); + percpu_up_write(&cpuset_rwsem); } /** @@ -3103,7 +3105,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work) if (on_dfl && !alloc_cpumasks(NULL, &tmp)) ptmp = &tmp; - mutex_lock(&cpuset_mutex); + percpu_down_write(&cpuset_rwsem); /* fetch the available cpus/mems and find out which changed how */ cpumask_copy(&new_cpus, cpu_active_mask); @@ -3153,7 +3155,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work) update_tasks_nodemask(&top_cpuset); } - mutex_unlock(&cpuset_mutex); + percpu_up_write(&cpuset_rwsem); /* if cpus or mems changed, we need to propagate to descendants */ if (cpus_updated || mems_updated) { -- cgit v1.2.3 From d74b27d63a8bebe2fe634944e4ebdc7b10db7a39 Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Fri, 19 Jul 2019 15:59:58 +0200 Subject: cgroup/cpuset: Change cpuset_rwsem and hotplug lock order cpuset_rwsem is going to be acquired from sched_setscheduler() with a following patch. There are however paths (e.g., spawn_ksoftirqd) in which sched_scheduler() is eventually called while holding hotplug lock; this creates a dependecy between hotplug lock (to be always acquired first) and cpuset_rwsem (to be always acquired after hotplug lock). Fix paths which currently take the two locks in the wrong order (after a following patch is applied). Tested-by: Dietmar Eggemann Signed-off-by: Juri Lelli Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: bristot@redhat.com Cc: claudio@evidence.eu.com Cc: lizefan@huawei.com Cc: longman@redhat.com Cc: luca.abeni@santannapisa.it Cc: mathieu.poirier@linaro.org Cc: rostedt@goodmis.org Cc: tj@kernel.org Cc: tommaso.cucinotta@santannapisa.it Link: https://lkml.kernel.org/r/20190719140000.31694-7-juri.lelli@redhat.com Signed-off-by: Ingo Molnar --- kernel/cgroup/cpuset.c | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index e1a8d168c5e9..5c5014caa23c 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -973,8 +973,8 @@ static void rebuild_sched_domains_locked(void) cpumask_var_t *doms; int ndoms; + lockdep_assert_cpus_held(); percpu_rwsem_assert_held(&cpuset_rwsem); - get_online_cpus(); /* * We have raced with CPU hotplug. Don't do anything to avoid @@ -983,19 +983,17 @@ static void rebuild_sched_domains_locked(void) */ if (!top_cpuset.nr_subparts_cpus && !cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask)) - goto out; + return; if (top_cpuset.nr_subparts_cpus && !cpumask_subset(top_cpuset.effective_cpus, cpu_active_mask)) - goto out; + return; /* Generate domain masks and attrs */ ndoms = generate_sched_domains(&doms, &attr); /* Have scheduler rebuild the domains */ partition_and_rebuild_sched_domains(ndoms, doms, attr); -out: - put_online_cpus(); } #else /* !CONFIG_SMP */ static void rebuild_sched_domains_locked(void) @@ -1005,9 +1003,11 @@ static void rebuild_sched_domains_locked(void) void rebuild_sched_domains(void) { + get_online_cpus(); percpu_down_write(&cpuset_rwsem); rebuild_sched_domains_locked(); percpu_up_write(&cpuset_rwsem); + put_online_cpus(); } /** @@ -2245,6 +2245,7 @@ static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft, cpuset_filetype_t type = cft->private; int retval = 0; + get_online_cpus(); percpu_down_write(&cpuset_rwsem); if (!is_cpuset_online(cs)) { retval = -ENODEV; @@ -2282,6 +2283,7 @@ static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft, } out_unlock: percpu_up_write(&cpuset_rwsem); + put_online_cpus(); return retval; } @@ -2292,6 +2294,7 @@ static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft, cpuset_filetype_t type = cft->private; int retval = -ENODEV; + get_online_cpus(); percpu_down_write(&cpuset_rwsem); if (!is_cpuset_online(cs)) goto out_unlock; @@ -2306,6 +2309,7 @@ static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft, } out_unlock: percpu_up_write(&cpuset_rwsem); + put_online_cpus(); return retval; } @@ -2344,6 +2348,7 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of, kernfs_break_active_protection(of->kn); flush_work(&cpuset_hotplug_work); + get_online_cpus(); percpu_down_write(&cpuset_rwsem); if (!is_cpuset_online(cs)) goto out_unlock; @@ -2369,6 +2374,7 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of, free_cpuset(trialcs); out_unlock: percpu_up_write(&cpuset_rwsem); + put_online_cpus(); kernfs_unbreak_active_protection(of->kn); css_put(&cs->css); flush_workqueue(cpuset_migrate_mm_wq); @@ -2499,6 +2505,7 @@ static ssize_t sched_partition_write(struct kernfs_open_file *of, char *buf, return -EINVAL; css_get(&cs->css); + get_online_cpus(); percpu_down_write(&cpuset_rwsem); if (!is_cpuset_online(cs)) goto out_unlock; @@ -2506,6 +2513,7 @@ static ssize_t sched_partition_write(struct kernfs_open_file *of, char *buf, retval = update_prstate(cs, val); out_unlock: percpu_up_write(&cpuset_rwsem); + put_online_cpus(); css_put(&cs->css); return retval ?: nbytes; } @@ -2711,6 +2719,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) if (!parent) return 0; + get_online_cpus(); percpu_down_write(&cpuset_rwsem); set_bit(CS_ONLINE, &cs->flags); @@ -2763,6 +2772,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) spin_unlock_irq(&callback_lock); out_unlock: percpu_up_write(&cpuset_rwsem); + put_online_cpus(); return 0; } @@ -2781,6 +2791,7 @@ static void cpuset_css_offline(struct cgroup_subsys_state *css) { struct cpuset *cs = css_cs(css); + get_online_cpus(); percpu_down_write(&cpuset_rwsem); if (is_partition_root(cs)) @@ -2801,6 +2812,7 @@ static void cpuset_css_offline(struct cgroup_subsys_state *css) clear_bit(CS_ONLINE, &cs->flags); percpu_up_write(&cpuset_rwsem); + put_online_cpus(); } static void cpuset_css_free(struct cgroup_subsys_state *css) -- cgit v1.2.3 From 1a763fd7c6335e3122c1cc09576ef6c99ada4267 Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Fri, 19 Jul 2019 15:59:59 +0200 Subject: rcu/tree: Call setschedule() gp ktread to SCHED_FIFO outside of atomic region sched_setscheduler() needs to acquire cpuset_rwsem, but it is currently called from an invalid (atomic) context by rcu_spawn_gp_kthread(). Fix that by simply moving sched_setscheduler_nocheck() call outside of the atomic region, as it doesn't actually require to be guarded by rcu_node lock. Suggested-by: Peter Zijlstra Tested-by: Dietmar Eggemann Signed-off-by: Juri Lelli Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Thomas Gleixner Cc: bristot@redhat.com Cc: claudio@evidence.eu.com Cc: lizefan@huawei.com Cc: longman@redhat.com Cc: luca.abeni@santannapisa.it Cc: mathieu.poirier@linaro.org Cc: rostedt@goodmis.org Cc: tj@kernel.org Cc: tommaso.cucinotta@santannapisa.it Link: https://lkml.kernel.org/r/20190719140000.31694-8-juri.lelli@redhat.com Signed-off-by: Ingo Molnar --- kernel/rcu/tree.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index a14e5fbbea46..eb764c24bc4d 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3234,13 +3234,13 @@ static int __init rcu_spawn_gp_kthread(void) t = kthread_create(rcu_gp_kthread, NULL, "%s", rcu_state.name); if (WARN_ONCE(IS_ERR(t), "%s: Could not start grace-period kthread, OOM is now expected behavior\n", __func__)) return 0; + if (kthread_prio) + sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); rnp = rcu_get_root(); raw_spin_lock_irqsave_rcu_node(rnp, flags); rcu_state.gp_kthread = t; - if (kthread_prio) { + if (kthread_prio) sp.sched_priority = kthread_prio; - sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); - } raw_spin_unlock_irqrestore_rcu_node(rnp, flags); wake_up_process(t); rcu_spawn_nocb_kthreads(); -- cgit v1.2.3 From 710da3c8ea7dfbd327920afd3831d8c82c42789d Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Fri, 19 Jul 2019 16:00:00 +0200 Subject: sched/core: Prevent race condition between cpuset and __sched_setscheduler() No synchronisation mechanism exists between the cpuset subsystem and calls to function __sched_setscheduler(). As such, it is possible that new root domains are created on the cpuset side while a deadline acceptance test is carried out in __sched_setscheduler(), leading to a potential oversell of CPU bandwidth. Grab cpuset_rwsem read lock from core scheduler, so to prevent situations such as the one described above from happening. The only exception is normalize_rt_tasks() which needs to work under tasklist_lock and can't therefore grab cpuset_rwsem. We are fine with this, as this function is only called by sysrq and, if that gets triggered, DEADLINE guarantees are already gone out of the window anyway. Tested-by: Dietmar Eggemann Signed-off-by: Juri Lelli Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: bristot@redhat.com Cc: claudio@evidence.eu.com Cc: lizefan@huawei.com Cc: longman@redhat.com Cc: luca.abeni@santannapisa.it Cc: mathieu.poirier@linaro.org Cc: rostedt@goodmis.org Cc: tj@kernel.org Cc: tommaso.cucinotta@santannapisa.it Link: https://lkml.kernel.org/r/20190719140000.31694-9-juri.lelli@redhat.com Signed-off-by: Ingo Molnar --- kernel/cgroup/cpuset.c | 11 +++++++++++ kernel/sched/core.c | 20 +++++++++++++++++--- 2 files changed, 28 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 5c5014caa23c..c52bc91f882b 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -334,6 +334,17 @@ static struct cpuset top_cpuset = { */ DEFINE_STATIC_PERCPU_RWSEM(cpuset_rwsem); + +void cpuset_read_lock(void) +{ + percpu_down_read(&cpuset_rwsem); +} + +void cpuset_read_unlock(void) +{ + percpu_up_read(&cpuset_rwsem); +} + static DEFINE_SPINLOCK(callback_lock); static struct workqueue_struct *cpuset_migrate_mm_wq; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 1af3d2dc6b29..1bceb22dac18 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4698,6 +4698,9 @@ recheck: return retval; } + if (pi) + cpuset_read_lock(); + /* * Make sure no PI-waiters arrive (or leave) while we are * changing the priority of the task: @@ -4772,6 +4775,8 @@ change: if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { policy = oldpolicy = -1; task_rq_unlock(rq, p, &rf); + if (pi) + cpuset_read_unlock(); goto recheck; } @@ -4832,8 +4837,10 @@ change: preempt_disable(); task_rq_unlock(rq, p, &rf); - if (pi) + if (pi) { + cpuset_read_unlock(); rt_mutex_adjust_pi(p); + } /* Run balance callbacks after we've adjusted the PI chain: */ balance_callback(rq); @@ -4843,6 +4850,8 @@ change: unlock: task_rq_unlock(rq, p, &rf); + if (pi) + cpuset_read_unlock(); return retval; } @@ -4927,10 +4936,15 @@ do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) rcu_read_lock(); retval = -ESRCH; p = find_process_by_pid(pid); - if (p != NULL) - retval = sched_setscheduler(p, policy, &lparam); + if (likely(p)) + get_task_struct(p); rcu_read_unlock(); + if (likely(p)) { + retval = sched_setscheduler(p, policy, &lparam); + put_task_struct(p); + } + return retval; } -- cgit v1.2.3 From a07db5c0865799ebed1f88be0df50c581fb65029 Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Fri, 19 Jul 2019 08:34:55 +0200 Subject: sched/core: Fix CPU controller for !RT_GROUP_SCHED MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On !CONFIG_RT_GROUP_SCHED configurations it is currently not possible to move RT tasks between cgroups to which CPU controller has been attached; but it is oddly possible to first move tasks around and then make them RT (setschedule to FIFO/RR). E.g.: # mkdir /sys/fs/cgroup/cpu,cpuacct/group1 # chrt -fp 10 $$ # echo $$ > /sys/fs/cgroup/cpu,cpuacct/group1/tasks bash: echo: write error: Invalid argument # chrt -op 0 $$ # echo $$ > /sys/fs/cgroup/cpu,cpuacct/group1/tasks # chrt -fp 10 $$ # cat /sys/fs/cgroup/cpu,cpuacct/group1/tasks 2345 2598 # chrt -p 2345 pid 2345's current scheduling policy: SCHED_FIFO pid 2345's current scheduling priority: 10 Also, as Michal noted, it is currently not possible to enable CPU controller on unified hierarchy with !CONFIG_RT_GROUP_SCHED (if there are any kernel RT threads in root cgroup, they can't be migrated to the newly created CPU controller's root in cgroup_update_dfl_csses()). Existing code comes with a comment saying the "we don't support RT-tasks being in separate groups". Such comment is however stale and belongs to pre-RT_GROUP_SCHED times. Also, it doesn't make much sense for !RT_GROUP_ SCHED configurations, since checks related to RT bandwidth are not performed at all in these cases. Make moving RT tasks between CPU controller groups viable by removing special case check for RT (and DEADLINE) tasks. Signed-off-by: Juri Lelli Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Michal Koutný Reviewed-by: Daniel Bristot de Oliveira Acked-by: Tejun Heo Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: lizefan@huawei.com Cc: longman@redhat.com Cc: luca.abeni@santannapisa.it Cc: rostedt@goodmis.org Link: https://lkml.kernel.org/r/20190719063455.27328-1-juri.lelli@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 1bceb22dac18..042c736b2b73 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6995,10 +6995,6 @@ static int cpu_cgroup_can_attach(struct cgroup_taskset *tset) #ifdef CONFIG_RT_GROUP_SCHED if (!sched_rt_can_attach(css_tg(css), task)) return -EINVAL; -#else - /* We don't support RT-tasks being in separate groups */ - if (task->sched_class != &fair_sched_class) - return -EINVAL; #endif /* * Serialize against wake_up_new_task() such that if its -- cgit v1.2.3 From a1dc0446d64966dc0ae756aebdc449f335742c13 Mon Sep 17 00:00:00 2001 From: Qian Cai Date: Fri, 19 Jul 2019 21:23:19 -0400 Subject: sched/core: Silence a warning in sched_init() Compiling a kernel with both FAIR_GROUP_SCHED=n and RT_GROUP_SCHED=n will generate a compiler warning: kernel/sched/core.c: In function 'sched_init': kernel/sched/core.c:5906:32: warning: variable 'ptr' set but not used It is unnecessary to have both "alloc_size" and "ptr", so just combine them. Signed-off-by: Qian Cai Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: valentin.schneider@arm.com Link: https://lkml.kernel.org/r/20190720012319.884-1-cai@lca.pw Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 042c736b2b73..46f3ca9e392a 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6430,19 +6430,19 @@ DECLARE_PER_CPU(cpumask_var_t, select_idle_mask); void __init sched_init(void) { - unsigned long alloc_size = 0, ptr; + unsigned long ptr = 0; int i; wait_bit_init(); #ifdef CONFIG_FAIR_GROUP_SCHED - alloc_size += 2 * nr_cpu_ids * sizeof(void **); + ptr += 2 * nr_cpu_ids * sizeof(void **); #endif #ifdef CONFIG_RT_GROUP_SCHED - alloc_size += 2 * nr_cpu_ids * sizeof(void **); + ptr += 2 * nr_cpu_ids * sizeof(void **); #endif - if (alloc_size) { - ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT); + if (ptr) { + ptr = (unsigned long)kzalloc(ptr, GFP_NOWAIT); #ifdef CONFIG_FAIR_GROUP_SCHED root_task_group.se = (struct sched_entity **)ptr; -- cgit v1.2.3 From 6fc4dbcf0276279d488c5fbbfabe94734134f4fa Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Thu, 18 Jul 2019 23:01:46 +0800 Subject: padata: Replace delayed timer with immediate workqueue in padata_reorder The function padata_reorder will use a timer when it cannot progress while completed jobs are outstanding (pd->reorder_objects > 0). This is suboptimal as if we do end up using the timer then it would have introduced a gratuitous delay of one second. In fact we can easily distinguish between whether completed jobs are outstanding and whether we can make progress. All we have to do is look at the next pqueue list. This patch does that by replacing pd->processed with pd->cpu so that the next pqueue is more accessible. A work queue is used instead of the original try_again to avoid hogging the CPU. Note that we don't bother removing the work queue in padata_flush_queues because the whole premise is broken. You cannot flush async crypto requests so it makes no sense to even try. A subsequent patch will fix it by replacing it with a ref counting scheme. Signed-off-by: Herbert Xu --- kernel/padata.c | 97 +++++++++++---------------------------------------------- 1 file changed, 18 insertions(+), 79 deletions(-) (limited to 'kernel') diff --git a/kernel/padata.c b/kernel/padata.c index 15a8ad63f4ff..fbafca18597f 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -165,23 +165,12 @@ EXPORT_SYMBOL(padata_do_parallel); */ static struct padata_priv *padata_get_next(struct parallel_data *pd) { - int cpu, num_cpus; - unsigned int next_nr, next_index; struct padata_parallel_queue *next_queue; struct padata_priv *padata; struct padata_list *reorder; + int cpu = pd->cpu; - num_cpus = cpumask_weight(pd->cpumask.pcpu); - - /* - * Calculate the percpu reorder queue and the sequence - * number of the next object. - */ - next_nr = pd->processed; - next_index = next_nr % num_cpus; - cpu = padata_index_to_cpu(pd, next_index); next_queue = per_cpu_ptr(pd->pqueue, cpu); - reorder = &next_queue->reorder; spin_lock(&reorder->lock); @@ -192,7 +181,8 @@ static struct padata_priv *padata_get_next(struct parallel_data *pd) list_del_init(&padata->list); atomic_dec(&pd->reorder_objects); - pd->processed++; + pd->cpu = cpumask_next_wrap(cpu, pd->cpumask.pcpu, -1, + false); spin_unlock(&reorder->lock); goto out; @@ -215,6 +205,7 @@ static void padata_reorder(struct parallel_data *pd) struct padata_priv *padata; struct padata_serial_queue *squeue; struct padata_instance *pinst = pd->pinst; + struct padata_parallel_queue *next_queue; /* * We need to ensure that only one cpu can work on dequeueing of @@ -246,7 +237,6 @@ static void padata_reorder(struct parallel_data *pd) * so exit immediately. */ if (PTR_ERR(padata) == -ENODATA) { - del_timer(&pd->timer); spin_unlock_bh(&pd->lock); return; } @@ -265,70 +255,29 @@ static void padata_reorder(struct parallel_data *pd) /* * The next object that needs serialization might have arrived to - * the reorder queues in the meantime, we will be called again - * from the timer function if no one else cares for it. + * the reorder queues in the meantime. * - * Ensure reorder_objects is read after pd->lock is dropped so we see - * an increment from another task in padata_do_serial. Pairs with + * Ensure reorder queue is read after pd->lock is dropped so we see + * new objects from another task in padata_do_serial. Pairs with * smp_mb__after_atomic in padata_do_serial. */ smp_mb(); - if (atomic_read(&pd->reorder_objects) - && !(pinst->flags & PADATA_RESET)) - mod_timer(&pd->timer, jiffies + HZ); - else - del_timer(&pd->timer); - return; + next_queue = per_cpu_ptr(pd->pqueue, pd->cpu); + if (!list_empty(&next_queue->reorder.list)) + queue_work(pinst->wq, &pd->reorder_work); } static void invoke_padata_reorder(struct work_struct *work) { - struct padata_parallel_queue *pqueue; struct parallel_data *pd; local_bh_disable(); - pqueue = container_of(work, struct padata_parallel_queue, reorder_work); - pd = pqueue->pd; + pd = container_of(work, struct parallel_data, reorder_work); padata_reorder(pd); local_bh_enable(); } -static void padata_reorder_timer(struct timer_list *t) -{ - struct parallel_data *pd = from_timer(pd, t, timer); - unsigned int weight; - int target_cpu, cpu; - - cpu = get_cpu(); - - /* We don't lock pd here to not interfere with parallel processing - * padata_reorder() calls on other CPUs. We just need any CPU out of - * the cpumask.pcpu set. It would be nice if it's the right one but - * it doesn't matter if we're off to the next one by using an outdated - * pd->processed value. - */ - weight = cpumask_weight(pd->cpumask.pcpu); - target_cpu = padata_index_to_cpu(pd, pd->processed % weight); - - /* ensure to call the reorder callback on the correct CPU */ - if (cpu != target_cpu) { - struct padata_parallel_queue *pqueue; - struct padata_instance *pinst; - - /* The timer function is serialized wrt itself -- no locking - * needed. - */ - pinst = pd->pinst; - pqueue = per_cpu_ptr(pd->pqueue, target_cpu); - queue_work_on(target_cpu, pinst->wq, &pqueue->reorder_work); - } else { - padata_reorder(pd); - } - - put_cpu(); -} - static void padata_serial_worker(struct work_struct *serial_work) { struct padata_serial_queue *squeue; @@ -376,9 +325,8 @@ void padata_do_serial(struct padata_priv *padata) cpu = get_cpu(); - /* We need to run on the same CPU padata_do_parallel(.., padata, ..) - * was called on -- or, at least, enqueue the padata object into the - * correct per-cpu queue. + /* We need to enqueue the padata object into the correct + * per-cpu queue. */ if (cpu != padata->cpu) { reorder_via_wq = 1; @@ -388,12 +336,12 @@ void padata_do_serial(struct padata_priv *padata) pqueue = per_cpu_ptr(pd->pqueue, cpu); spin_lock(&pqueue->reorder.lock); - atomic_inc(&pd->reorder_objects); list_add_tail(&padata->list, &pqueue->reorder.list); + atomic_inc(&pd->reorder_objects); spin_unlock(&pqueue->reorder.lock); /* - * Ensure the atomic_inc of reorder_objects above is ordered correctly + * Ensure the addition to the reorder list is ordered correctly * with the trylock of pd->lock in padata_reorder. Pairs with smp_mb * in padata_reorder. */ @@ -401,13 +349,7 @@ void padata_do_serial(struct padata_priv *padata) put_cpu(); - /* If we're running on the wrong CPU, call padata_reorder() via a - * kernel worker. - */ - if (reorder_via_wq) - queue_work_on(cpu, pd->pinst->wq, &pqueue->reorder_work); - else - padata_reorder(pd); + padata_reorder(pd); } EXPORT_SYMBOL(padata_do_serial); @@ -463,14 +405,12 @@ static void padata_init_pqueues(struct parallel_data *pd) continue; } - pqueue->pd = pd; pqueue->cpu_index = cpu_index; cpu_index++; __padata_list_init(&pqueue->reorder); __padata_list_init(&pqueue->parallel); INIT_WORK(&pqueue->work, padata_parallel_worker); - INIT_WORK(&pqueue->reorder_work, invoke_padata_reorder); atomic_set(&pqueue->num_obj, 0); } } @@ -498,12 +438,13 @@ static struct parallel_data *padata_alloc_pd(struct padata_instance *pinst, padata_init_pqueues(pd); padata_init_squeues(pd); - timer_setup(&pd->timer, padata_reorder_timer, 0); atomic_set(&pd->seq_nr, -1); atomic_set(&pd->reorder_objects, 0); atomic_set(&pd->refcnt, 0); pd->pinst = pinst; spin_lock_init(&pd->lock); + pd->cpu = cpumask_first(pcpumask); + INIT_WORK(&pd->reorder_work, invoke_padata_reorder); return pd; @@ -538,8 +479,6 @@ static void padata_flush_queues(struct parallel_data *pd) flush_work(&pqueue->work); } - del_timer_sync(&pd->timer); - if (atomic_read(&pd->reorder_objects)) padata_reorder(pd); -- cgit v1.2.3 From 065cf577135a4977931c7a1e1edf442bfd9773dd Mon Sep 17 00:00:00 2001 From: Daniel Jordan Date: Fri, 19 Jul 2019 15:04:44 -0400 Subject: padata: purge get_cpu and reorder_via_wq from padata_do_serial With the removal of the padata timer, padata_do_serial no longer needs special CPU handling, so remove it. Signed-off-by: Daniel Jordan Cc: Herbert Xu Cc: Steffen Klassert Cc: linux-crypto@vger.kernel.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Herbert Xu --- kernel/padata.c | 23 +++-------------------- 1 file changed, 3 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/padata.c b/kernel/padata.c index fbafca18597f..7372fb45eeeb 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -316,24 +316,9 @@ static void padata_serial_worker(struct work_struct *serial_work) */ void padata_do_serial(struct padata_priv *padata) { - int cpu; - struct padata_parallel_queue *pqueue; - struct parallel_data *pd; - int reorder_via_wq = 0; - - pd = padata->pd; - - cpu = get_cpu(); - - /* We need to enqueue the padata object into the correct - * per-cpu queue. - */ - if (cpu != padata->cpu) { - reorder_via_wq = 1; - cpu = padata->cpu; - } - - pqueue = per_cpu_ptr(pd->pqueue, cpu); + struct parallel_data *pd = padata->pd; + struct padata_parallel_queue *pqueue = per_cpu_ptr(pd->pqueue, + padata->cpu); spin_lock(&pqueue->reorder.lock); list_add_tail(&padata->list, &pqueue->reorder.list); @@ -347,8 +332,6 @@ void padata_do_serial(struct padata_priv *padata) */ smp_mb__after_atomic(); - put_cpu(); - padata_reorder(pd); } EXPORT_SYMBOL(padata_do_serial); -- cgit v1.2.3 From fca16e51078e8e5c0af839426b3d2dcd2bede135 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= Date: Fri, 26 Jul 2019 18:06:53 +0200 Subject: xdp: Refactor devmap allocation code for reuse MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The subsequent patch to add a new devmap sub-type can re-use much of the initialisation and allocation code, so refactor it into separate functions. Signed-off-by: Toke Høiland-Jørgensen Acked-by: Yonghong Song Acked-by: Jesper Dangaard Brouer Signed-off-by: Alexei Starovoitov --- kernel/bpf/devmap.c | 136 ++++++++++++++++++++++++++++++++-------------------- 1 file changed, 83 insertions(+), 53 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index d83cf8ccc872..a0501266bdb8 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -60,9 +60,9 @@ struct xdp_bulk_queue { struct bpf_dtab_netdev { struct net_device *dev; /* must be first member, due to tracepoint */ struct bpf_dtab *dtab; - unsigned int bit; struct xdp_bulk_queue __percpu *bulkq; struct rcu_head rcu; + unsigned int idx; /* keep track of map index for tracepoint */ }; struct bpf_dtab { @@ -75,28 +75,21 @@ struct bpf_dtab { static DEFINE_SPINLOCK(dev_map_lock); static LIST_HEAD(dev_map_list); -static struct bpf_map *dev_map_alloc(union bpf_attr *attr) +static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr) { - struct bpf_dtab *dtab; int err, cpu; u64 cost; - if (!capable(CAP_NET_ADMIN)) - return ERR_PTR(-EPERM); - /* check sanity of attributes */ if (attr->max_entries == 0 || attr->key_size != 4 || attr->value_size != 4 || attr->map_flags & ~DEV_CREATE_FLAG_MASK) - return ERR_PTR(-EINVAL); + return -EINVAL; /* Lookup returns a pointer straight to dev->ifindex, so make sure the * verifier prevents writes from the BPF side */ attr->map_flags |= BPF_F_RDONLY_PROG; - dtab = kzalloc(sizeof(*dtab), GFP_USER); - if (!dtab) - return ERR_PTR(-ENOMEM); bpf_map_init_from_attr(&dtab->map, attr); @@ -107,9 +100,7 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr) /* if map size is larger than memlock limit, reject it */ err = bpf_map_charge_init(&dtab->map.memory, cost); if (err) - goto free_dtab; - - err = -ENOMEM; + return -EINVAL; dtab->flush_list = alloc_percpu(struct list_head); if (!dtab->flush_list) @@ -124,19 +115,38 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr) if (!dtab->netdev_map) goto free_percpu; - spin_lock(&dev_map_lock); - list_add_tail_rcu(&dtab->list, &dev_map_list); - spin_unlock(&dev_map_lock); - - return &dtab->map; + return 0; free_percpu: free_percpu(dtab->flush_list); free_charge: bpf_map_charge_finish(&dtab->map.memory); -free_dtab: - kfree(dtab); - return ERR_PTR(err); + return -ENOMEM; +} + +static struct bpf_map *dev_map_alloc(union bpf_attr *attr) +{ + struct bpf_dtab *dtab; + int err; + + if (!capable(CAP_NET_ADMIN)) + return ERR_PTR(-EPERM); + + dtab = kzalloc(sizeof(*dtab), GFP_USER); + if (!dtab) + return ERR_PTR(-ENOMEM); + + err = dev_map_init_map(dtab, attr); + if (err) { + kfree(dtab); + return ERR_PTR(err); + } + + spin_lock(&dev_map_lock); + list_add_tail_rcu(&dtab->list, &dev_map_list); + spin_unlock(&dev_map_lock); + + return &dtab->map; } static void dev_map_free(struct bpf_map *map) @@ -235,7 +245,7 @@ static int bq_xmit_all(struct xdp_bulk_queue *bq, u32 flags, out: bq->count = 0; - trace_xdp_devmap_xmit(&obj->dtab->map, obj->bit, + trace_xdp_devmap_xmit(&obj->dtab->map, obj->idx, sent, drops, bq->dev_rx, dev, err); bq->dev_rx = NULL; __list_del_clearprev(&bq->flush_node); @@ -412,17 +422,52 @@ static int dev_map_delete_elem(struct bpf_map *map, void *key) return 0; } -static int dev_map_update_elem(struct bpf_map *map, void *key, void *value, - u64 map_flags) +static struct bpf_dtab_netdev *__dev_map_alloc_node(struct net *net, + struct bpf_dtab *dtab, + u32 ifindex, + unsigned int idx) { - struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); - struct net *net = current->nsproxy->net_ns; gfp_t gfp = GFP_ATOMIC | __GFP_NOWARN; + struct bpf_dtab_netdev *dev; + struct xdp_bulk_queue *bq; + int cpu; + + dev = kmalloc_node(sizeof(*dev), gfp, dtab->map.numa_node); + if (!dev) + return ERR_PTR(-ENOMEM); + + dev->bulkq = __alloc_percpu_gfp(sizeof(*dev->bulkq), + sizeof(void *), gfp); + if (!dev->bulkq) { + kfree(dev); + return ERR_PTR(-ENOMEM); + } + + for_each_possible_cpu(cpu) { + bq = per_cpu_ptr(dev->bulkq, cpu); + bq->obj = dev; + } + + dev->dev = dev_get_by_index(net, ifindex); + if (!dev->dev) { + free_percpu(dev->bulkq); + kfree(dev); + return ERR_PTR(-EINVAL); + } + + dev->idx = idx; + dev->dtab = dtab; + + return dev; +} + +static int __dev_map_update_elem(struct net *net, struct bpf_map *map, + void *key, void *value, u64 map_flags) +{ + struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); struct bpf_dtab_netdev *dev, *old_dev; u32 ifindex = *(u32 *)value; - struct xdp_bulk_queue *bq; u32 i = *(u32 *)key; - int cpu; if (unlikely(map_flags > BPF_EXIST)) return -EINVAL; @@ -434,31 +479,9 @@ static int dev_map_update_elem(struct bpf_map *map, void *key, void *value, if (!ifindex) { dev = NULL; } else { - dev = kmalloc_node(sizeof(*dev), gfp, map->numa_node); - if (!dev) - return -ENOMEM; - - dev->bulkq = __alloc_percpu_gfp(sizeof(*dev->bulkq), - sizeof(void *), gfp); - if (!dev->bulkq) { - kfree(dev); - return -ENOMEM; - } - - for_each_possible_cpu(cpu) { - bq = per_cpu_ptr(dev->bulkq, cpu); - bq->obj = dev; - } - - dev->dev = dev_get_by_index(net, ifindex); - if (!dev->dev) { - free_percpu(dev->bulkq); - kfree(dev); - return -EINVAL; - } - - dev->bit = i; - dev->dtab = dtab; + dev = __dev_map_alloc_node(net, dtab, ifindex, i); + if (IS_ERR(dev)) + return PTR_ERR(dev); } /* Use call_rcu() here to ensure rcu critical sections have completed @@ -472,6 +495,13 @@ static int dev_map_update_elem(struct bpf_map *map, void *key, void *value, return 0; } +static int dev_map_update_elem(struct bpf_map *map, void *key, void *value, + u64 map_flags) +{ + return __dev_map_update_elem(current->nsproxy->net_ns, + map, key, value, map_flags); +} + const struct bpf_map_ops dev_map_ops = { .map_alloc = dev_map_alloc, .map_free = dev_map_free, -- cgit v1.2.3 From 6f9d451ab1a33728adb72d7ff66a7b374d665176 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= Date: Fri, 26 Jul 2019 18:06:55 +0200 Subject: xdp: Add devmap_hash map type for looking up devices by hashed index MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A common pattern when using xdp_redirect_map() is to create a device map where the lookup key is simply ifindex. Because device maps are arrays, this leaves holes in the map, and the map has to be sized to fit the largest ifindex, regardless of how many devices actually are actually needed in the map. This patch adds a second type of device map where the key is looked up using a hashmap, instead of being used as an array index. This allows maps to be densely packed, so they can be smaller. Signed-off-by: Toke Høiland-Jørgensen Acked-by: Yonghong Song Acked-by: Jesper Dangaard Brouer Signed-off-by: Alexei Starovoitov --- kernel/bpf/devmap.c | 200 ++++++++++++++++++++++++++++++++++++++++++++++++++ kernel/bpf/verifier.c | 2 + 2 files changed, 202 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index a0501266bdb8..9af048a932b5 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -37,6 +37,12 @@ * notifier hook walks the map we know that new dev references can not be * added by the user because core infrastructure ensures dev_get_by_index() * calls will fail at this point. + * + * The devmap_hash type is a map type which interprets keys as ifindexes and + * indexes these using a hashmap. This allows maps that use ifindex as key to be + * densely packed instead of having holes in the lookup array for unused + * ifindexes. The setup and packet enqueue/send code is shared between the two + * types of devmap; only the lookup and insertion is different. */ #include #include @@ -59,6 +65,7 @@ struct xdp_bulk_queue { struct bpf_dtab_netdev { struct net_device *dev; /* must be first member, due to tracepoint */ + struct hlist_node index_hlist; struct bpf_dtab *dtab; struct xdp_bulk_queue __percpu *bulkq; struct rcu_head rcu; @@ -70,11 +77,30 @@ struct bpf_dtab { struct bpf_dtab_netdev **netdev_map; struct list_head __percpu *flush_list; struct list_head list; + + /* these are only used for DEVMAP_HASH type maps */ + struct hlist_head *dev_index_head; + spinlock_t index_lock; + unsigned int items; + u32 n_buckets; }; static DEFINE_SPINLOCK(dev_map_lock); static LIST_HEAD(dev_map_list); +static struct hlist_head *dev_map_create_hash(unsigned int entries) +{ + int i; + struct hlist_head *hash; + + hash = kmalloc_array(entries, sizeof(*hash), GFP_KERNEL); + if (hash != NULL) + for (i = 0; i < entries; i++) + INIT_HLIST_HEAD(&hash[i]); + + return hash; +} + static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr) { int err, cpu; @@ -97,6 +123,14 @@ static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr) cost = (u64) dtab->map.max_entries * sizeof(struct bpf_dtab_netdev *); cost += sizeof(struct list_head) * num_possible_cpus(); + if (attr->map_type == BPF_MAP_TYPE_DEVMAP_HASH) { + dtab->n_buckets = roundup_pow_of_two(dtab->map.max_entries); + + if (!dtab->n_buckets) /* Overflow check */ + return -EINVAL; + cost += sizeof(struct hlist_head) * dtab->n_buckets; + } + /* if map size is larger than memlock limit, reject it */ err = bpf_map_charge_init(&dtab->map.memory, cost); if (err) @@ -115,8 +149,18 @@ static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr) if (!dtab->netdev_map) goto free_percpu; + if (attr->map_type == BPF_MAP_TYPE_DEVMAP_HASH) { + dtab->dev_index_head = dev_map_create_hash(dtab->n_buckets); + if (!dtab->dev_index_head) + goto free_map_area; + + spin_lock_init(&dtab->index_lock); + } + return 0; +free_map_area: + bpf_map_area_free(dtab->netdev_map); free_percpu: free_percpu(dtab->flush_list); free_charge: @@ -198,6 +242,7 @@ static void dev_map_free(struct bpf_map *map) free_percpu(dtab->flush_list); bpf_map_area_free(dtab->netdev_map); + kfree(dtab->dev_index_head); kfree(dtab); } @@ -218,6 +263,70 @@ static int dev_map_get_next_key(struct bpf_map *map, void *key, void *next_key) return 0; } +static inline struct hlist_head *dev_map_index_hash(struct bpf_dtab *dtab, + int idx) +{ + return &dtab->dev_index_head[idx & (dtab->n_buckets - 1)]; +} + +struct bpf_dtab_netdev *__dev_map_hash_lookup_elem(struct bpf_map *map, u32 key) +{ + struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); + struct hlist_head *head = dev_map_index_hash(dtab, key); + struct bpf_dtab_netdev *dev; + + hlist_for_each_entry_rcu(dev, head, index_hlist) + if (dev->idx == key) + return dev; + + return NULL; +} + +static int dev_map_hash_get_next_key(struct bpf_map *map, void *key, + void *next_key) +{ + struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); + u32 idx, *next = next_key; + struct bpf_dtab_netdev *dev, *next_dev; + struct hlist_head *head; + int i = 0; + + if (!key) + goto find_first; + + idx = *(u32 *)key; + + dev = __dev_map_hash_lookup_elem(map, idx); + if (!dev) + goto find_first; + + next_dev = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(&dev->index_hlist)), + struct bpf_dtab_netdev, index_hlist); + + if (next_dev) { + *next = next_dev->idx; + return 0; + } + + i = idx & (dtab->n_buckets - 1); + i++; + + find_first: + for (; i < dtab->n_buckets; i++) { + head = dev_map_index_hash(dtab, i); + + next_dev = hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(head)), + struct bpf_dtab_netdev, + index_hlist); + if (next_dev) { + *next = next_dev->idx; + return 0; + } + } + + return -ENOENT; +} + static int bq_xmit_all(struct xdp_bulk_queue *bq, u32 flags, bool in_napi_ctx) { @@ -373,6 +482,15 @@ static void *dev_map_lookup_elem(struct bpf_map *map, void *key) return dev ? &dev->ifindex : NULL; } +static void *dev_map_hash_lookup_elem(struct bpf_map *map, void *key) +{ + struct bpf_dtab_netdev *obj = __dev_map_hash_lookup_elem(map, + *(u32 *)key); + struct net_device *dev = obj ? obj->dev : NULL; + + return dev ? &dev->ifindex : NULL; +} + static void dev_map_flush_old(struct bpf_dtab_netdev *dev) { if (dev->dev->netdev_ops->ndo_xdp_xmit) { @@ -422,6 +540,28 @@ static int dev_map_delete_elem(struct bpf_map *map, void *key) return 0; } +static int dev_map_hash_delete_elem(struct bpf_map *map, void *key) +{ + struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); + struct bpf_dtab_netdev *old_dev; + int k = *(u32 *)key; + unsigned long flags; + int ret = -ENOENT; + + spin_lock_irqsave(&dtab->index_lock, flags); + + old_dev = __dev_map_hash_lookup_elem(map, k); + if (old_dev) { + dtab->items--; + hlist_del_init_rcu(&old_dev->index_hlist); + call_rcu(&old_dev->rcu, __dev_map_entry_free); + ret = 0; + } + spin_unlock_irqrestore(&dtab->index_lock, flags); + + return ret; +} + static struct bpf_dtab_netdev *__dev_map_alloc_node(struct net *net, struct bpf_dtab *dtab, u32 ifindex, @@ -502,6 +642,56 @@ static int dev_map_update_elem(struct bpf_map *map, void *key, void *value, map, key, value, map_flags); } +static int __dev_map_hash_update_elem(struct net *net, struct bpf_map *map, + void *key, void *value, u64 map_flags) +{ + struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); + struct bpf_dtab_netdev *dev, *old_dev; + u32 ifindex = *(u32 *)value; + u32 idx = *(u32 *)key; + unsigned long flags; + + if (unlikely(map_flags > BPF_EXIST || !ifindex)) + return -EINVAL; + + old_dev = __dev_map_hash_lookup_elem(map, idx); + if (old_dev && (map_flags & BPF_NOEXIST)) + return -EEXIST; + + dev = __dev_map_alloc_node(net, dtab, ifindex, idx); + if (IS_ERR(dev)) + return PTR_ERR(dev); + + spin_lock_irqsave(&dtab->index_lock, flags); + + if (old_dev) { + hlist_del_rcu(&old_dev->index_hlist); + } else { + if (dtab->items >= dtab->map.max_entries) { + spin_unlock_irqrestore(&dtab->index_lock, flags); + call_rcu(&dev->rcu, __dev_map_entry_free); + return -E2BIG; + } + dtab->items++; + } + + hlist_add_head_rcu(&dev->index_hlist, + dev_map_index_hash(dtab, idx)); + spin_unlock_irqrestore(&dtab->index_lock, flags); + + if (old_dev) + call_rcu(&old_dev->rcu, __dev_map_entry_free); + + return 0; +} + +static int dev_map_hash_update_elem(struct bpf_map *map, void *key, void *value, + u64 map_flags) +{ + return __dev_map_hash_update_elem(current->nsproxy->net_ns, + map, key, value, map_flags); +} + const struct bpf_map_ops dev_map_ops = { .map_alloc = dev_map_alloc, .map_free = dev_map_free, @@ -512,6 +702,16 @@ const struct bpf_map_ops dev_map_ops = { .map_check_btf = map_check_no_btf, }; +const struct bpf_map_ops dev_map_hash_ops = { + .map_alloc = dev_map_alloc, + .map_free = dev_map_free, + .map_get_next_key = dev_map_hash_get_next_key, + .map_lookup_elem = dev_map_hash_lookup_elem, + .map_update_elem = dev_map_hash_update_elem, + .map_delete_elem = dev_map_hash_delete_elem, + .map_check_btf = map_check_no_btf, +}; + static int dev_map_notification(struct notifier_block *notifier, ulong event, void *ptr) { diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 5900cbb966b1..cef851cd5c36 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3457,6 +3457,7 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, goto error; break; case BPF_MAP_TYPE_DEVMAP: + case BPF_MAP_TYPE_DEVMAP_HASH: if (func_id != BPF_FUNC_redirect_map && func_id != BPF_FUNC_map_lookup_elem) goto error; @@ -3539,6 +3540,7 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, break; case BPF_FUNC_redirect_map: if (map->map_type != BPF_MAP_TYPE_DEVMAP && + map->map_type != BPF_MAP_TYPE_DEVMAP_HASH && map->map_type != BPF_MAP_TYPE_CPUMAP && map->map_type != BPF_MAP_TYPE_XSKMAP) goto error; -- cgit v1.2.3 From 38f054d549a869f22a02224cd276a27bf14b6171 Mon Sep 17 00:00:00 2001 From: Jessica Yu Date: Tue, 23 Jul 2019 15:26:28 +0200 Subject: modules: always page-align module section allocations Some arches (e.g., arm64, x86) have moved towards non-executable module_alloc() allocations for security hardening reasons. That means that the module loader will need to set the text section of a module to executable, regardless of whether or not CONFIG_STRICT_MODULE_RWX is set. When CONFIG_STRICT_MODULE_RWX=y, module section allocations are always page-aligned to handle memory rwx permissions. On some arches with CONFIG_STRICT_MODULE_RWX=n however, when setting the module text to executable, the BUG_ON() in frob_text() gets triggered since module section allocations are not page-aligned when CONFIG_STRICT_MODULE_RWX=n. Since the set_memory_* API works with pages, and since we need to call set_memory_x() regardless of whether CONFIG_STRICT_MODULE_RWX is set, we might as well page-align all module section allocations for ease of managing rwx permissions of module sections (text, rodata, etc). Fixes: 2eef1399a866 ("modules: fix BUG when load module with rodata=n") Reported-by: Martin Kaiser Reported-by: Bartosz Golaszewski Tested-by: David Lechner Tested-by: Martin Kaiser Tested-by: Bartosz Golaszewski Signed-off-by: Jessica Yu --- kernel/module.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 5933395af9a0..cd8df516666d 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -64,14 +64,9 @@ /* * Modules' sections will be aligned on page boundaries - * to ensure complete separation of code and data, but - * only when CONFIG_STRICT_MODULE_RWX=y + * to ensure complete separation of code and data */ -#ifdef CONFIG_STRICT_MODULE_RWX # define debug_align(X) ALIGN(X, PAGE_SIZE) -#else -# define debug_align(X) (X) -#endif /* If this is set, the section belongs in the init part of the module */ #define INIT_OFFSET_MASK (1UL << (BITS_PER_LONG-1)) -- cgit v1.2.3 From 2b089bf8d19c66f70ae3b2d2d101be1ae49bfe24 Mon Sep 17 00:00:00 2001 From: Thomas Huth Date: Mon, 22 Jul 2019 11:20:08 +0200 Subject: kernel/configs: Replace GPL boilerplate code with SPDX identifier The FSF does not reside in "675 Mass Ave, Cambridge" anymore... let's replace the old GPL boilerplate code with a proper SPDX identifier instead. Signed-off-by: Thomas Huth Signed-off-by: Greg Kroah-Hartman --- kernel/configs.c | 16 +--------------- 1 file changed, 1 insertion(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/configs.c b/kernel/configs.c index b062425ccf8d..c09ea4c995e1 100644 --- a/kernel/configs.c +++ b/kernel/configs.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * kernel/configs.c * Echo the kernel .config file used to build the kernel @@ -6,21 +7,6 @@ * Copyright (C) 2002 Randy Dunlap * Copyright (C) 2002 Al Stone * Copyright (C) 2002 Hewlett-Packard Company - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or (at - * your option) any later version. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or - * NON INFRINGEMENT. See the GNU General Public License for more - * details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ #include -- cgit v1.2.3 From b74494872555d1f7888dfd9225700a363f4a84fc Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 26 Jul 2019 20:30:49 +0200 Subject: hrtimer: Remove task argument from hrtimer_init_sleeper() All callers hand in 'current' and that's the only task pointer which actually makes sense. Remove the task argument and set current in the function. Signed-off-by: Thomas Gleixner Reviewed-by: Steven Rostedt (VMware) Acked-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20190726185752.791885290@linutronix.de --- kernel/futex.c | 2 +- kernel/time/hrtimer.c | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 6d50728ef2e7..5e9842ea4012 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -490,7 +490,7 @@ futex_setup_timer(ktime_t *time, struct hrtimer_sleeper *timeout, hrtimer_init_on_stack(&timeout->timer, (flags & FLAGS_CLOCKRT) ? CLOCK_REALTIME : CLOCK_MONOTONIC, HRTIMER_MODE_ABS); - hrtimer_init_sleeper(timeout, current); + hrtimer_init_sleeper(timeout); /* * If range_ns is 0, calling hrtimer_set_expires_range_ns() is diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 5ee77f1a8a92..de895d86800c 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -1639,10 +1639,10 @@ static enum hrtimer_restart hrtimer_wakeup(struct hrtimer *timer) return HRTIMER_NORESTART; } -void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, struct task_struct *task) +void hrtimer_init_sleeper(struct hrtimer_sleeper *sl) { sl->timer.function = hrtimer_wakeup; - sl->task = task; + sl->task = current; } EXPORT_SYMBOL_GPL(hrtimer_init_sleeper); @@ -1669,7 +1669,7 @@ static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mod { struct restart_block *restart; - hrtimer_init_sleeper(t, current); + hrtimer_init_sleeper(t); do { set_current_state(TASK_INTERRUPTIBLE); @@ -1930,7 +1930,7 @@ schedule_hrtimeout_range_clock(ktime_t *expires, u64 delta, hrtimer_init_on_stack(&t.timer, clock_id, mode); hrtimer_set_expires_range_ns(&t.timer, *expires, delta); - hrtimer_init_sleeper(&t, current); + hrtimer_init_sleeper(&t); hrtimer_start_expires(&t.timer, mode); -- cgit v1.2.3 From c1a280b68d4e6b6db4a65aa7865c22d8789ddf09 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 26 Jul 2019 23:19:37 +0200 Subject: sched/preempt: Use CONFIG_PREEMPTION where appropriate CONFIG_PREEMPTION is selected by CONFIG_PREEMPT and by CONFIG_PREEMPT_RT. Both PREEMPT and PREEMPT_RT require the same functionality which today depends on CONFIG_PREEMPT. Switch the preemption code, scheduler and init task over to use CONFIG_PREEMPTION. That's the first step towards RT in that area. The more complex changes are coming separately. Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Masami Hiramatsu Cc: Paolo Bonzini Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Steven Rostedt Link: http://lkml.kernel.org/r/20190726212124.117528401@linutronix.de Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 14 +++++++------- kernel/sched/fair.c | 2 +- kernel/sched/sched.h | 4 ++-- 3 files changed, 10 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 2b037f195473..604a5e137efe 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3581,7 +3581,7 @@ static inline void sched_tick_start(int cpu) { } static inline void sched_tick_stop(int cpu) { } #endif -#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \ +#if defined(CONFIG_PREEMPTION) && (defined(CONFIG_DEBUG_PREEMPT) || \ defined(CONFIG_TRACE_PREEMPT_TOGGLE)) /* * If the value passed in is equal to the current preempt count @@ -3782,7 +3782,7 @@ again: * task, then the wakeup sets TIF_NEED_RESCHED and schedule() gets * called on the nearest possible occasion: * - * - If the kernel is preemptible (CONFIG_PREEMPT=y): + * - If the kernel is preemptible (CONFIG_PREEMPTION=y): * * - in syscall or exception context, at the next outmost * preempt_enable(). (this might be as soon as the wake_up()'s @@ -3791,7 +3791,7 @@ again: * - in IRQ context, return from interrupt-handler to * preemptible context * - * - If the kernel is not preemptible (CONFIG_PREEMPT is not set) + * - If the kernel is not preemptible (CONFIG_PREEMPTION is not set) * then at the next: * * - cond_resched() call @@ -4033,7 +4033,7 @@ static void __sched notrace preempt_schedule_common(void) } while (need_resched()); } -#ifdef CONFIG_PREEMPT +#ifdef CONFIG_PREEMPTION /* * this is the entry point to schedule() from in-kernel preemption * off of preempt_enable. Kernel preemptions off return from interrupt @@ -4105,7 +4105,7 @@ asmlinkage __visible void __sched notrace preempt_schedule_notrace(void) } EXPORT_SYMBOL_GPL(preempt_schedule_notrace); -#endif /* CONFIG_PREEMPT */ +#endif /* CONFIG_PREEMPTION */ /* * this is the entry point to schedule() from kernel preemption @@ -5416,7 +5416,7 @@ SYSCALL_DEFINE0(sched_yield) return 0; } -#ifndef CONFIG_PREEMPT +#ifndef CONFIG_PREEMPTION int __sched _cond_resched(void) { if (should_resched(0)) { @@ -5433,7 +5433,7 @@ EXPORT_SYMBOL(_cond_resched); * __cond_resched_lock() - if a reschedule is pending, drop the given lock, * call schedule, and on return reacquire the lock. * - * This works OK both with and without CONFIG_PREEMPT. We do strange low-level + * This works OK both with and without CONFIG_PREEMPTION. We do strange low-level * operations here to prevent schedule() from being called twice (once via * spin_unlock(), once by hand). */ diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index bc9cfeaac8bd..aff9d76d8d65 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7430,7 +7430,7 @@ static int detach_tasks(struct lb_env *env) detached++; env->imbalance -= load; -#ifdef CONFIG_PREEMPT +#ifdef CONFIG_PREEMPTION /* * NEWIDLE balancing is a source of latency, so preemptible * kernels will stop after the first task is detached to minimize diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 802b1f3405f2..f2ce6ba1c5d5 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1943,7 +1943,7 @@ unsigned long arch_scale_freq_capacity(int cpu) #endif #ifdef CONFIG_SMP -#ifdef CONFIG_PREEMPT +#ifdef CONFIG_PREEMPTION static inline void double_rq_lock(struct rq *rq1, struct rq *rq2); @@ -1995,7 +1995,7 @@ static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest) return ret; } -#endif /* CONFIG_PREEMPT */ +#endif /* CONFIG_PREEMPTION */ /* * double_lock_balance - lock the busiest runqueue, this_rq is locked already. -- cgit v1.2.3 From 01b1d88b09824bea1a75b0bac04dcf50d9893875 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 26 Jul 2019 23:19:38 +0200 Subject: rcu: Use CONFIG_PREEMPTION CONFIG_PREEMPTION is selected by CONFIG_PREEMPT and by CONFIG_PREEMPT_RT. Both PREEMPT and PREEMPT_RT require the same functionality which today depends on CONFIG_PREEMPT. Switch the conditionals in RCU to use CONFIG_PREEMPTION. That's the first step towards RCU on RT. The further tweaks are work in progress. This neither touches the selftest bits which need a closer look by Paul. Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Masami Hiramatsu Cc: Paolo Bonzini Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Steven Rostedt Link: http://lkml.kernel.org/r/20190726212124.210156346@linutronix.de Signed-off-by: Ingo Molnar --- kernel/rcu/Kconfig | 8 ++++---- kernel/rcu/tree.c | 6 +++--- kernel/rcu/tree_stall.h | 6 +++--- kernel/trace/Kconfig | 2 +- 4 files changed, 11 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig index 480edf328b51..7644eda17d62 100644 --- a/kernel/rcu/Kconfig +++ b/kernel/rcu/Kconfig @@ -7,7 +7,7 @@ menu "RCU Subsystem" config TREE_RCU bool - default y if !PREEMPT && SMP + default y if !PREEMPTION && SMP help This option selects the RCU implementation that is designed for very large SMP system with hundreds or @@ -16,7 +16,7 @@ config TREE_RCU config PREEMPT_RCU bool - default y if PREEMPT + default y if PREEMPTION help This option selects the RCU implementation that is designed for very large SMP systems with hundreds or @@ -28,7 +28,7 @@ config PREEMPT_RCU config TINY_RCU bool - default y if !PREEMPT && !SMP + default y if !PREEMPTION && !SMP help This option selects the RCU implementation that is designed for UP systems from which real-time response @@ -70,7 +70,7 @@ config TREE_SRCU This option selects the full-fledged version of SRCU. config TASKS_RCU - def_bool PREEMPT + def_bool PREEMPTION select SRCU help This option enables a task-based RCU implementation that uses diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index a14e5fbbea46..5962636502bc 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1881,7 +1881,7 @@ rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags) struct rcu_node *rnp_p; raw_lockdep_assert_held_rcu_node(rnp); - if (WARN_ON_ONCE(!IS_ENABLED(CONFIG_PREEMPT)) || + if (WARN_ON_ONCE(!IS_ENABLED(CONFIG_PREEMPTION)) || WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp)) || rnp->qsmask != 0) { raw_spin_unlock_irqrestore_rcu_node(rnp, flags); @@ -2205,7 +2205,7 @@ static void force_qs_rnp(int (*f)(struct rcu_data *rdp)) mask = 0; raw_spin_lock_irqsave_rcu_node(rnp, flags); if (rnp->qsmask == 0) { - if (!IS_ENABLED(CONFIG_PREEMPT) || + if (!IS_ENABLED(CONFIG_PREEMPTION) || rcu_preempt_blocked_readers_cgp(rnp)) { /* * No point in scanning bits because they @@ -2622,7 +2622,7 @@ static int rcu_blocking_is_gp(void) { int ret; - if (IS_ENABLED(CONFIG_PREEMPT)) + if (IS_ENABLED(CONFIG_PREEMPTION)) return rcu_scheduler_active == RCU_SCHEDULER_INACTIVE; might_sleep(); /* Check for RCU read-side critical section. */ preempt_disable(); diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index 065183391f75..9b92bf18b737 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -163,7 +163,7 @@ static void rcu_iw_handler(struct irq_work *iwp) // // Printing RCU CPU stall warnings -#ifdef CONFIG_PREEMPT +#ifdef CONFIG_PREEMPTION /* * Dump detailed information for all tasks blocking the current RCU @@ -215,7 +215,7 @@ static int rcu_print_task_stall(struct rcu_node *rnp) return ndetected; } -#else /* #ifdef CONFIG_PREEMPT */ +#else /* #ifdef CONFIG_PREEMPTION */ /* * Because preemptible RCU does not exist, we never have to check for @@ -233,7 +233,7 @@ static int rcu_print_task_stall(struct rcu_node *rnp) { return 0; } -#endif /* #else #ifdef CONFIG_PREEMPT */ +#endif /* #else #ifdef CONFIG_PREEMPTION */ /* * Dump stacks of all tasks running on stalled CPUs. First try using diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 98da8998c25c..d2c1fe0b451d 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -146,7 +146,7 @@ config FUNCTION_TRACER select GENERIC_TRACER select CONTEXT_SWITCH_TRACER select GLOB - select TASKS_RCU if PREEMPT + select TASKS_RCU if PREEMPTION help Enable the kernel to trace every kernel function. This is done by using a compiler feature to insert a small, 5-byte No-Operation -- cgit v1.2.3 From 30c937043b2db09ae3408f5534824f9ececdb581 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 26 Jul 2019 23:19:40 +0200 Subject: tracing: Use CONFIG_PREEMPTION CONFIG_PREEMPTION is selected by CONFIG_PREEMPT and by CONFIG_PREEMPT_RT. Both PREEMPT and PREEMPT_RT require the same functionality which today depends on CONFIG_PREEMPT. Switch the conditionals in the tracer over to CONFIG_PREEMPTION. This is the first step to make the tracer work on RT. The other small tweaks are submitted separately. Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Masami Hiramatsu Cc: Paolo Bonzini Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Steven Rostedt Link: http://lkml.kernel.org/r/20190726212124.409766323@linutronix.de Signed-off-by: Ingo Molnar --- kernel/trace/Kconfig | 4 ++-- kernel/trace/ftrace.c | 2 +- kernel/trace/ring_buffer_benchmark.c | 2 +- kernel/trace/trace_events.c | 4 ++-- 4 files changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index d2c1fe0b451d..6a64d7772870 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -179,7 +179,7 @@ config TRACE_PREEMPT_TOGGLE config PREEMPTIRQ_EVENTS bool "Enable trace events for preempt and irq disable/enable" select TRACE_IRQFLAGS - select TRACE_PREEMPT_TOGGLE if PREEMPT + select TRACE_PREEMPT_TOGGLE if PREEMPTION select GENERIC_TRACER default n help @@ -214,7 +214,7 @@ config PREEMPT_TRACER bool "Preemption-off Latency Tracer" default n depends on !ARCH_USES_GETTIMEOFFSET - depends on PREEMPT + depends on PREEMPTION select GENERIC_TRACER select TRACER_MAX_TRACE select RING_BUFFER_ALLOW_SWAP diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index eca34503f178..a800e867c1a3 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -2814,7 +2814,7 @@ int ftrace_shutdown(struct ftrace_ops *ops, int command) * synchornize_rcu_tasks() will wait for those tasks to * execute and either schedule voluntarily or enter user space. */ - if (IS_ENABLED(CONFIG_PREEMPT)) + if (IS_ENABLED(CONFIG_PREEMPTION)) synchronize_rcu_tasks(); free_ops: diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c index 0564f6db0561..09b0b49f346e 100644 --- a/kernel/trace/ring_buffer_benchmark.c +++ b/kernel/trace/ring_buffer_benchmark.c @@ -267,7 +267,7 @@ static void ring_buffer_producer(void) if (consumer && !(cnt % wakeup_interval)) wake_up_process(consumer); -#ifndef CONFIG_PREEMPT +#ifndef CONFIG_PREEMPTION /* * If we are a non preempt kernel, the 10 second run will * stop everything while it runs. Instead, we will call diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index c7506bc81b75..5a189fb8ec23 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -255,12 +255,12 @@ void *trace_event_buffer_reserve(struct trace_event_buffer *fbuffer, local_save_flags(fbuffer->flags); fbuffer->pc = preempt_count(); /* - * If CONFIG_PREEMPT is enabled, then the tracepoint itself disables + * If CONFIG_PREEMPTION is enabled, then the tracepoint itself disables * preemption (adding one to the preempt_count). Since we are * interested in the preempt_count at the time the tracepoint was * hit, we need to subtract one to offset the increment. */ - if (IS_ENABLED(CONFIG_PREEMPT)) + if (IS_ENABLED(CONFIG_PREEMPTION)) fbuffer->pc--; fbuffer->trace_file = trace_file; -- cgit v1.2.3 From 92616606368ee01f1163fcfc986116c810cd48ba Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 26 Jul 2019 23:19:41 +0200 Subject: kprobes: Use CONFIG_PREEMPTION CONFIG_PREEMPTION is selected by CONFIG_PREEMPT and by CONFIG_PREEMPT_RT. Both PREEMPT and PREEMPT_RT require the same functionality which today depends on CONFIG_PREEMPT. Switch kprobes conditional over to CONFIG_PREEMPTION. Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Masami Hiramatsu Cc: Paolo Bonzini Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Steven Rostedt Link: http://lkml.kernel.org/r/20190726212124.516286187@linutronix.de Signed-off-by: Ingo Molnar --- kernel/kprobes.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 9873fc627d61..8bc5f1ffd68e 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -1906,7 +1906,7 @@ int register_kretprobe(struct kretprobe *rp) /* Pre-allocate memory for max kretprobe instances */ if (rp->maxactive <= 0) { -#ifdef CONFIG_PREEMPT +#ifdef CONFIG_PREEMPTION rp->maxactive = max_t(unsigned int, 10, 2*num_possible_cpus()); #else rp->maxactive = num_possible_cpus(); -- cgit v1.2.3 From dbc1625fc9deefb352f6ff26a575ae4b3ddef23a Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Fri, 26 Jul 2019 20:30:50 +0200 Subject: hrtimer: Consolidate hrtimer_init() + hrtimer_init_sleeper() calls hrtimer_init_sleeper() calls require prior initialisation of the hrtimer object which is embedded into the hrtimer_sleeper. Combine the initialization and spare a function call. Fixup all call sites. This is also a preparatory change for PREEMPT_RT to do hrtimer sleeper specific initializations of the embedded hrtimer without modifying any of the call sites. No functional change. [ anna-maria: Minor cleanups ] [ tglx: Adopted to the removal of the task argument of hrtimer_init_sleeper() and trivial polishing. Folded a fix from Stephen Rothwell for the vsoc code ] Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Anna-Maria Gleixner Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20190726185752.887468908@linutronix.de --- kernel/futex.c | 8 +++----- kernel/time/hrtimer.c | 43 ++++++++++++++++++++++++++++++++----------- 2 files changed, 35 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 5e9842ea4012..c8561aa5338e 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -487,11 +487,9 @@ futex_setup_timer(ktime_t *time, struct hrtimer_sleeper *timeout, if (!time) return NULL; - hrtimer_init_on_stack(&timeout->timer, (flags & FLAGS_CLOCKRT) ? - CLOCK_REALTIME : CLOCK_MONOTONIC, - HRTIMER_MODE_ABS); - hrtimer_init_sleeper(timeout); - + hrtimer_init_sleeper_on_stack(timeout, (flags & FLAGS_CLOCKRT) ? + CLOCK_REALTIME : CLOCK_MONOTONIC, + HRTIMER_MODE_ABS); /* * If range_ns is 0, calling hrtimer_set_expires_range_ns() is * effectively the same as calling hrtimer_set_expires(). diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index de895d86800c..bb55d62f631e 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -427,6 +427,17 @@ void hrtimer_init_on_stack(struct hrtimer *timer, clockid_t clock_id, } EXPORT_SYMBOL_GPL(hrtimer_init_on_stack); +static void __hrtimer_init_sleeper(struct hrtimer_sleeper *sl, + clockid_t clock_id, enum hrtimer_mode mode); + +void hrtimer_init_sleeper_on_stack(struct hrtimer_sleeper *sl, + clockid_t clock_id, enum hrtimer_mode mode) +{ + debug_object_init_on_stack(&sl->timer, &hrtimer_debug_descr); + __hrtimer_init_sleeper(sl, clock_id, mode); +} +EXPORT_SYMBOL_GPL(hrtimer_init_sleeper_on_stack); + void destroy_hrtimer_on_stack(struct hrtimer *timer) { debug_object_free(timer, &hrtimer_debug_descr); @@ -1639,11 +1650,27 @@ static enum hrtimer_restart hrtimer_wakeup(struct hrtimer *timer) return HRTIMER_NORESTART; } -void hrtimer_init_sleeper(struct hrtimer_sleeper *sl) +static void __hrtimer_init_sleeper(struct hrtimer_sleeper *sl, + clockid_t clock_id, enum hrtimer_mode mode) { + __hrtimer_init(&sl->timer, clock_id, mode); sl->timer.function = hrtimer_wakeup; sl->task = current; } + +/** + * hrtimer_init_sleeper - initialize sleeper to the given clock + * @sl: sleeper to be initialized + * @clock_id: the clock to be used + * @mode: timer mode abs/rel + */ +void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, clockid_t clock_id, + enum hrtimer_mode mode) +{ + debug_init(&sl->timer, clock_id, mode); + __hrtimer_init_sleeper(sl, clock_id, mode); + +} EXPORT_SYMBOL_GPL(hrtimer_init_sleeper); int nanosleep_copyout(struct restart_block *restart, struct timespec64 *ts) @@ -1669,8 +1696,6 @@ static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mod { struct restart_block *restart; - hrtimer_init_sleeper(t); - do { set_current_state(TASK_INTERRUPTIBLE); hrtimer_start_expires(&t->timer, mode); @@ -1707,10 +1732,9 @@ static long __sched hrtimer_nanosleep_restart(struct restart_block *restart) struct hrtimer_sleeper t; int ret; - hrtimer_init_on_stack(&t.timer, restart->nanosleep.clockid, - HRTIMER_MODE_ABS); + hrtimer_init_sleeper_on_stack(&t, restart->nanosleep.clockid, + HRTIMER_MODE_ABS); hrtimer_set_expires_tv64(&t.timer, restart->nanosleep.expires); - ret = do_nanosleep(&t, HRTIMER_MODE_ABS); destroy_hrtimer_on_stack(&t.timer); return ret; @@ -1728,7 +1752,7 @@ long hrtimer_nanosleep(const struct timespec64 *rqtp, if (dl_task(current) || rt_task(current)) slack = 0; - hrtimer_init_on_stack(&t.timer, clockid, mode); + hrtimer_init_sleeper_on_stack(&t, clockid, mode); hrtimer_set_expires_range_ns(&t.timer, timespec64_to_ktime(*rqtp), slack); ret = do_nanosleep(&t, mode); if (ret != -ERESTART_RESTARTBLOCK) @@ -1927,11 +1951,8 @@ schedule_hrtimeout_range_clock(ktime_t *expires, u64 delta, return -EINTR; } - hrtimer_init_on_stack(&t.timer, clock_id, mode); + hrtimer_init_sleeper_on_stack(&t, clock_id, mode); hrtimer_set_expires_range_ns(&t.timer, *expires, delta); - - hrtimer_init_sleeper(&t); - hrtimer_start_expires(&t.timer, mode); if (likely(t.task)) -- cgit v1.2.3 From 01656464fce946f70b02a84ab218e562ceb1662e Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 30 Jul 2019 21:03:53 +0200 Subject: hrtimer: Provide hrtimer_sleeper_start_expires() hrtimer_sleepers will gain a scheduling class dependent treatment on PREEMPT_RT. Create a wrapper around hrtimer_start_expires() to make that possible. Signed-off-by: Thomas Gleixner --- kernel/time/hrtimer.c | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index bb55d62f631e..dab1ea1a99d0 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -1650,6 +1650,21 @@ static enum hrtimer_restart hrtimer_wakeup(struct hrtimer *timer) return HRTIMER_NORESTART; } +/** + * hrtimer_sleeper_start_expires - Start a hrtimer sleeper timer + * @sl: sleeper to be started + * @mode: timer mode abs/rel + * + * Wrapper around hrtimer_start_expires() for hrtimer_sleeper based timers + * to allow PREEMPT_RT to tweak the delivery mode (soft/hardirq context) + */ +void hrtimer_sleeper_start_expires(struct hrtimer_sleeper *sl, + enum hrtimer_mode mode) +{ + hrtimer_start_expires(&sl->timer, mode); +} +EXPORT_SYMBOL_GPL(hrtimer_sleeper_start_expires); + static void __hrtimer_init_sleeper(struct hrtimer_sleeper *sl, clockid_t clock_id, enum hrtimer_mode mode) { @@ -1698,7 +1713,7 @@ static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mod do { set_current_state(TASK_INTERRUPTIBLE); - hrtimer_start_expires(&t->timer, mode); + hrtimer_sleeper_start_expires(t, mode); if (likely(t->task)) freezable_schedule(); @@ -1953,7 +1968,7 @@ schedule_hrtimeout_range_clock(ktime_t *expires, u64 delta, hrtimer_init_sleeper_on_stack(&t, clock_id, mode); hrtimer_set_expires_range_ns(&t.timer, *expires, delta); - hrtimer_start_expires(&t.timer, mode); + hrtimer_sleeper_start_expires(&t, mode); if (likely(t.task)) schedule(); -- cgit v1.2.3 From 9dd8813ed9f6e2bba75434abc6c8bb06c3d87fdc Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 30 Jul 2019 21:16:55 +0200 Subject: hrtimer/treewide: Use hrtimer_sleeper_start_expires() hrtimer_sleepers will gain a scheduling class dependent treatment on PREEMPT_RT. Use the new hrtimer_sleeper_start_expires() function to make that possible. Signed-off-by: Thomas Gleixner --- kernel/futex.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index c8561aa5338e..bd18f60e4c6c 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -2611,7 +2611,7 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q, /* Arm the timer */ if (timeout) - hrtimer_start_expires(&timeout->timer, HRTIMER_MODE_ABS); + hrtimer_sleeper_start_expires(timeout, HRTIMER_MODE_ABS); /* * If we have been removed from the hash list, then another task @@ -2897,7 +2897,7 @@ retry_private: } if (unlikely(to)) - hrtimer_start_expires(&to->timer, HRTIMER_MODE_ABS); + hrtimer_sleeper_start_expires(to, HRTIMER_MODE_ABS); ret = rt_mutex_wait_proxy_lock(&q.pi_state->pi_mutex, to, &rt_waiter); -- cgit v1.2.3 From 0ab6a3ddbad40ef5b6b8c2353fd53fa4ecf9c479 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 30 Jul 2019 20:15:25 +0200 Subject: hrtimer: Make enqueue mode check work on RT hrtimer_start_range_ns() has a WARN_ONCE() which verifies that a timer which is marker for softirq expiry is not queued in the hard interrupt base and vice versa. When PREEMPT_RT is enabled, timers which are not explicitely marked to expire in hard interrupt context are deferrred to the soft interrupt. So the regular check would trigger. Change the check, so when PREEMPT_RT is enabled, it is verified that the timers marked for hard interrupt expiry are not tried to be queued for soft interrupt expiry or any of the unmarked and softirq marked is tried to be expired in hard interrupt context. Signed-off-by: Thomas Gleixner --- kernel/time/hrtimer.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index dab1ea1a99d0..0ace301a56f4 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -1107,9 +1107,13 @@ void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, /* * Check whether the HRTIMER_MODE_SOFT bit and hrtimer.is_soft - * match. + * match on CONFIG_PREEMPT_RT = n. With PREEMPT_RT check the hard + * expiry mode because unmarked timers are moved to softirq expiry. */ - WARN_ON_ONCE(!(mode & HRTIMER_MODE_SOFT) ^ !timer->is_soft); + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + WARN_ON_ONCE(!(mode & HRTIMER_MODE_SOFT) ^ !timer->is_soft); + else + WARN_ON_ONCE(!(mode & HRTIMER_MODE_HARD) ^ !timer->is_hard); base = lock_hrtimer_base(timer, &flags); @@ -1288,6 +1292,7 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id, base += hrtimer_clockid_to_base(clock_id); timer->is_soft = softtimer; + timer->is_hard = !softtimer; timer->base = &cpu_base->clock_base[base]; timerqueue_init(&timer->node); } -- cgit v1.2.3 From d5096aa65acd0ef2d18ac8247260ab4481ade399 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Fri, 26 Jul 2019 20:30:52 +0200 Subject: sched: Mark hrtimers to expire in hard interrupt context The scheduler related hrtimers need to expire in hard interrupt context even on PREEMPT_RT enabled kernels. Mark then as such. No functional change. [ tglx: Split out from larger combo patch. Add changelog. ] Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20190726185753.077004842@linutronix.de --- kernel/sched/core.c | 6 +++--- kernel/sched/deadline.c | 4 ++-- kernel/sched/rt.c | 7 ++++--- 3 files changed, 9 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 2b037f195473..389e0993fbb4 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -255,7 +255,7 @@ static void __hrtick_restart(struct rq *rq) { struct hrtimer *timer = &rq->hrtick_timer; - hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED); + hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED_HARD); } /* @@ -314,7 +314,7 @@ void hrtick_start(struct rq *rq, u64 delay) */ delay = max_t(u64, delay, 10000LL); hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay), - HRTIMER_MODE_REL_PINNED); + HRTIMER_MODE_REL_PINNED_HARD); } #endif /* CONFIG_SMP */ @@ -328,7 +328,7 @@ static void hrtick_rq_init(struct rq *rq) rq->hrtick_csd.info = rq; #endif - hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); rq->hrtick_timer.function = hrtick; } #else /* CONFIG_SCHED_HRTICK */ diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index ef5b9f6b1d42..0359612d5443 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -923,7 +923,7 @@ static int start_dl_timer(struct task_struct *p) */ if (!hrtimer_is_queued(timer)) { get_task_struct(p); - hrtimer_start(timer, act, HRTIMER_MODE_ABS); + hrtimer_start(timer, act, HRTIMER_MODE_ABS_HARD); } return 1; @@ -1053,7 +1053,7 @@ void init_dl_task_timer(struct sched_dl_entity *dl_se) { struct hrtimer *timer = &dl_se->dl_timer; - hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); timer->function = dl_task_timer; } diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index a532558a5176..da3e85e61013 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -45,8 +45,8 @@ void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime) raw_spin_lock_init(&rt_b->rt_runtime_lock); - hrtimer_init(&rt_b->rt_period_timer, - CLOCK_MONOTONIC, HRTIMER_MODE_REL); + hrtimer_init(&rt_b->rt_period_timer, CLOCK_MONOTONIC, + HRTIMER_MODE_REL_HARD); rt_b->rt_period_timer.function = sched_rt_period_timer; } @@ -67,7 +67,8 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b) * to update the period. */ hrtimer_forward_now(&rt_b->rt_period_timer, ns_to_ktime(0)); - hrtimer_start_expires(&rt_b->rt_period_timer, HRTIMER_MODE_ABS_PINNED); + hrtimer_start_expires(&rt_b->rt_period_timer, + HRTIMER_MODE_ABS_PINNED_HARD); } raw_spin_unlock(&rt_b->rt_runtime_lock); } -- cgit v1.2.3 From 30f9028b6c43fd17c006550594ea3dbb87afbf80 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Fri, 26 Jul 2019 20:30:53 +0200 Subject: perf/core: Mark hrtimers to expire in hard interrupt context To guarantee that the multiplexing mechanism and the hrtimer driven events work on PREEMPT_RT enabled kernels it's required that the related hrtimers expire in hard interrupt context. Mark them so PREEMPT_RT kernels wont defer them to soft interrupt context. No functional change. [ tglx: Split out of larger combo patch. Added changelog ] Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20190726185753.169509224@linutronix.de --- kernel/events/core.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 026a14541a38..9d623e257a51 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1103,7 +1103,7 @@ static void __perf_mux_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu) cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * interval); raw_spin_lock_init(&cpuctx->hrtimer_lock); - hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED); + hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED_HARD); timer->function = perf_mux_hrtimer_handler; } @@ -1121,7 +1121,7 @@ static int perf_mux_hrtimer_restart(struct perf_cpu_context *cpuctx) if (!cpuctx->hrtimer_active) { cpuctx->hrtimer_active = 1; hrtimer_forward_now(timer, cpuctx->hrtimer_interval); - hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED); + hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED_HARD); } raw_spin_unlock_irqrestore(&cpuctx->hrtimer_lock, flags); @@ -9491,7 +9491,7 @@ static void perf_swevent_start_hrtimer(struct perf_event *event) period = max_t(u64, 10000, hwc->sample_period); } hrtimer_start(&hwc->hrtimer, ns_to_ktime(period), - HRTIMER_MODE_REL_PINNED); + HRTIMER_MODE_REL_PINNED_HARD); } static void perf_swevent_cancel_hrtimer(struct perf_event *event) @@ -9513,7 +9513,7 @@ static void perf_swevent_init_hrtimer(struct perf_event *event) if (!is_sampling_event(event)) return; - hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); hwc->hrtimer.function = perf_swevent_hrtimer; /* -- cgit v1.2.3 From d2ab4cf4943576fb060b8a69341d9e0c2a952ba7 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Fri, 26 Jul 2019 20:30:54 +0200 Subject: watchdog: Mark watchdog_hrtimer to expire in hard interrupt context The watchdog hrtimer must expire in hard interrupt context even on PREEMPT_RT=y kernels as otherwise the hard/softlockup detection logic would not work. No functional change. [ tglx: Split out from larger combo patch. Added changelog ] Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20190726185753.262895510@linutronix.de --- kernel/watchdog.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 7f9e7b9306fe..f41334ef0971 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -490,10 +490,10 @@ static void watchdog_enable(unsigned int cpu) * Start the timer first to prevent the NMI watchdog triggering * before the timer has a chance to fire. */ - hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); hrtimer->function = watchdog_timer_fn; hrtimer_start(hrtimer, ns_to_ktime(sample_period), - HRTIMER_MODE_REL_PINNED); + HRTIMER_MODE_REL_PINNED_HARD); /* Initialize timestamp */ __touch_watchdog(); -- cgit v1.2.3 From 902a9f9c509053161e987778dc5836d2628f53b7 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Fri, 26 Jul 2019 20:30:56 +0200 Subject: tick: Mark tick related hrtimers to expiry in hard interrupt context The tick related hrtimers, which drive the scheduler tick and hrtimer based broadcasting are required to expire in hard interrupt context for obvious reasons. Mark them so PREEMPT_RT kernels wont move them to soft interrupt expiry. Make the horribly formatted RCU_NONIDLE bracket maze readable while at it. No functional change, [ tglx: Split out from larger combo patch. Add changelog ] Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20190726185753.459144407@linutronix.de --- kernel/time/tick-broadcast-hrtimer.c | 13 +++++++++---- kernel/time/tick-sched.c | 15 +++++++++------ 2 files changed, 18 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-broadcast-hrtimer.c b/kernel/time/tick-broadcast-hrtimer.c index 5be6154e2fd2..c1f5bb590b5e 100644 --- a/kernel/time/tick-broadcast-hrtimer.c +++ b/kernel/time/tick-broadcast-hrtimer.c @@ -59,11 +59,16 @@ static int bc_set_next(ktime_t expires, struct clock_event_device *bc) * hrtimer_{start/cancel} functions call into tracing, * calls to these functions must be bound within RCU_NONIDLE. */ - RCU_NONIDLE({ + RCU_NONIDLE( + { bc_moved = hrtimer_try_to_cancel(&bctimer) >= 0; - if (bc_moved) + if (bc_moved) { hrtimer_start(&bctimer, expires, - HRTIMER_MODE_ABS_PINNED);}); + HRTIMER_MODE_ABS_PINNED_HARD); + } + } + ); + if (bc_moved) { /* Bind the "device" to the cpu */ bc->bound_on = smp_processor_id(); @@ -104,7 +109,7 @@ static enum hrtimer_restart bc_handler(struct hrtimer *t) void tick_setup_hrtimer_broadcast(void) { - hrtimer_init(&bctimer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); + hrtimer_init(&bctimer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD); bctimer.function = bc_handler; clockevents_register_device(&ce_broadcast_hrtimer); } diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index be9707f68024..01ff32a02af2 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -634,10 +634,12 @@ static void tick_nohz_restart(struct tick_sched *ts, ktime_t now) /* Forward the time to expire in the future */ hrtimer_forward(&ts->sched_timer, now, tick_period); - if (ts->nohz_mode == NOHZ_MODE_HIGHRES) - hrtimer_start_expires(&ts->sched_timer, HRTIMER_MODE_ABS_PINNED); - else + if (ts->nohz_mode == NOHZ_MODE_HIGHRES) { + hrtimer_start_expires(&ts->sched_timer, + HRTIMER_MODE_ABS_PINNED_HARD); + } else { tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1); + } /* * Reset to make sure next tick stop doesn't get fooled by past @@ -802,7 +804,8 @@ static void tick_nohz_stop_tick(struct tick_sched *ts, int cpu) } if (ts->nohz_mode == NOHZ_MODE_HIGHRES) { - hrtimer_start(&ts->sched_timer, tick, HRTIMER_MODE_ABS_PINNED); + hrtimer_start(&ts->sched_timer, tick, + HRTIMER_MODE_ABS_PINNED_HARD); } else { hrtimer_set_expires(&ts->sched_timer, tick); tick_program_event(tick, 1); @@ -1327,7 +1330,7 @@ void tick_setup_sched_timer(void) /* * Emulate tick processing via per-CPU hrtimers: */ - hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); + hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD); ts->sched_timer.function = tick_sched_timer; /* Get the next period (per-CPU) */ @@ -1342,7 +1345,7 @@ void tick_setup_sched_timer(void) } hrtimer_forward(&ts->sched_timer, now, tick_period); - hrtimer_start_expires(&ts->sched_timer, HRTIMER_MODE_ABS_PINNED); + hrtimer_start_expires(&ts->sched_timer, HRTIMER_MODE_ABS_PINNED_HARD); tick_nohz_activate(ts, NOHZ_MODE_HIGHRES); } #endif /* HIGH_RES_TIMERS */ -- cgit v1.2.3 From f5c2f0215e36d76fbb9605283dd7535af09f5770 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Fri, 26 Jul 2019 20:30:57 +0200 Subject: hrtimer: Move unmarked hrtimers to soft interrupt expiry on RT On PREEMPT_RT not all hrtimers can be expired in hard interrupt context even if that is perfectly fine on a PREEMPT_RT=n kernel, e.g. because they take regular spinlocks. Also for latency reasons PREEMPT_RT tries to defer most hrtimers' expiry into softirq context. hrtimers marked with HRTIMER_MODE_HARD must be kept in hard interrupt context expiry mode. Add the required logic. No functional change for PREEMPT_RT=n kernels. [ tglx: Split out of a larger combo patch. Added changelog ] Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20190726185753.551967692@linutronix.de --- kernel/time/hrtimer.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 0ace301a56f4..90dcc4d95e91 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -1275,8 +1275,17 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id, enum hrtimer_mode mode) { bool softtimer = !!(mode & HRTIMER_MODE_SOFT); - int base = softtimer ? HRTIMER_MAX_CLOCK_BASES / 2 : 0; struct hrtimer_cpu_base *cpu_base; + int base; + + /* + * On PREEMPT_RT enabled kernels hrtimers which are not explicitely + * marked for hard interrupt expiry mode are moved into soft + * interrupt context for latency reasons and because the callbacks + * can invoke functions which might sleep on RT, e.g. spin_lock(). + */ + if (IS_ENABLED(CONFIG_PREEMPT_RT) && !(mode & HRTIMER_MODE_HARD)) + softtimer = true; memset(timer, 0, sizeof(struct hrtimer)); @@ -1290,6 +1299,7 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id, if (clock_id == CLOCK_REALTIME && mode & HRTIMER_MODE_REL) clock_id = CLOCK_MONOTONIC; + base = softtimer ? HRTIMER_MAX_CLOCK_BASES / 2 : 0; base += hrtimer_clockid_to_base(clock_id); timer->is_soft = softtimer; timer->is_hard = !softtimer; -- cgit v1.2.3 From 1842f5a427f5323f5c19ab99b55d09b3ab5172a5 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Fri, 26 Jul 2019 20:30:58 +0200 Subject: hrtimer: Determine hard/soft expiry mode for hrtimer sleepers on RT On PREEMPT_RT enabled kernels hrtimers which are not explicitely marked for hard interrupt expiry mode are moved into soft interrupt context either for latency reasons or because the hrtimer callback takes regular spinlocks or invokes other functions which are not suitable for hard interrupt context on PREEMPT_RT. The hrtimer_sleeper callback is RT compatible in hard interrupt context, but there is a latency concern: Untrusted userspace can spawn many threads which arm timers for the same expiry time on the same CPU. On expiry that causes a latency spike due to the wakeup of a gazillion threads. OTOH, priviledged real-time user space applications rely on the low latency of hard interrupt wakeups. These syscall related wakeups are all based on hrtimer sleepers. If the current task is in a real-time scheduling class, mark the mode for hard interrupt expiry. [ tglx: Split out of a larger combo patch. Added changelog ] Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Thomas Gleixner Reviewed-by: Steven Rostedt (VMware) Acked-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20190726185753.645792403@linutronix.de --- kernel/time/hrtimer.c | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) (limited to 'kernel') diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 90dcc4d95e91..c101f88ae8aa 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -1676,6 +1676,16 @@ static enum hrtimer_restart hrtimer_wakeup(struct hrtimer *timer) void hrtimer_sleeper_start_expires(struct hrtimer_sleeper *sl, enum hrtimer_mode mode) { + /* + * Make the enqueue delivery mode check work on RT. If the sleeper + * was initialized for hard interrupt delivery, force the mode bit. + * This is a special case for hrtimer_sleepers because + * hrtimer_init_sleeper() determines the delivery mode on RT so the + * fiddling with this decision is avoided at the call sites. + */ + if (IS_ENABLED(CONFIG_PREEMPT_RT) && sl->timer.is_hard) + mode |= HRTIMER_MODE_HARD; + hrtimer_start_expires(&sl->timer, mode); } EXPORT_SYMBOL_GPL(hrtimer_sleeper_start_expires); @@ -1683,6 +1693,30 @@ EXPORT_SYMBOL_GPL(hrtimer_sleeper_start_expires); static void __hrtimer_init_sleeper(struct hrtimer_sleeper *sl, clockid_t clock_id, enum hrtimer_mode mode) { + /* + * On PREEMPT_RT enabled kernels hrtimers which are not explicitely + * marked for hard interrupt expiry mode are moved into soft + * interrupt context either for latency reasons or because the + * hrtimer callback takes regular spinlocks or invokes other + * functions which are not suitable for hard interrupt context on + * PREEMPT_RT. + * + * The hrtimer_sleeper callback is RT compatible in hard interrupt + * context, but there is a latency concern: Untrusted userspace can + * spawn many threads which arm timers for the same expiry time on + * the same CPU. That causes a latency spike due to the wakeup of + * a gazillion threads. + * + * OTOH, priviledged real-time user space applications rely on the + * low latency of hard interrupt wakeups. If the current task is in + * a real-time scheduling class, mark the mode for hard interrupt + * expiry. + */ + if (IS_ENABLED(CONFIG_PREEMPT_RT)) { + if (task_is_realtime(current) && !(mode & HRTIMER_MODE_SOFT)) + mode |= HRTIMER_MODE_HARD; + } + __hrtimer_init(&sl->timer, clock_id, mode); sl->timer.function = hrtimer_wakeup; sl->task = current; -- cgit v1.2.3 From f61eff83cec9cfab31fd30a2ca8856be379cdcd5 Mon Sep 17 00:00:00 2001 From: Anna-Maria Gleixner Date: Fri, 26 Jul 2019 20:30:59 +0200 Subject: hrtimer: Prepare support for PREEMPT_RT When PREEMPT_RT is enabled, the soft interrupt thread can be preempted. If the soft interrupt thread is preempted in the middle of a timer callback, then calling hrtimer_cancel() can lead to two issues: - If the caller is on a remote CPU then it has to spin wait for the timer handler to complete. This can result in unbound priority inversion. - If the caller originates from the task which preempted the timer handler on the same CPU, then spin waiting for the timer handler to complete is never going to end. To avoid these issues, add a new lock to the timer base which is held around the execution of the timer callbacks. If hrtimer_cancel() detects that the timer callback is currently running, it blocks on the expiry lock. When the callback is finished, the expiry lock is dropped by the softirq thread which wakes up the waiter and the system makes progress. This addresses both the priority inversion and the life lock issues. The same issue can happen in virtual machines when the vCPU which runs a timer callback is scheduled out. If a second vCPU of the same guest calls hrtimer_cancel() it will spin wait for the other vCPU to be scheduled back in. The expiry lock mechanism would avoid that. It'd be trivial to enable this when paravirt spinlocks are enabled in a guest, but it's not clear whether this is an actual problem in the wild, so for now it's an RT only mechanism. [ tglx: Refactored it for mainline ] Signed-off-by: Anna-Maria Gleixner Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20190726185753.737767218@linutronix.de --- kernel/time/hrtimer.c | 95 +++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 89 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index c101f88ae8aa..499122752649 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -1162,6 +1162,82 @@ int hrtimer_try_to_cancel(struct hrtimer *timer) } EXPORT_SYMBOL_GPL(hrtimer_try_to_cancel); +#ifdef CONFIG_PREEMPT_RT +static void hrtimer_cpu_base_init_expiry_lock(struct hrtimer_cpu_base *base) +{ + spin_lock_init(&base->softirq_expiry_lock); +} + +static void hrtimer_cpu_base_lock_expiry(struct hrtimer_cpu_base *base) +{ + spin_lock(&base->softirq_expiry_lock); +} + +static void hrtimer_cpu_base_unlock_expiry(struct hrtimer_cpu_base *base) +{ + spin_unlock(&base->softirq_expiry_lock); +} + +/* + * The counterpart to hrtimer_cancel_wait_running(). + * + * If there is a waiter for cpu_base->expiry_lock, then it was waiting for + * the timer callback to finish. Drop expiry_lock and reaquire it. That + * allows the waiter to acquire the lock and make progress. + */ +static void hrtimer_sync_wait_running(struct hrtimer_cpu_base *cpu_base, + unsigned long flags) +{ + if (atomic_read(&cpu_base->timer_waiters)) { + raw_spin_unlock_irqrestore(&cpu_base->lock, flags); + spin_unlock(&cpu_base->softirq_expiry_lock); + spin_lock(&cpu_base->softirq_expiry_lock); + raw_spin_lock_irq(&cpu_base->lock); + } +} + +/* + * This function is called on PREEMPT_RT kernels when the fast path + * deletion of a timer failed because the timer callback function was + * running. + * + * This prevents priority inversion, if the softirq thread on a remote CPU + * got preempted, and it prevents a life lock when the task which tries to + * delete a timer preempted the softirq thread running the timer callback + * function. + */ +void hrtimer_cancel_wait_running(const struct hrtimer *timer) +{ + struct hrtimer_clock_base *base = timer->base; + + if (!timer->is_soft || !base || !base->cpu_base) { + cpu_relax(); + return; + } + + /* + * Mark the base as contended and grab the expiry lock, which is + * held by the softirq across the timer callback. Drop the lock + * immediately so the softirq can expire the next timer. In theory + * the timer could already be running again, but that's more than + * unlikely and just causes another wait loop. + */ + atomic_inc(&base->cpu_base->timer_waiters); + spin_lock_bh(&base->cpu_base->softirq_expiry_lock); + atomic_dec(&base->cpu_base->timer_waiters); + spin_unlock_bh(&base->cpu_base->softirq_expiry_lock); +} +#else +static inline void +hrtimer_cpu_base_init_expiry_lock(struct hrtimer_cpu_base *base) { } +static inline void +hrtimer_cpu_base_lock_expiry(struct hrtimer_cpu_base *base) { } +static inline void +hrtimer_cpu_base_unlock_expiry(struct hrtimer_cpu_base *base) { } +static inline void hrtimer_sync_wait_running(struct hrtimer_cpu_base *base, + unsigned long flags) { } +#endif + /** * hrtimer_cancel - cancel a timer and wait for the handler to finish. * @timer: the timer to be cancelled @@ -1172,13 +1248,15 @@ EXPORT_SYMBOL_GPL(hrtimer_try_to_cancel); */ int hrtimer_cancel(struct hrtimer *timer) { - for (;;) { - int ret = hrtimer_try_to_cancel(timer); + int ret; - if (ret >= 0) - return ret; - cpu_relax(); - } + do { + ret = hrtimer_try_to_cancel(timer); + + if (ret < 0) + hrtimer_cancel_wait_running(timer); + } while (ret < 0); + return ret; } EXPORT_SYMBOL_GPL(hrtimer_cancel); @@ -1475,6 +1553,8 @@ static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now, break; __run_hrtimer(cpu_base, base, timer, &basenow, flags); + if (active_mask == HRTIMER_ACTIVE_SOFT) + hrtimer_sync_wait_running(cpu_base, flags); } } } @@ -1485,6 +1565,7 @@ static __latent_entropy void hrtimer_run_softirq(struct softirq_action *h) unsigned long flags; ktime_t now; + hrtimer_cpu_base_lock_expiry(cpu_base); raw_spin_lock_irqsave(&cpu_base->lock, flags); now = hrtimer_update_base(cpu_base); @@ -1494,6 +1575,7 @@ static __latent_entropy void hrtimer_run_softirq(struct softirq_action *h) hrtimer_update_softirq_timer(cpu_base, true); raw_spin_unlock_irqrestore(&cpu_base->lock, flags); + hrtimer_cpu_base_unlock_expiry(cpu_base); } #ifdef CONFIG_HIGH_RES_TIMERS @@ -1897,6 +1979,7 @@ int hrtimers_prepare_cpu(unsigned int cpu) cpu_base->softirq_next_timer = NULL; cpu_base->expires_next = KTIME_MAX; cpu_base->softirq_expires_next = KTIME_MAX; + hrtimer_cpu_base_init_expiry_lock(cpu_base); return 0; } -- cgit v1.2.3 From 030dcdd197d77374879bb5603d091eee7d8aba80 Mon Sep 17 00:00:00 2001 From: Anna-Maria Gleixner Date: Fri, 26 Jul 2019 20:31:00 +0200 Subject: timers: Prepare support for PREEMPT_RT When PREEMPT_RT is enabled, the soft interrupt thread can be preempted. If the soft interrupt thread is preempted in the middle of a timer callback, then calling del_timer_sync() can lead to two issues: - If the caller is on a remote CPU then it has to spin wait for the timer handler to complete. This can result in unbound priority inversion. - If the caller originates from the task which preempted the timer handler on the same CPU, then spin waiting for the timer handler to complete is never going to end. To avoid these issues, add a new lock to the timer base which is held around the execution of the timer callbacks. If del_timer_sync() detects that the timer callback is currently running, it blocks on the expiry lock. When the callback is finished, the expiry lock is dropped by the softirq thread which wakes up the waiter and the system makes progress. This addresses both the priority inversion and the life lock issues. This mechanism is not used for timers which are marked IRQSAFE as for those preemption is disabled accross the callback and therefore this situation cannot happen. The callbacks for such timers need to be individually audited for RT compliance. The same issue can happen in virtual machines when the vCPU which runs a timer callback is scheduled out. If a second vCPU of the same guest calls del_timer_sync() it will spin wait for the other vCPU to be scheduled back in. The expiry lock mechanism would avoid that. It'd be trivial to enable this when paravirt spinlocks are enabled in a guest, but it's not clear whether this is an actual problem in the wild, so for now it's an RT only mechanism. As the softirq thread can be preempted with PREEMPT_RT=y, the SMP variant of del_timer_sync() needs to be used on UP as well. [ tglx: Refactored it for mainline ] Signed-off-by: Anna-Maria Gleixner Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20190726185753.832418500@linutronix.de --- kernel/time/timer.c | 103 ++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 95 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 343c7ba33b1c..673c6a0f0c45 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -196,6 +196,10 @@ EXPORT_SYMBOL(jiffies_64); struct timer_base { raw_spinlock_t lock; struct timer_list *running_timer; +#ifdef CONFIG_PREEMPT_RT + spinlock_t expiry_lock; + atomic_t timer_waiters; +#endif unsigned long clk; unsigned long next_expiry; unsigned int cpu; @@ -1227,7 +1231,78 @@ int try_to_del_timer_sync(struct timer_list *timer) } EXPORT_SYMBOL(try_to_del_timer_sync); -#ifdef CONFIG_SMP +#ifdef CONFIG_PREEMPT_RT +static __init void timer_base_init_expiry_lock(struct timer_base *base) +{ + spin_lock_init(&base->expiry_lock); +} + +static inline void timer_base_lock_expiry(struct timer_base *base) +{ + spin_lock(&base->expiry_lock); +} + +static inline void timer_base_unlock_expiry(struct timer_base *base) +{ + spin_unlock(&base->expiry_lock); +} + +/* + * The counterpart to del_timer_wait_running(). + * + * If there is a waiter for base->expiry_lock, then it was waiting for the + * timer callback to finish. Drop expiry_lock and reaquire it. That allows + * the waiter to acquire the lock and make progress. + */ +static void timer_sync_wait_running(struct timer_base *base) +{ + if (atomic_read(&base->timer_waiters)) { + spin_unlock(&base->expiry_lock); + spin_lock(&base->expiry_lock); + } +} + +/* + * This function is called on PREEMPT_RT kernels when the fast path + * deletion of a timer failed because the timer callback function was + * running. + * + * This prevents priority inversion, if the softirq thread on a remote CPU + * got preempted, and it prevents a life lock when the task which tries to + * delete a timer preempted the softirq thread running the timer callback + * function. + */ +static void del_timer_wait_running(struct timer_list *timer) +{ + u32 tf; + + tf = READ_ONCE(timer->flags); + if (!(tf & TIMER_MIGRATING)) { + struct timer_base *base = get_timer_base(tf); + + /* + * Mark the base as contended and grab the expiry lock, + * which is held by the softirq across the timer + * callback. Drop the lock immediately so the softirq can + * expire the next timer. In theory the timer could already + * be running again, but that's more than unlikely and just + * causes another wait loop. + */ + atomic_inc(&base->timer_waiters); + spin_lock_bh(&base->expiry_lock); + atomic_dec(&base->timer_waiters); + spin_unlock_bh(&base->expiry_lock); + } +} +#else +static inline void timer_base_init_expiry_lock(struct timer_base *base) { } +static inline void timer_base_lock_expiry(struct timer_base *base) { } +static inline void timer_base_unlock_expiry(struct timer_base *base) { } +static inline void timer_sync_wait_running(struct timer_base *base) { } +static inline void del_timer_wait_running(struct timer_list *timer) { } +#endif + +#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT) /** * del_timer_sync - deactivate a timer and wait for the handler to finish. * @timer: the timer to be deactivated @@ -1266,6 +1341,8 @@ EXPORT_SYMBOL(try_to_del_timer_sync); */ int del_timer_sync(struct timer_list *timer) { + int ret; + #ifdef CONFIG_LOCKDEP unsigned long flags; @@ -1283,12 +1360,17 @@ int del_timer_sync(struct timer_list *timer) * could lead to deadlock. */ WARN_ON(in_irq() && !(timer->flags & TIMER_IRQSAFE)); - for (;;) { - int ret = try_to_del_timer_sync(timer); - if (ret >= 0) - return ret; - cpu_relax(); - } + + do { + ret = try_to_del_timer_sync(timer); + + if (unlikely(ret < 0)) { + del_timer_wait_running(timer); + cpu_relax(); + } + } while (ret < 0); + + return ret; } EXPORT_SYMBOL(del_timer_sync); #endif @@ -1360,10 +1442,13 @@ static void expire_timers(struct timer_base *base, struct hlist_head *head) if (timer->flags & TIMER_IRQSAFE) { raw_spin_unlock(&base->lock); call_timer_fn(timer, fn, baseclk); + base->running_timer = NULL; raw_spin_lock(&base->lock); } else { raw_spin_unlock_irq(&base->lock); call_timer_fn(timer, fn, baseclk); + base->running_timer = NULL; + timer_sync_wait_running(base); raw_spin_lock_irq(&base->lock); } } @@ -1658,6 +1743,7 @@ static inline void __run_timers(struct timer_base *base) if (!time_after_eq(jiffies, base->clk)) return; + timer_base_lock_expiry(base); raw_spin_lock_irq(&base->lock); /* @@ -1684,8 +1770,8 @@ static inline void __run_timers(struct timer_base *base) while (levels--) expire_timers(base, heads + levels); } - base->running_timer = NULL; raw_spin_unlock_irq(&base->lock); + timer_base_unlock_expiry(base); } /* @@ -1930,6 +2016,7 @@ static void __init init_timer_cpu(int cpu) base->cpu = cpu; raw_spin_lock_init(&base->lock); base->clk = jiffies; + timer_base_init_expiry_lock(base); } } -- cgit v1.2.3 From 850377a875a481c393ce59111b0c9725005e0eb4 Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Wed, 31 Jul 2019 12:37:15 +0200 Subject: sched/deadline: Ensure inactive_timer runs in hardirq context SCHED_DEADLINE inactive timer needs to run in hardirq context (as dl_task_timer already does) on PREEMPT_RT Change the mode to HRTIMER_MODE_REL_HARD. [ tglx: Fixed up the start site, so mode debugging works ] Signed-off-by: Juri Lelli Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20190731103715.4047-1-juri.lelli@redhat.com --- kernel/sched/deadline.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 0359612d5443..83a663a34196 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -287,7 +287,7 @@ static void task_non_contending(struct task_struct *p) dl_se->dl_non_contending = 1; get_task_struct(p); - hrtimer_start(timer, ns_to_ktime(zerolag_time), HRTIMER_MODE_REL); + hrtimer_start(timer, ns_to_ktime(zerolag_time), HRTIMER_MODE_REL_HARD); } static void task_contending(struct sched_dl_entity *dl_se, int flags) @@ -1292,7 +1292,7 @@ void init_dl_inactive_task_timer(struct sched_dl_entity *dl_se) { struct hrtimer *timer = &dl_se->inactive_timer; - hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); timer->function = inactive_task_timer; } -- cgit v1.2.3 From 51ae33092bb8320497ec75ddc5ab383d8fafd55c Mon Sep 17 00:00:00 2001 From: Anna-Maria Gleixner Date: Wed, 31 Jul 2019 00:33:49 +0200 Subject: alarmtimer: Prepare for PREEMPT_RT Use the hrtimer_cancel_wait_running() synchronization mechanism to prevent priority inversion and live locks on PREEMPT_RT. [ tglx: Split out of combo patch ] Signed-off-by: Anna-Maria Gleixner Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20190730223828.508744705@linutronix.de --- kernel/time/alarmtimer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 57518efc3810..36947449dba2 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -432,7 +432,7 @@ int alarm_cancel(struct alarm *alarm) int ret = alarm_try_to_cancel(alarm); if (ret >= 0) return ret; - cpu_relax(); + hrtimer_cancel_wait_running(&alarm->timer); } } EXPORT_SYMBOL_GPL(alarm_cancel); -- cgit v1.2.3 From c7e6d704a0097e59667495cf52dcc4e1085e620b Mon Sep 17 00:00:00 2001 From: Anna-Maria Gleixner Date: Wed, 31 Jul 2019 00:33:51 +0200 Subject: itimers: Prepare for PREEMPT_RT Use the hrtimer_cancel_wait_running() synchronization mechanism to prevent priority inversion and live locks on PREEMPT_RT. As a benefit the retry loop gains the missing cpu_relax() on !RT. [ tglx: Split out of combo patch ] Signed-off-by: Anna-Maria Gleixner Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20190730223828.690771827@linutronix.de --- kernel/time/itimer.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/time/itimer.c b/kernel/time/itimer.c index 02068b2d5862..9d26fd4ba4c0 100644 --- a/kernel/time/itimer.c +++ b/kernel/time/itimer.c @@ -213,6 +213,7 @@ again: /* We are sharing ->siglock with it_real_fn() */ if (hrtimer_try_to_cancel(timer) < 0) { spin_unlock_irq(&tsk->sighand->siglock); + hrtimer_cancel_wait_running(timer); goto again; } expires = timeval_to_ktime(value->it_value); -- cgit v1.2.3 From 21670ee44f1e3565030bcabc62178b8e5eb2fce7 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 31 Jul 2019 00:33:52 +0200 Subject: posix-timers: Cleanup the flag/flags confusion do_timer_settime() has a 'flags' argument and uses 'flag' for the interrupt flags, which is confusing at best. Rename the argument so 'flags' can be used for interrupt flags as usual. Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20190730223828.782664411@linutronix.de --- kernel/time/posix-timers.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index d7f2d91acdac..f5aedd2f60df 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -844,13 +844,13 @@ int common_timer_set(struct k_itimer *timr, int flags, return 0; } -static int do_timer_settime(timer_t timer_id, int flags, +static int do_timer_settime(timer_t timer_id, int tmr_flags, struct itimerspec64 *new_spec64, struct itimerspec64 *old_spec64) { const struct k_clock *kc; struct k_itimer *timr; - unsigned long flag; + unsigned long flags; int error = 0; if (!timespec64_valid(&new_spec64->it_interval) || @@ -860,7 +860,7 @@ static int do_timer_settime(timer_t timer_id, int flags, if (old_spec64) memset(old_spec64, 0, sizeof(*old_spec64)); retry: - timr = lock_timer(timer_id, &flag); + timr = lock_timer(timer_id, &flags); if (!timr) return -EINVAL; @@ -868,9 +868,9 @@ retry: if (WARN_ON_ONCE(!kc || !kc->timer_set)) error = -EINVAL; else - error = kc->timer_set(timr, flags, new_spec64, old_spec64); + error = kc->timer_set(timr, tmr_flags, new_spec64, old_spec64); - unlock_timer(timr, flag); + unlock_timer(timr, flags); if (error == TIMER_RETRY) { old_spec64 = NULL; // We already got the old time... goto retry; -- cgit v1.2.3 From 6945e5c2abe008302b20266248d6de95575311a8 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 31 Jul 2019 00:33:53 +0200 Subject: posix-timers: Rework cancel retry loops As a preparatory step for adding the PREEMPT RT specific synchronization mechanism to wait for a running timer callback, rework the timer cancel retry loops so they call a common function. This allows trivial substitution in one place. Originally-by: Anna-Maria Gleixner Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20190730223828.874901027@linutronix.de --- kernel/time/posix-timers.c | 29 +++++++++++++++++++++++------ 1 file changed, 23 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index f5aedd2f60df..bbe8f9686a70 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -805,6 +805,17 @@ static int common_hrtimer_try_to_cancel(struct k_itimer *timr) return hrtimer_try_to_cancel(&timr->it.real.timer); } +static struct k_itimer *timer_wait_running(struct k_itimer *timer, + unsigned long *flags) +{ + timer_t timer_id = READ_ONCE(timer->it_id); + + unlock_timer(timer, *flags); + cpu_relax(); + /* Relock the timer. It might be not longer hashed. */ + return lock_timer(timer_id, flags); +} + /* Set a POSIX.1b interval timer. */ int common_timer_set(struct k_itimer *timr, int flags, struct itimerspec64 *new_setting, @@ -859,8 +870,9 @@ static int do_timer_settime(timer_t timer_id, int tmr_flags, if (old_spec64) memset(old_spec64, 0, sizeof(*old_spec64)); -retry: + timr = lock_timer(timer_id, &flags); +retry: if (!timr) return -EINVAL; @@ -870,11 +882,14 @@ retry: else error = kc->timer_set(timr, tmr_flags, new_spec64, old_spec64); - unlock_timer(timr, flags); if (error == TIMER_RETRY) { - old_spec64 = NULL; // We already got the old time... + // We already got the old time... + old_spec64 = NULL; + /* Unlocks and relocks the timer if it still exists */ + timr = timer_wait_running(timr, &flags); goto retry; } + unlock_timer(timr, flags); return error; } @@ -951,13 +966,15 @@ SYSCALL_DEFINE1(timer_delete, timer_t, timer_id) struct k_itimer *timer; unsigned long flags; -retry_delete: timer = lock_timer(timer_id, &flags); + +retry_delete: if (!timer) return -EINVAL; - if (timer_delete_hook(timer) == TIMER_RETRY) { - unlock_timer(timer, flags); + if (unlikely(timer_delete_hook(timer) == TIMER_RETRY)) { + /* Unlocks and relocks the timer if it still exists */ + timer = timer_wait_running(timer, &flags); goto retry_delete; } -- cgit v1.2.3 From 5d99b32a009e900a561f6a42ea7afe5b21288b8a Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Wed, 31 Jul 2019 00:33:54 +0200 Subject: posix-timers: Move rcu_head out of it union Timer deletion on PREEMPT_RT is prone to priority inversion and live locks. The hrtimer code has a synchronization mechanism for this. Posix CPU timers will grow one. But that mechanism cannot be invoked while holding the k_itimer lock because that can deadlock against the running timer callback. So the lock must be dropped which allows the timer to be freed. The timer free can be prevented by taking RCU readlock before dropping the lock, but because the rcu_head is part of the 'it' union a concurrent free will overwrite the hrtimer on which the task is trying to synchronize. Move the rcu_head out of the union to prevent this. [ tglx: Fixed up kernel-doc. Rewrote changelog ] Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20190730223828.965541887@linutronix.de --- kernel/time/posix-timers.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index bbe8f9686a70..3e663f982c82 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -442,7 +442,7 @@ static struct k_itimer * alloc_posix_timer(void) static void k_itimer_rcu_free(struct rcu_head *head) { - struct k_itimer *tmr = container_of(head, struct k_itimer, it.rcu); + struct k_itimer *tmr = container_of(head, struct k_itimer, rcu); kmem_cache_free(posix_timers_cache, tmr); } @@ -459,7 +459,7 @@ static void release_posix_timer(struct k_itimer *tmr, int it_id_set) } put_pid(tmr->it_pid); sigqueue_free(tmr->sigq); - call_rcu(&tmr->it.rcu, k_itimer_rcu_free); + call_rcu(&tmr->rcu, k_itimer_rcu_free); } static int common_timer_create(struct k_itimer *new_timer) -- cgit v1.2.3 From 3695eae5fee0605f316fbaad0b9e3de791d7dfaf Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Sun, 28 Jul 2019 00:22:29 +0200 Subject: pidfd: add P_PIDFD to waitid() This adds the P_PIDFD type to waitid(). One of the last remaining bits for the pidfd api is to make it possible to wait on pidfds. With P_PIDFD added to waitid() the parts of userspace that want to use the pidfd api to exclusively manage processes can do so now. One of the things this will unblock in the future is the ability to make it possible to retrieve the exit status via waitid(P_PIDFD) for non-parent processes if handed a _suitable_ pidfd that has this feature set. This is similar to what you can do on FreeBSD with kqueue(). It might even end up being possible to wait on a process as a non-parent if an appropriate property is enabled on the pidfd. With P_PIDFD no scoping of the process identified by the pidfd is possible, i.e. it explicitly blocks things such as wait4(-1), wait4(0), waitid(P_ALL), waitid(P_PGID) etc. It only allows for semantics equivalent to wait4(pid), waitid(P_PID). Users that need scoping should rely on pid-based wait*() syscalls for now. Signed-off-by: Christian Brauner Reviewed-by: Kees Cook Reviewed-by: Oleg Nesterov Cc: Arnd Bergmann Cc: "Eric W. Biederman" Cc: Joel Fernandes (Google) Cc: Thomas Gleixner Cc: David Howells Cc: Jann Horn Cc: Andy Lutomirsky Cc: Andrew Morton Cc: Aleksa Sarai Cc: Linus Torvalds Cc: Al Viro Link: https://lore.kernel.org/r/20190727222229.6516-2-christian@brauner.io --- kernel/exit.c | 33 ++++++++++++++++++++++++++++++--- kernel/fork.c | 8 ++++++++ kernel/signal.c | 7 +++++-- 3 files changed, 43 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index a75b6a7f458a..64bb6893a37d 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1552,6 +1552,23 @@ end: return retval; } +static struct pid *pidfd_get_pid(unsigned int fd) +{ + struct fd f; + struct pid *pid; + + f = fdget(fd); + if (!f.file) + return ERR_PTR(-EBADF); + + pid = pidfd_pid(f.file); + if (!IS_ERR(pid)) + get_pid(pid); + + fdput(f); + return pid; +} + static long kernel_waitid(int which, pid_t upid, struct waitid_info *infop, int options, struct rusage *ru) { @@ -1574,19 +1591,29 @@ static long kernel_waitid(int which, pid_t upid, struct waitid_info *infop, type = PIDTYPE_PID; if (upid <= 0) return -EINVAL; + + pid = find_get_pid(upid); break; case P_PGID: type = PIDTYPE_PGID; if (upid <= 0) return -EINVAL; + + pid = find_get_pid(upid); + break; + case P_PIDFD: + type = PIDTYPE_PID; + if (upid < 0) + return -EINVAL; + + pid = pidfd_get_pid(upid); + if (IS_ERR(pid)) + return PTR_ERR(pid); break; default: return -EINVAL; } - if (type < PIDTYPE_MAX) - pid = find_get_pid(upid); - wo.wo_type = type; wo.wo_pid = pid; wo.wo_flags = options; diff --git a/kernel/fork.c b/kernel/fork.c index d8ae0f1b4148..b169e2ca2d84 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1690,6 +1690,14 @@ static inline void rcu_copy_process(struct task_struct *p) #endif /* #ifdef CONFIG_TASKS_RCU */ } +struct pid *pidfd_pid(const struct file *file) +{ + if (file->f_op == &pidfd_fops) + return file->private_data; + + return ERR_PTR(-EBADF); +} + static int pidfd_release(struct inode *inode, struct file *file) { struct pid *pid = file->private_data; diff --git a/kernel/signal.c b/kernel/signal.c index 91b789dd6e72..2e567f64812f 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -3672,8 +3672,11 @@ static int copy_siginfo_from_user_any(kernel_siginfo_t *kinfo, siginfo_t *info) static struct pid *pidfd_to_pid(const struct file *file) { - if (file->f_op == &pidfd_fops) - return file->private_data; + struct pid *pid; + + pid = pidfd_pid(file); + if (!IS_ERR(pid)) + return pid; return tgid_pidfd_to_pid(file); } -- cgit v1.2.3 From 9babe825da769cfbd7080f95e6bddbe98ce6b95d Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Mon, 29 Jul 2019 14:51:10 -0700 Subject: bpf: always allocate at least 16 bytes for setsockopt hook Since we always allocate memory, allocate just a little bit more for the BPF program in case it need to override user input with bigger value. The canonical example is TCP_CONGESTION where input string might be too small to override (nv -> bbr or cubic). 16 bytes are chosen to match the size of TCP_CA_NAME_MAX and can be extended in the future if needed. Signed-off-by: Stanislav Fomichev Signed-off-by: Alexei Starovoitov --- kernel/bpf/cgroup.c | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 0a00eaca6fae..6a6a154cfa7b 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -964,7 +964,6 @@ static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen) return -ENOMEM; ctx->optval_end = ctx->optval + max_optlen; - ctx->optlen = max_optlen; return 0; } @@ -984,7 +983,7 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level, .level = *level, .optname = *optname, }; - int ret; + int ret, max_optlen; /* Opportunistic check to see whether we have any BPF program * attached to the hook so we don't waste time allocating @@ -994,10 +993,18 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level, __cgroup_bpf_prog_array_is_empty(cgrp, BPF_CGROUP_SETSOCKOPT)) return 0; - ret = sockopt_alloc_buf(&ctx, *optlen); + /* Allocate a bit more than the initial user buffer for + * BPF program. The canonical use case is overriding + * TCP_CONGESTION(nv) to TCP_CONGESTION(cubic). + */ + max_optlen = max_t(int, 16, *optlen); + + ret = sockopt_alloc_buf(&ctx, max_optlen); if (ret) return ret; + ctx.optlen = *optlen; + if (copy_from_user(ctx.optval, optval, *optlen) != 0) { ret = -EFAULT; goto out; @@ -1016,7 +1023,7 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level, if (ctx.optlen == -1) { /* optlen set to -1, bypass kernel */ ret = 1; - } else if (ctx.optlen > *optlen || ctx.optlen < -1) { + } else if (ctx.optlen > max_optlen || ctx.optlen < -1) { /* optlen is out of bounds */ ret = -EFAULT; } else { @@ -1063,6 +1070,8 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level, if (ret) return ret; + ctx.optlen = max_optlen; + if (!retval) { /* If kernel getsockopt finished successfully, * copy whatever was returned to the user back -- cgit v1.2.3 From d143b3d1cd89f6bcab67dc88160914aa3536c663 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 22 Jun 2019 12:05:54 -0700 Subject: rcu: Simplify rcu_read_unlock_special() deferred wakeups In !use_softirq runs, we clearly cannot rely on raise_softirq() and its lightweight bit setting, so we must instead do some form of wakeup. In the absence of a self-IPI when interrupts are disabled, these wakeups can be delayed until the next interrupt occurs. This means that calling invoke_rcu_core() doesn't actually do any expediting. In this case, it is better to take the "else" clause, which sets the current CPU's resched bits and, if there is an expedited grace period in flight, uses IRQ-work to force the needed self-IPI. This commit therefore removes the "else if" clause that calls invoke_rcu_core(). Reported-by: Scott Wood Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index acb225023ed1..3f0701e860e4 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -631,17 +631,12 @@ static void rcu_read_unlock_special(struct task_struct *t) // Using softirq, safe to awaken, and we get // no help from enabling irqs, unlike bh/preempt. raise_softirq_irqoff(RCU_SOFTIRQ); - } else if (exp && irqs_were_disabled && !use_softirq && - !t->rcu_read_unlock_special.b.deferred_qs) { - // Safe to awaken and we get no help from enabling - // irqs, unlike bh/preempt. - invoke_rcu_core(); } else { // Enabling BH or preempt does reschedule, so... // Also if no expediting or NO_HZ_FULL, slow is OK. set_tsk_need_resched(current); set_preempt_need_resched(); - if (IS_ENABLED(CONFIG_IRQ_WORK) && + if (IS_ENABLED(CONFIG_IRQ_WORK) && irqs_were_disabled && !rdp->defer_qs_iw_pending && exp) { // Get scheduler to re-evaluate and call hooks. // If !IRQ_WORK, FQS scan will eventually IPI. -- cgit v1.2.3 From 87446b48748b49dd34900904649a5ec95a591699 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 28 Jun 2019 11:25:26 -0700 Subject: rcu: Make rcu_read_unlock_special() checks match raise_softirq_irqoff() Threaded interrupts provide additional interesting interactions between RCU and raise_softirq() that can result in self-deadlocks in v5.0-2 of the Linux kernel. These self-deadlocks can be provoked in susceptible kernels within a few minutes using the following rcutorture command on an 8-CPU system: tools/testing/selftests/rcutorture/bin/kvm.sh --duration 5 --configs "TREE03" --bootargs "threadirqs" Although post-v5.2 RCU commits have at least greatly reduced the probability of these self-deadlocks, this was entirely by accident. Although this sort of accident should be rowdily celebrated on those rare occasions when it does occur, such celebrations should be quickly followed by a principled patch, which is what this patch purports to be. The key point behind this patch is that when in_interrupt() returns true, __raise_softirq_irqoff() will never attempt a wakeup. Therefore, if in_interrupt(), calls to raise_softirq*() are both safe and extremely cheap. This commit therefore replaces the in_irq() calls in the "if" statement in rcu_read_unlock_special() with in_interrupt() and simplifies the "if" condition to the following: if (irqs_were_disabled && use_softirq && (in_interrupt() || (exp && !t->rcu_read_unlock_special.b.deferred_qs))) { raise_softirq_irqoff(RCU_SOFTIRQ); } else { /* Appeal to the scheduler. */ } The rationale behind the "if" condition is as follows: 1. irqs_were_disabled: If interrupts are enabled, we should instead appeal to the scheduler so as to let the upcoming irq_enable()/local_bh_enable() do the rescheduling for us. 2. use_softirq: If this kernel isn't using softirq, then raise_softirq_irqoff() will be unhelpful. 3. a. in_interrupt(): If this returns true, the subsequent call to raise_softirq_irqoff() is guaranteed not to do a wakeup, so that call will be both very cheap and quite safe. b. Otherwise, if !in_interrupt() the raise_softirq_irqoff() might do a wakeup, which is expensive and, in some contexts, unsafe. i. The "exp" (an expedited RCU grace period is being blocked) says that the wakeup is worthwhile, and: ii. The !.deferred_qs says that scheduler locks cannot be held, so the wakeup will be safe. Backporting this requires considerable care, so no auto-backport, please! Fixes: 05f415715ce45 ("rcu: Speed up expedited GPs when interrupting RCU reader") Reported-by: Sebastian Andrzej Siewior Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 3f0701e860e4..1fd3ca4ffc1d 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -626,8 +626,9 @@ static void rcu_read_unlock_special(struct task_struct *t) (rdp->grpmask & rnp->expmask) || tick_nohz_full_cpu(rdp->cpu); // Need to defer quiescent state until everything is enabled. - if ((exp || in_irq()) && irqs_were_disabled && use_softirq && - (in_irq() || !t->rcu_read_unlock_special.b.deferred_qs)) { + if (irqs_were_disabled && use_softirq && + (in_interrupt() || + (exp && !t->rcu_read_unlock_special.b.deferred_qs))) { // Using softirq, safe to awaken, and we get // no help from enabling irqs, unlike bh/preempt. raise_softirq_irqoff(RCU_SOFTIRQ); -- cgit v1.2.3 From cb4dbbfaa1f5a190f041b174177699a009ab2ecd Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Mon, 1 Jul 2019 00:04:14 -0400 Subject: rcu: Simplify rcu_note_context_switch exit from critical section Because __rcu_read_unlock() can be preempted just before the call to rcu_read_unlock_special(), it is possible for a task to be preempted just before it would have fully exited its RCU read-side critical section. This would result in a needless extension of that critical section until that task was resumed, which might in turn result in a needlessly long grace period, needless RCU priority boosting, and needless force-quiescent-state actions. Therefore, rcu_note_context_switch() invokes __rcu_read_unlock() followed by rcu_preempt_deferred_qs() when it detects this situation. This action by rcu_note_context_switch() ends the RCU read-side critical section immediately. Of course, once the task resumes, it will invoke rcu_read_unlock_special() redundantly. This is harmless because the fact that a preemption happened means that interrupts, preemption, and softirqs cannot have been disabled, so there would be no deferred quiescent state. While ->rcu_read_lock_nesting remains less than zero, none of the ->rcu_read_unlock_special.b bits can be set, and they were all zeroed by the call to rcu_note_context_switch() at task-preemption time. Therefore, setting ->rcu_read_unlock_special.b.exp_hint to false has no effect. Therefore, the extra call to rcu_preempt_deferred_qs_irqrestore() would return immediately. With one possible exception, which is if an expedited grace period started just as the task was being resumed, which could leave ->exp_deferred_qs set. This will cause rcu_preempt_deferred_qs_irqrestore() to invoke rcu_report_exp_rdp(), reporting the quiescent state, just as it should. (Such an expedited grace period won't affect the preemption code path due to interrupts having already been disabled.) But when rcu_note_context_switch() invokes __rcu_read_unlock(), it is doing so with preemption disabled, hence __rcu_read_unlock() will unconditionally defer the quiescent state, only to immediately invoke rcu_preempt_deferred_qs(), thus immediately reporting the deferred quiescent state. It turns out to be safe (and faster) to instead just invoke rcu_preempt_deferred_qs() without the __rcu_read_unlock() middleman. Because this is the invocation during the preemption (as opposed to the invocation just after the resume), at least one of the bits in ->rcu_read_unlock_special.b must be set and ->rcu_read_lock_nesting must be negative. This means that rcu_preempt_need_deferred_qs() must return true, avoiding the early exit from rcu_preempt_deferred_qs(). Thus, rcu_preempt_deferred_qs_irqrestore() will be invoked immediately, as required. This commit therefore simplifies the CONFIG_PREEMPT=y version of rcu_note_context_switch() by removing the "else if" branch of its "if" statement. This change means that all callers that would have invoked rcu_read_unlock_special() followed by rcu_preempt_deferred_qs() will now simply invoke rcu_preempt_deferred_qs(), thus avoiding the rcu_read_unlock_special() middleman when __rcu_read_unlock() is preempted. Cc: rcu@vger.kernel.org Cc: kernel-team@android.com Signed-off-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 9 --------- 1 file changed, 9 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 1fd3ca4ffc1d..ce6ef345102b 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -314,15 +314,6 @@ void rcu_note_context_switch(bool preempt) ? rnp->gp_seq : rcu_seq_snap(&rnp->gp_seq)); rcu_preempt_ctxt_queue(rnp, rdp); - } else if (t->rcu_read_lock_nesting < 0 && - t->rcu_read_unlock_special.s) { - - /* - * Complete exit from RCU read-side critical section on - * behalf of preempted instance of __rcu_read_unlock(). - */ - rcu_read_unlock_special(t); - rcu_preempt_deferred_qs(t); } else { rcu_preempt_deferred_qs(t); } -- cgit v1.2.3 From 519248f36d6f3c80e176f6fa844c10d94f1f5990 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 30 May 2019 05:39:25 -0700 Subject: lockdep: Make print_lock() address visible Security is a wonderful thing, but so is the ability to debug based on lockdep warnings. This commit therefore makes lockdep lock addresses visible in the clear. Signed-off-by: Paul E. McKenney --- kernel/locking/lockdep.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 4861cf8e274b..4aca3f4379d2 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -620,7 +620,7 @@ static void print_lock(struct held_lock *hlock) return; } - printk(KERN_CONT "%p", hlock->instance); + printk(KERN_CONT "%px", hlock->instance); print_lock_name(lock); printk(KERN_CONT ", at: %pS\n", (void *)hlock->acquire_ip); } -- cgit v1.2.3 From b55bd585551ed2220eefdab96b31e6f935310eec Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 30 May 2019 05:39:25 -0700 Subject: time/tick-broadcast: Fix tick_broadcast_offline() lockdep complaint The TASKS03 and TREE04 rcutorture scenarios produce the following lockdep complaint: ------------------------------------------------------------------------ ================================ WARNING: inconsistent lock state 5.2.0-rc1+ #513 Not tainted -------------------------------- inconsistent {IN-HARDIRQ-W} -> {HARDIRQ-ON-W} usage. migration/1/14 [HC0[0]:SC0[0]:HE1:SE1] takes: (____ptrval____) (tick_broadcast_lock){?...}, at: tick_broadcast_offline+0xf/0x70 {IN-HARDIRQ-W} state was registered at: lock_acquire+0xb0/0x1c0 _raw_spin_lock_irqsave+0x3c/0x50 tick_broadcast_switch_to_oneshot+0xd/0x40 tick_switch_to_oneshot+0x4f/0xd0 hrtimer_run_queues+0xf3/0x130 run_local_timers+0x1c/0x50 update_process_times+0x1c/0x50 tick_periodic+0x26/0xc0 tick_handle_periodic+0x1a/0x60 smp_apic_timer_interrupt+0x80/0x2a0 apic_timer_interrupt+0xf/0x20 _raw_spin_unlock_irqrestore+0x4e/0x60 rcu_nocb_gp_kthread+0x15d/0x590 kthread+0xf3/0x130 ret_from_fork+0x3a/0x50 irq event stamp: 171 hardirqs last enabled at (171): [] trace_hardirqs_on_thunk+0x1a/0x1c hardirqs last disabled at (170): [] trace_hardirqs_off_thunk+0x1a/0x1c softirqs last enabled at (0): [] copy_process.part.56+0x650/0x1cb0 softirqs last disabled at (0): [<0000000000000000>] 0x0 other info that might help us debug this: Possible unsafe locking scenario: CPU0 ---- lock(tick_broadcast_lock); lock(tick_broadcast_lock); *** DEADLOCK *** 1 lock held by migration/1/14: #0: (____ptrval____) (clockevents_lock){+.+.}, at: tick_offline_cpu+0xf/0x30 stack backtrace: CPU: 1 PID: 14 Comm: migration/1 Not tainted 5.2.0-rc1+ #513 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS Bochs 01/01/2011 Call Trace: dump_stack+0x5e/0x8b print_usage_bug+0x1fc/0x216 ? print_shortest_lock_dependencies+0x1b0/0x1b0 mark_lock+0x1f2/0x280 __lock_acquire+0x1e0/0x18f0 ? __lock_acquire+0x21b/0x18f0 ? _raw_spin_unlock_irqrestore+0x4e/0x60 lock_acquire+0xb0/0x1c0 ? tick_broadcast_offline+0xf/0x70 _raw_spin_lock+0x33/0x40 ? tick_broadcast_offline+0xf/0x70 tick_broadcast_offline+0xf/0x70 tick_offline_cpu+0x16/0x30 take_cpu_down+0x7d/0xa0 multi_cpu_stop+0xa2/0xe0 ? cpu_stop_queue_work+0xc0/0xc0 cpu_stopper_thread+0x6d/0x100 smpboot_thread_fn+0x169/0x240 kthread+0xf3/0x130 ? sort_range+0x20/0x20 ? kthread_cancel_delayed_work_sync+0x10/0x10 ret_from_fork+0x3a/0x50 ------------------------------------------------------------------------ To reproduce, run the following rcutorture test: tools/testing/selftests/rcutorture/bin/kvm.sh --duration 5 --kconfig "CONFIG_DEBUG_LOCK_ALLOC=y CONFIG_PROVE_LOCKING=y" --configs "TASKS03 TREE04" It turns out that tick_broadcast_offline() was an innocent bystander. After all, interrupts are supposed to be disabled throughout take_cpu_down(), and therefore should have been disabled upon entry to tick_offline_cpu() and thus to tick_broadcast_offline(). This suggests that one of the CPU-hotplug notifiers was incorrectly enabling interrupts, and leaving them enabled on return. Some debugging code showed that the culprit was sched_cpu_dying(). It had irqs enabled after return from sched_tick_stop(). Which in turn had irqs enabled after return from cancel_delayed_work_sync(). Which is a wrapper around __cancel_work_timer(). Which can sleep in the case where something else is concurrently trying to cancel the same delayed work, and as Thomas Gleixner pointed out on IRC, sleeping is a decidedly bad idea when you are invoked from take_cpu_down(), regardless of the state you leave interrupts in upon return. Code inspection located no reason why the delayed work absolutely needed to be canceled from sched_tick_stop(): The work is not bound to the outgoing CPU by design, given that the whole point is to collect statistics without disturbing the outgoing CPU. This commit therefore simply drops the cancel_delayed_work_sync() from sched_tick_stop(). Instead, a new ->state field is added to the tick_work structure so that the delayed-work handler function sched_tick_remote() can avoid reposting itself. A cpu_is_offline() check is also added to sched_tick_remote() to avoid mucking with the state of an offlined CPU (though it does appear safe to do so). The sched_tick_start() and sched_tick_stop() functions also update ->state, and sched_tick_start() also schedules the delayed work if ->state indicates that it is not already in flight. Signed-off-by: Paul E. McKenney Cc: Ingo Molnar Cc: Peter Zijlstra Reviewed-by: Frederic Weisbecker [ paulmck: Apply Peter Zijlstra and Frederic Weisbecker atomics feedback. ] Acked-by: Peter Zijlstra (Intel) --- kernel/sched/core.c | 57 +++++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 49 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 2b037f195473..0b22e55cebe8 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3486,8 +3486,36 @@ void scheduler_tick(void) struct tick_work { int cpu; + atomic_t state; struct delayed_work work; }; +/* Values for ->state, see diagram below. */ +#define TICK_SCHED_REMOTE_OFFLINE 0 +#define TICK_SCHED_REMOTE_OFFLINING 1 +#define TICK_SCHED_REMOTE_RUNNING 2 + +/* + * State diagram for ->state: + * + * + * TICK_SCHED_REMOTE_OFFLINE + * | ^ + * | | + * | | sched_tick_remote() + * | | + * | | + * +--TICK_SCHED_REMOTE_OFFLINING + * | ^ + * | | + * sched_tick_start() | | sched_tick_stop() + * | | + * V | + * TICK_SCHED_REMOTE_RUNNING + * + * + * Other transitions get WARN_ON_ONCE(), except that sched_tick_remote() + * and sched_tick_start() are happy to leave the state in RUNNING. + */ static struct tick_work __percpu *tick_work_cpu; @@ -3500,6 +3528,7 @@ static void sched_tick_remote(struct work_struct *work) struct task_struct *curr; struct rq_flags rf; u64 delta; + int os; /* * Handle the tick only if it appears the remote CPU is running in full @@ -3513,7 +3542,7 @@ static void sched_tick_remote(struct work_struct *work) rq_lock_irq(rq, &rf); curr = rq->curr; - if (is_idle_task(curr)) + if (is_idle_task(curr) || cpu_is_offline(cpu)) goto out_unlock; update_rq_clock(rq); @@ -3533,13 +3562,18 @@ out_requeue: /* * Run the remote tick once per second (1Hz). This arbitrary * frequency is large enough to avoid overload but short enough - * to keep scheduler internal stats reasonably up to date. + * to keep scheduler internal stats reasonably up to date. But + * first update state to reflect hotplug activity if required. */ - queue_delayed_work(system_unbound_wq, dwork, HZ); + os = atomic_fetch_add_unless(&twork->state, -1, TICK_SCHED_REMOTE_RUNNING); + WARN_ON_ONCE(os == TICK_SCHED_REMOTE_OFFLINE); + if (os == TICK_SCHED_REMOTE_RUNNING) + queue_delayed_work(system_unbound_wq, dwork, HZ); } static void sched_tick_start(int cpu) { + int os; struct tick_work *twork; if (housekeeping_cpu(cpu, HK_FLAG_TICK)) @@ -3548,15 +3582,20 @@ static void sched_tick_start(int cpu) WARN_ON_ONCE(!tick_work_cpu); twork = per_cpu_ptr(tick_work_cpu, cpu); - twork->cpu = cpu; - INIT_DELAYED_WORK(&twork->work, sched_tick_remote); - queue_delayed_work(system_unbound_wq, &twork->work, HZ); + os = atomic_xchg(&twork->state, TICK_SCHED_REMOTE_RUNNING); + WARN_ON_ONCE(os == TICK_SCHED_REMOTE_RUNNING); + if (os == TICK_SCHED_REMOTE_OFFLINE) { + twork->cpu = cpu; + INIT_DELAYED_WORK(&twork->work, sched_tick_remote); + queue_delayed_work(system_unbound_wq, &twork->work, HZ); + } } #ifdef CONFIG_HOTPLUG_CPU static void sched_tick_stop(int cpu) { struct tick_work *twork; + int os; if (housekeeping_cpu(cpu, HK_FLAG_TICK)) return; @@ -3564,7 +3603,10 @@ static void sched_tick_stop(int cpu) WARN_ON_ONCE(!tick_work_cpu); twork = per_cpu_ptr(tick_work_cpu, cpu); - cancel_delayed_work_sync(&twork->work); + /* There cannot be competing actions, but don't rely on stop-machine. */ + os = atomic_xchg(&twork->state, TICK_SCHED_REMOTE_OFFLINING); + WARN_ON_ONCE(os != TICK_SCHED_REMOTE_RUNNING); + /* Don't cancel, as this would mess up the state machine. */ } #endif /* CONFIG_HOTPLUG_CPU */ @@ -3572,7 +3614,6 @@ int __init sched_tick_offload_init(void) { tick_work_cpu = alloc_percpu(struct tick_work); BUG_ON(!tick_work_cpu); - return 0; } -- cgit v1.2.3 From 1f3ebc8253ee56bfaa883c5114fb5569c56f6197 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 4 Jun 2019 14:05:52 -0700 Subject: rcu: Restore barrier() to rcu_read_lock() and rcu_read_unlock() Commit bb73c52bad36 ("rcu: Don't disable preemption for Tiny and Tree RCU readers") removed the barrier() calls from rcu_read_lock() and rcu_write_lock() in CONFIG_PREEMPT=n&&CONFIG_PREEMPT_COUNT=n kernels. Within RCU, this commit was OK, but it failed to account for things like get_user() that can pagefault and that can be reordered by the compiler. Lack of the barrier() calls in rcu_read_lock() and rcu_read_unlock() can cause these page faults to migrate into RCU read-side critical sections, which in CONFIG_PREEMPT=n kernels could result in too-short grace periods and arbitrary misbehavior. Please see commit 386afc91144b ("spinlocks and preemption points need to be at least compiler barriers") and Linus's commit 66be4e66a7f4 ("rcu: locking and unlocking need to always be at least barriers"), this last of which restores the barrier() call to both rcu_read_lock() and rcu_read_unlock(). This commit removes barrier() calls that are no longer needed given that the addition of them in Linus's commit noted above. The combination of this commit and Linus's commit effectively reverts commit bb73c52bad36 ("rcu: Don't disable preemption for Tiny and Tree RCU readers"). Reported-by: Herbert Xu Reported-by: Linus Torvalds Signed-off-by: Paul E. McKenney [ paulmck: Fix embarrassing typo located by Alan Stern. ] --- kernel/rcu/tree_plugin.h | 11 ----------- 1 file changed, 11 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index acb225023ed1..3f1b5041de9b 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -288,7 +288,6 @@ void rcu_note_context_switch(bool preempt) struct rcu_data *rdp = this_cpu_ptr(&rcu_data); struct rcu_node *rnp; - barrier(); /* Avoid RCU read-side critical sections leaking down. */ trace_rcu_utilization(TPS("Start context switch")); lockdep_assert_irqs_disabled(); WARN_ON_ONCE(!preempt && t->rcu_read_lock_nesting > 0); @@ -340,7 +339,6 @@ void rcu_note_context_switch(bool preempt) if (rdp->exp_deferred_qs) rcu_report_exp_rdp(rdp); trace_rcu_utilization(TPS("End context switch")); - barrier(); /* Avoid RCU read-side critical sections leaking up. */ } EXPORT_SYMBOL_GPL(rcu_note_context_switch); @@ -828,11 +826,6 @@ static void rcu_qs(void) * dyntick-idle quiescent state visible to other CPUs, which will in * some cases serve for expedited as well as normal grace periods. * Either way, register a lightweight quiescent state. - * - * The barrier() calls are redundant in the common case when this is - * called externally, but just in case this is called from within this - * file. - * */ void rcu_all_qs(void) { @@ -847,14 +840,12 @@ void rcu_all_qs(void) return; } this_cpu_write(rcu_data.rcu_urgent_qs, false); - barrier(); /* Avoid RCU read-side critical sections leaking down. */ if (unlikely(raw_cpu_read(rcu_data.rcu_need_heavy_qs))) { local_irq_save(flags); rcu_momentary_dyntick_idle(); local_irq_restore(flags); } rcu_qs(); - barrier(); /* Avoid RCU read-side critical sections leaking up. */ preempt_enable(); } EXPORT_SYMBOL_GPL(rcu_all_qs); @@ -864,7 +855,6 @@ EXPORT_SYMBOL_GPL(rcu_all_qs); */ void rcu_note_context_switch(bool preempt) { - barrier(); /* Avoid RCU read-side critical sections leaking down. */ trace_rcu_utilization(TPS("Start context switch")); rcu_qs(); /* Load rcu_urgent_qs before other flags. */ @@ -877,7 +867,6 @@ void rcu_note_context_switch(bool preempt) rcu_tasks_qs(current); out: trace_rcu_utilization(TPS("End context switch")); - barrier(); /* Avoid RCU read-side critical sections leaking up. */ } EXPORT_SYMBOL_GPL(rcu_note_context_switch); -- cgit v1.2.3 From cdc694b2359d52cd6d0465d5a6263d97c786fb0c Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 13 Jun 2019 15:30:49 -0700 Subject: rcu: Add kernel parameter to dump trace after RCU CPU stall warning This commit adds a rcu_cpu_stall_ftrace_dump kernel boot parameter, that, when set, causes the trace buffer to be dumped after an RCU CPU stall warning is printed. This kernel boot parameter is disabled by default, maintaining compatibility with previous behavior. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcu.h | 1 + kernel/rcu/tree_stall.h | 4 ++++ kernel/rcu/update.c | 2 ++ 3 files changed, 7 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 5290b01de534..8fd4f82c9b3d 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -227,6 +227,7 @@ static inline bool __rcu_reclaim(const char *rn, struct rcu_head *head) #ifdef CONFIG_RCU_STALL_COMMON +extern int rcu_cpu_stall_ftrace_dump; extern int rcu_cpu_stall_suppress; extern int rcu_cpu_stall_timeout; int rcu_jiffies_till_stall_check(void); diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index 065183391f75..0627a66699a6 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -527,6 +527,8 @@ static void check_cpu_stall(struct rcu_data *rdp) /* We haven't checked in, so go dump stack. */ print_cpu_stall(); + if (rcu_cpu_stall_ftrace_dump) + rcu_ftrace_dump(DUMP_ALL); } else if (rcu_gp_in_progress() && ULONG_CMP_GE(j, js + RCU_STALL_RAT_DELAY) && @@ -534,6 +536,8 @@ static void check_cpu_stall(struct rcu_data *rdp) /* They had a few time units to dump stack, so complain. */ print_other_cpu_stall(gs2); + if (rcu_cpu_stall_ftrace_dump) + rcu_ftrace_dump(DUMP_ALL); } } diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 61df2bf08563..249517058b13 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -437,6 +437,8 @@ EXPORT_SYMBOL_GPL(rcutorture_sched_setaffinity); #endif #ifdef CONFIG_RCU_STALL_COMMON +int rcu_cpu_stall_ftrace_dump __read_mostly; +module_param(rcu_cpu_stall_ftrace_dump, int, 0644); int rcu_cpu_stall_suppress __read_mostly; /* 1 = suppress stall warnings. */ EXPORT_SYMBOL_GPL(rcu_cpu_stall_suppress); module_param(rcu_cpu_stall_suppress, int, 0644); -- cgit v1.2.3 From fbad01af8c3bb9618848abde8054ab7e0c2330fe Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 19 Jun 2019 15:42:51 -0700 Subject: rcu: Add destroy_work_on_stack() to match INIT_WORK_ONSTACK() The synchronize_rcu_expedited() function has an INIT_WORK_ONSTACK(), but lacks the corresponding destroy_work_on_stack(). This commit therefore adds destroy_work_on_stack(). Reported-by: Andrea Arcangeli Signed-off-by: Paul E. McKenney Acked-by: Andrea Arcangeli --- kernel/rcu/tree_exp.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index af7e7b9c86af..513b403b683b 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -792,6 +792,7 @@ static int rcu_print_task_exp_stall(struct rcu_node *rnp) */ void synchronize_rcu_expedited(void) { + bool boottime = (rcu_scheduler_active == RCU_SCHEDULER_INIT); struct rcu_exp_work rew; struct rcu_node *rnp; unsigned long s; @@ -817,7 +818,7 @@ void synchronize_rcu_expedited(void) return; /* Someone else did our work for us. */ /* Ensure that load happens before action based on it. */ - if (unlikely(rcu_scheduler_active == RCU_SCHEDULER_INIT)) { + if (unlikely(boottime)) { /* Direct call during scheduler init and early_initcalls(). */ rcu_exp_sel_wait_wake(s); } else { @@ -835,5 +836,8 @@ void synchronize_rcu_expedited(void) /* Let the next expedited grace period start. */ mutex_unlock(&rcu_state.exp_mutex); + + if (likely(!boottime)) + destroy_work_on_stack(&rew.rew_work); } EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); -- cgit v1.2.3 From 7e210a653ec9445512534cd235cac29e7301af2a Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 28 Jun 2019 17:11:10 -0700 Subject: srcu: Avoid srcutorture security-based pointer obfuscation Because pointer output is now obfuscated, and because what you really want to know is whether or not the callback lists are empty, this commit replaces the srcu_data structure's head callback pointer printout with a single character that is "." is the callback list is empty or "C" otherwise. This is the only remaining user of rcu_segcblist_head(), so this commit also removes this function's definition. It also turns out that rcu_segcblist_tail() no longer has any callers, so this commit removes that function's definition while in the area. They were both marked "Interim", and their end has come. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcu_segcblist.h | 21 --------------------- kernel/rcu/srcutree.c | 5 +++-- 2 files changed, 3 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcu_segcblist.h b/kernel/rcu/rcu_segcblist.h index 71b64648464e..822a39da0533 100644 --- a/kernel/rcu/rcu_segcblist.h +++ b/kernel/rcu/rcu_segcblist.h @@ -76,27 +76,6 @@ static inline bool rcu_segcblist_restempty(struct rcu_segcblist *rsclp, int seg) return !*rsclp->tails[seg]; } -/* - * Interim function to return rcu_segcblist head pointer. Longer term, the - * rcu_segcblist will be used more pervasively, removing the need for this - * function. - */ -static inline struct rcu_head *rcu_segcblist_head(struct rcu_segcblist *rsclp) -{ - return rsclp->head; -} - -/* - * Interim function to return rcu_segcblist head pointer. Longer term, the - * rcu_segcblist will be used more pervasively, removing the need for this - * function. - */ -static inline struct rcu_head **rcu_segcblist_tail(struct rcu_segcblist *rsclp) -{ - WARN_ON_ONCE(rcu_segcblist_empty(rsclp)); - return rsclp->tails[RCU_NEXT_TAIL]; -} - void rcu_segcblist_init(struct rcu_segcblist *rsclp); void rcu_segcblist_disable(struct rcu_segcblist *rsclp); bool rcu_segcblist_ready_cbs(struct rcu_segcblist *rsclp); diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index cf0e886314f2..5dffade2d7cd 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -1279,8 +1279,9 @@ void srcu_torture_stats_print(struct srcu_struct *ssp, char *tt, char *tf) c0 = l0 - u0; c1 = l1 - u1; - pr_cont(" %d(%ld,%ld %1p)", - cpu, c0, c1, rcu_segcblist_head(&sdp->srcu_cblist)); + pr_cont(" %d(%ld,%ld %c)", + cpu, c0, c1, + "C."[rcu_segcblist_empty(&sdp->srcu_cblist)]); s0 += c0; s1 += c1; } -- cgit v1.2.3 From 3545832fc22e2316d9c289f6ba825710a268bfa6 Mon Sep 17 00:00:00 2001 From: Byungchul Park Date: Mon, 1 Jul 2019 09:40:39 +0900 Subject: rcu: Change return type of rcu_spawn_one_boost_kthread() The return value of rcu_spawn_one_boost_kthread() is not used any longer. This commit therefore changes its return type from int to void, and removes the cast to void from its callers. Signed-off-by: Byungchul Park Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 3f1b5041de9b..307ae6ebb804 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1123,7 +1123,7 @@ static void rcu_preempt_boost_start_gp(struct rcu_node *rnp) * already exist. We only create this kthread for preemptible RCU. * Returns zero if all is well, a negated errno otherwise. */ -static int rcu_spawn_one_boost_kthread(struct rcu_node *rnp) +static void rcu_spawn_one_boost_kthread(struct rcu_node *rnp) { int rnp_index = rnp - rcu_get_root(); unsigned long flags; @@ -1131,25 +1131,27 @@ static int rcu_spawn_one_boost_kthread(struct rcu_node *rnp) struct task_struct *t; if (!IS_ENABLED(CONFIG_PREEMPT_RCU)) - return 0; + return; if (!rcu_scheduler_fully_active || rcu_rnp_online_cpus(rnp) == 0) - return 0; + return; rcu_state.boost = 1; + if (rnp->boost_kthread_task != NULL) - return 0; + return; + t = kthread_create(rcu_boost_kthread, (void *)rnp, "rcub/%d", rnp_index); - if (IS_ERR(t)) - return PTR_ERR(t); + if (WARN_ON_ONCE(IS_ERR(t))) + return; + raw_spin_lock_irqsave_rcu_node(rnp, flags); rnp->boost_kthread_task = t; raw_spin_unlock_irqrestore_rcu_node(rnp, flags); sp.sched_priority = kthread_prio; sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); wake_up_process(t); /* get to TASK_INTERRUPTIBLE quickly. */ - return 0; } /* @@ -1190,7 +1192,7 @@ static void __init rcu_spawn_boost_kthreads(void) struct rcu_node *rnp; rcu_for_each_leaf_node(rnp) - (void)rcu_spawn_one_boost_kthread(rnp); + rcu_spawn_one_boost_kthread(rnp); } static void rcu_prepare_kthreads(int cpu) @@ -1200,7 +1202,7 @@ static void rcu_prepare_kthreads(int cpu) /* Fire up the incoming CPU's kthread and leaf rcu_node kthread. */ if (rcu_scheduler_fully_active) - (void)rcu_spawn_one_boost_kthread(rnp); + rcu_spawn_one_boost_kthread(rnp); } #else /* #ifdef CONFIG_RCU_BOOST */ -- cgit v1.2.3 From 0a5b99f57873e233ad42ef71e23c629f6ea1fcfe Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Thu, 11 Jul 2019 16:45:41 -0400 Subject: treewide: Rename rcu_dereference_raw_notrace() to _check() The rcu_dereference_raw_notrace() API name is confusing. It is equivalent to rcu_dereference_raw() except that it also does sparse pointer checking. There are only a few users of rcu_dereference_raw_notrace(). This patches renames all of them to be rcu_dereference_raw_check() with the "_check()" indicating sparse checking. Signed-off-by: Joel Fernandes (Google) [ paulmck: Fix checkpatch warnings about parentheses. ] Signed-off-by: Paul E. McKenney --- kernel/trace/ftrace_internal.h | 8 ++++---- kernel/trace/trace.c | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace_internal.h b/kernel/trace/ftrace_internal.h index 0515a2096f90..0456e0a3dab1 100644 --- a/kernel/trace/ftrace_internal.h +++ b/kernel/trace/ftrace_internal.h @@ -6,22 +6,22 @@ /* * Traverse the ftrace_global_list, invoking all entries. The reason that we - * can use rcu_dereference_raw_notrace() is that elements removed from this list + * can use rcu_dereference_raw_check() is that elements removed from this list * are simply leaked, so there is no need to interact with a grace-period - * mechanism. The rcu_dereference_raw_notrace() calls are needed to handle + * mechanism. The rcu_dereference_raw_check() calls are needed to handle * concurrent insertions into the ftrace_global_list. * * Silly Alpha and silly pointer-speculation compiler optimizations! */ #define do_for_each_ftrace_op(op, list) \ - op = rcu_dereference_raw_notrace(list); \ + op = rcu_dereference_raw_check(list); \ do /* * Optimized for just a single item in the list (as that is the normal case). */ #define while_for_each_ftrace_op(op) \ - while (likely(op = rcu_dereference_raw_notrace((op)->next)) && \ + while (likely(op = rcu_dereference_raw_check((op)->next)) && \ unlikely((op) != &ftrace_list_end)) extern struct ftrace_ops __rcu *ftrace_ops_list; diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 525a97fbbc60..642474b26ba7 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2642,10 +2642,10 @@ static void ftrace_exports(struct ring_buffer_event *event) preempt_disable_notrace(); - export = rcu_dereference_raw_notrace(ftrace_exports_list); + export = rcu_dereference_raw_check(ftrace_exports_list); while (export) { trace_process_export(export, event); - export = rcu_dereference_raw_notrace(export->next); + export = rcu_dereference_raw_check(export->next); } preempt_enable_notrace(); -- cgit v1.2.3 From 9147089bee3a6b504821dd8462e2be229e6dbfae Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Tue, 16 Jul 2019 18:12:21 -0400 Subject: rcu: Remove redundant debug_locks check in rcu_read_lock_sched_held() The debug_locks flag can never be true at the end of rcu_read_lock_sched_held() because it is already checked by the earlier call todebug_lockdep_rcu_enabled(). This commit therefore removes this redundant check. Signed-off-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- kernel/rcu/update.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 61df2bf08563..9dd5aeef6e70 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -93,17 +93,13 @@ module_param(rcu_normal_after_boot, int, 0); */ int rcu_read_lock_sched_held(void) { - int lockdep_opinion = 0; - if (!debug_lockdep_rcu_enabled()) return 1; if (!rcu_is_watching()) return 0; if (!rcu_lockdep_current_cpu_online()) return 0; - if (debug_locks) - lockdep_opinion = lock_is_held(&rcu_sched_lock_map); - return lockdep_opinion || !preemptible(); + return lock_is_held(&rcu_sched_lock_map) || !preemptible(); } EXPORT_SYMBOL(rcu_read_lock_sched_held); #endif -- cgit v1.2.3 From b3f3886c59f649ace424d132bd8c06e3611c71a8 Mon Sep 17 00:00:00 2001 From: Xiao Yang Date: Fri, 31 May 2019 23:15:45 +0800 Subject: rcuperf: Fix perf_type module-parameter description The rcu_bh rcuperf type was removed by commit 620d246065cd("rcuperf: Remove the "rcu_bh" and "sched" torture types"), but it lives on in the MODULE_PARM_DESC() of perf_type. This commit therefore changes that module-parameter description to substitute srcu for rcu_bh. Signed-off-by: Xiao Yang Signed-off-by: Paul E. McKenney --- kernel/rcu/rcuperf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c index 7a6890b23c5f..4513807cd4c4 100644 --- a/kernel/rcu/rcuperf.c +++ b/kernel/rcu/rcuperf.c @@ -89,7 +89,7 @@ torture_param(int, writer_holdoff, 0, "Holdoff (us) between GPs, zero to disable static char *perf_type = "rcu"; module_param(perf_type, charp, 0444); -MODULE_PARM_DESC(perf_type, "Type of RCU to performance-test (rcu, rcu_bh, ...)"); +MODULE_PARM_DESC(perf_type, "Type of RCU to performance-test (rcu, srcu, ...)"); static int nrealreaders; static int nrealwriters; -- cgit v1.2.3 From bd1bfc51a36f334270b886db6d8467e55fe294ca Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 22 Jun 2019 14:35:59 -0700 Subject: rcutorture: Emulate userspace sojourn during call_rcu() floods During an actual call_rcu() flood, there would be frequent trips to userspace (in-kernel call_rcu() floods must be otherwise housebroken). Userspace execution allows a great many things to interrupt execution, and rcutorture needs to also allow such interruptions. This commit therefore causes call_rcu() floods to occasionally invoke schedule(), thus preventing spurious rcutorture failures due to other parts of the kernel becoming irate at the call_rcu() flood events. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index fce4e7e6f502..c44e5307afcc 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1713,12 +1713,14 @@ static void rcu_torture_fwd_cb_cr(struct rcu_head *rhp) } // Give the scheduler a chance, even on nohz_full CPUs. -static void rcu_torture_fwd_prog_cond_resched(void) +static void rcu_torture_fwd_prog_cond_resched(unsigned long iter) { if (IS_ENABLED(CONFIG_PREEMPT) && IS_ENABLED(CONFIG_NO_HZ_FULL)) { - if (need_resched()) + // Real call_rcu() floods hit userspace, so emulate that. + if (need_resched() || (iter & 0xfff)) schedule(); } else { + // No userspace emulation: CB invocation throttles call_rcu() cond_resched(); } } @@ -1746,7 +1748,7 @@ static unsigned long rcu_torture_fwd_prog_cbfree(void) spin_unlock_irqrestore(&rcu_fwd_lock, flags); kfree(rfcp); freed++; - rcu_torture_fwd_prog_cond_resched(); + rcu_torture_fwd_prog_cond_resched(freed); } return freed; } @@ -1790,7 +1792,7 @@ static void rcu_torture_fwd_prog_nr(int *tested, int *tested_tries) udelay(10); cur_ops->readunlock(idx); if (!fwd_progress_need_resched || need_resched()) - rcu_torture_fwd_prog_cond_resched(); + rcu_torture_fwd_prog_cond_resched(1); } (*tested_tries)++; if (!time_before(jiffies, stopat) && @@ -1875,7 +1877,7 @@ static void rcu_torture_fwd_prog_cr(void) rfcp->rfc_gps = 0; } cur_ops->call(&rfcp->rh, rcu_torture_fwd_cb_cr); - rcu_torture_fwd_prog_cond_resched(); + rcu_torture_fwd_prog_cond_resched(n_launders + n_max_cbs); } stoppedat = jiffies; n_launders_cb_snap = READ_ONCE(n_launders_cb); -- cgit v1.2.3 From 21f57546ceaf4c5537a617f55b809a843b109210 Mon Sep 17 00:00:00 2001 From: Denis Efremov Date: Thu, 4 Jul 2019 15:57:19 +0300 Subject: torture: Remove exporting of internal functions The functions torture_onoff_cleanup() and torture_shuffle_cleanup() are declared static and marked EXPORT_SYMBOL_GPL(), which is at best an odd combination. Because these functions are not used outside of the kernel/torture.c file they are defined in, this commit removes their EXPORT_SYMBOL_GPL() marking. Fixes: cc47ae083026 ("rcutorture: Abstract torture-test cleanup") Signed-off-by: Denis Efremov Signed-off-by: Paul E. McKenney --- kernel/torture.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/torture.c b/kernel/torture.c index a8d9bdfba7c3..7c13f5558b71 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -263,7 +263,6 @@ static void torture_onoff_cleanup(void) onoff_task = NULL; #endif /* #ifdef CONFIG_HOTPLUG_CPU */ } -EXPORT_SYMBOL_GPL(torture_onoff_cleanup); /* * Print online/offline testing statistics. @@ -449,7 +448,6 @@ static void torture_shuffle_cleanup(void) } shuffler_task = NULL; } -EXPORT_SYMBOL_GPL(torture_shuffle_cleanup); /* * Variables for auto-shutdown. This allows "lights out" torture runs -- cgit v1.2.3 From 77e9752ce69f36f1be4e366373727fb7921f5909 Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Thu, 4 Jul 2019 00:34:30 -0400 Subject: rcuperf: Make rcuperf kernel test more robust for !expedited mode It is possible that the rcuperf kernel test runs concurrently with init starting up. During this time, the system is running all grace periods as expedited. However, rcuperf can also be run for normal GP tests. Right now, it depends on a holdoff time before starting the test to ensure grace periods start later. This works fine with the default holdoff time however it is not robust in situations where init takes greater than the holdoff time to finish running. Or, as in my case: I modified the rcuperf test locally to also run a thread that did preempt disable/enable in a loop. This had the effect of slowing down init. The end result was that the "batches:" counter in rcuperf was 0 causing a division by 0 error in the results. This counter was 0 because only expedited GPs seem to happen, not normal ones which led to the rcu_state.gp_seq counter remaining constant across grace periods which unexpectedly happen to be expedited. The system was running expedited RCU all the time because rcu_unexpedited_gp() would not have run yet from init. In other words, the test would concurrently with init booting in expedited GP mode. To fix this properly, this commit waits until system_state is set to SYSTEM_RUNNING before starting the test. This change is made just before kernel_init() invokes rcu_end_inkernel_boot(), and this latter is what turns off boot-time expediting of RCU grace periods. Signed-off-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- kernel/rcu/rcuperf.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c index 4513807cd4c4..5a879d073c1c 100644 --- a/kernel/rcu/rcuperf.c +++ b/kernel/rcu/rcuperf.c @@ -375,6 +375,14 @@ rcu_perf_writer(void *arg) if (holdoff) schedule_timeout_uninterruptible(holdoff * HZ); + /* + * Wait until rcu_end_inkernel_boot() is called for normal GP tests + * so that RCU is not always expedited for normal GP tests. + * The system_state test is approximate, but works well in practice. + */ + while (!gp_exp && system_state != SYSTEM_RUNNING) + schedule_timeout_uninterruptible(1); + t = ktime_get_mono_fast_ns(); if (atomic_inc_return(&n_rcu_perf_writer_started) >= nrealwriters) { t_rcu_perf_writer_started = t; -- cgit v1.2.3 From 60013d5d2b4031e6027005e5e2dcb6ee6da6b186 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 10 Jul 2019 08:30:00 -0700 Subject: rcutorture: Aggressive forward-progress tests shouldn't block shutdown The more aggressive forward-progress tests can interfere with rcutorture shutdown, resulting in false-positive diagnostics. This commit therefore ends any such tests 30 seconds prior to shutdown. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index c44e5307afcc..b22947324423 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -161,6 +161,7 @@ static atomic_long_t n_rcu_torture_timers; static long n_barrier_attempts; static long n_barrier_successes; /* did rcu_barrier test succeed? */ static struct list_head rcu_torture_removed; +static unsigned long shutdown_jiffies; static int rcu_torture_writer_state; #define RTWS_FIXED_DELAY 0 @@ -228,6 +229,15 @@ static u64 notrace rcu_trace_clock_local(void) } #endif /* #else #ifdef CONFIG_RCU_TRACE */ +/* + * Stop aggressive CPU-hog tests a bit before the end of the test in order + * to avoid interfering with test shutdown. + */ +static bool shutdown_time_arrived(void) +{ + return shutdown_secs && time_after(jiffies, shutdown_jiffies - 30 * HZ); +} + static unsigned long boost_starttime; /* jiffies of next boost test start. */ static DEFINE_MUTEX(boost_mutex); /* protect setting boost_starttime */ /* and boost task create/destroy. */ @@ -1787,6 +1797,7 @@ static void rcu_torture_fwd_prog_nr(int *tested, int *tested_tries) WRITE_ONCE(rcu_fwd_startat, jiffies); stopat = rcu_fwd_startat + dur; while (time_before(jiffies, stopat) && + !shutdown_time_arrived() && !READ_ONCE(rcu_fwd_emergency_stop) && !torture_must_stop()) { idx = cur_ops->readlock(); udelay(10); @@ -1796,6 +1807,7 @@ static void rcu_torture_fwd_prog_nr(int *tested, int *tested_tries) } (*tested_tries)++; if (!time_before(jiffies, stopat) && + !shutdown_time_arrived() && !READ_ONCE(rcu_fwd_emergency_stop) && !torture_must_stop()) { (*tested)++; cver = READ_ONCE(rcu_torture_current_version) - cver; @@ -1854,6 +1866,7 @@ static void rcu_torture_fwd_prog_cr(void) gps = cur_ops->get_gp_seq(); rcu_launder_gp_seq_start = gps; while (time_before(jiffies, stopat) && + !shutdown_time_arrived() && !READ_ONCE(rcu_fwd_emergency_stop) && !torture_must_stop()) { rfcp = READ_ONCE(rcu_fwd_cb_head); rfcpn = NULL; @@ -1886,7 +1899,8 @@ static void rcu_torture_fwd_prog_cr(void) cur_ops->cb_barrier(); /* Wait for callbacks to be invoked. */ (void)rcu_torture_fwd_prog_cbfree(); - if (!torture_must_stop() && !READ_ONCE(rcu_fwd_emergency_stop)) { + if (!torture_must_stop() && !READ_ONCE(rcu_fwd_emergency_stop) && + !shutdown_time_arrived()) { WARN_ON(n_max_gps < MIN_FWD_CBS_LAUNDERED); pr_alert("%s Duration %lu barrier: %lu pending %ld n_launders: %ld n_launders_sa: %ld n_max_gps: %ld n_max_cbs: %ld cver %ld gps %ld\n", __func__, @@ -2467,6 +2481,7 @@ rcu_torture_init(void) goto unwind; rcutor_hp = firsterr; } + shutdown_jiffies = jiffies + shutdown_secs * HZ; firsterr = torture_shutdown_init(shutdown_secs, rcu_torture_cleanup); if (firsterr) goto unwind; -- cgit v1.2.3 From 2c8db5bef9fb442a5a0d2fee28aed20100362ce3 Mon Sep 17 00:00:00 2001 From: Kalesh Singh Date: Wed, 31 Jul 2019 14:29:33 -0700 Subject: PM/sleep: Expose suspend stats in sysfs Userspace can get suspend stats from the suspend stats debugfs node. Since debugfs doesn't have stable ABI, expose suspend stats in sysfs under /sys/power/suspend_stats. Signed-off-by: Kalesh Singh Reviewed-by: Greg Kroah-Hartman Signed-off-by: Rafael J. Wysocki --- kernel/power/main.c | 97 +++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 95 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/power/main.c b/kernel/power/main.c index bdbd605c4215..938dc53a8b94 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -254,7 +254,6 @@ static ssize_t pm_test_store(struct kobject *kobj, struct kobj_attribute *attr, power_attr(pm_test); #endif /* CONFIG_PM_SLEEP_DEBUG */ -#ifdef CONFIG_DEBUG_FS static char *suspend_step_name(enum suspend_stat_step step) { switch (step) { @@ -275,6 +274,92 @@ static char *suspend_step_name(enum suspend_stat_step step) } } +#define suspend_attr(_name) \ +static ssize_t _name##_show(struct kobject *kobj, \ + struct kobj_attribute *attr, char *buf) \ +{ \ + return sprintf(buf, "%d\n", suspend_stats._name); \ +} \ +static struct kobj_attribute _name = __ATTR_RO(_name) + +suspend_attr(success); +suspend_attr(fail); +suspend_attr(failed_freeze); +suspend_attr(failed_prepare); +suspend_attr(failed_suspend); +suspend_attr(failed_suspend_late); +suspend_attr(failed_suspend_noirq); +suspend_attr(failed_resume); +suspend_attr(failed_resume_early); +suspend_attr(failed_resume_noirq); + +static ssize_t last_failed_dev_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + int index; + char *last_failed_dev = NULL; + + index = suspend_stats.last_failed_dev + REC_FAILED_NUM - 1; + index %= REC_FAILED_NUM; + last_failed_dev = suspend_stats.failed_devs[index]; + + return sprintf(buf, "%s\n", last_failed_dev); +} +static struct kobj_attribute last_failed_dev = __ATTR_RO(last_failed_dev); + +static ssize_t last_failed_errno_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + int index; + int last_failed_errno; + + index = suspend_stats.last_failed_errno + REC_FAILED_NUM - 1; + index %= REC_FAILED_NUM; + last_failed_errno = suspend_stats.errno[index]; + + return sprintf(buf, "%d\n", last_failed_errno); +} +static struct kobj_attribute last_failed_errno = __ATTR_RO(last_failed_errno); + +static ssize_t last_failed_step_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + int index; + enum suspend_stat_step step; + char *last_failed_step = NULL; + + index = suspend_stats.last_failed_step + REC_FAILED_NUM - 1; + index %= REC_FAILED_NUM; + step = suspend_stats.failed_steps[index]; + last_failed_step = suspend_step_name(step); + + return sprintf(buf, "%s\n", last_failed_step); +} +static struct kobj_attribute last_failed_step = __ATTR_RO(last_failed_step); + +static struct attribute *suspend_attrs[] = { + &success.attr, + &fail.attr, + &failed_freeze.attr, + &failed_prepare.attr, + &failed_suspend.attr, + &failed_suspend_late.attr, + &failed_suspend_noirq.attr, + &failed_resume.attr, + &failed_resume_early.attr, + &failed_resume_noirq.attr, + &last_failed_dev.attr, + &last_failed_errno.attr, + &last_failed_step.attr, + NULL, +}; + +static struct attribute_group suspend_attr_group = { + .name = "suspend_stats", + .attrs = suspend_attrs, +}; + +#ifdef CONFIG_DEBUG_FS static int suspend_stats_show(struct seq_file *s, void *unused) { int i, index, last_dev, last_errno, last_step; @@ -794,6 +879,14 @@ static const struct attribute_group attr_group = { .attrs = g, }; +static const struct attribute_group *attr_groups[] = { + &attr_group, +#ifdef CONFIG_PM_SLEEP + &suspend_attr_group, +#endif + NULL, +}; + struct workqueue_struct *pm_wq; EXPORT_SYMBOL_GPL(pm_wq); @@ -815,7 +908,7 @@ static int __init pm_init(void) power_kobj = kobject_create_and_add("power", NULL); if (!power_kobj) return -ENOMEM; - error = sysfs_create_group(power_kobj, &attr_group); + error = sysfs_create_groups(power_kobj, attr_groups); if (error) return error; pm_print_times_init(); -- cgit v1.2.3 From c8424e776b093280d3fdd104d850706b3b229ac8 Mon Sep 17 00:00:00 2001 From: Thiago Jung Bauermann Date: Thu, 4 Jul 2019 15:57:34 -0300 Subject: MODSIGN: Export module signature definitions IMA will use the module_signature format for append signatures, so export the relevant definitions and factor out the code which verifies that the appended signature trailer is valid. Also, create a CONFIG_MODULE_SIG_FORMAT option so that IMA can select it and be able to use mod_check_sig() without having to depend on either CONFIG_MODULE_SIG or CONFIG_MODULES. s390 duplicated the definition of struct module_signature so now they can use the new header instead. Signed-off-by: Thiago Jung Bauermann Acked-by: Jessica Yu Reviewed-by: Philipp Rudo Cc: Heiko Carstens Signed-off-by: Mimi Zohar --- kernel/Makefile | 1 + kernel/module.c | 1 + kernel/module_signature.c | 46 ++++++++++++++++++++++++++++++++++++++ kernel/module_signing.c | 56 +++++++---------------------------------------- 4 files changed, 56 insertions(+), 48 deletions(-) create mode 100644 kernel/module_signature.c (limited to 'kernel') diff --git a/kernel/Makefile b/kernel/Makefile index a8d923b5481b..179b44ab8b46 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -58,6 +58,7 @@ endif obj-$(CONFIG_UID16) += uid16.o obj-$(CONFIG_MODULES) += module.o obj-$(CONFIG_MODULE_SIG) += module_signing.o +obj-$(CONFIG_MODULE_SIG_FORMAT) += module_signature.o obj-$(CONFIG_KALLSYMS) += kallsyms.o obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o obj-$(CONFIG_CRASH_CORE) += crash_core.o diff --git a/kernel/module.c b/kernel/module.c index 5933395af9a0..5ac22efc3685 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -7,6 +7,7 @@ #include #include #include +#include #include #include #include diff --git a/kernel/module_signature.c b/kernel/module_signature.c new file mode 100644 index 000000000000..4224a1086b7d --- /dev/null +++ b/kernel/module_signature.c @@ -0,0 +1,46 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * Module signature checker + * + * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + */ + +#include +#include +#include +#include + +/** + * mod_check_sig - check that the given signature is sane + * + * @ms: Signature to check. + * @file_len: Size of the file to which @ms is appended. + * @name: What is being checked. Used for error messages. + */ +int mod_check_sig(const struct module_signature *ms, size_t file_len, + const char *name) +{ + if (be32_to_cpu(ms->sig_len) >= file_len - sizeof(*ms)) + return -EBADMSG; + + if (ms->id_type != PKEY_ID_PKCS7) { + pr_err("%s: Module is not signed with expected PKCS#7 message\n", + name); + return -ENOPKG; + } + + if (ms->algo != 0 || + ms->hash != 0 || + ms->signer_len != 0 || + ms->key_id_len != 0 || + ms->__pad[0] != 0 || + ms->__pad[1] != 0 || + ms->__pad[2] != 0) { + pr_err("%s: PKCS#7 signature info has unexpected non-zero params\n", + name); + return -EBADMSG; + } + + return 0; +} diff --git a/kernel/module_signing.c b/kernel/module_signing.c index b10fb1986ca9..9d9fc678c91d 100644 --- a/kernel/module_signing.c +++ b/kernel/module_signing.c @@ -7,37 +7,13 @@ #include #include +#include +#include #include #include #include #include "module-internal.h" -enum pkey_id_type { - PKEY_ID_PGP, /* OpenPGP generated key ID */ - PKEY_ID_X509, /* X.509 arbitrary subjectKeyIdentifier */ - PKEY_ID_PKCS7, /* Signature in PKCS#7 message */ -}; - -/* - * Module signature information block. - * - * The constituents of the signature section are, in order: - * - * - Signer's name - * - Key identifier - * - Signature data - * - Information block - */ -struct module_signature { - u8 algo; /* Public-key crypto algorithm [0] */ - u8 hash; /* Digest algorithm [0] */ - u8 id_type; /* Key identifier type [PKEY_ID_PKCS7] */ - u8 signer_len; /* Length of signer's name [0] */ - u8 key_id_len; /* Length of key identifier [0] */ - u8 __pad[3]; - __be32 sig_len; /* Length of signature data */ -}; - /* * Verify the signature on a module. */ @@ -45,6 +21,7 @@ int mod_verify_sig(const void *mod, struct load_info *info) { struct module_signature ms; size_t sig_len, modlen = info->len; + int ret; pr_devel("==>%s(,%zu)\n", __func__, modlen); @@ -52,32 +29,15 @@ int mod_verify_sig(const void *mod, struct load_info *info) return -EBADMSG; memcpy(&ms, mod + (modlen - sizeof(ms)), sizeof(ms)); - modlen -= sizeof(ms); + + ret = mod_check_sig(&ms, modlen, info->name); + if (ret) + return ret; sig_len = be32_to_cpu(ms.sig_len); - if (sig_len >= modlen) - return -EBADMSG; - modlen -= sig_len; + modlen -= sig_len + sizeof(ms); info->len = modlen; - if (ms.id_type != PKEY_ID_PKCS7) { - pr_err("%s: Module is not signed with expected PKCS#7 message\n", - info->name); - return -ENOPKG; - } - - if (ms.algo != 0 || - ms.hash != 0 || - ms.signer_len != 0 || - ms.key_id_len != 0 || - ms.__pad[0] != 0 || - ms.__pad[1] != 0 || - ms.__pad[2] != 0) { - pr_err("%s: PKCS#7 signature info has unexpected non-zero params\n", - info->name); - return -EBADMSG; - } - return verify_pkcs7_signature(mod, modlen, mod + modlen, sig_len, VERIFY_USE_SECONDARY_KEYRING, VERIFYING_MODULE_SIGNATURE, -- cgit v1.2.3 From 91d2a812dfb98b3b4dad661529c33bc38d303461 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Tue, 25 Jun 2019 10:39:13 -0400 Subject: locking/rwsem: Make handoff writer optimistically spin on owner When the handoff bit is set by a writer, no other tasks other than the setting writer itself is allowed to acquire the lock. If the to-be-handoff'ed writer goes to sleep, there will be a wakeup latency period where the lock is free, but no one can acquire it. That is less than ideal. To reduce that latency, the handoff writer will now optimistically spin on the owner if it happens to be a on-cpu writer. It will spin until it releases the lock and the to-be-handoff'ed writer can then acquire the lock immediately without any delay. Of course, if the owner is not a on-cpu writer, the to-be-handoff'ed writer will have to sleep anyway. The optimistic spinning code is also modified to not stop spinning when the handoff bit is set. This will prevent an occasional setting of handoff bit from causing a bunch of optimistic spinners from entering into the wait queue causing significant reduction in throughput. On a 1-socket 22-core 44-thread Skylake system, the AIM7 shared_memory workload was run with 7000 users. The throughput (jobs/min) of the following kernels were as follows: 1) 5.2-rc6 - 8,092,486 2) 5.2-rc6 + tip's rwsem patches - 7,567,568 3) 5.2-rc6 + tip's rwsem patches + this patch - 7,954,545 Using perf-record(1), the %cpu time used by rwsem_down_write_slowpath(), rwsem_down_write_failed() and their callees for the 3 kernels were 1.70%, 5.46% and 2.08% respectively. Signed-off-by: Waiman Long Signed-off-by: Peter Zijlstra (Intel) Cc: x86@kernel.org Cc: Ingo Molnar Cc: Will Deacon Cc: huang ying Cc: Tim Chen Cc: Linus Torvalds Cc: Borislav Petkov Cc: Thomas Gleixner Cc: Davidlohr Bueso Cc: "H. Peter Anvin" Link: https://lkml.kernel.org/r/20190625143913.24154-1-longman@redhat.com --- kernel/locking/rwsem.c | 32 ++++++++++++++++++++++++++------ 1 file changed, 26 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index bd0f0d05724c..354238a08b7a 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -724,11 +724,12 @@ rwsem_spin_on_owner(struct rw_semaphore *sem, unsigned long nonspinnable) rcu_read_lock(); for (;;) { - if (atomic_long_read(&sem->count) & RWSEM_FLAG_HANDOFF) { - state = OWNER_NONSPINNABLE; - break; - } - + /* + * When a waiting writer set the handoff flag, it may spin + * on the owner as well. Once that writer acquires the lock, + * we can spin on it. So we don't need to quit even when the + * handoff bit is set. + */ new = rwsem_owner_flags(sem, &new_flags); if ((new != owner) || (new_flags != flags)) { state = rwsem_owner_state(new, new_flags, nonspinnable); @@ -974,6 +975,13 @@ static inline bool rwsem_reader_phase_trylock(struct rw_semaphore *sem, { return false; } + +static inline int +rwsem_spin_on_owner(struct rw_semaphore *sem, unsigned long nonspinnable) +{ + return 0; +} +#define OWNER_NULL 1 #endif /* @@ -1206,6 +1214,18 @@ wait: raw_spin_unlock_irq(&sem->wait_lock); + /* + * After setting the handoff bit and failing to acquire + * the lock, attempt to spin on owner to accelerate lock + * transfer. If the previous owner is a on-cpu writer and it + * has just released the lock, OWNER_NULL will be returned. + * In this case, we attempt to acquire the lock again + * without sleeping. + */ + if ((wstate == WRITER_HANDOFF) && + (rwsem_spin_on_owner(sem, 0) == OWNER_NULL)) + goto trylock_again; + /* Block until there are no active lockers. */ for (;;) { if (signal_pending_state(state, current)) @@ -1240,7 +1260,7 @@ wait: break; } } - +trylock_again: raw_spin_lock_irq(&sem->wait_lock); } __set_current_state(TASK_RUNNING); -- cgit v1.2.3 From fce45cd41101f1a9620267146b21f09b3454d8db Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Sun, 28 Jul 2019 21:47:35 -0700 Subject: locking/rwsem: Check for operations on an uninitialized rwsem Currently rwsems is the only locking primitive that lacks this debug feature. Add it under CONFIG_DEBUG_RWSEMS and do the magic checking in the locking fastpath (trylock) operation such that we cover all cases. The unlocking part is pretty straightforward. Signed-off-by: Davidlohr Bueso Signed-off-by: Peter Zijlstra (Intel) Acked-by: Waiman Long Cc: mingo@kernel.org Cc: Davidlohr Bueso Link: https://lkml.kernel.org/r/20190729044735.9632-1-dave@stgolabs.net --- kernel/locking/rwsem.c | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index 354238a08b7a..eef04551eae7 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -105,8 +105,9 @@ #ifdef CONFIG_DEBUG_RWSEMS # define DEBUG_RWSEMS_WARN_ON(c, sem) do { \ if (!debug_locks_silent && \ - WARN_ONCE(c, "DEBUG_RWSEMS_WARN_ON(%s): count = 0x%lx, owner = 0x%lx, curr 0x%lx, list %sempty\n",\ + WARN_ONCE(c, "DEBUG_RWSEMS_WARN_ON(%s): count = 0x%lx, magic = 0x%lx, owner = 0x%lx, curr 0x%lx, list %sempty\n",\ #c, atomic_long_read(&(sem)->count), \ + (unsigned long) sem->magic, \ atomic_long_read(&(sem)->owner), (long)current, \ list_empty(&(sem)->wait_list) ? "" : "not ")) \ debug_locks_off(); \ @@ -329,6 +330,9 @@ void __init_rwsem(struct rw_semaphore *sem, const char *name, */ debug_check_no_locks_freed((void *)sem, sizeof(*sem)); lockdep_init_map(&sem->dep_map, name, key, 0); +#endif +#ifdef CONFIG_DEBUG_RWSEMS + sem->magic = sem; #endif atomic_long_set(&sem->count, RWSEM_UNLOCKED_VALUE); raw_spin_lock_init(&sem->wait_lock); @@ -1358,11 +1362,14 @@ static inline int __down_read_killable(struct rw_semaphore *sem) static inline int __down_read_trylock(struct rw_semaphore *sem) { + long tmp; + + DEBUG_RWSEMS_WARN_ON(sem->magic != sem, sem); + /* * Optimize for the case when the rwsem is not locked at all. */ - long tmp = RWSEM_UNLOCKED_VALUE; - + tmp = RWSEM_UNLOCKED_VALUE; do { if (atomic_long_try_cmpxchg_acquire(&sem->count, &tmp, tmp + RWSEM_READER_BIAS)) { @@ -1403,8 +1410,11 @@ static inline int __down_write_killable(struct rw_semaphore *sem) static inline int __down_write_trylock(struct rw_semaphore *sem) { - long tmp = RWSEM_UNLOCKED_VALUE; + long tmp; + DEBUG_RWSEMS_WARN_ON(sem->magic != sem, sem); + + tmp = RWSEM_UNLOCKED_VALUE; if (atomic_long_try_cmpxchg_acquire(&sem->count, &tmp, RWSEM_WRITER_LOCKED)) { rwsem_set_owner(sem); @@ -1420,7 +1430,9 @@ inline void __up_read(struct rw_semaphore *sem) { long tmp; + DEBUG_RWSEMS_WARN_ON(sem->magic != sem, sem); DEBUG_RWSEMS_WARN_ON(!is_rwsem_reader_owned(sem), sem); + rwsem_clear_reader_owned(sem); tmp = atomic_long_add_return_release(-RWSEM_READER_BIAS, &sem->count); DEBUG_RWSEMS_WARN_ON(tmp < 0, sem); @@ -1438,12 +1450,14 @@ static inline void __up_write(struct rw_semaphore *sem) { long tmp; + DEBUG_RWSEMS_WARN_ON(sem->magic != sem, sem); /* * sem->owner may differ from current if the ownership is transferred * to an anonymous writer by setting the RWSEM_NONSPINNABLE bits. */ DEBUG_RWSEMS_WARN_ON((rwsem_owner(sem) != current) && !rwsem_test_oflags(sem, RWSEM_NONSPINNABLE), sem); + rwsem_clear_owner(sem); tmp = atomic_long_fetch_add_release(-RWSEM_WRITER_LOCKED, &sem->count); if (unlikely(tmp & RWSEM_FLAG_WAITERS)) -- cgit v1.2.3 From 5f35d5a66b3ec62cb5ec4ec2ad9aebe2ac325673 Mon Sep 17 00:00:00 2001 From: Mukesh Ojha Date: Wed, 31 Jul 2019 20:35:03 +0530 Subject: locking/mutex: Make __mutex_owner static to mutex.c __mutex_owner() should only be used by the mutex api's. So, to put this restiction let's move the __mutex_owner() function definition from linux/mutex.h to mutex.c file. There exist functions that uses __mutex_owner() like mutex_is_locked() and mutex_trylock_recursive(), So to keep legacy thing intact move them as well and export them. Move mutex_waiter structure also to keep it private to the file. Signed-off-by: Mukesh Ojha Signed-off-by: Peter Zijlstra (Intel) Cc: mingo@redhat.com Cc: will@kernel.org Link: https://lkml.kernel.org/r/1564585504-3543-1-git-send-email-mojha@codeaurora.org --- kernel/locking/mutex.c | 39 +++++++++++++++++++++++++++++++++++++++ kernel/locking/mutex.h | 2 ++ 2 files changed, 41 insertions(+) (limited to 'kernel') diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index 5e069734363c..ac4929f1e085 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -36,6 +36,19 @@ # include "mutex.h" #endif +/* + * This is the control structure for tasks blocked on mutex, + * which resides on the blocked task's kernel stack: + */ +struct mutex_waiter { + struct list_head list; + struct task_struct *task; + struct ww_acquire_ctx *ww_ctx; +#ifdef CONFIG_DEBUG_MUTEXES + void *magic; +#endif +}; + void __mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key) { @@ -65,11 +78,37 @@ EXPORT_SYMBOL(__mutex_init); #define MUTEX_FLAGS 0x07 +/* + * Internal helper function; C doesn't allow us to hide it :/ + * + * DO NOT USE (outside of mutex code). + */ +static inline struct task_struct *__mutex_owner(struct mutex *lock) +{ + return (struct task_struct *)(atomic_long_read(&lock->owner) & ~0x07); +} + static inline struct task_struct *__owner_task(unsigned long owner) { return (struct task_struct *)(owner & ~MUTEX_FLAGS); } +bool mutex_is_locked(struct mutex *lock) +{ + return __mutex_owner(lock) != NULL; +} +EXPORT_SYMBOL(mutex_is_locked); + +__must_check enum mutex_trylock_recursive_enum +mutex_trylock_recursive(struct mutex *lock) +{ + if (unlikely(__mutex_owner(lock) == current)) + return MUTEX_TRYLOCK_RECURSIVE; + + return mutex_trylock(lock); +} +EXPORT_SYMBOL(mutex_trylock_recursive); + static inline unsigned long __owner_flags(unsigned long owner) { return owner & MUTEX_FLAGS; diff --git a/kernel/locking/mutex.h b/kernel/locking/mutex.h index 1c2287d3fa71..7cde5c6d414e 100644 --- a/kernel/locking/mutex.h +++ b/kernel/locking/mutex.h @@ -19,6 +19,8 @@ #define debug_mutex_unlock(lock) do { } while (0) #define debug_mutex_init(lock, name, key) do { } while (0) +struct mutex_waiter; + static inline void debug_mutex_lock_common(struct mutex *lock, struct mutex_waiter *waiter) { -- cgit v1.2.3 From a037d269221c0ae15f47046757afcbd1a7177bbf Mon Sep 17 00:00:00 2001 From: Mukesh Ojha Date: Wed, 31 Jul 2019 20:35:04 +0530 Subject: locking/mutex: Use mutex flags macro instead of hard code Use the mutex flag macro instead of hard code value inside __mutex_owner(). Signed-off-by: Mukesh Ojha Signed-off-by: Peter Zijlstra (Intel) Cc: mingo@redhat.com Cc: will@kernel.org Link: https://lkml.kernel.org/r/1564585504-3543-2-git-send-email-mojha@codeaurora.org --- kernel/locking/mutex.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index ac4929f1e085..b4bcb0236d7a 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -85,7 +85,7 @@ EXPORT_SYMBOL(__mutex_init); */ static inline struct task_struct *__mutex_owner(struct mutex *lock) { - return (struct task_struct *)(atomic_long_read(&lock->owner) & ~0x07); + return (struct task_struct *)(atomic_long_read(&lock->owner) & ~MUTEX_FLAGS); } static inline struct task_struct *__owner_task(unsigned long owner) -- cgit v1.2.3 From 63f0c60379650d82250f22e4cf4137ef3dc4f43d Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Tue, 23 Jul 2019 19:58:39 +0200 Subject: arm64: Introduce prctl() options to control the tagged user addresses ABI It is not desirable to relax the ABI to allow tagged user addresses into the kernel indiscriminately. This patch introduces a prctl() interface for enabling or disabling the tagged ABI with a global sysctl control for preventing applications from enabling the relaxed ABI (meant for testing user-space prctl() return error checking without reconfiguring the kernel). The ABI properties are inherited by threads of the same application and fork()'ed children but cleared on execve(). A Kconfig option allows the overall disabling of the relaxed ABI. The PR_SET_TAGGED_ADDR_CTRL will be expanded in the future to handle MTE-specific settings like imprecise vs precise exceptions. Reviewed-by: Kees Cook Signed-off-by: Catalin Marinas Signed-off-by: Andrey Konovalov Signed-off-by: Will Deacon --- kernel/sys.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index 2969304c29fe..c6c4d5358bd3 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -124,6 +124,12 @@ #ifndef PAC_RESET_KEYS # define PAC_RESET_KEYS(a, b) (-EINVAL) #endif +#ifndef SET_TAGGED_ADDR_CTRL +# define SET_TAGGED_ADDR_CTRL(a) (-EINVAL) +#endif +#ifndef GET_TAGGED_ADDR_CTRL +# define GET_TAGGED_ADDR_CTRL() (-EINVAL) +#endif /* * this is where the system-wide overflow UID and GID are defined, for @@ -2492,6 +2498,12 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, return -EINVAL; error = PAC_RESET_KEYS(me, arg2); break; + case PR_SET_TAGGED_ADDR_CTRL: + error = SET_TAGGED_ADDR_CTRL(arg2); + break; + case PR_GET_TAGGED_ADDR_CTRL: + error = GET_TAGGED_ADDR_CTRL(); + break; default: error = -EINVAL; break; -- cgit v1.2.3 From b977fcf477c176e5f41775f0ea139f935b0f25b7 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 31 Jul 2019 15:13:19 +0100 Subject: irqdomain/debugfs: Use PAs to generate fwnode names Booting a large arm64 server (HiSi D05) leads to the following shouting at boot time: [ 20.722132] debugfs: File 'irqchip@(____ptrval____)-3' in directory 'domains' already present! [ 20.730851] debugfs: File 'irqchip@(____ptrval____)-3' in directory 'domains' already present! [ 20.739560] debugfs: File 'irqchip@(____ptrval____)-3' in directory 'domains' already present! [ 20.748267] debugfs: File 'irqchip@(____ptrval____)-3' in directory 'domains' already present! [ 20.756975] debugfs: File 'irqchip@(____ptrval____)-3' in directory 'domains' already present! [ 20.765683] debugfs: File 'irqchip@(____ptrval____)-3' in directory 'domains' already present! [ 20.774391] debugfs: File 'irqchip@(____ptrval____)-3' in directory 'domains' already present! and many more... Evidently, we expect something a bit more informative than ____ptrval____, and certainly we want all of our domains, not just the first one. For that, turn the %p used to generate the fwnode name into something that won't be repainted (%pa). Given that we've now fixed all users to pass a pointer to a PA, it will actually do the right thing. Acked-by: Thomas Gleixner Signed-off-by: Marc Zyngier --- kernel/irq/irqdomain.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 3078d0e48bba..e7bbab149750 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -31,7 +31,7 @@ struct irqchip_fwid { struct fwnode_handle fwnode; unsigned int type; char *name; - void *data; + phys_addr_t *pa; }; #ifdef CONFIG_GENERIC_IRQ_DEBUGFS @@ -62,7 +62,8 @@ EXPORT_SYMBOL_GPL(irqchip_fwnode_ops); * domain struct. */ struct fwnode_handle *__irq_domain_alloc_fwnode(unsigned int type, int id, - const char *name, void *data) + const char *name, + phys_addr_t *pa) { struct irqchip_fwid *fwid; char *n; @@ -77,7 +78,7 @@ struct fwnode_handle *__irq_domain_alloc_fwnode(unsigned int type, int id, n = kasprintf(GFP_KERNEL, "%s-%d", name, id); break; default: - n = kasprintf(GFP_KERNEL, "irqchip@%p", data); + n = kasprintf(GFP_KERNEL, "irqchip@%pa", pa); break; } @@ -89,7 +90,7 @@ struct fwnode_handle *__irq_domain_alloc_fwnode(unsigned int type, int id, fwid->type = type; fwid->name = n; - fwid->data = data; + fwid->pa = pa; fwid->fwnode.ops = &irqchip_fwnode_ops; return &fwid->fwnode; } -- cgit v1.2.3 From 653a23ca7e1e1ae907654e91d028bc6dfc7fce0c Mon Sep 17 00:00:00 2001 From: Marc Koderer Date: Tue, 6 Aug 2019 15:24:12 +0200 Subject: Use kvmalloc in cgroups-v1 Instead of using its own logic for k-/vmalloc rely on kvmalloc which is actually doing quite the same. Signed-off-by: Marc Koderer Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup-v1.c | 27 ++++----------------------- 1 file changed, 4 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index 88006be40ea3..7f83f4121d8d 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -193,25 +193,6 @@ struct cgroup_pidlist { struct delayed_work destroy_dwork; }; -/* - * The following two functions "fix" the issue where there are more pids - * than kmalloc will give memory for; in such cases, we use vmalloc/vfree. - * TODO: replace with a kernel-wide solution to this problem - */ -#define PIDLIST_TOO_LARGE(c) ((c) * sizeof(pid_t) > (PAGE_SIZE * 2)) -static void *pidlist_allocate(int count) -{ - if (PIDLIST_TOO_LARGE(count)) - return vmalloc(array_size(count, sizeof(pid_t))); - else - return kmalloc_array(count, sizeof(pid_t), GFP_KERNEL); -} - -static void pidlist_free(void *p) -{ - kvfree(p); -} - /* * Used to destroy all pidlists lingering waiting for destroy timer. None * should be left afterwards. @@ -244,7 +225,7 @@ static void cgroup_pidlist_destroy_work_fn(struct work_struct *work) */ if (!delayed_work_pending(dwork)) { list_del(&l->links); - pidlist_free(l->list); + kvfree(l->list); put_pid_ns(l->key.ns); tofree = l; } @@ -365,7 +346,7 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type, * show up until sometime later on. */ length = cgroup_task_count(cgrp); - array = pidlist_allocate(length); + array = kvmalloc_array(length, sizeof(pid_t), GFP_KERNEL); if (!array) return -ENOMEM; /* now, populate the array */ @@ -390,12 +371,12 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type, l = cgroup_pidlist_find_create(cgrp, type); if (!l) { - pidlist_free(array); + kvfree(array); return -ENOMEM; } /* store array, freeing old if necessary */ - pidlist_free(l->list); + kvfree(l->list); l->list = array; l->length = length; *lp = l; -- cgit v1.2.3 From e57d143091f1c0b1a98140a4d2e63e113afb62c0 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 8 Aug 2019 08:47:14 +0200 Subject: mutex: Fix up mutex_waiter usage The patch moving bits into mutex.c was a little too much; by also moving struct mutex_waiter a few less common CONFIGs would no longer build. Fixes: 5f35d5a66b3e ("locking/mutex: Make __mutex_owner static to mutex.c") Signed-off-by: Peter Zijlstra (Intel) --- kernel/locking/mutex.c | 13 ------------- kernel/locking/mutex.h | 2 -- 2 files changed, 15 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index b4bcb0236d7a..468a9b8422e3 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -36,19 +36,6 @@ # include "mutex.h" #endif -/* - * This is the control structure for tasks blocked on mutex, - * which resides on the blocked task's kernel stack: - */ -struct mutex_waiter { - struct list_head list; - struct task_struct *task; - struct ww_acquire_ctx *ww_ctx; -#ifdef CONFIG_DEBUG_MUTEXES - void *magic; -#endif -}; - void __mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key) { diff --git a/kernel/locking/mutex.h b/kernel/locking/mutex.h index 7cde5c6d414e..1c2287d3fa71 100644 --- a/kernel/locking/mutex.h +++ b/kernel/locking/mutex.h @@ -19,8 +19,6 @@ #define debug_mutex_unlock(lock) do { } while (0) #define debug_mutex_init(lock, name, key) do { } while (0) -struct mutex_waiter; - static inline void debug_mutex_lock_common(struct mutex *lock, struct mutex_waiter *waiter) { -- cgit v1.2.3 From 130d9c331bc59a8733b47c58ef197a2b1fa3ed43 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 1 Aug 2019 12:42:06 +0200 Subject: rcu/tree: Fix SCHED_FIFO params A rather embarrasing mistake had us call sched_setscheduler() before initializing the parameters passed to it. Fixes: 1a763fd7c633 ("rcu/tree: Call setschedule() gp ktread to SCHED_FIFO outside of atomic region") Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Paul E. McKenney Cc: Juri Lelli --- kernel/rcu/tree.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index eb764c24bc4d..5efdce756fdf 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3234,13 +3234,13 @@ static int __init rcu_spawn_gp_kthread(void) t = kthread_create(rcu_gp_kthread, NULL, "%s", rcu_state.name); if (WARN_ONCE(IS_ERR(t), "%s: Could not start grace-period kthread, OOM is now expected behavior\n", __func__)) return 0; - if (kthread_prio) + if (kthread_prio) { + sp.sched_priority = kthread_prio; sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); + } rnp = rcu_get_root(); raw_spin_lock_irqsave_rcu_node(rnp, flags); rcu_state.gp_kthread = t; - if (kthread_prio) - sp.sched_priority = kthread_prio; raw_spin_unlock_irqrestore_rcu_node(rnp, flags); wake_up_process(t); rcu_spawn_nocb_kthreads(); -- cgit v1.2.3 From 139d025cda1da5484e7287b35c019fe1dcf9b650 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 29 Jul 2019 16:05:15 +0200 Subject: sched: Clean up active_mm reference counting The current active_mm reference counting is confusing and sub-optimal. Rewrite the code to explicitly consider the 4 separate cases: user -> user When switching between two user tasks, all we need to consider is switch_mm(). user -> kernel When switching from a user task to a kernel task (which doesn't have an associated mm) we retain the last mm in our active_mm. Increment a reference count on active_mm. kernel -> kernel When switching between kernel threads, all we need to do is pass along the active_mm reference. kernel -> user When switching between a kernel and user task, we must switch from the last active_mm to the next mm, hoping of course that these are the same. Decrement a reference on the active_mm. The code keeps a different order, because as you'll note, both 'to user' cases require switch_mm(). And where the old code would increment/decrement for the 'kernel -> kernel' case, the new code observes this is a neutral operation and avoids touching the reference count. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Rik van Riel Reviewed-by: Mathieu Desnoyers Cc: luto@kernel.org --- kernel/sched/core.c | 49 ++++++++++++++++++++++++++++++------------------- 1 file changed, 30 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 46f3ca9e392a..b4a44bc84749 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3214,12 +3214,8 @@ static __always_inline struct rq * context_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next, struct rq_flags *rf) { - struct mm_struct *mm, *oldmm; - prepare_task_switch(rq, prev, next); - mm = next->mm; - oldmm = prev->active_mm; /* * For paravirt, this is coupled with an exit in switch_to to * combine the page table reload and the switch backend into @@ -3228,22 +3224,37 @@ context_switch(struct rq *rq, struct task_struct *prev, arch_start_context_switch(prev); /* - * If mm is non-NULL, we pass through switch_mm(). If mm is - * NULL, we will pass through mmdrop() in finish_task_switch(). - * Both of these contain the full memory barrier required by - * membarrier after storing to rq->curr, before returning to - * user-space. + * kernel -> kernel lazy + transfer active + * user -> kernel lazy + mmgrab() active + * + * kernel -> user switch + mmdrop() active + * user -> user switch */ - if (!mm) { - next->active_mm = oldmm; - mmgrab(oldmm); - enter_lazy_tlb(oldmm, next); - } else - switch_mm_irqs_off(oldmm, mm, next); - - if (!prev->mm) { - prev->active_mm = NULL; - rq->prev_mm = oldmm; + if (!next->mm) { // to kernel + enter_lazy_tlb(prev->active_mm, next); + + next->active_mm = prev->active_mm; + if (prev->mm) // from user + mmgrab(prev->active_mm); + else + prev->active_mm = NULL; + } else { // to user + /* + * sys_membarrier() requires an smp_mb() between setting + * rq->curr and returning to userspace. + * + * The below provides this either through switch_mm(), or in + * case 'prev->active_mm == next->mm' through + * finish_task_switch()'s mmdrop(). + */ + + switch_mm_irqs_off(prev->active_mm, next->mm, next); + + if (!prev->mm) { // from kernel + /* will mmdrop() in finish_task_switch(). */ + rq->prev_mm = prev->active_mm; + prev->active_mm = NULL; + } } rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP); -- cgit v1.2.3 From de53fd7aedb100f03e5d2231cfce0e4993282425 Mon Sep 17 00:00:00 2001 From: Dave Chiluk Date: Tue, 23 Jul 2019 11:44:26 -0500 Subject: sched/fair: Fix low cpu usage with high throttling by removing expiration of cpu-local slices It has been observed, that highly-threaded, non-cpu-bound applications running under cpu.cfs_quota_us constraints can hit a high percentage of periods throttled while simultaneously not consuming the allocated amount of quota. This use case is typical of user-interactive non-cpu bound applications, such as those running in kubernetes or mesos when run on multiple cpu cores. This has been root caused to cpu-local run queue being allocated per cpu bandwidth slices, and then not fully using that slice within the period. At which point the slice and quota expires. This expiration of unused slice results in applications not being able to utilize the quota for which they are allocated. The non-expiration of per-cpu slices was recently fixed by 'commit 512ac999d275 ("sched/fair: Fix bandwidth timer clock drift condition")'. Prior to that it appears that this had been broken since at least 'commit 51f2176d74ac ("sched/fair: Fix unlocked reads of some cfs_b->quota/period")' which was introduced in v3.16-rc1 in 2014. That added the following conditional which resulted in slices never being expired. if (cfs_rq->runtime_expires != cfs_b->runtime_expires) { /* extend local deadline, drift is bounded above by 2 ticks */ cfs_rq->runtime_expires += TICK_NSEC; Because this was broken for nearly 5 years, and has recently been fixed and is now being noticed by many users running kubernetes (https://github.com/kubernetes/kubernetes/issues/67577) it is my opinion that the mechanisms around expiring runtime should be removed altogether. This allows quota already allocated to per-cpu run-queues to live longer than the period boundary. This allows threads on runqueues that do not use much CPU to continue to use their remaining slice over a longer period of time than cpu.cfs_period_us. However, this helps prevent the above condition of hitting throttling while also not fully utilizing your cpu quota. This theoretically allows a machine to use slightly more than its allotted quota in some periods. This overflow would be bounded by the remaining quota left on each per-cpu runqueueu. This is typically no more than min_cfs_rq_runtime=1ms per cpu. For CPU bound tasks this will change nothing, as they should theoretically fully utilize all of their quota in each period. For user-interactive tasks as described above this provides a much better user/application experience as their cpu utilization will more closely match the amount they requested when they hit throttling. This means that cpu limits no longer strictly apply per period for non-cpu bound applications, but that they are still accurate over longer timeframes. This greatly improves performance of high-thread-count, non-cpu bound applications with low cfs_quota_us allocation on high-core-count machines. In the case of an artificial testcase (10ms/100ms of quota on 80 CPU machine), this commit resulted in almost 30x performance improvement, while still maintaining correct cpu quota restrictions. That testcase is available at https://github.com/indeedeng/fibtest. Fixes: 512ac999d275 ("sched/fair: Fix bandwidth timer clock drift condition") Signed-off-by: Dave Chiluk Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Phil Auld Reviewed-by: Ben Segall Cc: Ingo Molnar Cc: John Hammond Cc: Jonathan Corbet Cc: Kyle Anderson Cc: Gabriel Munos Cc: Peter Oskolkov Cc: Cong Wang Cc: Brendan Gregg Link: https://lkml.kernel.org/r/1563900266-19734-2-git-send-email-chiluk+linux@indeed.com --- kernel/sched/fair.c | 72 +++++----------------------------------------------- kernel/sched/sched.h | 4 --- 2 files changed, 7 insertions(+), 69 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index fb75c0bea80f..7d8043fc8317 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4371,8 +4371,6 @@ void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b) now = sched_clock_cpu(smp_processor_id()); cfs_b->runtime = cfs_b->quota; - cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period); - cfs_b->expires_seq++; } static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg) @@ -4394,8 +4392,7 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq) { struct task_group *tg = cfs_rq->tg; struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg); - u64 amount = 0, min_amount, expires; - int expires_seq; + u64 amount = 0, min_amount; /* note: this is a positive sum as runtime_remaining <= 0 */ min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining; @@ -4412,61 +4409,17 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq) cfs_b->idle = 0; } } - expires_seq = cfs_b->expires_seq; - expires = cfs_b->runtime_expires; raw_spin_unlock(&cfs_b->lock); cfs_rq->runtime_remaining += amount; - /* - * we may have advanced our local expiration to account for allowed - * spread between our sched_clock and the one on which runtime was - * issued. - */ - if (cfs_rq->expires_seq != expires_seq) { - cfs_rq->expires_seq = expires_seq; - cfs_rq->runtime_expires = expires; - } return cfs_rq->runtime_remaining > 0; } -/* - * Note: This depends on the synchronization provided by sched_clock and the - * fact that rq->clock snapshots this value. - */ -static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq) -{ - struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); - - /* if the deadline is ahead of our clock, nothing to do */ - if (likely((s64)(rq_clock(rq_of(cfs_rq)) - cfs_rq->runtime_expires) < 0)) - return; - - if (cfs_rq->runtime_remaining < 0) - return; - - /* - * If the local deadline has passed we have to consider the - * possibility that our sched_clock is 'fast' and the global deadline - * has not truly expired. - * - * Fortunately we can check determine whether this the case by checking - * whether the global deadline(cfs_b->expires_seq) has advanced. - */ - if (cfs_rq->expires_seq == cfs_b->expires_seq) { - /* extend local deadline, drift is bounded above by 2 ticks */ - cfs_rq->runtime_expires += TICK_NSEC; - } else { - /* global deadline is ahead, expiration has passed */ - cfs_rq->runtime_remaining = 0; - } -} - static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) { /* dock delta_exec before expiring quota (as it could span periods) */ cfs_rq->runtime_remaining -= delta_exec; - expire_cfs_rq_runtime(cfs_rq); if (likely(cfs_rq->runtime_remaining > 0)) return; @@ -4661,8 +4614,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) resched_curr(rq); } -static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b, - u64 remaining, u64 expires) +static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b, u64 remaining) { struct cfs_rq *cfs_rq; u64 runtime; @@ -4684,7 +4636,6 @@ static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b, remaining -= runtime; cfs_rq->runtime_remaining += runtime; - cfs_rq->runtime_expires = expires; /* we check whether we're throttled above */ if (cfs_rq->runtime_remaining > 0) @@ -4709,7 +4660,7 @@ next: */ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun, unsigned long flags) { - u64 runtime, runtime_expires; + u64 runtime; int throttled; /* no need to continue the timer with no bandwidth constraint */ @@ -4737,8 +4688,6 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun, u /* account preceding periods in which throttling occurred */ cfs_b->nr_throttled += overrun; - runtime_expires = cfs_b->runtime_expires; - /* * This check is repeated as we are holding onto the new bandwidth while * we unthrottle. This can potentially race with an unthrottled group @@ -4751,8 +4700,7 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun, u cfs_b->distribute_running = 1; raw_spin_unlock_irqrestore(&cfs_b->lock, flags); /* we can't nest cfs_b->lock while distributing bandwidth */ - runtime = distribute_cfs_runtime(cfs_b, runtime, - runtime_expires); + runtime = distribute_cfs_runtime(cfs_b, runtime); raw_spin_lock_irqsave(&cfs_b->lock, flags); cfs_b->distribute_running = 0; @@ -4834,8 +4782,7 @@ static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq) return; raw_spin_lock(&cfs_b->lock); - if (cfs_b->quota != RUNTIME_INF && - cfs_rq->runtime_expires == cfs_b->runtime_expires) { + if (cfs_b->quota != RUNTIME_INF) { cfs_b->runtime += slack_runtime; /* we are under rq->lock, defer unthrottling using a timer */ @@ -4868,7 +4815,6 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b) { u64 runtime = 0, slice = sched_cfs_bandwidth_slice(); unsigned long flags; - u64 expires; /* confirm we're still not at a refresh boundary */ raw_spin_lock_irqsave(&cfs_b->lock, flags); @@ -4886,7 +4832,6 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b) if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice) runtime = cfs_b->runtime; - expires = cfs_b->runtime_expires; if (runtime) cfs_b->distribute_running = 1; @@ -4895,11 +4840,10 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b) if (!runtime) return; - runtime = distribute_cfs_runtime(cfs_b, runtime, expires); + runtime = distribute_cfs_runtime(cfs_b, runtime); raw_spin_lock_irqsave(&cfs_b->lock, flags); - if (expires == cfs_b->runtime_expires) - lsub_positive(&cfs_b->runtime, runtime); + lsub_positive(&cfs_b->runtime, runtime); cfs_b->distribute_running = 0; raw_spin_unlock_irqrestore(&cfs_b->lock, flags); } @@ -5056,8 +5000,6 @@ void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b) cfs_b->period_active = 1; overrun = hrtimer_forward_now(&cfs_b->period_timer, cfs_b->period); - cfs_b->runtime_expires += (overrun + 1) * ktime_to_ns(cfs_b->period); - cfs_b->expires_seq++; hrtimer_start_expires(&cfs_b->period_timer, HRTIMER_MODE_ABS_PINNED); } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 7583faddba33..ea48aa5daeee 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -335,8 +335,6 @@ struct cfs_bandwidth { u64 quota; u64 runtime; s64 hierarchical_quota; - u64 runtime_expires; - int expires_seq; u8 idle; u8 period_active; @@ -557,8 +555,6 @@ struct cfs_rq { #ifdef CONFIG_CFS_BANDWIDTH int runtime_enabled; - int expires_seq; - u64 runtime_expires; s64 runtime_remaining; u64 throttled_clock; -- cgit v1.2.3 From 99d84bf8c65a7a0dbc9e166ca0a58ed949ac4f37 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 29 May 2019 20:36:37 +0000 Subject: stop_machine: Fix stop_cpus_in_progress ordering Make sure the entire for loop has stop_cpus_in_progress set. Signed-off-by: Peter Zijlstra (Intel) Cc: Aaron Lu Cc: Valentin Schneider Cc: mingo@kernel.org Cc: Phil Auld Cc: Julien Desfossez Cc: Nishanth Aravamudan Link: https://lkml.kernel.org/r/0fd8fd4b99b9b9aa88d8b2dff897f7fd0d88f72c.1559129225.git.vpillai@digitalocean.com --- kernel/stop_machine.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index b4f83f7bdf86..c7031a22aa7b 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -383,6 +383,7 @@ static bool queue_stop_cpus_work(const struct cpumask *cpumask, */ preempt_disable(); stop_cpus_in_progress = true; + barrier(); for_each_cpu(cpu, cpumask) { work = &per_cpu(cpu_stopper.stop_work, cpu); work->fn = fn; @@ -391,6 +392,7 @@ static bool queue_stop_cpus_work(const struct cpumask *cpumask, if (cpu_stop_queue_work(cpu, work)) queued = true; } + barrier(); stop_cpus_in_progress = false; preempt_enable(); -- cgit v1.2.3 From 5feeb7837a448f659e0aaa19fb446b1d9a4b323a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 29 May 2019 20:36:38 +0000 Subject: sched: Fix kerneldoc comment for ia64_set_curr_task Signed-off-by: Peter Zijlstra (Intel) Cc: Aaron Lu Cc: Valentin Schneider Cc: mingo@kernel.org Cc: Phil Auld Cc: Julien Desfossez Cc: Nishanth Aravamudan Link: https://lkml.kernel.org/r/fde3a65ea3091ec6b84dac3c19639f85f452c5d1.1559129225.git.vpillai@digitalocean.com --- kernel/sched/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index b4a44bc84749..9a821ff68502 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6772,7 +6772,7 @@ struct task_struct *curr_task(int cpu) #ifdef CONFIG_IA64 /** - * set_curr_task - set the current task for a given CPU. + * ia64_set_curr_task - set the current task for a given CPU. * @cpu: the processor in question. * @p: the task pointer to set. * -- cgit v1.2.3 From f95d4eaee6d0207bff2dc93371133d31227d4cfb Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 29 May 2019 20:36:40 +0000 Subject: sched/{rt,deadline}: Fix set_next_task vs pick_next_task Because pick_next_task() implies set_curr_task() and some of the details haven't mattered too much, some of what _should_ be in set_curr_task() ended up in pick_next_task, correct this. This prepares the way for a pick_next_task() variant that does not affect the current state; allowing remote picking. Signed-off-by: Peter Zijlstra (Intel) Cc: Aaron Lu Cc: Valentin Schneider Cc: mingo@kernel.org Cc: Phil Auld Cc: Julien Desfossez Cc: Nishanth Aravamudan Link: https://lkml.kernel.org/r/38c61d5240553e043c27c5e00b9dd0d184dd6081.1559129225.git.vpillai@digitalocean.com --- kernel/sched/deadline.c | 22 +++++++++++----------- kernel/sched/rt.c | 26 +++++++++++++------------- 2 files changed, 24 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 039dde2b1dac..2dc2784b196c 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1727,12 +1727,20 @@ static void start_hrtick_dl(struct rq *rq, struct task_struct *p) } #endif -static inline void set_next_task(struct rq *rq, struct task_struct *p) +static void set_next_task_dl(struct rq *rq, struct task_struct *p) { p->se.exec_start = rq_clock_task(rq); /* You can't push away the running task */ dequeue_pushable_dl_task(rq, p); + + if (hrtick_enabled(rq)) + start_hrtick_dl(rq, p); + + if (rq->curr->sched_class != &dl_sched_class) + update_dl_rq_load_avg(rq_clock_pelt(rq), rq, 0); + + deadline_queue_push_tasks(rq); } static struct sched_dl_entity *pick_next_dl_entity(struct rq *rq, @@ -1791,15 +1799,7 @@ pick_next_task_dl(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) p = dl_task_of(dl_se); - set_next_task(rq, p); - - if (hrtick_enabled(rq)) - start_hrtick_dl(rq, p); - - deadline_queue_push_tasks(rq); - - if (rq->curr->sched_class != &dl_sched_class) - update_dl_rq_load_avg(rq_clock_pelt(rq), rq, 0); + set_next_task_dl(rq, p); return p; } @@ -1846,7 +1846,7 @@ static void task_fork_dl(struct task_struct *p) static void set_curr_task_dl(struct rq *rq) { - set_next_task(rq, rq->curr); + set_next_task_dl(rq, rq->curr); } #ifdef CONFIG_SMP diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index a532558a5176..40bb71004325 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1498,12 +1498,22 @@ static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flag #endif } -static inline void set_next_task(struct rq *rq, struct task_struct *p) +static inline void set_next_task_rt(struct rq *rq, struct task_struct *p) { p->se.exec_start = rq_clock_task(rq); /* The running task is never eligible for pushing */ dequeue_pushable_task(rq, p); + + /* + * If prev task was rt, put_prev_task() has already updated the + * utilization. We only care of the case where we start to schedule a + * rt task + */ + if (rq->curr->sched_class != &rt_sched_class) + update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 0); + + rt_queue_push_tasks(rq); } static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq, @@ -1577,17 +1587,7 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) p = _pick_next_task_rt(rq); - set_next_task(rq, p); - - rt_queue_push_tasks(rq); - - /* - * If prev task was rt, put_prev_task() has already updated the - * utilization. We only care of the case where we start to schedule a - * rt task - */ - if (rq->curr->sched_class != &rt_sched_class) - update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 0); + set_next_task_rt(rq, p); return p; } @@ -2356,7 +2356,7 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued) static void set_curr_task_rt(struct rq *rq) { - set_next_task(rq, rq->curr); + set_next_task_rt(rq, rq->curr); } static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task) -- cgit v1.2.3 From 10e7071b2f491b0fb981717ea0a585c441906ede Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 6 Aug 2019 15:13:17 +0200 Subject: sched: Rework CPU hotplug task selection The CPU hotplug task selection is the only place where we used put_prev_task() on a task that is not current. While looking at that, it occured to me that we can simplify all that by by using a custom pick loop. Since we don't need to put current, we can do away with the fake task too. Signed-off-by: Peter Zijlstra (Intel) Cc: Aaron Lu Cc: Valentin Schneider Cc: mingo@kernel.org Cc: Phil Auld Cc: Julien Desfossez Cc: Nishanth Aravamudan --- kernel/sched/core.c | 32 ++++++++++++++------------------ kernel/sched/sched.h | 1 + 2 files changed, 15 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 9a821ff68502..364b6d7da2be 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6082,21 +6082,22 @@ static void calc_load_migrate(struct rq *rq) atomic_long_add(delta, &calc_load_tasks); } -static void put_prev_task_fake(struct rq *rq, struct task_struct *prev) +static struct task_struct *__pick_migrate_task(struct rq *rq) { -} + const struct sched_class *class; + struct task_struct *next; -static const struct sched_class fake_sched_class = { - .put_prev_task = put_prev_task_fake, -}; + for_each_class(class) { + next = class->pick_next_task(rq, NULL, NULL); + if (next) { + next->sched_class->put_prev_task(rq, next); + return next; + } + } -static struct task_struct fake_task = { - /* - * Avoid pull_{rt,dl}_task() - */ - .prio = MAX_PRIO + 1, - .sched_class = &fake_sched_class, -}; + /* The idle class should always have a runnable task */ + BUG(); +} /* * Migrate all tasks from the rq, sleeping tasks will be migrated by @@ -6139,12 +6140,7 @@ static void migrate_tasks(struct rq *dead_rq, struct rq_flags *rf) if (rq->nr_running == 1) break; - /* - * pick_next_task() assumes pinned rq->lock: - */ - next = pick_next_task(rq, &fake_task, rf); - BUG_ON(!next); - put_prev_task(rq, next); + next = __pick_migrate_task(rq); /* * Rules for changing task_struct::cpus_mask are holding diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index ea48aa5daeee..b3449d0dd7f0 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1751,6 +1751,7 @@ struct sched_class { static inline void put_prev_task(struct rq *rq, struct task_struct *prev) { + WARN_ON_ONCE(rq->curr != prev); prev->sched_class->put_prev_task(rq, prev); } -- cgit v1.2.3 From 03b7fad167efca3b7abbbb39733933f9df56e79c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 29 May 2019 20:36:41 +0000 Subject: sched: Add task_struct pointer to sched_class::set_curr_task In preparation of further separating pick_next_task() and set_curr_task() we have to pass the actual task into it, while there, rename the thing to better pair with put_prev_task(). Signed-off-by: Peter Zijlstra (Intel) Cc: Aaron Lu Cc: Valentin Schneider Cc: mingo@kernel.org Cc: Phil Auld Cc: Julien Desfossez Cc: Nishanth Aravamudan Link: https://lkml.kernel.org/r/a96d1bcdd716db4a4c5da2fece647a1456c0ed78.1559129225.git.vpillai@digitalocean.com --- kernel/sched/core.c | 12 ++++++------ kernel/sched/deadline.c | 7 +------ kernel/sched/fair.c | 17 ++++++++++++++--- kernel/sched/idle.c | 27 +++++++++++++++------------ kernel/sched/rt.c | 7 +------ kernel/sched/sched.h | 7 ++++--- kernel/sched/stop_task.c | 17 +++++++---------- 7 files changed, 48 insertions(+), 46 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 364b6d7da2be..0c4220789092 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1494,7 +1494,7 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) if (queued) enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK); if (running) - set_curr_task(rq, p); + set_next_task(rq, p); } /* @@ -4325,7 +4325,7 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task) if (queued) enqueue_task(rq, p, queue_flag); if (running) - set_curr_task(rq, p); + set_next_task(rq, p); check_class_changed(rq, p, prev_class, oldprio); out_unlock: @@ -4392,7 +4392,7 @@ void set_user_nice(struct task_struct *p, long nice) resched_curr(rq); } if (running) - set_curr_task(rq, p); + set_next_task(rq, p); out_unlock: task_rq_unlock(rq, p, &rf); } @@ -4840,7 +4840,7 @@ change: enqueue_task(rq, p, queue_flags); } if (running) - set_curr_task(rq, p); + set_next_task(rq, p); check_class_changed(rq, p, prev_class, oldprio); @@ -6042,7 +6042,7 @@ void sched_setnuma(struct task_struct *p, int nid) if (queued) enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK); if (running) - set_curr_task(rq, p); + set_next_task(rq, p); task_rq_unlock(rq, p, &rf); } #endif /* CONFIG_NUMA_BALANCING */ @@ -6919,7 +6919,7 @@ void sched_move_task(struct task_struct *tsk) if (queued) enqueue_task(rq, tsk, queue_flags); if (running) - set_curr_task(rq, tsk); + set_next_task(rq, tsk); task_rq_unlock(rq, tsk, &rf); } diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 2dc2784b196c..6eae79350303 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1844,11 +1844,6 @@ static void task_fork_dl(struct task_struct *p) */ } -static void set_curr_task_dl(struct rq *rq) -{ - set_next_task_dl(rq, rq->curr); -} - #ifdef CONFIG_SMP /* Only try algorithms three times */ @@ -2466,6 +2461,7 @@ const struct sched_class dl_sched_class = { .pick_next_task = pick_next_task_dl, .put_prev_task = put_prev_task_dl, + .set_next_task = set_next_task_dl, #ifdef CONFIG_SMP .select_task_rq = select_task_rq_dl, @@ -2476,7 +2472,6 @@ const struct sched_class dl_sched_class = { .task_woken = task_woken_dl, #endif - .set_curr_task = set_curr_task_dl, .task_tick = task_tick_dl, .task_fork = task_fork_dl, diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 7d8043fc8317..8ce1b8893947 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -10150,9 +10150,19 @@ static void switched_to_fair(struct rq *rq, struct task_struct *p) * This routine is mostly called to set cfs_rq->curr field when a task * migrates between groups/classes. */ -static void set_curr_task_fair(struct rq *rq) +static void set_next_task_fair(struct rq *rq, struct task_struct *p) { - struct sched_entity *se = &rq->curr->se; + struct sched_entity *se = &p->se; + +#ifdef CONFIG_SMP + if (task_on_rq_queued(p)) { + /* + * Move the next running task to the front of the list, so our + * cfs_tasks list becomes MRU one. + */ + list_move(&se->group_node, &rq->cfs_tasks); + } +#endif for_each_sched_entity(se) { struct cfs_rq *cfs_rq = cfs_rq_of(se); @@ -10423,7 +10433,9 @@ const struct sched_class fair_sched_class = { .check_preempt_curr = check_preempt_wakeup, .pick_next_task = pick_next_task_fair, + .put_prev_task = put_prev_task_fair, + .set_next_task = set_next_task_fair, #ifdef CONFIG_SMP .select_task_rq = select_task_rq_fair, @@ -10436,7 +10448,6 @@ const struct sched_class fair_sched_class = { .set_cpus_allowed = set_cpus_allowed_common, #endif - .set_curr_task = set_curr_task_fair, .task_tick = task_tick_fair, .task_fork = task_fork_fair, diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 80940939b733..54194d41035c 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -374,14 +374,25 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int fl resched_curr(rq); } +static void put_prev_task_idle(struct rq *rq, struct task_struct *prev) +{ +} + +static void set_next_task_idle(struct rq *rq, struct task_struct *next) +{ + update_idle_core(rq); + schedstat_inc(rq->sched_goidle); +} + static struct task_struct * pick_next_task_idle(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) { + struct task_struct *next = rq->idle; + put_prev_task(rq, prev); - update_idle_core(rq); - schedstat_inc(rq->sched_goidle); + set_next_task_idle(rq, next); - return rq->idle; + return next; } /* @@ -397,10 +408,6 @@ dequeue_task_idle(struct rq *rq, struct task_struct *p, int flags) raw_spin_lock_irq(&rq->lock); } -static void put_prev_task_idle(struct rq *rq, struct task_struct *prev) -{ -} - /* * scheduler tick hitting a task of our scheduling class. * @@ -413,10 +420,6 @@ static void task_tick_idle(struct rq *rq, struct task_struct *curr, int queued) { } -static void set_curr_task_idle(struct rq *rq) -{ -} - static void switched_to_idle(struct rq *rq, struct task_struct *p) { BUG(); @@ -451,13 +454,13 @@ const struct sched_class idle_sched_class = { .pick_next_task = pick_next_task_idle, .put_prev_task = put_prev_task_idle, + .set_next_task = set_next_task_idle, #ifdef CONFIG_SMP .select_task_rq = select_task_rq_idle, .set_cpus_allowed = set_cpus_allowed_common, #endif - .set_curr_task = set_curr_task_idle, .task_tick = task_tick_idle, .get_rr_interval = get_rr_interval_idle, diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 40bb71004325..f71bcbe1a00c 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -2354,11 +2354,6 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued) } } -static void set_curr_task_rt(struct rq *rq) -{ - set_next_task_rt(rq, rq->curr); -} - static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task) { /* @@ -2380,6 +2375,7 @@ const struct sched_class rt_sched_class = { .pick_next_task = pick_next_task_rt, .put_prev_task = put_prev_task_rt, + .set_next_task = set_next_task_rt, #ifdef CONFIG_SMP .select_task_rq = select_task_rq_rt, @@ -2391,7 +2387,6 @@ const struct sched_class rt_sched_class = { .switched_from = switched_from_rt, #endif - .set_curr_task = set_curr_task_rt, .task_tick = task_tick_rt, .get_rr_interval = get_rr_interval_rt, diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index b3449d0dd7f0..f3c50445bf22 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1707,6 +1707,7 @@ struct sched_class { struct task_struct *prev, struct rq_flags *rf); void (*put_prev_task)(struct rq *rq, struct task_struct *p); + void (*set_next_task)(struct rq *rq, struct task_struct *p); #ifdef CONFIG_SMP int (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags); @@ -1721,7 +1722,6 @@ struct sched_class { void (*rq_offline)(struct rq *rq); #endif - void (*set_curr_task)(struct rq *rq); void (*task_tick)(struct rq *rq, struct task_struct *p, int queued); void (*task_fork)(struct task_struct *p); void (*task_dead)(struct task_struct *p); @@ -1755,9 +1755,10 @@ static inline void put_prev_task(struct rq *rq, struct task_struct *prev) prev->sched_class->put_prev_task(rq, prev); } -static inline void set_curr_task(struct rq *rq, struct task_struct *curr) +static inline void set_next_task(struct rq *rq, struct task_struct *next) { - curr->sched_class->set_curr_task(rq); + WARN_ON_ONCE(rq->curr != next); + next->sched_class->set_next_task(rq, next); } #ifdef CONFIG_SMP diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c index c183b790ca54..47a3d2a18a9a 100644 --- a/kernel/sched/stop_task.c +++ b/kernel/sched/stop_task.c @@ -23,6 +23,11 @@ check_preempt_curr_stop(struct rq *rq, struct task_struct *p, int flags) /* we're never preempted */ } +static void set_next_task_stop(struct rq *rq, struct task_struct *stop) +{ + stop->se.exec_start = rq_clock_task(rq); +} + static struct task_struct * pick_next_task_stop(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) { @@ -32,8 +37,7 @@ pick_next_task_stop(struct rq *rq, struct task_struct *prev, struct rq_flags *rf return NULL; put_prev_task(rq, prev); - - stop->se.exec_start = rq_clock_task(rq); + set_next_task_stop(rq, stop); return stop; } @@ -86,13 +90,6 @@ static void task_tick_stop(struct rq *rq, struct task_struct *curr, int queued) { } -static void set_curr_task_stop(struct rq *rq) -{ - struct task_struct *stop = rq->stop; - - stop->se.exec_start = rq_clock_task(rq); -} - static void switched_to_stop(struct rq *rq, struct task_struct *p) { BUG(); /* its impossible to change to this class */ @@ -128,13 +125,13 @@ const struct sched_class stop_sched_class = { .pick_next_task = pick_next_task_stop, .put_prev_task = put_prev_task_stop, + .set_next_task = set_next_task_stop, #ifdef CONFIG_SMP .select_task_rq = select_task_rq_stop, .set_cpus_allowed = set_cpus_allowed_common, #endif - .set_curr_task = set_curr_task_stop, .task_tick = task_tick_stop, .get_rr_interval = get_rr_interval_stop, -- cgit v1.2.3 From 5ba553eff0c3a7c099b1e29a740277a82c0c3314 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 29 May 2019 20:36:42 +0000 Subject: sched/fair: Expose newidle_balance() For pick_next_task_fair() it is the newidle balance that requires dropping the rq->lock; provided we do put_prev_task() early, we can also detect the condition for doing newidle early. Signed-off-by: Peter Zijlstra (Intel) Cc: Aaron Lu Cc: Valentin Schneider Cc: mingo@kernel.org Cc: Phil Auld Cc: Julien Desfossez Cc: Nishanth Aravamudan Link: https://lkml.kernel.org/r/9e3eb1859b946f03d7e500453a885725b68957ba.1559129225.git.vpillai@digitalocean.com --- kernel/sched/fair.c | 18 ++++++++---------- kernel/sched/sched.h | 4 ++++ 2 files changed, 12 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 8ce1b8893947..e7c27eda9f24 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3690,8 +3690,6 @@ static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq) return cfs_rq->avg.load_avg; } -static int idle_balance(struct rq *this_rq, struct rq_flags *rf); - static inline unsigned long task_util(struct task_struct *p) { return READ_ONCE(p->se.avg.util_avg); @@ -6878,11 +6876,10 @@ done: __maybe_unused; return p; idle: - update_misfit_status(NULL, rq); - new_tasks = idle_balance(rq, rf); + new_tasks = newidle_balance(rq, rf); /* - * Because idle_balance() releases (and re-acquires) rq->lock, it is + * Because newidle_balance() releases (and re-acquires) rq->lock, it is * possible for any higher priority task to appear. In that case we * must re-start the pick_next_entity() loop. */ @@ -9045,10 +9042,10 @@ out_one_pinned: ld_moved = 0; /* - * idle_balance() disregards balance intervals, so we could repeatedly - * reach this code, which would lead to balance_interval skyrocketting - * in a short amount of time. Skip the balance_interval increase logic - * to avoid that. + * newidle_balance() disregards balance intervals, so we could + * repeatedly reach this code, which would lead to balance_interval + * skyrocketting in a short amount of time. Skip the balance_interval + * increase logic to avoid that. */ if (env.idle == CPU_NEWLY_IDLE) goto out; @@ -9758,7 +9755,7 @@ static inline void nohz_newidle_balance(struct rq *this_rq) { } * idle_balance is called by schedule() if this_cpu is about to become * idle. Attempts to pull tasks from other CPUs. */ -static int idle_balance(struct rq *this_rq, struct rq_flags *rf) +int newidle_balance(struct rq *this_rq, struct rq_flags *rf) { unsigned long next_balance = jiffies + HZ; int this_cpu = this_rq->cpu; @@ -9766,6 +9763,7 @@ static int idle_balance(struct rq *this_rq, struct rq_flags *rf) int pulled_task = 0; u64 curr_cost = 0; + update_misfit_status(NULL, this_rq); /* * We must set idle_stamp _before_ calling idle_balance(), such that we * measure the duration of idle_balance() as idle time. diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index f3c50445bf22..304d98e712bf 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1445,10 +1445,14 @@ static inline void unregister_sched_domain_sysctl(void) } #endif +extern int newidle_balance(struct rq *this_rq, struct rq_flags *rf); + #else static inline void sched_ttwu_pending(void) { } +static inline int newidle_balance(struct rq *this_rq, struct rq_flags *rf) { return 0; } + #endif /* CONFIG_SMP */ #include "stats.h" -- cgit v1.2.3 From 5f2a45fc9e89e022233085e6f0f352eb6ff770bb Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 29 May 2019 20:36:43 +0000 Subject: sched: Allow put_prev_task() to drop rq->lock Currently the pick_next_task() loop is convoluted and ugly because of how it can drop the rq->lock and needs to restart the picking. For the RT/Deadline classes, it is put_prev_task() where we do balancing, and we could do this before the picking loop. Make this possible. Signed-off-by: Peter Zijlstra (Intel) Cc: Valentin Schneider Cc: Aaron Lu Cc: mingo@kernel.org Cc: Phil Auld Cc: Julien Desfossez Cc: Nishanth Aravamudan Link: https://lkml.kernel.org/r/e4519f6850477ab7f3d257062796e6425ee4ba7c.1559129225.git.vpillai@digitalocean.com --- kernel/sched/core.c | 2 +- kernel/sched/deadline.c | 14 +++++++++++++- kernel/sched/fair.c | 2 +- kernel/sched/idle.c | 2 +- kernel/sched/rt.c | 14 +++++++++++++- kernel/sched/sched.h | 4 ++-- kernel/sched/stop_task.c | 2 +- 7 files changed, 32 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 0c4220789092..7bbe78a31ba5 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6090,7 +6090,7 @@ static struct task_struct *__pick_migrate_task(struct rq *rq) for_each_class(class) { next = class->pick_next_task(rq, NULL, NULL); if (next) { - next->sched_class->put_prev_task(rq, next); + next->sched_class->put_prev_task(rq, next, NULL); return next; } } diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 6eae79350303..2872e15a87cd 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1804,13 +1804,25 @@ pick_next_task_dl(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) return p; } -static void put_prev_task_dl(struct rq *rq, struct task_struct *p) +static void put_prev_task_dl(struct rq *rq, struct task_struct *p, struct rq_flags *rf) { update_curr_dl(rq); update_dl_rq_load_avg(rq_clock_pelt(rq), rq, 1); if (on_dl_rq(&p->dl) && p->nr_cpus_allowed > 1) enqueue_pushable_dl_task(rq, p); + + if (rf && !on_dl_rq(&p->dl) && need_pull_dl_task(rq, p)) { + /* + * This is OK, because current is on_cpu, which avoids it being + * picked for load-balance and preemption/IRQs are still + * disabled avoiding further scheduler activity on it and we've + * not yet started the picking loop. + */ + rq_unpin_lock(rq, rf); + pull_dl_task(rq); + rq_repin_lock(rq, rf); + } } /* diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index e7c27eda9f24..4418c1998e69 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6901,7 +6901,7 @@ idle: /* * Account for a descheduled task: */ -static void put_prev_task_fair(struct rq *rq, struct task_struct *prev) +static void put_prev_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) { struct sched_entity *se = &prev->se; struct cfs_rq *cfs_rq; diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 54194d41035c..8d59de2e4a6e 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -374,7 +374,7 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int fl resched_curr(rq); } -static void put_prev_task_idle(struct rq *rq, struct task_struct *prev) +static void put_prev_task_idle(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) { } diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index f71bcbe1a00c..dbdabd76f192 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1592,7 +1592,7 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) return p; } -static void put_prev_task_rt(struct rq *rq, struct task_struct *p) +static void put_prev_task_rt(struct rq *rq, struct task_struct *p, struct rq_flags *rf) { update_curr_rt(rq); @@ -1604,6 +1604,18 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p) */ if (on_rt_rq(&p->rt) && p->nr_cpus_allowed > 1) enqueue_pushable_task(rq, p); + + if (rf && !on_rt_rq(&p->rt) && need_pull_rt_task(rq, p)) { + /* + * This is OK, because current is on_cpu, which avoids it being + * picked for load-balance and preemption/IRQs are still + * disabled avoiding further scheduler activity on it and we've + * not yet started the picking loop. + */ + rq_unpin_lock(rq, rf); + pull_rt_task(rq); + rq_repin_lock(rq, rf); + } } #ifdef CONFIG_SMP diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 304d98e712bf..e085cffb8004 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1710,7 +1710,7 @@ struct sched_class { struct task_struct * (*pick_next_task)(struct rq *rq, struct task_struct *prev, struct rq_flags *rf); - void (*put_prev_task)(struct rq *rq, struct task_struct *p); + void (*put_prev_task)(struct rq *rq, struct task_struct *p, struct rq_flags *rf); void (*set_next_task)(struct rq *rq, struct task_struct *p); #ifdef CONFIG_SMP @@ -1756,7 +1756,7 @@ struct sched_class { static inline void put_prev_task(struct rq *rq, struct task_struct *prev) { WARN_ON_ONCE(rq->curr != prev); - prev->sched_class->put_prev_task(rq, prev); + prev->sched_class->put_prev_task(rq, prev, NULL); } static inline void set_next_task(struct rq *rq, struct task_struct *next) diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c index 47a3d2a18a9a..8f414018d5e0 100644 --- a/kernel/sched/stop_task.c +++ b/kernel/sched/stop_task.c @@ -59,7 +59,7 @@ static void yield_task_stop(struct rq *rq) BUG(); /* the stop task should never yield, its pointless. */ } -static void put_prev_task_stop(struct rq *rq, struct task_struct *prev) +static void put_prev_task_stop(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) { struct task_struct *curr = rq->curr; u64 delta_exec; -- cgit v1.2.3 From 67692435c411e5c53a1c588ecca2037aebd81f2e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 29 May 2019 20:36:44 +0000 Subject: sched: Rework pick_next_task() slow-path Avoid the RETRY_TASK case in the pick_next_task() slow path. By doing the put_prev_task() early, we get the rt/deadline pull done, and by testing rq->nr_running we know if we need newidle_balance(). This then gives a stable state to pick a task from. Since the fast-path is fair only; it means the other classes will always have pick_next_task(.prev=NULL, .rf=NULL) and we can simplify. Signed-off-by: Peter Zijlstra (Intel) Cc: Aaron Lu Cc: Valentin Schneider Cc: mingo@kernel.org Cc: Phil Auld Cc: Julien Desfossez Cc: Nishanth Aravamudan Link: https://lkml.kernel.org/r/aa34d24b36547139248f32a30138791ac6c02bd6.1559129225.git.vpillai@digitalocean.com --- kernel/sched/core.c | 19 ++++++++++++------- kernel/sched/deadline.c | 30 ++---------------------------- kernel/sched/fair.c | 9 ++++++--- kernel/sched/idle.c | 4 +++- kernel/sched/rt.c | 29 +---------------------------- kernel/sched/sched.h | 13 ++++++++----- kernel/sched/stop_task.c | 3 ++- 7 files changed, 34 insertions(+), 73 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 7bbe78a31ba5..a6661852907b 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3791,7 +3791,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) p = fair_sched_class.pick_next_task(rq, prev, rf); if (unlikely(p == RETRY_TASK)) - goto again; + goto restart; /* Assumes fair_sched_class->next == idle_sched_class */ if (unlikely(!p)) @@ -3800,14 +3800,19 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) return p; } -again: +restart: + /* + * Ensure that we put DL/RT tasks before the pick loop, such that they + * can PULL higher prio tasks when we lower the RQ 'priority'. + */ + prev->sched_class->put_prev_task(rq, prev, rf); + if (!rq->nr_running) + newidle_balance(rq, rf); + for_each_class(class) { - p = class->pick_next_task(rq, prev, rf); - if (p) { - if (unlikely(p == RETRY_TASK)) - goto again; + p = class->pick_next_task(rq, NULL, NULL); + if (p) return p; - } } /* The idle class should always have a runnable task: */ diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 2872e15a87cd..0b9cbfb2b1d4 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1761,39 +1761,13 @@ pick_next_task_dl(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) struct task_struct *p; struct dl_rq *dl_rq; - dl_rq = &rq->dl; - - if (need_pull_dl_task(rq, prev)) { - /* - * This is OK, because current is on_cpu, which avoids it being - * picked for load-balance and preemption/IRQs are still - * disabled avoiding further scheduler activity on it and we're - * being very careful to re-start the picking loop. - */ - rq_unpin_lock(rq, rf); - pull_dl_task(rq); - rq_repin_lock(rq, rf); - /* - * pull_dl_task() can drop (and re-acquire) rq->lock; this - * means a stop task can slip in, in which case we need to - * re-start task selection. - */ - if (rq->stop && task_on_rq_queued(rq->stop)) - return RETRY_TASK; - } + WARN_ON_ONCE(prev || rf); - /* - * When prev is DL, we may throttle it in put_prev_task(). - * So, we update time before we check for dl_nr_running. - */ - if (prev->sched_class == &dl_sched_class) - update_curr_dl(rq); + dl_rq = &rq->dl; if (unlikely(!dl_rq->dl_nr_running)) return NULL; - put_prev_task(rq, prev); - dl_se = pick_next_dl_entity(rq, dl_rq); BUG_ON(!dl_se); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 4418c1998e69..19c58599e967 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6770,7 +6770,7 @@ again: goto idle; #ifdef CONFIG_FAIR_GROUP_SCHED - if (prev->sched_class != &fair_sched_class) + if (!prev || prev->sched_class != &fair_sched_class) goto simple; /* @@ -6847,8 +6847,8 @@ again: goto done; simple: #endif - - put_prev_task(rq, prev); + if (prev) + put_prev_task(rq, prev); do { se = pick_next_entity(cfs_rq, NULL); @@ -6876,6 +6876,9 @@ done: __maybe_unused; return p; idle: + if (!rf) + return NULL; + new_tasks = newidle_balance(rq, rf); /* diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 8d59de2e4a6e..7c54550dda6a 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -389,7 +389,9 @@ pick_next_task_idle(struct rq *rq, struct task_struct *prev, struct rq_flags *rf { struct task_struct *next = rq->idle; - put_prev_task(rq, prev); + if (prev) + put_prev_task(rq, prev); + set_next_task_idle(rq, next); return next; diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index dbdabd76f192..858c4cc6f99b 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1553,38 +1553,11 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) struct task_struct *p; struct rt_rq *rt_rq = &rq->rt; - if (need_pull_rt_task(rq, prev)) { - /* - * This is OK, because current is on_cpu, which avoids it being - * picked for load-balance and preemption/IRQs are still - * disabled avoiding further scheduler activity on it and we're - * being very careful to re-start the picking loop. - */ - rq_unpin_lock(rq, rf); - pull_rt_task(rq); - rq_repin_lock(rq, rf); - /* - * pull_rt_task() can drop (and re-acquire) rq->lock; this - * means a dl or stop task can slip in, in which case we need - * to re-start task selection. - */ - if (unlikely((rq->stop && task_on_rq_queued(rq->stop)) || - rq->dl.dl_nr_running)) - return RETRY_TASK; - } - - /* - * We may dequeue prev's rt_rq in put_prev_task(). - * So, we update time before rt_queued check. - */ - if (prev->sched_class == &rt_sched_class) - update_curr_rt(rq); + WARN_ON_ONCE(prev || rf); if (!rt_rq->rt_queued) return NULL; - put_prev_task(rq, prev); - p = _pick_next_task_rt(rq); set_next_task_rt(rq, p); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index e085cffb8004..7111e3a1eeb4 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1700,12 +1700,15 @@ struct sched_class { void (*check_preempt_curr)(struct rq *rq, struct task_struct *p, int flags); /* - * It is the responsibility of the pick_next_task() method that will - * return the next task to call put_prev_task() on the @prev task or - * something equivalent. + * Both @prev and @rf are optional and may be NULL, in which case the + * caller must already have invoked put_prev_task(rq, prev, rf). * - * May return RETRY_TASK when it finds a higher prio class has runnable - * tasks. + * Otherwise it is the responsibility of the pick_next_task() to call + * put_prev_task() on the @prev task or something equivalent, IFF it + * returns a next task. + * + * In that case (@rf != NULL) it may return RETRY_TASK when it finds a + * higher prio class has runnable tasks. */ struct task_struct * (*pick_next_task)(struct rq *rq, struct task_struct *prev, diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c index 8f414018d5e0..7e1cee4e65b2 100644 --- a/kernel/sched/stop_task.c +++ b/kernel/sched/stop_task.c @@ -33,10 +33,11 @@ pick_next_task_stop(struct rq *rq, struct task_struct *prev, struct rq_flags *rf { struct task_struct *stop = rq->stop; + WARN_ON_ONCE(prev || rf); + if (!stop || !task_on_rq_queued(stop)) return NULL; - put_prev_task(rq, prev); set_next_task_stop(rq, stop); return stop; -- cgit v1.2.3 From 5c3ceef9ad7b340b0acee6c26d0c9e6429decb2c Mon Sep 17 00:00:00 2001 From: Qais Yousef Date: Fri, 2 Aug 2019 11:46:28 +0100 Subject: cpufreq: schedutil: fix equation in comment scale_irq_capacity() call in schedutil_cpu_util() does util *= (max - irq) util /= max But the comment says util *= (1 - irq) util /= max Fix the comment to match what the scaling function does. Signed-off-by: Qais Yousef Signed-off-by: Peter Zijlstra (Intel) Acked-by: Viresh Kumar Acked-by: Vincent Guittot Cc: Ingo Molnar Cc: "Rafael J . Wysocki" Link: https://lkml.kernel.org/r/20190802104628.8410-1-qais.yousef@arm.com --- kernel/sched/cpufreq_schedutil.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 636ca6f88c8e..e127d89d5974 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -259,9 +259,9 @@ unsigned long schedutil_cpu_util(int cpu, unsigned long util_cfs, * irq metric. Because IRQ/steal time is hidden from the task clock we * need to scale the task numbers: * - * 1 - irq - * U' = irq + ------- * U - * max + * max - irq + * U' = irq + --------- * U + * max */ util = scale_irq_capacity(util, irq, max); util += irq; -- cgit v1.2.3 From ac9eafbe930abb589e9289842a99cc575cadb854 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 1 Aug 2019 19:31:10 +0200 Subject: ACPI: PM: s2idle: Execute LPS0 _DSM functions with suspended devices According to Section 3.5 of the "Intel Low Power S0 Idle" document [1], Function 5 of the LPS0 _DSM is expected to be invoked when the system configuration matches the criteria for entering the target low-power state of the platform. In particular, this means that all devices should be suspended and in low-power states already when that function is invoked. This is not the case currently, however, because Function 5 of the LPS0 _DSM is invoked by it before the "noirq" phase of device suspend, which means that some devices may not have been put into low-power states yet at that point. That is a consequence of the previous design of the suspend-to-idle flow that allowed the "noirq" phase of device suspend and the "noirq" phase of device resume to be carried out for multiple times while "suspended" (if any spurious wakeup events were detected) and the point of the LPS0 _DSM Function 5 invocation was chosen so as to call it (and LPS0 _DSM Function 6 analogously) once per suspend-resume cycle (regardless of how many times the "noirq" phases of device suspend and resume were carried out while "suspended"). Now that the suspend-to-idle flow has been redesigned to carry out the "noirq" phases of device suspend and resume once in each cycle, the code can be reordered to follow the specification that it is based on more closely. For this purpose, add ->prepare_late and ->restore_early platform callbacks for suspend-to-idle, to be executed, respectively, after the "noirq" phase of suspending devices and before the "noirq" phase of resuming them and make ACPI use them for the invocation of LPS0 _DSM functions as appropriate. While at it, move the LPS0 entry requirements check to be made before invoking Functions 3 and 5 of the LPS0 _DSM (also once per cycle) as follows from the specification [1]. Link: https://uefi.org/sites/default/files/resources/Intel_ACPI_Low_Power_S0_Idle.pdf # [1] Signed-off-by: Rafael J. Wysocki Tested-by: Kai-Heng Feng --- kernel/power/suspend.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 2b6057853b33..ed9ddef12b13 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -253,13 +253,19 @@ static int platform_suspend_prepare_late(suspend_state_t state) static int platform_suspend_prepare_noirq(suspend_state_t state) { - return state != PM_SUSPEND_TO_IDLE && suspend_ops->prepare_late ? - suspend_ops->prepare_late() : 0; + if (state == PM_SUSPEND_TO_IDLE) { + if (s2idle_ops && s2idle_ops->prepare_late) + return s2idle_ops->prepare_late(); + } + return suspend_ops->prepare_late ? suspend_ops->prepare_late() : 0; } static void platform_resume_noirq(suspend_state_t state) { - if (state != PM_SUSPEND_TO_IDLE && suspend_ops->wake) + if (state == PM_SUSPEND_TO_IDLE) { + if (s2idle_ops && s2idle_ops->restore_early) + s2idle_ops->restore_early(); + } else if (suspend_ops->wake) suspend_ops->wake(); } -- cgit v1.2.3 From ec9c7d19336ee98ecba8de80128aa405c45feebb Mon Sep 17 00:00:00 2001 From: Daniel Jordan Date: Thu, 8 Aug 2019 12:05:35 -0400 Subject: padata: initialize pd->cpu with effective cpumask Exercising CPU hotplug on a 5.2 kernel with recent padata fixes from cryptodev-2.6.git in an 8-CPU kvm guest... # modprobe tcrypt alg="pcrypt(rfc4106(gcm(aes)))" type=3 # echo 0 > /sys/devices/system/cpu/cpu1/online # echo c > /sys/kernel/pcrypt/pencrypt/parallel_cpumask # modprobe tcrypt mode=215 ...caused the following crash: BUG: kernel NULL pointer dereference, address: 0000000000000000 #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page PGD 0 P4D 0 Oops: 0000 [#1] SMP PTI CPU: 2 PID: 134 Comm: kworker/2:2 Not tainted 5.2.0-padata-base+ #7 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.12.0- Workqueue: pencrypt padata_parallel_worker RIP: 0010:padata_reorder+0xcb/0x180 ... Call Trace: padata_do_serial+0x57/0x60 pcrypt_aead_enc+0x3a/0x50 [pcrypt] padata_parallel_worker+0x9b/0xe0 process_one_work+0x1b5/0x3f0 worker_thread+0x4a/0x3c0 ... In padata_alloc_pd, pd->cpu is set using the user-supplied cpumask instead of the effective cpumask, and in this case cpumask_first picked an offline CPU. The offline CPU's reorder->list.next is NULL in padata_reorder because the list wasn't initialized in padata_init_pqueues, which only operates on CPUs in the effective mask. Fix by using the effective mask in padata_alloc_pd. Fixes: 6fc4dbcf0276 ("padata: Replace delayed timer with immediate workqueue in padata_reorder") Signed-off-by: Daniel Jordan Cc: Herbert Xu Cc: Steffen Klassert Cc: linux-crypto@vger.kernel.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Herbert Xu --- kernel/padata.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/padata.c b/kernel/padata.c index 7372fb45eeeb..b60cc3dcee58 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -426,7 +426,7 @@ static struct parallel_data *padata_alloc_pd(struct padata_instance *pinst, atomic_set(&pd->refcnt, 0); pd->pinst = pinst; spin_lock_init(&pd->lock); - pd->cpu = cpumask_first(pcpumask); + pd->cpu = cpumask_first(pd->cpumask.pcpu); INIT_WORK(&pd->reorder_work, invoke_padata_reorder); return pd; -- cgit v1.2.3 From 47e5d8f9ed34a5310f45f48bbc15f1e61d83b6db Mon Sep 17 00:00:00 2001 From: Thiago Jung Bauermann Date: Tue, 6 Aug 2019 01:49:15 -0300 Subject: swiotlb: Remove call to sme_active() sme_active() is an x86-specific function so it's better not to call it from generic code. There's no need to mention which memory encryption feature is active, so just use a more generic message. Besides, other architectures will have different names for similar technology. Signed-off-by: Thiago Jung Bauermann Reviewed-by: Christoph Hellwig Reviewed-by: Tom Lendacky Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190806044919.10622-3-bauerman@linux.ibm.com --- kernel/dma/swiotlb.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index 9de232229063..f29caad71e13 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -461,8 +461,7 @@ phys_addr_t swiotlb_tbl_map_single(struct device *hwdev, panic("Can not allocate SWIOTLB buffer earlier and can't now provide you with the DMA bounce buffer"); if (mem_encrypt_active()) - pr_warn_once("%s is active and system is using DMA bounce buffers\n", - sme_active() ? "SME" : "SEV"); + pr_warn_once("Memory encryption is active and system is using DMA bounce buffers\n"); mask = dma_get_seg_boundary(hwdev); -- cgit v1.2.3 From e740815a97e2b6d6446792f4328378e66de166d1 Mon Sep 17 00:00:00 2001 From: Thiago Jung Bauermann Date: Tue, 6 Aug 2019 01:49:16 -0300 Subject: dma-mapping: Remove dma_check_mask() sme_active() is an x86-specific function so it's better not to call it from generic code. Christoph Hellwig mentioned that "There is no reason why we should have a special debug printk just for one specific reason why there is a requirement for a large DMA mask.", so just remove dma_check_mask(). Signed-off-by: Thiago Jung Bauermann Reviewed-by: Christoph Hellwig Reviewed-by: Tom Lendacky Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190806044919.10622-4-bauerman@linux.ibm.com --- kernel/dma/mapping.c | 8 -------- 1 file changed, 8 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c index 1f628e7ac709..61eeefbfcb36 100644 --- a/kernel/dma/mapping.c +++ b/kernel/dma/mapping.c @@ -291,12 +291,6 @@ void dma_free_attrs(struct device *dev, size_t size, void *cpu_addr, } EXPORT_SYMBOL(dma_free_attrs); -static inline void dma_check_mask(struct device *dev, u64 mask) -{ - if (sme_active() && (mask < (((u64)sme_get_me_mask() << 1) - 1))) - dev_warn(dev, "SME is active, device will require DMA bounce buffers\n"); -} - int dma_supported(struct device *dev, u64 mask) { const struct dma_map_ops *ops = get_dma_ops(dev); @@ -327,7 +321,6 @@ int dma_set_mask(struct device *dev, u64 mask) return -EIO; arch_dma_set_mask(dev, mask); - dma_check_mask(dev, mask); *dev->dma_mask = mask; return 0; } @@ -345,7 +338,6 @@ int dma_set_coherent_mask(struct device *dev, u64 mask) if (!dma_supported(dev, mask)) return -EIO; - dma_check_mask(dev, mask); dev->coherent_dma_mask = mask; return 0; } -- cgit v1.2.3 From 28875945ba98d1b47a8a706812b6494d165bb0a0 Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Tue, 16 Jul 2019 18:12:22 -0400 Subject: rcu: Add support for consolidated-RCU reader checking This commit adds RCU-reader checks to list_for_each_entry_rcu() and hlist_for_each_entry_rcu(). These checks are optional, and are indicated by a lockdep expression passed to a new optional argument to these two macros. If this optional lockdep expression is omitted, these two macros act as before, checking for an RCU read-side critical section. Signed-off-by: Joel Fernandes (Google) [ paulmck: Update to eliminate return within macro and update comment. ] Signed-off-by: Paul E. McKenney --- kernel/rcu/Kconfig.debug | 11 ++++++ kernel/rcu/update.c | 96 +++++++++++++++++++++++++++++++----------------- 2 files changed, 74 insertions(+), 33 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/Kconfig.debug b/kernel/rcu/Kconfig.debug index 5ec3ea4028e2..4aa02eee8f6c 100644 --- a/kernel/rcu/Kconfig.debug +++ b/kernel/rcu/Kconfig.debug @@ -8,6 +8,17 @@ menu "RCU Debugging" config PROVE_RCU def_bool PROVE_LOCKING +config PROVE_RCU_LIST + bool "RCU list lockdep debugging" + depends on PROVE_RCU && RCU_EXPERT + default n + help + Enable RCU lockdep checking for list usages. By default it is + turned off since there are several list RCU users that still + need to be converted to pass a lockdep expression. To prevent + false-positive splats, we keep it default disabled but once all + users are converted, we can remove this config option. + config TORTURE_TEST tristate default n diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 9dd5aeef6e70..38cbd616b381 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -61,9 +61,15 @@ module_param(rcu_normal_after_boot, int, 0); #ifdef CONFIG_DEBUG_LOCK_ALLOC /** - * rcu_read_lock_sched_held() - might we be in RCU-sched read-side critical section? + * rcu_read_lock_held_common() - might we be in RCU-sched read-side critical section? + * @ret: Best guess answer if lockdep cannot be relied on * - * If CONFIG_DEBUG_LOCK_ALLOC is selected, returns nonzero iff in an + * Returns true if lockdep must be ignored, in which case *ret contains + * the best guess described below. Otherwise returns false, in which + * case *ret tells the caller nothing and the caller should instead + * consult lockdep. + * + * If CONFIG_DEBUG_LOCK_ALLOC is selected, set *ret to nonzero iff in an * RCU-sched read-side critical section. In absence of * CONFIG_DEBUG_LOCK_ALLOC, this assumes we are in an RCU-sched read-side * critical section unless it can prove otherwise. Note that disabling @@ -75,30 +81,44 @@ module_param(rcu_normal_after_boot, int, 0); * Check debug_lockdep_rcu_enabled() to prevent false positives during boot * and while lockdep is disabled. * - * Note that if the CPU is in the idle loop from an RCU point of - * view (ie: that we are in the section between rcu_idle_enter() and - * rcu_idle_exit()) then rcu_read_lock_held() returns false even if the CPU - * did an rcu_read_lock(). The reason for this is that RCU ignores CPUs - * that are in such a section, considering these as in extended quiescent - * state, so such a CPU is effectively never in an RCU read-side critical - * section regardless of what RCU primitives it invokes. This state of - * affairs is required --- we need to keep an RCU-free window in idle - * where the CPU may possibly enter into low power mode. This way we can - * notice an extended quiescent state to other CPUs that started a grace - * period. Otherwise we would delay any grace period as long as we run in - * the idle task. + * Note that if the CPU is in the idle loop from an RCU point of view (ie: + * that we are in the section between rcu_idle_enter() and rcu_idle_exit()) + * then rcu_read_lock_held() sets *ret to false even if the CPU did an + * rcu_read_lock(). The reason for this is that RCU ignores CPUs that are + * in such a section, considering these as in extended quiescent state, + * so such a CPU is effectively never in an RCU read-side critical section + * regardless of what RCU primitives it invokes. This state of affairs is + * required --- we need to keep an RCU-free window in idle where the CPU may + * possibly enter into low power mode. This way we can notice an extended + * quiescent state to other CPUs that started a grace period. Otherwise + * we would delay any grace period as long as we run in the idle task. * - * Similarly, we avoid claiming an SRCU read lock held if the current + * Similarly, we avoid claiming an RCU read lock held if the current * CPU is offline. */ +static bool rcu_read_lock_held_common(bool *ret) +{ + if (!debug_lockdep_rcu_enabled()) { + *ret = 1; + return true; + } + if (!rcu_is_watching()) { + *ret = 0; + return true; + } + if (!rcu_lockdep_current_cpu_online()) { + *ret = 0; + return true; + } + return false; +} + int rcu_read_lock_sched_held(void) { - if (!debug_lockdep_rcu_enabled()) - return 1; - if (!rcu_is_watching()) - return 0; - if (!rcu_lockdep_current_cpu_online()) - return 0; + bool ret; + + if (rcu_read_lock_held_common(&ret)) + return ret; return lock_is_held(&rcu_sched_lock_map) || !preemptible(); } EXPORT_SYMBOL(rcu_read_lock_sched_held); @@ -257,12 +277,10 @@ NOKPROBE_SYMBOL(debug_lockdep_rcu_enabled); */ int rcu_read_lock_held(void) { - if (!debug_lockdep_rcu_enabled()) - return 1; - if (!rcu_is_watching()) - return 0; - if (!rcu_lockdep_current_cpu_online()) - return 0; + bool ret; + + if (rcu_read_lock_held_common(&ret)) + return ret; return lock_is_held(&rcu_lock_map); } EXPORT_SYMBOL_GPL(rcu_read_lock_held); @@ -284,16 +302,28 @@ EXPORT_SYMBOL_GPL(rcu_read_lock_held); */ int rcu_read_lock_bh_held(void) { - if (!debug_lockdep_rcu_enabled()) - return 1; - if (!rcu_is_watching()) - return 0; - if (!rcu_lockdep_current_cpu_online()) - return 0; + bool ret; + + if (rcu_read_lock_held_common(&ret)) + return ret; return in_softirq() || irqs_disabled(); } EXPORT_SYMBOL_GPL(rcu_read_lock_bh_held); +int rcu_read_lock_any_held(void) +{ + bool ret; + + if (rcu_read_lock_held_common(&ret)) + return ret; + if (lock_is_held(&rcu_lock_map) || + lock_is_held(&rcu_bh_lock_map) || + lock_is_held(&rcu_sched_lock_map)) + return 1; + return !preemptible(); +} +EXPORT_SYMBOL_GPL(rcu_read_lock_any_held); + #endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ /** -- cgit v1.2.3 From 11f26633cccb7243217370837cbb066a73f678a5 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Sat, 10 Aug 2019 13:18:06 +0200 Subject: PM: suspend: Fix platform_suspend_prepare_noirq() After commit ac9eafbe930a ("ACPI: PM: s2idle: Execute LPS0 _DSM functions with suspended devices"), a NULL pointer may be dereferenced if suspend-to-idle is attempted on a platform without "traditional" suspend support due to invalid fall-through in platform_suspend_prepare_noirq(). Fix that and while at it add missing braces in platform_resume_noirq(). Fixes: ac9eafbe930a ("ACPI: PM: s2idle: Execute LPS0 _DSM functions with suspended devices") Reported-by: Marek Szyprowski Signed-off-by: Rafael J. Wysocki --- kernel/power/suspend.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index ed9ddef12b13..f3b7239f1892 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -253,10 +253,10 @@ static int platform_suspend_prepare_late(suspend_state_t state) static int platform_suspend_prepare_noirq(suspend_state_t state) { - if (state == PM_SUSPEND_TO_IDLE) { - if (s2idle_ops && s2idle_ops->prepare_late) - return s2idle_ops->prepare_late(); - } + if (state == PM_SUSPEND_TO_IDLE) + return s2idle_ops && s2idle_ops->prepare_late ? + s2idle_ops->prepare_late() : 0; + return suspend_ops->prepare_late ? suspend_ops->prepare_late() : 0; } @@ -265,8 +265,9 @@ static void platform_resume_noirq(suspend_state_t state) if (state == PM_SUSPEND_TO_IDLE) { if (s2idle_ops && s2idle_ops->restore_early) s2idle_ops->restore_early(); - } else if (suspend_ops->wake) + } else if (suspend_ops->wake) { suspend_ops->wake(); + } } static void platform_resume_early(suspend_state_t state) -- cgit v1.2.3 From 600f5badb78c316146d062cfd7af4a2cfb655baa Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Wed, 7 Aug 2019 12:36:01 +0530 Subject: cpufreq: schedutil: Don't skip freq update when limits change To avoid reducing the frequency of a CPU prematurely, we skip reducing the frequency if the CPU had been busy recently. This should not be done when the limits of the policy are changed, for example due to thermal throttling. We should always get the frequency within the new limits as soon as possible. Trying to fix this by using only one flag, i.e. need_freq_update, can lead to a race condition where the flag gets cleared without forcing us to change the frequency at least once. And so this patch introduces another flag to avoid that race condition. Fixes: ecd288429126 ("cpufreq: schedutil: Don't set next_freq to UINT_MAX") Cc: v4.18+ # v4.18+ Reported-by: Doug Smythies Tested-by: Doug Smythies Signed-off-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki --- kernel/sched/cpufreq_schedutil.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 636ca6f88c8e..867b4bb6d4be 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -40,6 +40,7 @@ struct sugov_policy { struct task_struct *thread; bool work_in_progress; + bool limits_changed; bool need_freq_update; }; @@ -89,8 +90,11 @@ static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time) !cpufreq_this_cpu_can_update(sg_policy->policy)) return false; - if (unlikely(sg_policy->need_freq_update)) + if (unlikely(sg_policy->limits_changed)) { + sg_policy->limits_changed = false; + sg_policy->need_freq_update = true; return true; + } delta_ns = time - sg_policy->last_freq_update_time; @@ -437,7 +441,7 @@ static inline bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu) { return false; } static inline void ignore_dl_rate_limit(struct sugov_cpu *sg_cpu, struct sugov_policy *sg_policy) { if (cpu_bw_dl(cpu_rq(sg_cpu->cpu)) > sg_cpu->bw_dl) - sg_policy->need_freq_update = true; + sg_policy->limits_changed = true; } static void sugov_update_single(struct update_util_data *hook, u64 time, @@ -457,7 +461,8 @@ static void sugov_update_single(struct update_util_data *hook, u64 time, if (!sugov_should_update_freq(sg_policy, time)) return; - busy = sugov_cpu_is_busy(sg_cpu); + /* Limits may have changed, don't skip frequency update */ + busy = !sg_policy->need_freq_update && sugov_cpu_is_busy(sg_cpu); util = sugov_get_util(sg_cpu); max = sg_cpu->max; @@ -831,6 +836,7 @@ static int sugov_start(struct cpufreq_policy *policy) sg_policy->last_freq_update_time = 0; sg_policy->next_freq = 0; sg_policy->work_in_progress = false; + sg_policy->limits_changed = false; sg_policy->need_freq_update = false; sg_policy->cached_raw_freq = 0; @@ -879,7 +885,7 @@ static void sugov_limits(struct cpufreq_policy *policy) mutex_unlock(&sg_policy->work_lock); } - sg_policy->need_freq_update = true; + sg_policy->limits_changed = true; } struct cpufreq_governor schedutil_gov = { -- cgit v1.2.3 From cf14be0b41c659ede89abef3f7ec0e98e6cfea5b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 6 Aug 2019 14:33:23 +0300 Subject: dma-direct: fix DMA_ATTR_NO_KERNEL_MAPPING The new DMA_ATTR_NO_KERNEL_MAPPING needs to actually assign a dma_addr to work. Also skip it if the architecture needs forced decryption handling, as that needs a kernel virtual address. Fixes: d98849aff879 (dma-direct: handle DMA_ATTR_NO_KERNEL_MAPPING in common code) Signed-off-by: Christoph Hellwig Reviewed-by: Lucas Stach --- kernel/dma/direct.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index 59bdceea3737..974e96a1de44 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -130,10 +130,12 @@ void *dma_direct_alloc_pages(struct device *dev, size_t size, if (!page) return NULL; - if (attrs & DMA_ATTR_NO_KERNEL_MAPPING) { + if ((attrs & DMA_ATTR_NO_KERNEL_MAPPING) && + !force_dma_unencrypted(dev)) { /* remove any dirty cache lines on the kernel alias */ if (!PageHighMem(page)) arch_dma_prep_coherent(page, size); + *dma_handle = phys_to_dma(dev, page_to_phys(page)); /* return the page pointer as the opaque cookie */ return page; } @@ -178,7 +180,8 @@ void dma_direct_free_pages(struct device *dev, size_t size, void *cpu_addr, { unsigned int page_order = get_order(size); - if (attrs & DMA_ATTR_NO_KERNEL_MAPPING) { + if ((attrs & DMA_ATTR_NO_KERNEL_MAPPING) && + !force_dma_unencrypted(dev)) { /* cpu_addr is a struct page cookie, not a kernel address */ __dma_direct_free_pages(dev, size, cpu_addr); return; -- cgit v1.2.3 From d8ad55538abe443919e20e0bb996561bca9cad84 Mon Sep 17 00:00:00 2001 From: Lucas Stach Date: Mon, 5 Aug 2019 17:51:53 +0200 Subject: dma-direct: don't truncate dma_required_mask to bus addressing capabilities The dma required_mask needs to reflect the actual addressing capabilities needed to handle the whole system RAM. When truncated down to the bus addressing capabilities dma_addressing_limited() will incorrectly signal no limitations for devices which are restricted by the bus_dma_mask. Fixes: b4ebe6063204 (dma-direct: implement complete bus_dma_mask handling) Signed-off-by: Lucas Stach Tested-by: Atish Patra Signed-off-by: Christoph Hellwig --- kernel/dma/direct.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index 974e96a1de44..795c9b095d75 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -47,9 +47,6 @@ u64 dma_direct_get_required_mask(struct device *dev) { u64 max_dma = phys_to_dma_direct(dev, (max_pfn - 1) << PAGE_SHIFT); - if (dev->bus_dma_mask && dev->bus_dma_mask < max_dma) - max_dma = dev->bus_dma_mask; - return (1ULL << (fls64(max_dma) - 1)) * 2 - 1; } -- cgit v1.2.3 From 33dcb37cef741294b481f4d889a465b8091f11bf Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 26 Jul 2019 09:26:40 +0200 Subject: dma-mapping: fix page attributes for dma_mmap_* All the way back to introducing dma_common_mmap we've defaulted to mark the pages as uncached. But this is wrong for DMA coherent devices. Later on DMA_ATTR_WRITE_COMBINE also got incorrect treatment as that flag is only treated special on the alloc side for non-coherent devices. Introduce a new dma_pgprot helper that deals with the check for coherent devices so that only the remapping cases ever reach arch_dma_mmap_pgprot and we thus ensure no aliasing of page attributes happens, which makes the powerpc version of arch_dma_mmap_pgprot obsolete and simplifies the remaining ones. Note that this means arch_dma_mmap_pgprot is a bit misnamed now, but we'll phase it out soon. Fixes: 64ccc9c033c6 ("common: dma-mapping: add support for generic dma_mmap_* calls") Reported-by: Shawn Anastasio Reported-by: Gavin Li Signed-off-by: Christoph Hellwig Acked-by: Catalin Marinas # arm64 --- kernel/dma/mapping.c | 19 ++++++++++++++++++- kernel/dma/remap.c | 2 +- 2 files changed, 19 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c index b945239621d8..b0038ca3aa92 100644 --- a/kernel/dma/mapping.c +++ b/kernel/dma/mapping.c @@ -150,6 +150,23 @@ int dma_get_sgtable_attrs(struct device *dev, struct sg_table *sgt, } EXPORT_SYMBOL(dma_get_sgtable_attrs); +#ifdef CONFIG_MMU +/* + * Return the page attributes used for mapping dma_alloc_* memory, either in + * kernel space if remapping is needed, or to userspace through dma_mmap_*. + */ +pgprot_t dma_pgprot(struct device *dev, pgprot_t prot, unsigned long attrs) +{ + if (dev_is_dma_coherent(dev) || + (IS_ENABLED(CONFIG_DMA_NONCOHERENT_CACHE_SYNC) && + (attrs & DMA_ATTR_NON_CONSISTENT))) + return prot; + if (IS_ENABLED(CONFIG_ARCH_HAS_DMA_MMAP_PGPROT)) + return arch_dma_mmap_pgprot(dev, prot, attrs); + return pgprot_noncached(prot); +} +#endif /* CONFIG_MMU */ + /* * Create userspace mapping for the DMA-coherent memory. */ @@ -164,7 +181,7 @@ int dma_common_mmap(struct device *dev, struct vm_area_struct *vma, unsigned long pfn; int ret = -ENXIO; - vma->vm_page_prot = arch_dma_mmap_pgprot(dev, vma->vm_page_prot, attrs); + vma->vm_page_prot = dma_pgprot(dev, vma->vm_page_prot, attrs); if (dma_mmap_from_dev_coherent(dev, vma, cpu_addr, size, &ret)) return ret; diff --git a/kernel/dma/remap.c b/kernel/dma/remap.c index a594aec07882..ffe78f0b2fe4 100644 --- a/kernel/dma/remap.c +++ b/kernel/dma/remap.c @@ -218,7 +218,7 @@ void *arch_dma_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle, /* create a coherent mapping */ ret = dma_common_contiguous_remap(page, size, VM_USERMAP, - arch_dma_mmap_pgprot(dev, PAGE_KERNEL, attrs), + dma_pgprot(dev, PAGE_KERNEL, attrs), __builtin_return_address(0)); if (!ret) { __dma_direct_free_pages(dev, size, page); -- cgit v1.2.3 From a46d14eca7b75fffe35603aa8b81df654353d80f Mon Sep 17 00:00:00 2001 From: Phil Auld Date: Thu, 1 Aug 2019 09:37:49 -0400 Subject: sched/fair: Use rq_lock/unlock in online_fair_sched_group Enabling WARN_DOUBLE_CLOCK in /sys/kernel/debug/sched_features causes warning to fire in update_rq_clock. This seems to be caused by onlining a new fair sched group not using the rq lock wrappers. [] rq->clock_update_flags & RQCF_UPDATED [] WARNING: CPU: 5 PID: 54385 at kernel/sched/core.c:210 update_rq_clock+0xec/0x150 [] Call Trace: [] online_fair_sched_group+0x53/0x100 [] cpu_cgroup_css_online+0x16/0x20 [] online_css+0x1c/0x60 [] cgroup_apply_control_enable+0x231/0x3b0 [] cgroup_mkdir+0x41b/0x530 [] kernfs_iop_mkdir+0x61/0xa0 [] vfs_mkdir+0x108/0x1a0 [] do_mkdirat+0x77/0xe0 [] do_syscall_64+0x55/0x1d0 [] entry_SYSCALL_64_after_hwframe+0x44/0xa9 Using the wrappers in online_fair_sched_group instead of the raw locking removes this warning. [ tglx: Use rq_*lock_irq() ] Signed-off-by: Phil Auld Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Cc: Ingo Molnar Cc: Vincent Guittot Cc: Ingo Molnar Link: https://lkml.kernel.org/r/20190801133749.11033-1-pauld@redhat.com --- kernel/sched/fair.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 19c58599e967..1054d2cf6aaa 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -10281,18 +10281,18 @@ err: void online_fair_sched_group(struct task_group *tg) { struct sched_entity *se; + struct rq_flags rf; struct rq *rq; int i; for_each_possible_cpu(i) { rq = cpu_rq(i); se = tg->se[i]; - - raw_spin_lock_irq(&rq->lock); + rq_lock_irq(rq, &rf); update_rq_clock(rq); attach_entity_cfs_rq(se); sync_throttle(tg, i); - raw_spin_unlock_irq(&rq->lock); + rq_unlock_irq(rq, &rf); } } -- cgit v1.2.3 From 4189ff23489e6688ef4c7f5109df0cebbad425b1 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 12 Aug 2019 08:55:24 +0200 Subject: kernel: only define task_struct_whitelist conditionally If CONFIG_ARCH_TASK_STRUCT_ALLOCATOR is set task_struct_whitelist is never called, and thus generates a compiler warning. Signed-off-by: Christoph Hellwig Link: https://lkml.kernel.org/r/20190812065524.19959-5-hch@lst.de Signed-off-by: Tony Luck --- kernel/fork.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 2852d0e76ea3..f79e3da0caaf 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -768,6 +768,7 @@ static void set_max_threads(unsigned int max_threads_suggested) int arch_task_struct_size __read_mostly; #endif +#ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR static void task_struct_whitelist(unsigned long *offset, unsigned long *size) { /* Fetch thread_struct whitelist for the architecture. */ @@ -782,6 +783,7 @@ static void task_struct_whitelist(unsigned long *offset, unsigned long *size) else *offset += offsetof(struct task_struct, thread); } +#endif /* CONFIG_ARCH_TASK_STRUCT_ALLOCATOR */ void __init fork_init(void) { -- cgit v1.2.3 From e78a7614f3876ac649b3df608789cb6ef74d0480 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 5 Jun 2019 07:46:43 -0700 Subject: idle: Prevent late-arriving interrupts from disrupting offline Scheduling-clock interrupts can arrive late in the CPU-offline process, after idle entry and the subsequent call to cpuhp_report_idle_dead(). Once execution passes the call to rcu_report_dead(), RCU is ignoring the CPU, which results in lockdep complaints when the interrupt handler uses RCU: ------------------------------------------------------------------------ ============================= WARNING: suspicious RCU usage 5.2.0-rc1+ #681 Not tainted ----------------------------- kernel/sched/fair.c:9542 suspicious rcu_dereference_check() usage! other info that might help us debug this: RCU used illegally from offline CPU! rcu_scheduler_active = 2, debug_locks = 1 no locks held by swapper/5/0. stack backtrace: CPU: 5 PID: 0 Comm: swapper/5 Not tainted 5.2.0-rc1+ #681 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS Bochs 01/01/2011 Call Trace: dump_stack+0x5e/0x8b trigger_load_balance+0xa8/0x390 ? tick_sched_do_timer+0x60/0x60 update_process_times+0x3b/0x50 tick_sched_handle+0x2f/0x40 tick_sched_timer+0x32/0x70 __hrtimer_run_queues+0xd3/0x3b0 hrtimer_interrupt+0x11d/0x270 ? sched_clock_local+0xc/0x74 smp_apic_timer_interrupt+0x79/0x200 apic_timer_interrupt+0xf/0x20 RIP: 0010:delay_tsc+0x22/0x50 Code: ff 0f 1f 80 00 00 00 00 65 44 8b 05 18 a7 11 48 0f ae e8 0f 31 48 89 d6 48 c1 e6 20 48 09 c6 eb 0e f3 90 65 8b 05 fe a6 11 48 <41> 39 c0 75 18 0f ae e8 0f 31 48 c1 e2 20 48 09 c2 48 89 d0 48 29 RSP: 0000:ffff8f92c0157ed0 EFLAGS: 00000212 ORIG_RAX: ffffffffffffff13 RAX: 0000000000000005 RBX: ffff8c861f356400 RCX: ffff8f92c0157e64 RDX: 000000321214c8cc RSI: 00000032120daa7f RDI: 0000000000260f15 RBP: 0000000000000005 R08: 0000000000000005 R09: 0000000000000000 R10: 0000000000000001 R11: 0000000000000001 R12: 0000000000000000 R13: 0000000000000000 R14: ffff8c861ee18000 R15: ffff8c861ee18000 cpuhp_report_idle_dead+0x31/0x60 do_idle+0x1d5/0x200 ? _raw_spin_unlock_irqrestore+0x2d/0x40 cpu_startup_entry+0x14/0x20 start_secondary+0x151/0x170 secondary_startup_64+0xa4/0xb0 ------------------------------------------------------------------------ This happens rarely, but can be forced by happen more often by placing delays in cpuhp_report_idle_dead() following the call to rcu_report_dead(). With this in place, the following rcutorture scenario reproduces the problem within a few minutes: tools/testing/selftests/rcutorture/bin/kvm.sh --cpus 8 --duration 5 --kconfig "CONFIG_DEBUG_LOCK_ALLOC=y CONFIG_PROVE_LOCKING=y" --configs "TREE04" This commit uses the crude but effective expedient of moving the disabling of interrupts within the idle loop to precede the cpu_is_offline() check. It also invokes tick_nohz_idle_stop_tick() instead of tick_nohz_idle_stop_tick_protected() to shut off the scheduling-clock interrupt. Signed-off-by: Peter Zijlstra Cc: Frederic Weisbecker Cc: Thomas Gleixner Cc: Ingo Molnar [ paulmck: Revert tick_nohz_idle_stop_tick_protected() removal, new callers. ] Signed-off-by: Paul E. McKenney --- kernel/sched/idle.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 80940939b733..e4bc4aa739b8 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -241,13 +241,14 @@ static void do_idle(void) check_pgt_cache(); rmb(); + local_irq_disable(); + if (cpu_is_offline(cpu)) { - tick_nohz_idle_stop_tick_protected(); + tick_nohz_idle_stop_tick(); cpuhp_report_idle_dead(); arch_cpu_idle_dead(); } - local_irq_disable(); arch_cpu_idle_enter(); /* -- cgit v1.2.3 From b823cafa7501f946a37dce5aa1e576a0b2f31ed9 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 5 Jul 2019 08:05:10 -0700 Subject: rcu: Remove redundant "if" condition from rcu_gp_is_expedited() Because rcu_expedited_nesting is initialized to 1 and not decremented until just before init is spawned, rcu_expedited_nesting is guaranteed to be non-zero whenever rcu_scheduler_active == RCU_SCHEDULER_INIT. This commit therefore removes this redundant "if" equality test. Signed-off-by: Paul E. McKenney Reviewed-by: Joel Fernandes (Google) --- kernel/rcu/update.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 249517058b13..64e9cc8609e7 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -136,8 +136,7 @@ static atomic_t rcu_expedited_nesting = ATOMIC_INIT(1); */ bool rcu_gp_is_expedited(void) { - return rcu_expedited || atomic_read(&rcu_expedited_nesting) || - rcu_scheduler_active == RCU_SCHEDULER_INIT; + return rcu_expedited || atomic_read(&rcu_expedited_nesting); } EXPORT_SYMBOL_GPL(rcu_gp_is_expedited); -- cgit v1.2.3 From 511b44f7598ce602f9efce687ca9eec013967d9b Mon Sep 17 00:00:00 2001 From: Mukesh Ojha Date: Mon, 29 Jul 2019 13:25:57 +0530 Subject: rcu: Fix spelling mistake "greate"->"great" This commit fixes a spelling mistake in file tree_exp.h. Signed-off-by: Mukesh Ojha Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_exp.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 513b403b683b..d632cd019597 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -781,7 +781,7 @@ static int rcu_print_task_exp_stall(struct rcu_node *rnp) * other hand, if the CPU is not in an RCU read-side critical section, * the IPI handler reports the quiescent state immediately. * - * Although this is a greate improvement over previous expedited + * Although this is a great improvement over previous expedited * implementations, it is still unfriendly to real-time workloads, so is * thus not recommended for any sort of common-case code. In fact, if * you are using synchronize_rcu_expedited() in a loop, please restructure -- cgit v1.2.3 From 341dfcf8d78eaa3a2dc96dea06f0392eb2978364 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 12 Aug 2019 11:39:47 -0700 Subject: btf: expose BTF info through sysfs Make .BTF section allocated and expose its contents through sysfs. /sys/kernel/btf directory is created to contain all the BTFs present inside kernel. Currently there is only kernel's main BTF, represented as /sys/kernel/btf/kernel file. Once kernel modules' BTFs are supported, each module will expose its BTF as /sys/kernel/btf/ file. Current approach relies on a few pieces coming together: 1. pahole is used to take almost final vmlinux image (modulo .BTF and kallsyms) and generate .BTF section by converting DWARF info into BTF. This section is not allocated and not mapped to any segment, though, so is not yet accessible from inside kernel at runtime. 2. objcopy dumps .BTF contents into binary file and subsequently convert binary file into linkable object file with automatically generated symbols _binary__btf_kernel_bin_start and _binary__btf_kernel_bin_end, pointing to start and end, respectively, of BTF raw data. 3. final vmlinux image is generated by linking this object file (and kallsyms, if necessary). sysfs_btf.c then creates /sys/kernel/btf/kernel file and exposes embedded BTF contents through it. This allows, e.g., libbpf and bpftool access BTF info at well-known location, without resorting to searching for vmlinux image on disk (location of which is not standardized and vmlinux image might not be even available in some scenarios, e.g., inside qemu during testing). Alternative approach using .incbin assembler directive to embed BTF contents directly was attempted but didn't work, because sysfs_proc.o is not re-compiled during link-vmlinux.sh stage. This is required, though, to update embedded BTF data (initially empty data is embedded, then pahole generates BTF info and we need to regenerate sysfs_btf.o with updated contents, but it's too late at that point). If BTF couldn't be generated due to missing or too old pahole, sysfs_btf.c handles that gracefully by detecting that _binary__btf_kernel_bin_start (weak symbol) is 0 and not creating /sys/kernel/btf at all. v2->v3: - added Documentation/ABI/testing/sysfs-kernel-btf (Greg K-H); - created proper kobject (btf_kobj) for btf directory (Greg K-H); - undo v2 change of reusing vmlinux, as it causes extra kallsyms pass due to initially missing __binary__btf_kernel_bin_{start/end} symbols; v1->v2: - allow kallsyms stage to re-use vmlinux generated by gen_btf(); Reviewed-by: Greg Kroah-Hartman Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann --- kernel/bpf/Makefile | 3 +++ kernel/bpf/sysfs_btf.c | 51 ++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 54 insertions(+) create mode 100644 kernel/bpf/sysfs_btf.c (limited to 'kernel') diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 29d781061cd5..e1d9adb212f9 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -22,3 +22,6 @@ obj-$(CONFIG_CGROUP_BPF) += cgroup.o ifeq ($(CONFIG_INET),y) obj-$(CONFIG_BPF_SYSCALL) += reuseport_array.o endif +ifeq ($(CONFIG_SYSFS),y) +obj-$(CONFIG_DEBUG_INFO_BTF) += sysfs_btf.o +endif diff --git a/kernel/bpf/sysfs_btf.c b/kernel/bpf/sysfs_btf.c new file mode 100644 index 000000000000..092e63b9758b --- /dev/null +++ b/kernel/bpf/sysfs_btf.c @@ -0,0 +1,51 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Provide kernel BTF information for introspection and use by eBPF tools. + */ +#include +#include +#include +#include +#include + +/* See scripts/link-vmlinux.sh, gen_btf() func for details */ +extern char __weak _binary__btf_kernel_bin_start[]; +extern char __weak _binary__btf_kernel_bin_end[]; + +static ssize_t +btf_kernel_read(struct file *file, struct kobject *kobj, + struct bin_attribute *bin_attr, + char *buf, loff_t off, size_t len) +{ + memcpy(buf, _binary__btf_kernel_bin_start + off, len); + return len; +} + +static struct bin_attribute bin_attr_btf_kernel __ro_after_init = { + .attr = { .name = "kernel", .mode = 0444, }, + .read = btf_kernel_read, +}; + +static struct kobject *btf_kobj; + +static int __init btf_kernel_init(void) +{ + int err; + + if (!_binary__btf_kernel_bin_start) + return 0; + + btf_kobj = kobject_create_and_add("btf", kernel_kobj); + if (IS_ERR(btf_kobj)) { + err = PTR_ERR(btf_kobj); + btf_kobj = NULL; + return err; + } + + bin_attr_btf_kernel.size = _binary__btf_kernel_bin_end - + _binary__btf_kernel_bin_start; + + return sysfs_create_bin_file(btf_kobj, &bin_attr_btf_kernel); +} + +subsys_initcall(btf_kernel_init); -- cgit v1.2.3 From 7fd785685e2243bb639b31557e258d11464c3489 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 13 Aug 2019 11:54:42 -0700 Subject: btf: rename /sys/kernel/btf/kernel into /sys/kernel/btf/vmlinux Expose kernel's BTF under the name vmlinux to be more uniform with using kernel module names as file names in the future. Fixes: 341dfcf8d78e ("btf: expose BTF info through sysfs") Suggested-by: Daniel Borkmann Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann --- kernel/bpf/sysfs_btf.c | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/sysfs_btf.c b/kernel/bpf/sysfs_btf.c index 092e63b9758b..4659349fc795 100644 --- a/kernel/bpf/sysfs_btf.c +++ b/kernel/bpf/sysfs_btf.c @@ -9,30 +9,30 @@ #include /* See scripts/link-vmlinux.sh, gen_btf() func for details */ -extern char __weak _binary__btf_kernel_bin_start[]; -extern char __weak _binary__btf_kernel_bin_end[]; +extern char __weak _binary__btf_vmlinux_bin_start[]; +extern char __weak _binary__btf_vmlinux_bin_end[]; static ssize_t -btf_kernel_read(struct file *file, struct kobject *kobj, - struct bin_attribute *bin_attr, - char *buf, loff_t off, size_t len) +btf_vmlinux_read(struct file *file, struct kobject *kobj, + struct bin_attribute *bin_attr, + char *buf, loff_t off, size_t len) { - memcpy(buf, _binary__btf_kernel_bin_start + off, len); + memcpy(buf, _binary__btf_vmlinux_bin_start + off, len); return len; } -static struct bin_attribute bin_attr_btf_kernel __ro_after_init = { - .attr = { .name = "kernel", .mode = 0444, }, - .read = btf_kernel_read, +static struct bin_attribute bin_attr_btf_vmlinux __ro_after_init = { + .attr = { .name = "vmlinux", .mode = 0444, }, + .read = btf_vmlinux_read, }; static struct kobject *btf_kobj; -static int __init btf_kernel_init(void) +static int __init btf_vmlinux_init(void) { int err; - if (!_binary__btf_kernel_bin_start) + if (!_binary__btf_vmlinux_bin_start) return 0; btf_kobj = kobject_create_and_add("btf", kernel_kobj); @@ -42,10 +42,10 @@ static int __init btf_kernel_init(void) return err; } - bin_attr_btf_kernel.size = _binary__btf_kernel_bin_end - - _binary__btf_kernel_bin_start; + bin_attr_btf_vmlinux.size = _binary__btf_vmlinux_bin_end - + _binary__btf_vmlinux_bin_start; - return sysfs_create_bin_file(btf_kobj, &bin_attr_btf_kernel); + return sysfs_create_bin_file(btf_kobj, &bin_attr_btf_vmlinux); } -subsys_initcall(btf_kernel_init); +subsys_initcall(btf_vmlinux_init); -- cgit v1.2.3 From 58bf6f77c6fb0abe8e1330d8375dddd52711ef4c Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 28 Mar 2019 15:33:59 -0700 Subject: rcu/nocb: Rename rcu_data fields to prepare for forward-progress work This commit simply renames rcu_data fields to prepare for leader nocb kthreads doing only grace-period work and callback shuffling. This will mean the addition of replacement kthreads to invoke callbacks. The "leader" and "follower" thus become less meaningful, so the commit changes no-CB fields with these strings to "gp" and "cb", respectively. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.h | 14 ++++----- kernel/rcu/tree_plugin.h | 78 ++++++++++++++++++++++++------------------------ 2 files changed, 46 insertions(+), 46 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 7acaf3a62d39..e4e59b627c5a 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -198,10 +198,10 @@ struct rcu_data { struct rcu_head **nocb_tail; atomic_long_t nocb_q_count; /* # CBs waiting for nocb */ atomic_long_t nocb_q_count_lazy; /* invocation (all stages). */ - struct rcu_head *nocb_follower_head; /* CBs ready to invoke. */ - struct rcu_head **nocb_follower_tail; + struct rcu_head *nocb_cb_head; /* CBs ready to invoke. */ + struct rcu_head **nocb_cb_tail; struct swait_queue_head nocb_wq; /* For nocb kthreads to sleep on. */ - struct task_struct *nocb_kthread; + struct task_struct *nocb_cb_kthread; raw_spinlock_t nocb_lock; /* Guard following pair of fields. */ int nocb_defer_wakeup; /* Defer wakeup of nocb_kthread. */ struct timer_list nocb_timer; /* Enforce finite deferral. */ @@ -210,12 +210,12 @@ struct rcu_data { struct rcu_head *nocb_gp_head ____cacheline_internodealigned_in_smp; /* CBs waiting for GP. */ struct rcu_head **nocb_gp_tail; - bool nocb_leader_sleep; /* Is the nocb leader thread asleep? */ - struct rcu_data *nocb_next_follower; - /* Next follower in wakeup chain. */ + bool nocb_gp_sleep; /* Is the nocb leader thread asleep? */ + struct rcu_data *nocb_next_cb_rdp; + /* Next rcu_data in wakeup chain. */ /* The following fields are used by the follower, hence new cachline. */ - struct rcu_data *nocb_leader ____cacheline_internodealigned_in_smp; + struct rcu_data *nocb_gp_rdp ____cacheline_internodealigned_in_smp; /* Leader CPU takes GP-end wakeups. */ #endif /* #ifdef CONFIG_RCU_NOCB_CPU */ diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 99e9d952827b..5ce1edd1c87f 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1528,19 +1528,19 @@ static void __wake_nocb_leader(struct rcu_data *rdp, bool force, unsigned long flags) __releases(rdp->nocb_lock) { - struct rcu_data *rdp_leader = rdp->nocb_leader; + struct rcu_data *rdp_leader = rdp->nocb_gp_rdp; lockdep_assert_held(&rdp->nocb_lock); - if (!READ_ONCE(rdp_leader->nocb_kthread)) { + if (!READ_ONCE(rdp_leader->nocb_cb_kthread)) { raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); return; } - if (rdp_leader->nocb_leader_sleep || force) { + if (rdp_leader->nocb_gp_sleep || force) { /* Prior smp_mb__after_atomic() orders against prior enqueue. */ - WRITE_ONCE(rdp_leader->nocb_leader_sleep, false); + WRITE_ONCE(rdp_leader->nocb_gp_sleep, false); del_timer(&rdp->nocb_timer); raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); - smp_mb(); /* ->nocb_leader_sleep before swake_up_one(). */ + smp_mb(); /* ->nocb_gp_sleep before swake_up_one(). */ swake_up_one(&rdp_leader->nocb_wq); } else { raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); @@ -1604,10 +1604,10 @@ static bool rcu_nocb_cpu_needs_barrier(int cpu) if (!rhp) rhp = READ_ONCE(rdp->nocb_gp_head); if (!rhp) - rhp = READ_ONCE(rdp->nocb_follower_head); + rhp = READ_ONCE(rdp->nocb_cb_head); /* Having no rcuo kthread but CBs after scheduler starts is bad! */ - if (!READ_ONCE(rdp->nocb_kthread) && rhp && + if (!READ_ONCE(rdp->nocb_cb_kthread) && rhp && rcu_scheduler_fully_active) { /* RCU callback enqueued before CPU first came online??? */ pr_err("RCU: Never-onlined no-CBs CPU %d has CB %p\n", @@ -1646,7 +1646,7 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp, smp_mb__after_atomic(); /* Store *old_rhpp before _wake test. */ /* If we are not being polled and there is a kthread, awaken it ... */ - t = READ_ONCE(rdp->nocb_kthread); + t = READ_ONCE(rdp->nocb_cb_kthread); if (rcu_nocb_poll || !t) { trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WakeNotPoll")); @@ -1800,9 +1800,9 @@ wait_again: if (!rcu_nocb_poll) { trace_rcu_nocb_wake(rcu_state.name, my_rdp->cpu, TPS("Sleep")); swait_event_interruptible_exclusive(my_rdp->nocb_wq, - !READ_ONCE(my_rdp->nocb_leader_sleep)); + !READ_ONCE(my_rdp->nocb_gp_sleep)); raw_spin_lock_irqsave(&my_rdp->nocb_lock, flags); - my_rdp->nocb_leader_sleep = true; + my_rdp->nocb_gp_sleep = true; WRITE_ONCE(my_rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT); del_timer(&my_rdp->nocb_timer); raw_spin_unlock_irqrestore(&my_rdp->nocb_lock, flags); @@ -1818,7 +1818,7 @@ wait_again: */ gotcbs = false; smp_mb(); /* wakeup and _sleep before ->nocb_head reads. */ - for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower) { + for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_cb_rdp) { rdp->nocb_gp_head = READ_ONCE(rdp->nocb_head); if (!rdp->nocb_gp_head) continue; /* No CBs here, try next follower. */ @@ -1845,12 +1845,12 @@ wait_again: rcu_nocb_wait_gp(my_rdp); /* Each pass through the following loop wakes a follower, if needed. */ - for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower) { + for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_cb_rdp) { if (!rcu_nocb_poll && READ_ONCE(rdp->nocb_head) && - READ_ONCE(my_rdp->nocb_leader_sleep)) { + READ_ONCE(my_rdp->nocb_gp_sleep)) { raw_spin_lock_irqsave(&my_rdp->nocb_lock, flags); - my_rdp->nocb_leader_sleep = false;/* No need to sleep.*/ + my_rdp->nocb_gp_sleep = false;/* No need to sleep.*/ raw_spin_unlock_irqrestore(&my_rdp->nocb_lock, flags); } if (!rdp->nocb_gp_head) @@ -1858,18 +1858,18 @@ wait_again: /* Append callbacks to follower's "done" list. */ raw_spin_lock_irqsave(&rdp->nocb_lock, flags); - tail = rdp->nocb_follower_tail; - rdp->nocb_follower_tail = rdp->nocb_gp_tail; + tail = rdp->nocb_cb_tail; + rdp->nocb_cb_tail = rdp->nocb_gp_tail; *tail = rdp->nocb_gp_head; raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); - if (rdp != my_rdp && tail == &rdp->nocb_follower_head) { + if (rdp != my_rdp && tail == &rdp->nocb_cb_head) { /* List was empty, so wake up the follower. */ swake_up_one(&rdp->nocb_wq); } } /* If we (the leader) don't have CBs, go wait some more. */ - if (!my_rdp->nocb_follower_head) + if (!my_rdp->nocb_cb_head) goto wait_again; } @@ -1882,8 +1882,8 @@ static void nocb_follower_wait(struct rcu_data *rdp) for (;;) { trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("FollowerSleep")); swait_event_interruptible_exclusive(rdp->nocb_wq, - READ_ONCE(rdp->nocb_follower_head)); - if (smp_load_acquire(&rdp->nocb_follower_head)) { + READ_ONCE(rdp->nocb_cb_head)); + if (smp_load_acquire(&rdp->nocb_cb_head)) { /* ^^^ Ensure CB invocation follows _head test. */ return; } @@ -1910,17 +1910,17 @@ static int rcu_nocb_kthread(void *arg) /* Each pass through this loop invokes one batch of callbacks */ for (;;) { /* Wait for callbacks. */ - if (rdp->nocb_leader == rdp) + if (rdp->nocb_gp_rdp == rdp) nocb_leader_wait(rdp); else nocb_follower_wait(rdp); /* Pull the ready-to-invoke callbacks onto local list. */ raw_spin_lock_irqsave(&rdp->nocb_lock, flags); - list = rdp->nocb_follower_head; - rdp->nocb_follower_head = NULL; - tail = rdp->nocb_follower_tail; - rdp->nocb_follower_tail = &rdp->nocb_follower_head; + list = rdp->nocb_cb_head; + rdp->nocb_cb_head = NULL; + tail = rdp->nocb_cb_tail; + rdp->nocb_cb_tail = &rdp->nocb_cb_head; raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); if (WARN_ON_ONCE(!list)) continue; @@ -2048,7 +2048,7 @@ static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp) { rdp->nocb_tail = &rdp->nocb_head; init_swait_queue_head(&rdp->nocb_wq); - rdp->nocb_follower_tail = &rdp->nocb_follower_head; + rdp->nocb_cb_tail = &rdp->nocb_cb_head; raw_spin_lock_init(&rdp->nocb_lock); timer_setup(&rdp->nocb_timer, do_nocb_deferred_wakeup_timer, 0); } @@ -2070,27 +2070,27 @@ static void rcu_spawn_one_nocb_kthread(int cpu) * If this isn't a no-CBs CPU or if it already has an rcuo kthread, * then nothing to do. */ - if (!rcu_is_nocb_cpu(cpu) || rdp_spawn->nocb_kthread) + if (!rcu_is_nocb_cpu(cpu) || rdp_spawn->nocb_cb_kthread) return; /* If we didn't spawn the leader first, reorganize! */ - rdp_old_leader = rdp_spawn->nocb_leader; - if (rdp_old_leader != rdp_spawn && !rdp_old_leader->nocb_kthread) { + rdp_old_leader = rdp_spawn->nocb_gp_rdp; + if (rdp_old_leader != rdp_spawn && !rdp_old_leader->nocb_cb_kthread) { rdp_last = NULL; rdp = rdp_old_leader; do { - rdp->nocb_leader = rdp_spawn; + rdp->nocb_gp_rdp = rdp_spawn; if (rdp_last && rdp != rdp_spawn) - rdp_last->nocb_next_follower = rdp; + rdp_last->nocb_next_cb_rdp = rdp; if (rdp == rdp_spawn) { - rdp = rdp->nocb_next_follower; + rdp = rdp->nocb_next_cb_rdp; } else { rdp_last = rdp; - rdp = rdp->nocb_next_follower; - rdp_last->nocb_next_follower = NULL; + rdp = rdp->nocb_next_cb_rdp; + rdp_last->nocb_next_cb_rdp = NULL; } } while (rdp); - rdp_spawn->nocb_next_follower = rdp_old_leader; + rdp_spawn->nocb_next_cb_rdp = rdp_old_leader; } /* Spawn the kthread for this CPU. */ @@ -2098,7 +2098,7 @@ static void rcu_spawn_one_nocb_kthread(int cpu) "rcuo%c/%d", rcu_state.abbr, cpu); if (WARN_ONCE(IS_ERR(t), "%s: Could not start rcuo kthread, OOM is now expected behavior\n", __func__)) return; - WRITE_ONCE(rdp_spawn->nocb_kthread, t); + WRITE_ONCE(rdp_spawn->nocb_cb_kthread, t); } /* @@ -2158,12 +2158,12 @@ static void __init rcu_organize_nocb_kthreads(void) if (rdp->cpu >= nl) { /* New leader, set up for followers & next leader. */ nl = DIV_ROUND_UP(rdp->cpu + 1, ls) * ls; - rdp->nocb_leader = rdp; + rdp->nocb_gp_rdp = rdp; rdp_leader = rdp; } else { /* Another follower, link to previous leader. */ - rdp->nocb_leader = rdp_leader; - rdp_prev->nocb_next_follower = rdp; + rdp->nocb_gp_rdp = rdp_leader; + rdp_prev->nocb_next_cb_rdp = rdp; } rdp_prev = rdp; } -- cgit v1.2.3 From 6484fe54b5c64e9a388f369001508ab8df85a646 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 28 Mar 2019 15:44:18 -0700 Subject: rcu/nocb: Update comments to prepare for forward-progress work This commit simply rewords comments to prepare for leader nocb kthreads doing only grace-period work and callback shuffling. This will mean the addition of replacement kthreads to invoke callbacks. The "leader" and "follower" thus become less meaningful, so the commit changes no-CB comments with these strings to "GP" and "CB", respectively. (Give or take the usual grammatical transformations.) Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.h | 8 +++---- kernel/rcu/tree_plugin.h | 57 ++++++++++++++++++++++++------------------------ 2 files changed, 33 insertions(+), 32 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index e4e59b627c5a..32b3348d3a4d 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -206,17 +206,17 @@ struct rcu_data { int nocb_defer_wakeup; /* Defer wakeup of nocb_kthread. */ struct timer_list nocb_timer; /* Enforce finite deferral. */ - /* The following fields are used by the leader, hence own cacheline. */ + /* The following fields are used by GP kthread, hence own cacheline. */ struct rcu_head *nocb_gp_head ____cacheline_internodealigned_in_smp; /* CBs waiting for GP. */ struct rcu_head **nocb_gp_tail; - bool nocb_gp_sleep; /* Is the nocb leader thread asleep? */ + bool nocb_gp_sleep; /* Is the nocb GP thread asleep? */ struct rcu_data *nocb_next_cb_rdp; /* Next rcu_data in wakeup chain. */ - /* The following fields are used by the follower, hence new cachline. */ + /* The following fields are used by CB kthread, hence new cachline. */ struct rcu_data *nocb_gp_rdp ____cacheline_internodealigned_in_smp; - /* Leader CPU takes GP-end wakeups. */ + /* GP rdp takes GP-end wakeups. */ #endif /* #ifdef CONFIG_RCU_NOCB_CPU */ /* 6) RCU priority boosting. */ diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 5ce1edd1c87f..5a72700c3a32 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1447,10 +1447,10 @@ static void rcu_cleanup_after_idle(void) * specified by rcu_nocb_mask. For the CPUs in the set, there are kthreads * created that pull the callbacks from the corresponding CPU, wait for * a grace period to elapse, and invoke the callbacks. These kthreads - * are organized into leaders, which manage incoming callbacks, wait for - * grace periods, and awaken followers, and the followers, which only - * invoke callbacks. Each leader is its own follower. The no-CBs CPUs - * do a wake_up() on their kthread when they insert a callback into any + * are organized into GP kthreads, which manage incoming callbacks, wait for + * grace periods, and awaken CB kthreads, and the CB kthreads, which only + * invoke callbacks. Each GP kthread invokes its own CBs. The no-CBs CPUs + * do a wake_up() on their GP kthread when they insert a callback into any * empty list, unless the rcu_nocb_poll boot parameter has been specified, * in which case each kthread actively polls its CPU. (Which isn't so great * for energy efficiency, but which does reduce RCU's overhead on that CPU.) @@ -1521,7 +1521,7 @@ bool rcu_is_nocb_cpu(int cpu) } /* - * Kick the leader kthread for this NOCB group. Caller holds ->nocb_lock + * Kick the GP kthread for this NOCB group. Caller holds ->nocb_lock * and this function releases it. */ static void __wake_nocb_leader(struct rcu_data *rdp, bool force, @@ -1548,7 +1548,7 @@ static void __wake_nocb_leader(struct rcu_data *rdp, bool force, } /* - * Kick the leader kthread for this NOCB group, but caller has not + * Kick the GP kthread for this NOCB group, but caller has not * acquired locks. */ static void wake_nocb_leader(struct rcu_data *rdp, bool force) @@ -1560,8 +1560,8 @@ static void wake_nocb_leader(struct rcu_data *rdp, bool force) } /* - * Arrange to wake the leader kthread for this NOCB group at some - * future time when it is safe to do so. + * Arrange to wake the GP kthread for this NOCB group at some future + * time when it is safe to do so. */ static void wake_nocb_leader_defer(struct rcu_data *rdp, int waketype, const char *reason) @@ -1783,7 +1783,7 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp) } /* - * Leaders come here to wait for additional callbacks to show up. + * No-CBs GP kthreads come here to wait for additional callbacks to show up. * This function does not return until callbacks appear. */ static void nocb_leader_wait(struct rcu_data *my_rdp) @@ -1812,8 +1812,8 @@ wait_again: } /* - * Each pass through the following loop checks a follower for CBs. - * We are our own first follower. Any CBs found are moved to + * Each pass through the following loop checks for CBs. + * We are our own first CB kthread. Any CBs found are moved to * nocb_gp_head, where they await a grace period. */ gotcbs = false; @@ -1821,7 +1821,7 @@ wait_again: for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_cb_rdp) { rdp->nocb_gp_head = READ_ONCE(rdp->nocb_head); if (!rdp->nocb_gp_head) - continue; /* No CBs here, try next follower. */ + continue; /* No CBs here, try next. */ /* Move callbacks to wait-for-GP list, which is empty. */ WRITE_ONCE(rdp->nocb_head, NULL); @@ -1844,7 +1844,7 @@ wait_again: /* Wait for one grace period. */ rcu_nocb_wait_gp(my_rdp); - /* Each pass through the following loop wakes a follower, if needed. */ + /* Each pass through this loop wakes a CB kthread, if needed. */ for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_cb_rdp) { if (!rcu_nocb_poll && READ_ONCE(rdp->nocb_head) && @@ -1854,27 +1854,27 @@ wait_again: raw_spin_unlock_irqrestore(&my_rdp->nocb_lock, flags); } if (!rdp->nocb_gp_head) - continue; /* No CBs, so no need to wake follower. */ + continue; /* No CBs, so no need to wake kthread. */ - /* Append callbacks to follower's "done" list. */ + /* Append callbacks to CB kthread's "done" list. */ raw_spin_lock_irqsave(&rdp->nocb_lock, flags); tail = rdp->nocb_cb_tail; rdp->nocb_cb_tail = rdp->nocb_gp_tail; *tail = rdp->nocb_gp_head; raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); if (rdp != my_rdp && tail == &rdp->nocb_cb_head) { - /* List was empty, so wake up the follower. */ + /* List was empty, so wake up the kthread. */ swake_up_one(&rdp->nocb_wq); } } - /* If we (the leader) don't have CBs, go wait some more. */ + /* If we (the GP kthreads) don't have CBs, go wait some more. */ if (!my_rdp->nocb_cb_head) goto wait_again; } /* - * Followers come here to wait for additional callbacks to show up. + * No-CBs CB kthreads come here to wait for additional callbacks to show up. * This function does not return until callbacks appear. */ static void nocb_follower_wait(struct rcu_data *rdp) @@ -1894,9 +1894,10 @@ static void nocb_follower_wait(struct rcu_data *rdp) /* * Per-rcu_data kthread, but only for no-CBs CPUs. Each kthread invokes - * callbacks queued by the corresponding no-CBs CPU, however, there is - * an optional leader-follower relationship so that the grace-period - * kthreads don't have to do quite so many wakeups. + * callbacks queued by the corresponding no-CBs CPU, however, there is an + * optional GP-CB relationship so that the grace-period kthreads don't + * have to do quite so many wakeups (as in they only need to wake the + * no-CBs GP kthreads, not the CB kthreads). */ static int rcu_nocb_kthread(void *arg) { @@ -2056,7 +2057,7 @@ static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp) /* * If the specified CPU is a no-CBs CPU that does not already have its * rcuo kthread, spawn it. If the CPUs are brought online out of order, - * this can require re-organizing the leader-follower relationships. + * this can require re-organizing the GP-CB relationships. */ static void rcu_spawn_one_nocb_kthread(int cpu) { @@ -2073,7 +2074,7 @@ static void rcu_spawn_one_nocb_kthread(int cpu) if (!rcu_is_nocb_cpu(cpu) || rdp_spawn->nocb_cb_kthread) return; - /* If we didn't spawn the leader first, reorganize! */ + /* If we didn't spawn the GP kthread first, reorganize! */ rdp_old_leader = rdp_spawn->nocb_gp_rdp; if (rdp_old_leader != rdp_spawn && !rdp_old_leader->nocb_cb_kthread) { rdp_last = NULL; @@ -2125,18 +2126,18 @@ static void __init rcu_spawn_nocb_kthreads(void) rcu_spawn_cpu_nocb_kthread(cpu); } -/* How many follower CPU IDs per leader? Default of -1 for sqrt(nr_cpu_ids). */ +/* How many CB CPU IDs per GP kthread? Default of -1 for sqrt(nr_cpu_ids). */ static int rcu_nocb_leader_stride = -1; module_param(rcu_nocb_leader_stride, int, 0444); /* - * Initialize leader-follower relationships for all no-CBs CPU. + * Initialize GP-CB relationships for all no-CBs CPU. */ static void __init rcu_organize_nocb_kthreads(void) { int cpu; int ls = rcu_nocb_leader_stride; - int nl = 0; /* Next leader. */ + int nl = 0; /* Next GP kthread. */ struct rcu_data *rdp; struct rcu_data *rdp_leader = NULL; /* Suppress misguided gcc warn. */ struct rcu_data *rdp_prev = NULL; @@ -2156,12 +2157,12 @@ static void __init rcu_organize_nocb_kthreads(void) for_each_cpu(cpu, rcu_nocb_mask) { rdp = per_cpu_ptr(&rcu_data, cpu); if (rdp->cpu >= nl) { - /* New leader, set up for followers & next leader. */ + /* New GP kthread, set up for CBs & next GP. */ nl = DIV_ROUND_UP(rdp->cpu + 1, ls) * ls; rdp->nocb_gp_rdp = rdp; rdp_leader = rdp; } else { - /* Another follower, link to previous leader. */ + /* Another CB kthread, link to previous GP kthread. */ rdp->nocb_gp_rdp = rdp_leader; rdp_prev->nocb_next_cb_rdp = rdp; } -- cgit v1.2.3 From 12f54c3a8410102afb96ed437aebe7f1d87f399f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 29 Mar 2019 16:43:51 -0700 Subject: rcu/nocb: Provide separate no-CBs grace-period kthreads Currently, there is one no-CBs rcuo kthread per CPU, and these kthreads are divided into groups. The first rcuo kthread to come online in a given group is that group's leader, and the leader both waits for grace periods and invokes its CPU's callbacks. The non-leader rcuo kthreads only invoke callbacks. This works well in the real-time/embedded environments for which it was intended because such environments tend not to generate all that many callbacks. However, given huge floods of callbacks, it is possible for the leader kthread to be stuck invoking callbacks while its followers wait helplessly while their callbacks pile up. This is a good recipe for an OOM, and rcutorture's new callback-flood capability does generate such OOMs. One strategy would be to wait until such OOMs start happening in production, but similar OOMs have in fact happened starting in 2018. It would therefore be wise to take a more proactive approach. This commit therefore features per-CPU rcuo kthreads that do nothing but invoke callbacks. Instead of having one of these kthreads act as leader, each group has a separate rcog kthread that handles grace periods for its group. Because these rcuog kthreads do not invoke callbacks, callback floods on one CPU no longer block callbacks from reaching the rcuc callback-invocation kthreads on other CPUs. This change does introduce additional kthreads, however: 1. The number of additional kthreads is about the square root of the number of CPUs, so that a 4096-CPU system would have only about 64 additional kthreads. Note that recent changes decreased the number of rcuo kthreads by a factor of two (CONFIG_PREEMPT=n) or even three (CONFIG_PREEMPT=y), so this still represents a significant improvement on most systems. 2. The leading "rcuo" of the rcuog kthreads should allow existing scripting to affinity these additional kthreads as needed, the same as for the rcuop and rcuos kthreads. (There are no longer any rcuob kthreads.) 3. A state-machine approach was considered and rejected. Although this would allow the rcuo kthreads to continue their dual leader/follower roles, it complicates callback invocation and makes it more difficult to consolidate rcuo callback invocation with existing softirq callback invocation. The introduction of rcuog kthreads should thus be acceptable. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.h | 6 ++- kernel/rcu/tree_plugin.h | 115 +++++++++++++++++++++++------------------------ 2 files changed, 61 insertions(+), 60 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 32b3348d3a4d..dc3c53cb9608 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -200,8 +200,8 @@ struct rcu_data { atomic_long_t nocb_q_count_lazy; /* invocation (all stages). */ struct rcu_head *nocb_cb_head; /* CBs ready to invoke. */ struct rcu_head **nocb_cb_tail; - struct swait_queue_head nocb_wq; /* For nocb kthreads to sleep on. */ - struct task_struct *nocb_cb_kthread; + struct swait_queue_head nocb_cb_wq; /* For nocb kthreads to sleep on. */ + struct task_struct *nocb_gp_kthread; raw_spinlock_t nocb_lock; /* Guard following pair of fields. */ int nocb_defer_wakeup; /* Defer wakeup of nocb_kthread. */ struct timer_list nocb_timer; /* Enforce finite deferral. */ @@ -211,6 +211,8 @@ struct rcu_data { /* CBs waiting for GP. */ struct rcu_head **nocb_gp_tail; bool nocb_gp_sleep; /* Is the nocb GP thread asleep? */ + struct swait_queue_head nocb_gp_wq; /* For nocb kthreads to sleep on. */ + struct task_struct *nocb_cb_kthread; struct rcu_data *nocb_next_cb_rdp; /* Next rcu_data in wakeup chain. */ diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 5a72700c3a32..c3b6493313ab 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1531,7 +1531,7 @@ static void __wake_nocb_leader(struct rcu_data *rdp, bool force, struct rcu_data *rdp_leader = rdp->nocb_gp_rdp; lockdep_assert_held(&rdp->nocb_lock); - if (!READ_ONCE(rdp_leader->nocb_cb_kthread)) { + if (!READ_ONCE(rdp_leader->nocb_gp_kthread)) { raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); return; } @@ -1541,7 +1541,7 @@ static void __wake_nocb_leader(struct rcu_data *rdp, bool force, del_timer(&rdp->nocb_timer); raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); smp_mb(); /* ->nocb_gp_sleep before swake_up_one(). */ - swake_up_one(&rdp_leader->nocb_wq); + swake_up_one(&rdp_leader->nocb_gp_wq); } else { raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); } @@ -1646,7 +1646,7 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp, smp_mb__after_atomic(); /* Store *old_rhpp before _wake test. */ /* If we are not being polled and there is a kthread, awaken it ... */ - t = READ_ONCE(rdp->nocb_cb_kthread); + t = READ_ONCE(rdp->nocb_gp_kthread); if (rcu_nocb_poll || !t) { trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WakeNotPoll")); @@ -1786,7 +1786,7 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp) * No-CBs GP kthreads come here to wait for additional callbacks to show up. * This function does not return until callbacks appear. */ -static void nocb_leader_wait(struct rcu_data *my_rdp) +static void nocb_gp_wait(struct rcu_data *my_rdp) { bool firsttime = true; unsigned long flags; @@ -1794,12 +1794,10 @@ static void nocb_leader_wait(struct rcu_data *my_rdp) struct rcu_data *rdp; struct rcu_head **tail; -wait_again: - /* Wait for callbacks to appear. */ if (!rcu_nocb_poll) { trace_rcu_nocb_wake(rcu_state.name, my_rdp->cpu, TPS("Sleep")); - swait_event_interruptible_exclusive(my_rdp->nocb_wq, + swait_event_interruptible_exclusive(my_rdp->nocb_gp_wq, !READ_ONCE(my_rdp->nocb_gp_sleep)); raw_spin_lock_irqsave(&my_rdp->nocb_lock, flags); my_rdp->nocb_gp_sleep = true; @@ -1838,7 +1836,7 @@ wait_again: trace_rcu_nocb_wake(rcu_state.name, my_rdp->cpu, TPS("WokeEmpty")); } - goto wait_again; + return; } /* Wait for one grace period. */ @@ -1862,34 +1860,47 @@ wait_again: rdp->nocb_cb_tail = rdp->nocb_gp_tail; *tail = rdp->nocb_gp_head; raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); - if (rdp != my_rdp && tail == &rdp->nocb_cb_head) { + if (tail == &rdp->nocb_cb_head) { /* List was empty, so wake up the kthread. */ - swake_up_one(&rdp->nocb_wq); + swake_up_one(&rdp->nocb_cb_wq); } } +} - /* If we (the GP kthreads) don't have CBs, go wait some more. */ - if (!my_rdp->nocb_cb_head) - goto wait_again; +/* + * No-CBs grace-period-wait kthread. There is one of these per group + * of CPUs, but only once at least one CPU in that group has come online + * at least once since boot. This kthread checks for newly posted + * callbacks from any of the CPUs it is responsible for, waits for a + * grace period, then awakens all of the rcu_nocb_cb_kthread() instances + * that then have callback-invocation work to do. + */ +static int rcu_nocb_gp_kthread(void *arg) +{ + struct rcu_data *rdp = arg; + + for (;;) + nocb_gp_wait(rdp); + return 0; } /* * No-CBs CB kthreads come here to wait for additional callbacks to show up. - * This function does not return until callbacks appear. + * This function returns true ("keep waiting") until callbacks appear and + * then false ("stop waiting") when callbacks finally do appear. */ -static void nocb_follower_wait(struct rcu_data *rdp) +static bool nocb_follower_wait(struct rcu_data *rdp) { - for (;;) { - trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("FollowerSleep")); - swait_event_interruptible_exclusive(rdp->nocb_wq, - READ_ONCE(rdp->nocb_cb_head)); - if (smp_load_acquire(&rdp->nocb_cb_head)) { - /* ^^^ Ensure CB invocation follows _head test. */ - return; - } - WARN_ON(signal_pending(current)); - trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WokeEmpty")); + trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("FollowerSleep")); + swait_event_interruptible_exclusive(rdp->nocb_cb_wq, + READ_ONCE(rdp->nocb_cb_head)); + if (smp_load_acquire(&rdp->nocb_cb_head)) { /* VVV */ + /* ^^^ Ensure CB invocation follows _head test. */ + return false; } + WARN_ON(signal_pending(current)); + trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WokeEmpty")); + return true; } /* @@ -1899,7 +1910,7 @@ static void nocb_follower_wait(struct rcu_data *rdp) * have to do quite so many wakeups (as in they only need to wake the * no-CBs GP kthreads, not the CB kthreads). */ -static int rcu_nocb_kthread(void *arg) +static int rcu_nocb_cb_kthread(void *arg) { int c, cl; unsigned long flags; @@ -1911,10 +1922,8 @@ static int rcu_nocb_kthread(void *arg) /* Each pass through this loop invokes one batch of callbacks */ for (;;) { /* Wait for callbacks. */ - if (rdp->nocb_gp_rdp == rdp) - nocb_leader_wait(rdp); - else - nocb_follower_wait(rdp); + while (nocb_follower_wait(rdp)) + continue; /* Pull the ready-to-invoke callbacks onto local list. */ raw_spin_lock_irqsave(&rdp->nocb_lock, flags); @@ -2048,7 +2057,8 @@ void __init rcu_init_nohz(void) static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp) { rdp->nocb_tail = &rdp->nocb_head; - init_swait_queue_head(&rdp->nocb_wq); + init_swait_queue_head(&rdp->nocb_cb_wq); + init_swait_queue_head(&rdp->nocb_gp_wq); rdp->nocb_cb_tail = &rdp->nocb_cb_head; raw_spin_lock_init(&rdp->nocb_lock); timer_setup(&rdp->nocb_timer, do_nocb_deferred_wakeup_timer, 0); @@ -2056,50 +2066,39 @@ static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp) /* * If the specified CPU is a no-CBs CPU that does not already have its - * rcuo kthread, spawn it. If the CPUs are brought online out of order, - * this can require re-organizing the GP-CB relationships. + * rcuo CB kthread, spawn it. Additionally, if the rcuo GP kthread + * for this CPU's group has not yet been created, spawn it as well. */ static void rcu_spawn_one_nocb_kthread(int cpu) { - struct rcu_data *rdp; - struct rcu_data *rdp_last; - struct rcu_data *rdp_old_leader; - struct rcu_data *rdp_spawn = per_cpu_ptr(&rcu_data, cpu); + struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); + struct rcu_data *rdp_gp; struct task_struct *t; /* * If this isn't a no-CBs CPU or if it already has an rcuo kthread, * then nothing to do. */ - if (!rcu_is_nocb_cpu(cpu) || rdp_spawn->nocb_cb_kthread) + if (!rcu_is_nocb_cpu(cpu) || rdp->nocb_cb_kthread) return; /* If we didn't spawn the GP kthread first, reorganize! */ - rdp_old_leader = rdp_spawn->nocb_gp_rdp; - if (rdp_old_leader != rdp_spawn && !rdp_old_leader->nocb_cb_kthread) { - rdp_last = NULL; - rdp = rdp_old_leader; - do { - rdp->nocb_gp_rdp = rdp_spawn; - if (rdp_last && rdp != rdp_spawn) - rdp_last->nocb_next_cb_rdp = rdp; - if (rdp == rdp_spawn) { - rdp = rdp->nocb_next_cb_rdp; - } else { - rdp_last = rdp; - rdp = rdp->nocb_next_cb_rdp; - rdp_last->nocb_next_cb_rdp = NULL; - } - } while (rdp); - rdp_spawn->nocb_next_cb_rdp = rdp_old_leader; + rdp_gp = rdp->nocb_gp_rdp; + if (!rdp_gp->nocb_gp_kthread) { + t = kthread_run(rcu_nocb_gp_kthread, rdp_gp, + "rcuog/%d", rdp_gp->cpu); + if (WARN_ONCE(IS_ERR(t), "%s: Could not start rcuo GP kthread, OOM is now expected behavior\n", __func__)) + return; + WRITE_ONCE(rdp_gp->nocb_gp_kthread, t); } /* Spawn the kthread for this CPU. */ - t = kthread_run(rcu_nocb_kthread, rdp_spawn, + t = kthread_run(rcu_nocb_cb_kthread, rdp, "rcuo%c/%d", rcu_state.abbr, cpu); - if (WARN_ONCE(IS_ERR(t), "%s: Could not start rcuo kthread, OOM is now expected behavior\n", __func__)) + if (WARN_ONCE(IS_ERR(t), "%s: Could not start rcuo CB kthread, OOM is now expected behavior\n", __func__)) return; - WRITE_ONCE(rdp_spawn->nocb_cb_kthread, t); + WRITE_ONCE(rdp->nocb_cb_kthread, t); + WRITE_ONCE(rdp->nocb_gp_kthread, rdp_gp->nocb_gp_kthread); } /* -- cgit v1.2.3 From 9fa471a881df9ba84e0e0844d918ed1ec55fc567 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 31 Mar 2019 16:07:43 -0700 Subject: rcu/nocb: Rename nocb_follower_wait() to nocb_cb_wait() This commit adjusts naming to account for the new distinction between callback and grace-period no-CBs kthreads. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index c3b6493313ab..9d5448217bbc 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1889,7 +1889,7 @@ static int rcu_nocb_gp_kthread(void *arg) * This function returns true ("keep waiting") until callbacks appear and * then false ("stop waiting") when callbacks finally do appear. */ -static bool nocb_follower_wait(struct rcu_data *rdp) +static bool nocb_cb_wait(struct rcu_data *rdp) { trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("FollowerSleep")); swait_event_interruptible_exclusive(rdp->nocb_cb_wq, @@ -1922,7 +1922,7 @@ static int rcu_nocb_cb_kthread(void *arg) /* Each pass through this loop invokes one batch of callbacks */ for (;;) { /* Wait for callbacks. */ - while (nocb_follower_wait(rdp)) + while (nocb_cb_wait(rdp)) continue; /* Pull the ready-to-invoke callbacks onto local list. */ -- cgit v1.2.3 From 5d62c08c5fe54492899b978d0ddc0bf7fd071317 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 31 Mar 2019 16:10:17 -0700 Subject: rcu/nocb: Rename wake_nocb_leader() to wake_nocb_gp() This commit adjusts naming to account for the new distinction between callback and grace-period no-CBs kthreads. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 9d5448217bbc..632c2cfb9856 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1551,7 +1551,7 @@ static void __wake_nocb_leader(struct rcu_data *rdp, bool force, * Kick the GP kthread for this NOCB group, but caller has not * acquired locks. */ -static void wake_nocb_leader(struct rcu_data *rdp, bool force) +static void wake_nocb_gp(struct rcu_data *rdp, bool force) { unsigned long flags; @@ -1656,7 +1656,7 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp, if (old_rhpp == &rdp->nocb_head) { if (!irqs_disabled_flags(flags)) { /* ... if queue was empty ... */ - wake_nocb_leader(rdp, false); + wake_nocb_gp(rdp, false); trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WakeEmpty")); } else { @@ -1667,7 +1667,7 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp, } else if (len > rdp->qlen_last_fqs_check + qhimark) { /* ... or if many callbacks queued. */ if (!irqs_disabled_flags(flags)) { - wake_nocb_leader(rdp, true); + wake_nocb_gp(rdp, true); trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WakeOvf")); } else { -- cgit v1.2.3 From 5f675ba6eb5d1e4be56fc7c28881728373d880c4 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 31 Mar 2019 16:11:57 -0700 Subject: rcu/nocb: Rename __wake_nocb_leader() to __wake_nocb_gp() This commit adjusts naming to account for the new distinction between callback and grace-period no-CBs kthreads. While in the area, it also updates local variables. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 632c2cfb9856..7c7870da234a 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1524,24 +1524,24 @@ bool rcu_is_nocb_cpu(int cpu) * Kick the GP kthread for this NOCB group. Caller holds ->nocb_lock * and this function releases it. */ -static void __wake_nocb_leader(struct rcu_data *rdp, bool force, - unsigned long flags) +static void __wake_nocb_gp(struct rcu_data *rdp, bool force, + unsigned long flags) __releases(rdp->nocb_lock) { - struct rcu_data *rdp_leader = rdp->nocb_gp_rdp; + struct rcu_data *rdp_gp = rdp->nocb_gp_rdp; lockdep_assert_held(&rdp->nocb_lock); - if (!READ_ONCE(rdp_leader->nocb_gp_kthread)) { + if (!READ_ONCE(rdp_gp->nocb_gp_kthread)) { raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); return; } - if (rdp_leader->nocb_gp_sleep || force) { + if (rdp_gp->nocb_gp_sleep || force) { /* Prior smp_mb__after_atomic() orders against prior enqueue. */ - WRITE_ONCE(rdp_leader->nocb_gp_sleep, false); + WRITE_ONCE(rdp_gp->nocb_gp_sleep, false); del_timer(&rdp->nocb_timer); raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); smp_mb(); /* ->nocb_gp_sleep before swake_up_one(). */ - swake_up_one(&rdp_leader->nocb_gp_wq); + swake_up_one(&rdp_gp->nocb_gp_wq); } else { raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); } @@ -1556,7 +1556,7 @@ static void wake_nocb_gp(struct rcu_data *rdp, bool force) unsigned long flags; raw_spin_lock_irqsave(&rdp->nocb_lock, flags); - __wake_nocb_leader(rdp, force, flags); + __wake_nocb_gp(rdp, force, flags); } /* @@ -1988,7 +1988,7 @@ static void do_nocb_deferred_wakeup_common(struct rcu_data *rdp) } ndw = READ_ONCE(rdp->nocb_defer_wakeup); WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT); - __wake_nocb_leader(rdp, ndw == RCU_NOCB_WAKE_FORCE, flags); + __wake_nocb_gp(rdp, ndw == RCU_NOCB_WAKE_FORCE, flags); trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("DeferredWake")); } -- cgit v1.2.3 From 0d52a6652f15f6b1155297326de85c07ca421d64 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 31 Mar 2019 16:19:02 -0700 Subject: rcu/nocb: Rename wake_nocb_leader_defer() to wake_nocb_gp_defer() This commit adjusts naming to account for the new distinction between callback and grace-period no-CBs kthreads. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 7c7870da234a..e6581a51ff9a 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1563,8 +1563,8 @@ static void wake_nocb_gp(struct rcu_data *rdp, bool force) * Arrange to wake the GP kthread for this NOCB group at some future * time when it is safe to do so. */ -static void wake_nocb_leader_defer(struct rcu_data *rdp, int waketype, - const char *reason) +static void wake_nocb_gp_defer(struct rcu_data *rdp, int waketype, + const char *reason) { unsigned long flags; @@ -1660,8 +1660,8 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp, trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WakeEmpty")); } else { - wake_nocb_leader_defer(rdp, RCU_NOCB_WAKE, - TPS("WakeEmptyIsDeferred")); + wake_nocb_gp_defer(rdp, RCU_NOCB_WAKE, + TPS("WakeEmptyIsDeferred")); } rdp->qlen_last_fqs_check = 0; } else if (len > rdp->qlen_last_fqs_check + qhimark) { @@ -1671,8 +1671,8 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp, trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WakeOvf")); } else { - wake_nocb_leader_defer(rdp, RCU_NOCB_WAKE_FORCE, - TPS("WakeOvfIsDeferred")); + wake_nocb_gp_defer(rdp, RCU_NOCB_WAKE_FORCE, + TPS("WakeOvfIsDeferred")); } rdp->qlen_last_fqs_check = LONG_MAX / 2; } else { -- cgit v1.2.3 From 0bdc33daef96a54f9e5799d84f2fbc05d9e5cae3 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 31 Mar 2019 16:20:52 -0700 Subject: rcu/nocb: Rename rcu_organize_nocb_kthreads() local variable This commit renames rdp_leader to rdp_gp in order to account for the new distinction between callback and grace-period no-CBs kthreads. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index e6581a51ff9a..0af36e98e70f 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -2138,7 +2138,7 @@ static void __init rcu_organize_nocb_kthreads(void) int ls = rcu_nocb_leader_stride; int nl = 0; /* Next GP kthread. */ struct rcu_data *rdp; - struct rcu_data *rdp_leader = NULL; /* Suppress misguided gcc warn. */ + struct rcu_data *rdp_gp = NULL; /* Suppress misguided gcc warn. */ struct rcu_data *rdp_prev = NULL; if (!cpumask_available(rcu_nocb_mask)) @@ -2159,10 +2159,10 @@ static void __init rcu_organize_nocb_kthreads(void) /* New GP kthread, set up for CBs & next GP. */ nl = DIV_ROUND_UP(rdp->cpu + 1, ls) * ls; rdp->nocb_gp_rdp = rdp; - rdp_leader = rdp; + rdp_gp = rdp; } else { /* Another CB kthread, link to previous GP kthread. */ - rdp->nocb_gp_rdp = rdp_leader; + rdp->nocb_gp_rdp = rdp_gp; rdp_prev->nocb_next_cb_rdp = rdp; } rdp_prev = rdp; -- cgit v1.2.3 From f7c9a9b664fb32a127e8e9a987b52023b92c3a0b Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 1 Apr 2019 09:57:01 -0700 Subject: rcu/nocb: Rename and document no-CB CB kthread sleep trace event The nocb_cb_wait() function traces a "FollowerSleep" trace_rcu_nocb_wake() event, which never was documented and is now misleading. This commit therefore changes "FollowerSleep" to "CBSleep", documents this, and updates the documentation for "Sleep" as well. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 0af36e98e70f..be065aacd63b 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1891,7 +1891,7 @@ static int rcu_nocb_gp_kthread(void *arg) */ static bool nocb_cb_wait(struct rcu_data *rdp) { - trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("FollowerSleep")); + trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("CBSleep")); swait_event_interruptible_exclusive(rdp->nocb_cb_wq, READ_ONCE(rdp->nocb_cb_head)); if (smp_load_acquire(&rdp->nocb_cb_head)) { /* VVV */ -- cgit v1.2.3 From f7c612b000d7e974826089b5a6f6eecd6805862a Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 2 Apr 2019 08:05:55 -0700 Subject: rcu/nocb: Rename rcu_nocb_leader_stride kernel boot parameter This commit changes the name of the rcu_nocb_leader_stride kernel boot parameter to rcu_nocb_gp_stride in order to account for the new distinction between callback and grace-period no-CBs kthreads. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index be065aacd63b..80b27a9f306d 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -2126,8 +2126,8 @@ static void __init rcu_spawn_nocb_kthreads(void) } /* How many CB CPU IDs per GP kthread? Default of -1 for sqrt(nr_cpu_ids). */ -static int rcu_nocb_leader_stride = -1; -module_param(rcu_nocb_leader_stride, int, 0444); +static int rcu_nocb_gp_stride = -1; +module_param(rcu_nocb_gp_stride, int, 0444); /* * Initialize GP-CB relationships for all no-CBs CPU. @@ -2135,7 +2135,7 @@ module_param(rcu_nocb_leader_stride, int, 0444); static void __init rcu_organize_nocb_kthreads(void) { int cpu; - int ls = rcu_nocb_leader_stride; + int ls = rcu_nocb_gp_stride; int nl = 0; /* Next GP kthread. */ struct rcu_data *rdp; struct rcu_data *rdp_gp = NULL; /* Suppress misguided gcc warn. */ @@ -2145,7 +2145,7 @@ static void __init rcu_organize_nocb_kthreads(void) return; if (ls == -1) { ls = int_sqrt(nr_cpu_ids); - rcu_nocb_leader_stride = ls; + rcu_nocb_gp_stride = ls; } /* -- cgit v1.2.3 From 18cd8c93e69e3853eb408980089fb3c58813f922 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 1 Jun 2019 05:12:36 -0700 Subject: rcu/nocb: Print gp/cb kthread hierarchy if dump_tree This commit causes the no-CBs grace-period/callback hierarchy to be printed to the console when the dump_tree kernel boot parameter is set. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 80b27a9f306d..0a3f8680b450 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -2135,6 +2135,7 @@ module_param(rcu_nocb_gp_stride, int, 0444); static void __init rcu_organize_nocb_kthreads(void) { int cpu; + bool firsttime = true; int ls = rcu_nocb_gp_stride; int nl = 0; /* Next GP kthread. */ struct rcu_data *rdp; @@ -2160,10 +2161,15 @@ static void __init rcu_organize_nocb_kthreads(void) nl = DIV_ROUND_UP(rdp->cpu + 1, ls) * ls; rdp->nocb_gp_rdp = rdp; rdp_gp = rdp; + if (!firsttime && dump_tree) + pr_cont("\n"); + firsttime = false; + pr_alert("%s: No-CB GP kthread CPU %d:", __func__, cpu); } else { /* Another CB kthread, link to previous GP kthread. */ rdp->nocb_gp_rdp = rdp_gp; rdp_prev->nocb_next_cb_rdp = rdp; + pr_alert(" %d", cpu); } rdp_prev = rdp; } -- cgit v1.2.3 From 1bb5f9b95afe5d9d6b586389ce5e8f461a5b671c Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 12 Apr 2019 12:34:41 -0700 Subject: rcu/nocb: Use separate flag to indicate disabled ->cblist NULLing the RCU_NEXT_TAIL pointer was a clever way to save a byte, but forward-progress considerations would require that this pointer be both NULL and non-NULL, which, absent a quantum-computer port of the Linux kernel, simply won't happen. This commit therefore creates as separate ->enabled flag to replace the current NULL checks. [ paulmck: Add include files per 0day test robot and -next. ] Signed-off-by: Paul E. McKenney --- kernel/rcu/rcu_segcblist.c | 3 ++- kernel/rcu/rcu_segcblist.h | 2 +- kernel/rcu/tree_plugin.h | 2 +- 3 files changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c index 9bd5f6023c21..b305dcac34c9 100644 --- a/kernel/rcu/rcu_segcblist.c +++ b/kernel/rcu/rcu_segcblist.c @@ -58,6 +58,7 @@ void rcu_segcblist_init(struct rcu_segcblist *rsclp) rsclp->tails[i] = &rsclp->head; rsclp->len = 0; rsclp->len_lazy = 0; + rsclp->enabled = 1; } /* @@ -69,7 +70,7 @@ void rcu_segcblist_disable(struct rcu_segcblist *rsclp) WARN_ON_ONCE(!rcu_segcblist_empty(rsclp)); WARN_ON_ONCE(rcu_segcblist_n_cbs(rsclp)); WARN_ON_ONCE(rcu_segcblist_n_lazy_cbs(rsclp)); - rsclp->tails[RCU_NEXT_TAIL] = NULL; + rsclp->enabled = 0; } /* diff --git a/kernel/rcu/rcu_segcblist.h b/kernel/rcu/rcu_segcblist.h index 822a39da0533..b2de7b32da29 100644 --- a/kernel/rcu/rcu_segcblist.h +++ b/kernel/rcu/rcu_segcblist.h @@ -63,7 +63,7 @@ static inline long rcu_segcblist_n_nonlazy_cbs(struct rcu_segcblist *rsclp) */ static inline bool rcu_segcblist_is_enabled(struct rcu_segcblist *rsclp) { - return !!rsclp->tails[RCU_NEXT_TAIL]; + return rsclp->enabled; } /* diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 0a3f8680b450..b8a43cf9bb4e 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -2189,8 +2189,8 @@ static bool init_nocb_callback_list(struct rcu_data *rdp) rcu_segcblist_n_cbs(&rdp->cblist)); atomic_long_set(&rdp->nocb_q_count_lazy, rcu_segcblist_n_lazy_cbs(&rdp->cblist)); - rcu_segcblist_init(&rdp->cblist); } + rcu_segcblist_init(&rdp->cblist); rcu_segcblist_disable(&rdp->cblist); return true; } -- cgit v1.2.3 From ce5215c1342c6c89b3c3c45fea82cddf0b013787 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 12 Apr 2019 15:58:34 -0700 Subject: rcu/nocb: Use separate flag to indicate offloaded ->cblist RCU callback processing currently uses rcu_is_nocb_cpu() to determine whether or not the current CPU's callbacks are to be offloaded. This works, but it is not so good for cache locality. Plus use of ->cblist for offloaded callbacks will greatly increase the frequency of these checks. This commit therefore adds a ->offloaded flag to the rcu_segcblist structure to provide a more flexible and cache-friendly means of checking for callback offloading. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcu_segcblist.c | 12 ++++++++++++ kernel/rcu/rcu_segcblist.h | 7 +++++++ kernel/rcu/tree.c | 10 ++++++---- kernel/rcu/tree_plugin.h | 11 +++++++---- 4 files changed, 32 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c index b305dcac34c9..700779f4c0cb 100644 --- a/kernel/rcu/rcu_segcblist.c +++ b/kernel/rcu/rcu_segcblist.c @@ -73,6 +73,18 @@ void rcu_segcblist_disable(struct rcu_segcblist *rsclp) rsclp->enabled = 0; } +/* + * Mark the specified rcu_segcblist structure as offloaded. This + * structure must be empty. + */ +void rcu_segcblist_offload(struct rcu_segcblist *rsclp) +{ + WARN_ON_ONCE(!rcu_segcblist_empty(rsclp)); + WARN_ON_ONCE(rcu_segcblist_n_cbs(rsclp)); + WARN_ON_ONCE(rcu_segcblist_n_lazy_cbs(rsclp)); + rsclp->offloaded = 1; +} + /* * Does the specified rcu_segcblist structure contain callbacks that * are ready to be invoked? diff --git a/kernel/rcu/rcu_segcblist.h b/kernel/rcu/rcu_segcblist.h index b2de7b32da29..8f3783391075 100644 --- a/kernel/rcu/rcu_segcblist.h +++ b/kernel/rcu/rcu_segcblist.h @@ -66,6 +66,12 @@ static inline bool rcu_segcblist_is_enabled(struct rcu_segcblist *rsclp) return rsclp->enabled; } +/* Is the specified rcu_segcblist offloaded? */ +static inline bool rcu_segcblist_is_offloaded(struct rcu_segcblist *rsclp) +{ + return rsclp->offloaded; +} + /* * Are all segments following the specified segment of the specified * rcu_segcblist structure empty of callbacks? (The specified @@ -78,6 +84,7 @@ static inline bool rcu_segcblist_restempty(struct rcu_segcblist *rsclp, int seg) void rcu_segcblist_init(struct rcu_segcblist *rsclp); void rcu_segcblist_disable(struct rcu_segcblist *rsclp); +void rcu_segcblist_offload(struct rcu_segcblist *rsclp); bool rcu_segcblist_ready_cbs(struct rcu_segcblist *rsclp); bool rcu_segcblist_pend_cbs(struct rcu_segcblist *rsclp); struct rcu_head *rcu_segcblist_first_cb(struct rcu_segcblist *rsclp); diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index a14e5fbbea46..6f5c96c4f9a3 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2858,10 +2858,11 @@ void rcu_barrier(void) * corresponding CPU's preceding callbacks have been invoked. */ for_each_possible_cpu(cpu) { - if (!cpu_online(cpu) && !rcu_is_nocb_cpu(cpu)) - continue; rdp = per_cpu_ptr(&rcu_data, cpu); - if (rcu_is_nocb_cpu(cpu)) { + if (!cpu_online(cpu) && + !rcu_segcblist_is_offloaded(&rdp->cblist)) + continue; + if (rcu_segcblist_is_offloaded(&rdp->cblist)) { if (!rcu_nocb_cpu_needs_barrier(cpu)) { rcu_barrier_trace(TPS("OfflineNoCB"), cpu, rcu_state.barrier_sequence); @@ -3155,7 +3156,8 @@ void rcutree_migrate_callbacks(int cpu) struct rcu_node *rnp_root = rcu_get_root(); bool needwake; - if (rcu_is_nocb_cpu(cpu) || rcu_segcblist_empty(&rdp->cblist)) + if (rcu_segcblist_is_offloaded(&rdp->cblist) || + rcu_segcblist_empty(&rdp->cblist)) return; /* No callbacks to migrate. */ local_irq_save(flags); diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index b8a43cf9bb4e..fc6133eed50a 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1382,7 +1382,7 @@ static void rcu_prepare_for_idle(void) int tne; lockdep_assert_irqs_disabled(); - if (rcu_is_nocb_cpu(smp_processor_id())) + if (rcu_segcblist_is_offloaded(&rdp->cblist)) return; /* Handle nohz enablement switches conservatively. */ @@ -1431,8 +1431,10 @@ static void rcu_prepare_for_idle(void) */ static void rcu_cleanup_after_idle(void) { + struct rcu_data *rdp = this_cpu_ptr(&rcu_data); + lockdep_assert_irqs_disabled(); - if (rcu_is_nocb_cpu(smp_processor_id())) + if (rcu_segcblist_is_offloaded(&rdp->cblist)) return; if (rcu_try_advance_all_cbs()) invoke_rcu_core(); @@ -1694,7 +1696,7 @@ static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, bool lazy, unsigned long flags) { - if (!rcu_is_nocb_cpu(rdp->cpu)) + if (!rcu_segcblist_is_offloaded(&rdp->cblist)) return false; __call_rcu_nocb_enqueue(rdp, rhp, &rhp->next, 1, lazy, flags); if (__is_kfree_rcu_offset((unsigned long)rhp->func)) @@ -1729,7 +1731,7 @@ static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_data *my_rdp, unsigned long flags) { lockdep_assert_irqs_disabled(); - if (!rcu_is_nocb_cpu(smp_processor_id())) + if (!rcu_segcblist_is_offloaded(&my_rdp->cblist)) return false; /* Not NOCBs CPU, caller must migrate CBs. */ __call_rcu_nocb_enqueue(my_rdp, rcu_segcblist_head(&rdp->cblist), rcu_segcblist_tail(&rdp->cblist), @@ -2192,6 +2194,7 @@ static bool init_nocb_callback_list(struct rcu_data *rdp) } rcu_segcblist_init(&rdp->cblist); rcu_segcblist_disable(&rdp->cblist); + rcu_segcblist_offload(&rdp->cblist); return true; } -- cgit v1.2.3 From 750d7f6a434ff4640fa825dfb1eccb44e79fb6af Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 16 Apr 2019 08:19:43 -0700 Subject: rcu/nocb: Add checks for offloaded callback processing This commit is a preparatory patch for offloaded callbacks using the same ->cblist structure used by non-offloaded callbacks. It therefore adds rcu_segcblist_is_offloaded() calls where they will be needed when !rcu_segcblist_is_enabled() no longer flags the offloaded case. It also adds checks in rcu_do_batch() to ensure that there are no missed checks: Currently, it should not be possible for offloaded execution to reach rcu_do_batch(), though this will change later in this series. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 6f5c96c4f9a3..969ba292a669 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -210,7 +210,8 @@ static long rcu_get_n_cbs_cpu(int cpu) { struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); - if (rcu_segcblist_is_enabled(&rdp->cblist)) /* Online normal CPU? */ + if (rcu_segcblist_is_enabled(&rdp->cblist) && + !rcu_segcblist_is_offloaded(&rdp->cblist)) /* Online normal CPU? */ return rcu_segcblist_n_cbs(&rdp->cblist); return rcu_get_n_cbs_nocb_cpu(rdp); /* Works for offline, too. */ } @@ -2081,6 +2082,7 @@ static void rcu_do_batch(struct rcu_data *rdp) struct rcu_cblist rcl = RCU_CBLIST_INITIALIZER(rcl); long bl, count; + WARN_ON_ONCE(rdp->cblist.offloaded); /* If no callbacks are ready, just return. */ if (!rcu_segcblist_ready_cbs(&rdp->cblist)) { trace_rcu_batch_start(rcu_state.name, @@ -2299,7 +2301,8 @@ static __latent_entropy void rcu_core(void) /* No grace period and unregistered callbacks? */ if (!rcu_gp_in_progress() && - rcu_segcblist_is_enabled(&rdp->cblist)) { + rcu_segcblist_is_enabled(&rdp->cblist) && + !rcu_segcblist_is_offloaded(&rdp->cblist)) { local_irq_save(flags); if (!rcu_segcblist_restempty(&rdp->cblist, RCU_NEXT_READY_TAIL)) rcu_accelerate_cbs_unlocked(rnp, rdp); @@ -2514,7 +2517,8 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func, int cpu, bool lazy) rdp = this_cpu_ptr(&rcu_data); /* Add the callback to our list. */ - if (unlikely(!rcu_segcblist_is_enabled(&rdp->cblist)) || cpu != -1) { + if (unlikely(!rcu_segcblist_is_enabled(&rdp->cblist)) || + rcu_segcblist_is_offloaded(&rdp->cblist) || cpu != -1) { int offline; if (cpu != -1) @@ -2750,6 +2754,7 @@ static int rcu_pending(void) /* Has RCU gone idle with this CPU needing another grace period? */ if (!rcu_gp_in_progress() && rcu_segcblist_is_enabled(&rdp->cblist) && + !rcu_segcblist_is_offloaded(&rdp->cblist) && !rcu_segcblist_restempty(&rdp->cblist, RCU_NEXT_READY_TAIL)) return 1; -- cgit v1.2.3 From c00045be32fe13333ba8c62748ba04747c182838 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 16 Apr 2019 14:09:15 -0700 Subject: rcu/nocb: Make rcutree_migrate_callbacks() start at leaf rcu_node structure Because rcutree_migrate_callbacks() is invoked infrequently and because an exact snapshot of the grace-period state might save some callbacks a second trip through a grace period, this function has used the root rcu_node structure. However, this safe-second-trip optimization happens only if rcutree_migrate_callbacks() races with grace-period initialization, so it is not worth the added mental load. This commit therefore makes rcutree_migrate_callbacks() start with the leaf rcu_node structures, as is done elsewhere. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 969ba292a669..ea479d81da7f 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3157,8 +3157,8 @@ void rcutree_migrate_callbacks(int cpu) { unsigned long flags; struct rcu_data *my_rdp; + struct rcu_node *my_rnp; struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); - struct rcu_node *rnp_root = rcu_get_root(); bool needwake; if (rcu_segcblist_is_offloaded(&rdp->cblist) || @@ -3167,18 +3167,19 @@ void rcutree_migrate_callbacks(int cpu) local_irq_save(flags); my_rdp = this_cpu_ptr(&rcu_data); + my_rnp = my_rdp->mynode; if (rcu_nocb_adopt_orphan_cbs(my_rdp, rdp, flags)) { local_irq_restore(flags); return; } - raw_spin_lock_rcu_node(rnp_root); /* irqs already disabled. */ + raw_spin_lock_rcu_node(my_rnp); /* irqs already disabled. */ /* Leverage recent GPs and set GP for new callbacks. */ - needwake = rcu_advance_cbs(rnp_root, rdp) || - rcu_advance_cbs(rnp_root, my_rdp); + needwake = rcu_advance_cbs(my_rnp, rdp) || + rcu_advance_cbs(my_rnp, my_rdp); rcu_segcblist_merge(&my_rdp->cblist, &rdp->cblist); WARN_ON_ONCE(rcu_segcblist_empty(&my_rdp->cblist) != !rcu_segcblist_n_cbs(&my_rdp->cblist)); - raw_spin_unlock_irqrestore_rcu_node(rnp_root, flags); + raw_spin_unlock_irqrestore_rcu_node(my_rnp, flags); if (needwake) rcu_gp_kthread_wake(); WARN_ONCE(rcu_segcblist_n_cbs(&rdp->cblist) != 0 || -- cgit v1.2.3 From 85f69b32126dcf798f2c8d69a7957ba990bc2e02 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 16 Apr 2019 14:48:28 -0700 Subject: rcu/nocb: Check for deferred nocb wakeups before nohz_full early exit In theory, a timer is used to defer wakeups of no-CBs grace-period kthreads when the wakeup cannot be done safely directly from the call_rcu(). In practice, the one-jiffy delay is not always consistent with timely callback invocation under heavy call_rcu() loads. Therefore, there are a number of checks for a pending deferred wakeup, including from the scheduling-clock interrupt. Unfortunately, this check follows the rcu_nohz_full_cpu() early exit, which renders it useless on such CPUs. This commit therefore moves the check for the pending deferred no-CB wakeup to precede the rcu_nohz_full_cpu() early exit. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index ea479d81da7f..f1a25d17e3a0 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2739,6 +2739,10 @@ static int rcu_pending(void) /* Check for CPU stalls, if enabled. */ check_cpu_stall(rdp); + /* Does this CPU need a deferred NOCB wakeup? */ + if (rcu_nocb_need_deferred_wakeup(rdp)) + return 1; + /* Is this CPU a NO_HZ_FULL CPU that should ignore RCU? */ if (rcu_nohz_full_cpu()) return 0; @@ -2763,10 +2767,6 @@ static int rcu_pending(void) unlikely(READ_ONCE(rdp->gpwrap))) /* outside lock */ return 1; - /* Does this CPU need a deferred NOCB wakeup? */ - if (rcu_nocb_need_deferred_wakeup(rdp)) - return 1; - /* nothing to do */ return 0; } -- cgit v1.2.3 From ca5c8258081178c60b27e3532d9ea95b6eaa7040 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 16 Apr 2019 15:15:24 -0700 Subject: rcu/nocb: Remove deferred wakeup checks for extended quiescent states The idea behind the checks for extended quiescent states at the end of __call_rcu_nocb() is to handle cases where call_rcu() is invoked directly from within an extended quiescent state, for example, from the idle loop. However, this will result in a timer-mediated deferred wakeup, which will cause the needed wakeup to happen within a jiffy or thereabouts. There should be no forward-progress concerns, and if there are, the proper response is to exit the extended quiescent state while executing the endless blast of call_rcu() invocations, for example, using RCU_NONIDLE(). Given the more realistic case of an isolated call_rcu() invocation, there should be no problem. This commit therefore removes the checks for invoking call_rcu() within an extended quiescent state for on no-CBs CPUs. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 10 ---------- 1 file changed, 10 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index fc6133eed50a..9936a66b80bb 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1709,16 +1709,6 @@ static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, -atomic_long_read(&rdp->nocb_q_count_lazy), -rcu_get_n_cbs_nocb_cpu(rdp)); - /* - * If called from an extended quiescent state with interrupts - * disabled, invoke the RCU core in order to allow the idle-entry - * deferred-wakeup check to function. - */ - if (irqs_disabled_flags(flags) && - !rcu_is_watching() && - cpu_online(smp_processor_id())) - invoke_rcu_core(); - return true; } -- cgit v1.2.3 From 76c6927c3ee443e756f2c0c9f992cb04b26c65f2 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 13 May 2019 14:36:11 -0700 Subject: rcu/nocb: Allow lockless use of rcu_segcblist_restempty() Currently, rcu_segcblist_restempty() assumes that the callback list is not being changed by other CPUs, but upcoming changes will require it to operate locklessly. This commit therefore adds the needed READ_ONCE() calls, along with the WRITE_ONCE() calls when updating the callback list. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcu_segcblist.c | 30 +++++++++++++++--------------- kernel/rcu/rcu_segcblist.h | 2 +- 2 files changed, 16 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c index 700779f4c0cb..0e7fe678b6ac 100644 --- a/kernel/rcu/rcu_segcblist.c +++ b/kernel/rcu/rcu_segcblist.c @@ -147,8 +147,8 @@ void rcu_segcblist_enqueue(struct rcu_segcblist *rsclp, rsclp->len_lazy++; smp_mb(); /* Ensure counts are updated before callback is enqueued. */ rhp->next = NULL; - *rsclp->tails[RCU_NEXT_TAIL] = rhp; - rsclp->tails[RCU_NEXT_TAIL] = &rhp->next; + WRITE_ONCE(*rsclp->tails[RCU_NEXT_TAIL], rhp); + WRITE_ONCE(rsclp->tails[RCU_NEXT_TAIL], &rhp->next); } /* @@ -176,9 +176,9 @@ bool rcu_segcblist_entrain(struct rcu_segcblist *rsclp, for (i = RCU_NEXT_TAIL; i > RCU_DONE_TAIL; i--) if (rsclp->tails[i] != rsclp->tails[i - 1]) break; - *rsclp->tails[i] = rhp; + WRITE_ONCE(*rsclp->tails[i], rhp); for (; i <= RCU_NEXT_TAIL; i++) - rsclp->tails[i] = &rhp->next; + WRITE_ONCE(rsclp->tails[i], &rhp->next); return true; } @@ -214,11 +214,11 @@ void rcu_segcblist_extract_done_cbs(struct rcu_segcblist *rsclp, return; /* Nothing to do. */ *rclp->tail = rsclp->head; rsclp->head = *rsclp->tails[RCU_DONE_TAIL]; - *rsclp->tails[RCU_DONE_TAIL] = NULL; + WRITE_ONCE(*rsclp->tails[RCU_DONE_TAIL], NULL); rclp->tail = rsclp->tails[RCU_DONE_TAIL]; for (i = RCU_CBLIST_NSEGS - 1; i >= RCU_DONE_TAIL; i--) if (rsclp->tails[i] == rsclp->tails[RCU_DONE_TAIL]) - rsclp->tails[i] = &rsclp->head; + WRITE_ONCE(rsclp->tails[i], &rsclp->head); } /* @@ -237,9 +237,9 @@ void rcu_segcblist_extract_pend_cbs(struct rcu_segcblist *rsclp, return; /* Nothing to do. */ *rclp->tail = *rsclp->tails[RCU_DONE_TAIL]; rclp->tail = rsclp->tails[RCU_NEXT_TAIL]; - *rsclp->tails[RCU_DONE_TAIL] = NULL; + WRITE_ONCE(*rsclp->tails[RCU_DONE_TAIL], NULL); for (i = RCU_DONE_TAIL + 1; i < RCU_CBLIST_NSEGS; i++) - rsclp->tails[i] = rsclp->tails[RCU_DONE_TAIL]; + WRITE_ONCE(rsclp->tails[i], rsclp->tails[RCU_DONE_TAIL]); } /* @@ -271,7 +271,7 @@ void rcu_segcblist_insert_done_cbs(struct rcu_segcblist *rsclp, rsclp->head = rclp->head; for (i = RCU_DONE_TAIL; i < RCU_CBLIST_NSEGS; i++) if (&rsclp->head == rsclp->tails[i]) - rsclp->tails[i] = rclp->tail; + WRITE_ONCE(rsclp->tails[i], rclp->tail); else break; rclp->head = NULL; @@ -287,8 +287,8 @@ void rcu_segcblist_insert_pend_cbs(struct rcu_segcblist *rsclp, { if (!rclp->head) return; /* Nothing to do. */ - *rsclp->tails[RCU_NEXT_TAIL] = rclp->head; - rsclp->tails[RCU_NEXT_TAIL] = rclp->tail; + WRITE_ONCE(*rsclp->tails[RCU_NEXT_TAIL], rclp->head); + WRITE_ONCE(rsclp->tails[RCU_NEXT_TAIL], rclp->tail); rclp->head = NULL; rclp->tail = &rclp->head; } @@ -312,7 +312,7 @@ void rcu_segcblist_advance(struct rcu_segcblist *rsclp, unsigned long seq) for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++) { if (ULONG_CMP_LT(seq, rsclp->gp_seq[i])) break; - rsclp->tails[RCU_DONE_TAIL] = rsclp->tails[i]; + WRITE_ONCE(rsclp->tails[RCU_DONE_TAIL], rsclp->tails[i]); } /* If no callbacks moved, nothing more need be done. */ @@ -321,7 +321,7 @@ void rcu_segcblist_advance(struct rcu_segcblist *rsclp, unsigned long seq) /* Clean up tail pointers that might have been misordered above. */ for (j = RCU_WAIT_TAIL; j < i; j++) - rsclp->tails[j] = rsclp->tails[RCU_DONE_TAIL]; + WRITE_ONCE(rsclp->tails[j], rsclp->tails[RCU_DONE_TAIL]); /* * Callbacks moved, so clean up the misordered ->tails[] pointers @@ -332,7 +332,7 @@ void rcu_segcblist_advance(struct rcu_segcblist *rsclp, unsigned long seq) for (j = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++, j++) { if (rsclp->tails[j] == rsclp->tails[RCU_NEXT_TAIL]) break; /* No more callbacks. */ - rsclp->tails[j] = rsclp->tails[i]; + WRITE_ONCE(rsclp->tails[j], rsclp->tails[i]); rsclp->gp_seq[j] = rsclp->gp_seq[i]; } } @@ -397,7 +397,7 @@ bool rcu_segcblist_accelerate(struct rcu_segcblist *rsclp, unsigned long seq) * structure other than in the RCU_NEXT_TAIL segment. */ for (; i < RCU_NEXT_TAIL; i++) { - rsclp->tails[i] = rsclp->tails[RCU_NEXT_TAIL]; + WRITE_ONCE(rsclp->tails[i], rsclp->tails[RCU_NEXT_TAIL]); rsclp->gp_seq[i] = seq; } return true; diff --git a/kernel/rcu/rcu_segcblist.h b/kernel/rcu/rcu_segcblist.h index 8f3783391075..f74960f0305c 100644 --- a/kernel/rcu/rcu_segcblist.h +++ b/kernel/rcu/rcu_segcblist.h @@ -79,7 +79,7 @@ static inline bool rcu_segcblist_is_offloaded(struct rcu_segcblist *rsclp) */ static inline bool rcu_segcblist_restempty(struct rcu_segcblist *rsclp, int seg) { - return !*rsclp->tails[seg]; + return !READ_ONCE(*READ_ONCE(rsclp->tails[seg])); } void rcu_segcblist_init(struct rcu_segcblist *rsclp); -- cgit v1.2.3 From e6060b41c9955374079926a7612b857a8458ed1f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 13 May 2019 15:57:50 -0700 Subject: rcu/nocb: Allow lockless use of rcu_segcblist_empty() Currently, rcu_segcblist_empty() assumes that the callback list is not being changed by other CPUs, but upcoming changes will require it to operate locklessly. This commit therefore adds the needed READ_ONCE() call, along with the WRITE_ONCE() calls when updating the callback list's ->head field. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcu_segcblist.c | 4 ++-- kernel/rcu/rcu_segcblist.h | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c index 0e7fe678b6ac..06435a368be5 100644 --- a/kernel/rcu/rcu_segcblist.c +++ b/kernel/rcu/rcu_segcblist.c @@ -213,7 +213,7 @@ void rcu_segcblist_extract_done_cbs(struct rcu_segcblist *rsclp, if (!rcu_segcblist_ready_cbs(rsclp)) return; /* Nothing to do. */ *rclp->tail = rsclp->head; - rsclp->head = *rsclp->tails[RCU_DONE_TAIL]; + WRITE_ONCE(rsclp->head, *rsclp->tails[RCU_DONE_TAIL]); WRITE_ONCE(*rsclp->tails[RCU_DONE_TAIL], NULL); rclp->tail = rsclp->tails[RCU_DONE_TAIL]; for (i = RCU_CBLIST_NSEGS - 1; i >= RCU_DONE_TAIL; i--) @@ -268,7 +268,7 @@ void rcu_segcblist_insert_done_cbs(struct rcu_segcblist *rsclp, if (!rclp->head) return; /* No callbacks to move. */ *rclp->tail = rsclp->head; - rsclp->head = rclp->head; + WRITE_ONCE(rsclp->head, rclp->head); for (i = RCU_DONE_TAIL; i < RCU_CBLIST_NSEGS; i++) if (&rsclp->head == rsclp->tails[i]) WRITE_ONCE(rsclp->tails[i], rclp->tail); diff --git a/kernel/rcu/rcu_segcblist.h b/kernel/rcu/rcu_segcblist.h index f74960f0305c..d9142b3590a8 100644 --- a/kernel/rcu/rcu_segcblist.h +++ b/kernel/rcu/rcu_segcblist.h @@ -36,7 +36,7 @@ struct rcu_head *rcu_cblist_dequeue(struct rcu_cblist *rclp); */ static inline bool rcu_segcblist_empty(struct rcu_segcblist *rsclp) { - return !rsclp->head; + return !READ_ONCE(rsclp->head); } /* Return number of callbacks in segmented callback list. */ -- cgit v1.2.3 From e83e73f5b0f8de6a8978ba64185e80fdf48a2a63 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 14 May 2019 09:50:49 -0700 Subject: rcu/nocb: Leave ->cblist enabled for no-CBs CPUs As a first step towards making no-CBs CPUs use the ->cblist, this commit leaves the ->cblist enabled for these CPUs. The main reason to make no-CBs CPUs use ->cblist is to take advantage of callback numbering, which will reduce the effects of missed grace periods which in turn will reduce forward-progress problems for no-CBs CPUs. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcu_segcblist.c | 3 --- kernel/rcu/rcu_segcblist.h | 2 +- kernel/rcu/tree.c | 5 +++-- kernel/rcu/tree.h | 1 - kernel/rcu/tree_plugin.h | 35 +++++++---------------------------- 5 files changed, 11 insertions(+), 35 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c index 06435a368be5..9ac28f175627 100644 --- a/kernel/rcu/rcu_segcblist.c +++ b/kernel/rcu/rcu_segcblist.c @@ -79,9 +79,6 @@ void rcu_segcblist_disable(struct rcu_segcblist *rsclp) */ void rcu_segcblist_offload(struct rcu_segcblist *rsclp) { - WARN_ON_ONCE(!rcu_segcblist_empty(rsclp)); - WARN_ON_ONCE(rcu_segcblist_n_cbs(rsclp)); - WARN_ON_ONCE(rcu_segcblist_n_lazy_cbs(rsclp)); rsclp->offloaded = 1; } diff --git a/kernel/rcu/rcu_segcblist.h b/kernel/rcu/rcu_segcblist.h index d9142b3590a8..ed3fcece39a9 100644 --- a/kernel/rcu/rcu_segcblist.h +++ b/kernel/rcu/rcu_segcblist.h @@ -59,7 +59,7 @@ static inline long rcu_segcblist_n_nonlazy_cbs(struct rcu_segcblist *rsclp) /* * Is the specified rcu_segcblist enabled, for example, not corresponding - * to an offline or callback-offloaded CPU? + * to an offline CPU? */ static inline bool rcu_segcblist_is_enabled(struct rcu_segcblist *rsclp) { diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index f1a25d17e3a0..2917ce379b23 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2964,7 +2964,8 @@ rcu_boot_init_percpu_data(int cpu) * Initializes a CPU's per-CPU RCU data. Note that only one online or * offline event can be happening at a given time. Note also that we can * accept some slop in the rsp->gp_seq access due to the fact that this - * CPU cannot possibly have any RCU callbacks in flight yet. + * CPU cannot possibly have any non-offloaded RCU callbacks in flight yet. + * And any offloaded callbacks are being numbered elsewhere. */ int rcutree_prepare_cpu(unsigned int cpu) { @@ -2978,7 +2979,7 @@ int rcutree_prepare_cpu(unsigned int cpu) rdp->n_force_qs_snap = rcu_state.n_force_qs; rdp->blimit = blimit; if (rcu_segcblist_empty(&rdp->cblist) && /* No early-boot CBs? */ - !init_nocb_callback_list(rdp)) + !rcu_segcblist_is_offloaded(&rdp->cblist)) rcu_segcblist_init(&rdp->cblist); /* Re-enable callbacks. */ rdp->dynticks_nesting = 1; /* CPU not up, no tearing. */ rcu_dynticks_eqs_online(); diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index dc3c53cb9608..8d9cfcac6757 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -438,7 +438,6 @@ static void __init rcu_spawn_nocb_kthreads(void); #ifdef CONFIG_RCU_NOCB_CPU static void __init rcu_organize_nocb_kthreads(void); #endif /* #ifdef CONFIG_RCU_NOCB_CPU */ -static bool init_nocb_callback_list(struct rcu_data *rdp); static unsigned long rcu_get_n_cbs_nocb_cpu(struct rcu_data *rdp); static void rcu_bind_gp_kthread(void); static bool rcu_nohz_full_cpu(void); diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 9936a66b80bb..2d37fd3fa0d4 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -2007,6 +2007,7 @@ void __init rcu_init_nohz(void) { int cpu; bool need_rcu_nocb_mask = false; + struct rcu_data *rdp; #if defined(CONFIG_NO_HZ_FULL) if (tick_nohz_full_running && cpumask_weight(tick_nohz_full_mask)) @@ -2040,8 +2041,12 @@ void __init rcu_init_nohz(void) if (rcu_nocb_poll) pr_info("\tPoll for callbacks from no-CBs CPUs.\n"); - for_each_cpu(cpu, rcu_nocb_mask) - init_nocb_callback_list(per_cpu_ptr(&rcu_data, cpu)); + for_each_cpu(cpu, rcu_nocb_mask) { + rdp = per_cpu_ptr(&rcu_data, cpu); + if (rcu_segcblist_empty(&rdp->cblist)) + rcu_segcblist_init(&rdp->cblist); + rcu_segcblist_offload(&rdp->cblist); + } rcu_organize_nocb_kthreads(); } @@ -2167,27 +2172,6 @@ static void __init rcu_organize_nocb_kthreads(void) } } -/* Prevent __call_rcu() from enqueuing callbacks on no-CBs CPUs */ -static bool init_nocb_callback_list(struct rcu_data *rdp) -{ - if (!rcu_is_nocb_cpu(rdp->cpu)) - return false; - - /* If there are early-boot callbacks, move them to nocb lists. */ - if (!rcu_segcblist_empty(&rdp->cblist)) { - rdp->nocb_head = rcu_segcblist_head(&rdp->cblist); - rdp->nocb_tail = rcu_segcblist_tail(&rdp->cblist); - atomic_long_set(&rdp->nocb_q_count, - rcu_segcblist_n_cbs(&rdp->cblist)); - atomic_long_set(&rdp->nocb_q_count_lazy, - rcu_segcblist_n_lazy_cbs(&rdp->cblist)); - } - rcu_segcblist_init(&rdp->cblist); - rcu_segcblist_disable(&rdp->cblist); - rcu_segcblist_offload(&rdp->cblist); - return true; -} - /* * Bind the current task to the offloaded CPUs. If there are no offloaded * CPUs, leave the task unbound. Splat if the bind attempt fails. @@ -2263,11 +2247,6 @@ static void __init rcu_spawn_nocb_kthreads(void) { } -static bool init_nocb_callback_list(struct rcu_data *rdp) -{ - return false; -} - static unsigned long rcu_get_n_cbs_nocb_cpu(struct rcu_data *rdp) { return 0; -- cgit v1.2.3 From 5d6742b37727e111f4755155e59c5319cf5caa7b Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 15 May 2019 09:56:40 -0700 Subject: rcu/nocb: Use rcu_segcblist for no-CBs CPUs Currently the RCU callbacks for no-CBs CPUs are queued on a series of ad-hoc linked lists, which means that these callbacks cannot benefit from "drive-by" grace periods, thus suffering needless delays prior to invocation. In addition, the no-CBs grace-period kthreads first wait for callbacks to appear and later wait for a new grace period, which means that callbacks appearing during a grace-period wait can be delayed. These delays increase memory footprint, and could even result in an out-of-memory condition. This commit therefore enqueues RCU callbacks from no-CBs CPUs on the rcu_segcblist structure that is already used by non-no-CBs CPUs. It also restructures the no-CBs grace-period kthread to be checking for incoming callbacks while waiting for grace periods. Also, instead of waiting for a new grace period, it waits for the closest grace period that will cause some of the callbacks to be safe to invoke. All of these changes reduce callback latency and thus the number of outstanding callbacks, in turn reducing the probability of an out-of-memory condition. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcu_segcblist.c | 12 ++ kernel/rcu/rcu_segcblist.h | 1 + kernel/rcu/tree.c | 116 ++++++----- kernel/rcu/tree.h | 14 +- kernel/rcu/tree_plugin.h | 510 +++++++++++++++++---------------------------- 5 files changed, 270 insertions(+), 383 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c index 9ac28f175627..92968b856593 100644 --- a/kernel/rcu/rcu_segcblist.c +++ b/kernel/rcu/rcu_segcblist.c @@ -127,6 +127,18 @@ struct rcu_head *rcu_segcblist_first_pend_cb(struct rcu_segcblist *rsclp) return NULL; } +/* + * Return false if there are no CBs awaiting grace periods, otherwise, + * return true and store the nearest waited-upon grace period into *lp. + */ +bool rcu_segcblist_nextgp(struct rcu_segcblist *rsclp, unsigned long *lp) +{ + if (!rcu_segcblist_pend_cbs(rsclp)) + return false; + *lp = rsclp->gp_seq[RCU_WAIT_TAIL]; + return true; +} + /* * Enqueue the specified callback onto the specified rcu_segcblist * structure, updating accounting as needed. Note that the ->len diff --git a/kernel/rcu/rcu_segcblist.h b/kernel/rcu/rcu_segcblist.h index ed3fcece39a9..db38f0a512c4 100644 --- a/kernel/rcu/rcu_segcblist.h +++ b/kernel/rcu/rcu_segcblist.h @@ -89,6 +89,7 @@ bool rcu_segcblist_ready_cbs(struct rcu_segcblist *rsclp); bool rcu_segcblist_pend_cbs(struct rcu_segcblist *rsclp); struct rcu_head *rcu_segcblist_first_cb(struct rcu_segcblist *rsclp); struct rcu_head *rcu_segcblist_first_pend_cb(struct rcu_segcblist *rsclp); +bool rcu_segcblist_nextgp(struct rcu_segcblist *rsclp, unsigned long *lp); void rcu_segcblist_enqueue(struct rcu_segcblist *rsclp, struct rcu_head *rhp, bool lazy); bool rcu_segcblist_entrain(struct rcu_segcblist *rsclp, diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 2917ce379b23..054418d2d960 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1343,8 +1343,10 @@ static bool rcu_advance_cbs(struct rcu_node *rnp, struct rcu_data *rdp) */ static bool __note_gp_changes(struct rcu_node *rnp, struct rcu_data *rdp) { - bool ret; + bool ret = false; bool need_gp; + const bool offloaded = IS_ENABLED(CONFIG_RCU_NOCB_CPU) && + rcu_segcblist_is_offloaded(&rdp->cblist); raw_lockdep_assert_held_rcu_node(rnp); @@ -1354,10 +1356,12 @@ static bool __note_gp_changes(struct rcu_node *rnp, struct rcu_data *rdp) /* Handle the ends of any preceding grace periods first. */ if (rcu_seq_completed_gp(rdp->gp_seq, rnp->gp_seq) || unlikely(READ_ONCE(rdp->gpwrap))) { - ret = rcu_advance_cbs(rnp, rdp); /* Advance callbacks. */ + if (!offloaded) + ret = rcu_advance_cbs(rnp, rdp); /* Advance CBs. */ trace_rcu_grace_period(rcu_state.name, rdp->gp_seq, TPS("cpuend")); } else { - ret = rcu_accelerate_cbs(rnp, rdp); /* Recent callbacks. */ + if (!offloaded) + ret = rcu_accelerate_cbs(rnp, rdp); /* Recent CBs. */ } /* Now handle the beginnings of any new-to-this-CPU grace periods. */ @@ -1658,6 +1662,7 @@ static void rcu_gp_cleanup(void) unsigned long gp_duration; bool needgp = false; unsigned long new_gp_seq; + bool offloaded; struct rcu_data *rdp; struct rcu_node *rnp = rcu_get_root(); struct swait_queue_head *sq; @@ -1723,7 +1728,9 @@ static void rcu_gp_cleanup(void) needgp = true; } /* Advance CBs to reduce false positives below. */ - if (!rcu_accelerate_cbs(rnp, rdp) && needgp) { + offloaded = IS_ENABLED(CONFIG_RCU_NOCB_CPU) && + rcu_segcblist_is_offloaded(&rdp->cblist); + if ((offloaded || !rcu_accelerate_cbs(rnp, rdp)) && needgp) { WRITE_ONCE(rcu_state.gp_flags, RCU_GP_FLAG_INIT); rcu_state.gp_req_activity = jiffies; trace_rcu_grace_period(rcu_state.name, @@ -1917,7 +1924,9 @@ rcu_report_qs_rdp(int cpu, struct rcu_data *rdp) { unsigned long flags; unsigned long mask; - bool needwake; + bool needwake = false; + const bool offloaded = IS_ENABLED(CONFIG_RCU_NOCB_CPU) && + rcu_segcblist_is_offloaded(&rdp->cblist); struct rcu_node *rnp; rnp = rdp->mynode; @@ -1944,7 +1953,8 @@ rcu_report_qs_rdp(int cpu, struct rcu_data *rdp) * This GP can't end until cpu checks in, so all of our * callbacks can be processed during the next GP. */ - needwake = rcu_accelerate_cbs(rnp, rdp); + if (!offloaded) + needwake = rcu_accelerate_cbs(rnp, rdp); rcu_report_qs_rnp(mask, rnp, rnp->gp_seq, flags); /* ^^^ Released rnp->lock */ @@ -2082,7 +2092,6 @@ static void rcu_do_batch(struct rcu_data *rdp) struct rcu_cblist rcl = RCU_CBLIST_INITIALIZER(rcl); long bl, count; - WARN_ON_ONCE(rdp->cblist.offloaded); /* If no callbacks are ready, just return. */ if (!rcu_segcblist_ready_cbs(&rdp->cblist)) { trace_rcu_batch_start(rcu_state.name, @@ -2101,13 +2110,14 @@ static void rcu_do_batch(struct rcu_data *rdp) * callback counts, as rcu_barrier() needs to be conservative. */ local_irq_save(flags); + rcu_nocb_lock(rdp); WARN_ON_ONCE(cpu_is_offline(smp_processor_id())); bl = rdp->blimit; trace_rcu_batch_start(rcu_state.name, rcu_segcblist_n_lazy_cbs(&rdp->cblist), rcu_segcblist_n_cbs(&rdp->cblist), bl); rcu_segcblist_extract_done_cbs(&rdp->cblist, &rcl); - local_irq_restore(flags); + rcu_nocb_unlock_irqrestore(rdp, flags); /* Invoke callbacks. */ rhp = rcu_cblist_dequeue(&rcl); @@ -2120,12 +2130,22 @@ static void rcu_do_batch(struct rcu_data *rdp) * Note: The rcl structure counts down from zero. */ if (-rcl.len >= bl && + !rcu_segcblist_is_offloaded(&rdp->cblist) && (need_resched() || (!is_idle_task(current) && !rcu_is_callbacks_kthread()))) break; + if (rcu_segcblist_is_offloaded(&rdp->cblist)) { + WARN_ON_ONCE(in_serving_softirq()); + local_bh_enable(); + lockdep_assert_irqs_enabled(); + cond_resched_tasks_rcu_qs(); + lockdep_assert_irqs_enabled(); + local_bh_disable(); + } } local_irq_save(flags); + rcu_nocb_lock(rdp); count = -rcl.len; trace_rcu_batch_end(rcu_state.name, count, !!rcl.head, need_resched(), is_idle_task(current), rcu_is_callbacks_kthread()); @@ -2153,10 +2173,11 @@ static void rcu_do_batch(struct rcu_data *rdp) */ WARN_ON_ONCE(rcu_segcblist_empty(&rdp->cblist) != (count == 0)); - local_irq_restore(flags); + rcu_nocb_unlock_irqrestore(rdp, flags); /* Re-invoke RCU core processing if there are callbacks remaining. */ - if (rcu_segcblist_ready_cbs(&rdp->cblist)) + if (!rcu_segcblist_is_offloaded(&rdp->cblist) && + rcu_segcblist_ready_cbs(&rdp->cblist)) invoke_rcu_core(); } @@ -2312,7 +2333,8 @@ static __latent_entropy void rcu_core(void) rcu_check_gp_start_stall(rnp, rdp, rcu_jiffies_till_stall_check()); /* If there are callbacks ready, invoke them. */ - if (rcu_segcblist_ready_cbs(&rdp->cblist) && + if (!rcu_segcblist_is_offloaded(&rdp->cblist) && + rcu_segcblist_ready_cbs(&rdp->cblist) && likely(READ_ONCE(rcu_scheduler_fully_active))) rcu_do_batch(rdp); @@ -2492,10 +2514,11 @@ static void rcu_leak_callback(struct rcu_head *rhp) * is expected to specify a CPU. */ static void -__call_rcu(struct rcu_head *head, rcu_callback_t func, int cpu, bool lazy) +__call_rcu(struct rcu_head *head, rcu_callback_t func, bool lazy) { unsigned long flags; struct rcu_data *rdp; + bool was_alldone; /* Misaligned rcu_head! */ WARN_ON_ONCE((unsigned long)head & (sizeof(void *) - 1)); @@ -2517,29 +2540,17 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func, int cpu, bool lazy) rdp = this_cpu_ptr(&rcu_data); /* Add the callback to our list. */ - if (unlikely(!rcu_segcblist_is_enabled(&rdp->cblist)) || - rcu_segcblist_is_offloaded(&rdp->cblist) || cpu != -1) { - int offline; - - if (cpu != -1) - rdp = per_cpu_ptr(&rcu_data, cpu); - if (likely(rdp->mynode)) { - /* Post-boot, so this should be for a no-CBs CPU. */ - offline = !__call_rcu_nocb(rdp, head, lazy, flags); - WARN_ON_ONCE(offline); - /* Offline CPU, _call_rcu() illegal, leak callback. */ - local_irq_restore(flags); - return; - } - /* - * Very early boot, before rcu_init(). Initialize if needed - * and then drop through to queue the callback. - */ - WARN_ON_ONCE(cpu != -1); + if (unlikely(!rcu_segcblist_is_enabled(&rdp->cblist))) { + // This can trigger due to call_rcu() from offline CPU: + WARN_ON_ONCE(rcu_scheduler_active != RCU_SCHEDULER_INACTIVE); WARN_ON_ONCE(!rcu_is_watching()); + // Very early boot, before rcu_init(). Initialize if needed + // and then drop through to queue the callback. if (rcu_segcblist_empty(&rdp->cblist)) rcu_segcblist_init(&rdp->cblist); } + rcu_nocb_lock(rdp); + was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist); rcu_segcblist_enqueue(&rdp->cblist, head, lazy); if (__is_kfree_rcu_offset((unsigned long)func)) trace_rcu_kfree_callback(rcu_state.name, head, @@ -2552,8 +2563,13 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func, int cpu, bool lazy) rcu_segcblist_n_cbs(&rdp->cblist)); /* Go handle any RCU core processing required. */ - __call_rcu_core(rdp, head, flags); - local_irq_restore(flags); + if (IS_ENABLED(CONFIG_RCU_NOCB_CPU) && + unlikely(rcu_segcblist_is_offloaded(&rdp->cblist))) { + __call_rcu_nocb_wake(rdp, was_alldone, flags); /* unlocks */ + } else { + __call_rcu_core(rdp, head, flags); + local_irq_restore(flags); + } } /** @@ -2593,7 +2609,7 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func, int cpu, bool lazy) */ void call_rcu(struct rcu_head *head, rcu_callback_t func) { - __call_rcu(head, func, -1, 0); + __call_rcu(head, func, 0); } EXPORT_SYMBOL_GPL(call_rcu); @@ -2606,7 +2622,7 @@ EXPORT_SYMBOL_GPL(call_rcu); */ void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func) { - __call_rcu(head, func, -1, 1); + __call_rcu(head, func, 1); } EXPORT_SYMBOL_GPL(kfree_call_rcu); @@ -2806,6 +2822,7 @@ static void rcu_barrier_func(void *unused) rcu_barrier_trace(TPS("IRQ"), -1, rcu_state.barrier_sequence); rdp->barrier_head.func = rcu_barrier_callback; debug_rcu_head_queue(&rdp->barrier_head); + rcu_nocb_lock(rdp); if (rcu_segcblist_entrain(&rdp->cblist, &rdp->barrier_head, 0)) { atomic_inc(&rcu_state.barrier_cpu_count); } else { @@ -2813,6 +2830,7 @@ static void rcu_barrier_func(void *unused) rcu_barrier_trace(TPS("IRQNQ"), -1, rcu_state.barrier_sequence); } + rcu_nocb_unlock(rdp); } /** @@ -2867,19 +2885,7 @@ void rcu_barrier(void) if (!cpu_online(cpu) && !rcu_segcblist_is_offloaded(&rdp->cblist)) continue; - if (rcu_segcblist_is_offloaded(&rdp->cblist)) { - if (!rcu_nocb_cpu_needs_barrier(cpu)) { - rcu_barrier_trace(TPS("OfflineNoCB"), cpu, - rcu_state.barrier_sequence); - } else { - rcu_barrier_trace(TPS("OnlineNoCB"), cpu, - rcu_state.barrier_sequence); - smp_mb__before_atomic(); - atomic_inc(&rcu_state.barrier_cpu_count); - __call_rcu(&rdp->barrier_head, - rcu_barrier_callback, cpu, 0); - } - } else if (rcu_segcblist_n_cbs(&rdp->cblist)) { + if (rcu_segcblist_n_cbs(&rdp->cblist)) { rcu_barrier_trace(TPS("OnlineQ"), cpu, rcu_state.barrier_sequence); smp_call_function_single(cpu, rcu_barrier_func, NULL, 1); @@ -3169,10 +3175,7 @@ void rcutree_migrate_callbacks(int cpu) local_irq_save(flags); my_rdp = this_cpu_ptr(&rcu_data); my_rnp = my_rdp->mynode; - if (rcu_nocb_adopt_orphan_cbs(my_rdp, rdp, flags)) { - local_irq_restore(flags); - return; - } + rcu_nocb_lock(my_rdp); /* irqs already disabled. */ raw_spin_lock_rcu_node(my_rnp); /* irqs already disabled. */ /* Leverage recent GPs and set GP for new callbacks. */ needwake = rcu_advance_cbs(my_rnp, rdp) || @@ -3180,9 +3183,16 @@ void rcutree_migrate_callbacks(int cpu) rcu_segcblist_merge(&my_rdp->cblist, &rdp->cblist); WARN_ON_ONCE(rcu_segcblist_empty(&my_rdp->cblist) != !rcu_segcblist_n_cbs(&my_rdp->cblist)); - raw_spin_unlock_irqrestore_rcu_node(my_rnp, flags); + if (rcu_segcblist_is_offloaded(&my_rdp->cblist)) { + raw_spin_unlock_rcu_node(my_rnp); /* irqs remain disabled. */ + __call_rcu_nocb_wake(my_rdp, true, flags); + } else { + rcu_nocb_unlock(my_rdp); /* irqs remain disabled. */ + raw_spin_unlock_irqrestore_rcu_node(my_rnp, flags); + } if (needwake) rcu_gp_kthread_wake(); + lockdep_assert_irqs_enabled(); WARN_ONCE(rcu_segcblist_n_cbs(&rdp->cblist) != 0 || !rcu_segcblist_empty(&rdp->cblist), "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, 1stCB=%p\n", diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 8d9cfcac6757..529eec2aa74d 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -211,7 +211,9 @@ struct rcu_data { /* CBs waiting for GP. */ struct rcu_head **nocb_gp_tail; bool nocb_gp_sleep; /* Is the nocb GP thread asleep? */ + bool nocb_gp_forced; /* Forced nocb GP thread wakeup? */ struct swait_queue_head nocb_gp_wq; /* For nocb kthreads to sleep on. */ + bool nocb_cb_sleep; /* Is the nocb CB thread asleep? */ struct task_struct *nocb_cb_kthread; struct rcu_data *nocb_next_cb_rdp; /* Next rcu_data in wakeup chain. */ @@ -421,20 +423,20 @@ static bool rcu_preempt_has_tasks(struct rcu_node *rnp); static bool rcu_preempt_need_deferred_qs(struct task_struct *t); static void rcu_preempt_deferred_qs(struct task_struct *t); static void zero_cpu_stall_ticks(struct rcu_data *rdp); -static bool rcu_nocb_cpu_needs_barrier(int cpu); static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp); static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq); static void rcu_init_one_nocb(struct rcu_node *rnp); -static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, - bool lazy, unsigned long flags); -static bool rcu_nocb_adopt_orphan_cbs(struct rcu_data *my_rdp, - struct rcu_data *rdp, - unsigned long flags); +static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_empty, + unsigned long flags); static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp); static void do_nocb_deferred_wakeup(struct rcu_data *rdp); static void rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp); static void rcu_spawn_cpu_nocb_kthread(int cpu); static void __init rcu_spawn_nocb_kthreads(void); +static void rcu_nocb_lock(struct rcu_data *rdp); +static void rcu_nocb_unlock(struct rcu_data *rdp); +static void rcu_nocb_unlock_irqrestore(struct rcu_data *rdp, + unsigned long flags); #ifdef CONFIG_RCU_NOCB_CPU static void __init rcu_organize_nocb_kthreads(void); #endif /* #ifdef CONFIG_RCU_NOCB_CPU */ diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 2d37fd3fa0d4..feffc46cccb0 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1494,6 +1494,45 @@ static int __init parse_rcu_nocb_poll(char *arg) } early_param("rcu_nocb_poll", parse_rcu_nocb_poll); +/* + * Acquire the specified rcu_data structure's ->nocb_lock, but only + * if it corresponds to a no-CBs CPU. + */ +static void rcu_nocb_lock(struct rcu_data *rdp) +{ + if (rcu_segcblist_is_offloaded(&rdp->cblist)) { + lockdep_assert_irqs_disabled(); + raw_spin_lock(&rdp->nocb_lock); + } +} + +/* + * Release the specified rcu_data structure's ->nocb_lock, but only + * if it corresponds to a no-CBs CPU. + */ +static void rcu_nocb_unlock(struct rcu_data *rdp) +{ + if (rcu_segcblist_is_offloaded(&rdp->cblist)) { + lockdep_assert_irqs_disabled(); + raw_spin_unlock(&rdp->nocb_lock); + } +} + +/* + * Release the specified rcu_data structure's ->nocb_lock and restore + * interrupts, but only if it corresponds to a no-CBs CPU. + */ +static void rcu_nocb_unlock_irqrestore(struct rcu_data *rdp, + unsigned long flags) +{ + if (rcu_segcblist_is_offloaded(&rdp->cblist)) { + lockdep_assert_irqs_disabled(); + raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); + } else { + local_irq_restore(flags); + } +} + /* * Wake up any no-CBs CPUs' kthreads that were waiting on the just-ended * grace period. @@ -1526,7 +1565,7 @@ bool rcu_is_nocb_cpu(int cpu) * Kick the GP kthread for this NOCB group. Caller holds ->nocb_lock * and this function releases it. */ -static void __wake_nocb_gp(struct rcu_data *rdp, bool force, +static void wake_nocb_gp(struct rcu_data *rdp, bool force, unsigned long flags) __releases(rdp->nocb_lock) { @@ -1537,30 +1576,19 @@ static void __wake_nocb_gp(struct rcu_data *rdp, bool force, raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); return; } - if (rdp_gp->nocb_gp_sleep || force) { - /* Prior smp_mb__after_atomic() orders against prior enqueue. */ - WRITE_ONCE(rdp_gp->nocb_gp_sleep, false); + if (READ_ONCE(rdp_gp->nocb_gp_sleep) || force) { del_timer(&rdp->nocb_timer); raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); - smp_mb(); /* ->nocb_gp_sleep before swake_up_one(). */ - swake_up_one(&rdp_gp->nocb_gp_wq); + smp_mb(); /* enqueue before ->nocb_gp_sleep. */ + raw_spin_lock_irqsave(&rdp_gp->nocb_lock, flags); + WRITE_ONCE(rdp_gp->nocb_gp_sleep, false); + raw_spin_unlock_irqrestore(&rdp_gp->nocb_lock, flags); + wake_up_process(rdp_gp->nocb_gp_kthread); } else { raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); } } -/* - * Kick the GP kthread for this NOCB group, but caller has not - * acquired locks. - */ -static void wake_nocb_gp(struct rcu_data *rdp, bool force) -{ - unsigned long flags; - - raw_spin_lock_irqsave(&rdp->nocb_lock, flags); - __wake_nocb_gp(rdp, force, flags); -} - /* * Arrange to wake the GP kthread for this NOCB group at some future * time when it is safe to do so. @@ -1568,295 +1596,148 @@ static void wake_nocb_gp(struct rcu_data *rdp, bool force) static void wake_nocb_gp_defer(struct rcu_data *rdp, int waketype, const char *reason) { - unsigned long flags; - - raw_spin_lock_irqsave(&rdp->nocb_lock, flags); if (rdp->nocb_defer_wakeup == RCU_NOCB_WAKE_NOT) mod_timer(&rdp->nocb_timer, jiffies + 1); WRITE_ONCE(rdp->nocb_defer_wakeup, waketype); trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, reason); - raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); -} - -/* Does rcu_barrier need to queue an RCU callback on the specified CPU? */ -static bool rcu_nocb_cpu_needs_barrier(int cpu) -{ - struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); - unsigned long ret; -#ifdef CONFIG_PROVE_RCU - struct rcu_head *rhp; -#endif /* #ifdef CONFIG_PROVE_RCU */ - - /* - * Check count of all no-CBs callbacks awaiting invocation. - * There needs to be a barrier before this function is called, - * but associated with a prior determination that no more - * callbacks would be posted. In the worst case, the first - * barrier in rcu_barrier() suffices (but the caller cannot - * necessarily rely on this, not a substitute for the caller - * getting the concurrency design right!). There must also be a - * barrier between the following load and posting of a callback - * (if a callback is in fact needed). This is associated with an - * atomic_inc() in the caller. - */ - ret = rcu_get_n_cbs_nocb_cpu(rdp); - -#ifdef CONFIG_PROVE_RCU - rhp = READ_ONCE(rdp->nocb_head); - if (!rhp) - rhp = READ_ONCE(rdp->nocb_gp_head); - if (!rhp) - rhp = READ_ONCE(rdp->nocb_cb_head); - - /* Having no rcuo kthread but CBs after scheduler starts is bad! */ - if (!READ_ONCE(rdp->nocb_cb_kthread) && rhp && - rcu_scheduler_fully_active) { - /* RCU callback enqueued before CPU first came online??? */ - pr_err("RCU: Never-onlined no-CBs CPU %d has CB %p\n", - cpu, rhp->func); - WARN_ON_ONCE(1); - } -#endif /* #ifdef CONFIG_PROVE_RCU */ - - return !!ret; } /* - * Enqueue the specified string of rcu_head structures onto the specified - * CPU's no-CBs lists. The CPU is specified by rdp, the head of the - * string by rhp, and the tail of the string by rhtp. The non-lazy/lazy - * counts are supplied by rhcount and rhcount_lazy. + * Awaken the no-CBs grace-period kthead if needed, either due to it + * legitimately being asleep or due to overload conditions. * * If warranted, also wake up the kthread servicing this CPUs queues. */ -static void __call_rcu_nocb_enqueue(struct rcu_data *rdp, - struct rcu_head *rhp, - struct rcu_head **rhtp, - int rhcount, int rhcount_lazy, - unsigned long flags) +static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_alldone, + unsigned long flags) + __releases(rdp->nocb_lock) { int len; - struct rcu_head **old_rhpp; struct task_struct *t; - /* Enqueue the callback on the nocb list and update counts. */ - atomic_long_add(rhcount, &rdp->nocb_q_count); - /* rcu_barrier() relies on ->nocb_q_count add before xchg. */ - old_rhpp = xchg(&rdp->nocb_tail, rhtp); - WRITE_ONCE(*old_rhpp, rhp); - atomic_long_add(rhcount_lazy, &rdp->nocb_q_count_lazy); - smp_mb__after_atomic(); /* Store *old_rhpp before _wake test. */ - - /* If we are not being polled and there is a kthread, awaken it ... */ + // If we are being polled or there is no kthread, just leave. t = READ_ONCE(rdp->nocb_gp_kthread); if (rcu_nocb_poll || !t) { trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WakeNotPoll")); + rcu_nocb_unlock_irqrestore(rdp, flags); return; } - len = rcu_get_n_cbs_nocb_cpu(rdp); - if (old_rhpp == &rdp->nocb_head) { + // Need to actually to a wakeup. + len = rcu_segcblist_n_cbs(&rdp->cblist); + if (was_alldone) { if (!irqs_disabled_flags(flags)) { /* ... if queue was empty ... */ - wake_nocb_gp(rdp, false); + wake_nocb_gp(rdp, false, flags); trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WakeEmpty")); } else { wake_nocb_gp_defer(rdp, RCU_NOCB_WAKE, TPS("WakeEmptyIsDeferred")); + rcu_nocb_unlock_irqrestore(rdp, flags); } rdp->qlen_last_fqs_check = 0; } else if (len > rdp->qlen_last_fqs_check + qhimark) { /* ... or if many callbacks queued. */ if (!irqs_disabled_flags(flags)) { - wake_nocb_gp(rdp, true); + wake_nocb_gp(rdp, true, flags); trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WakeOvf")); } else { wake_nocb_gp_defer(rdp, RCU_NOCB_WAKE_FORCE, TPS("WakeOvfIsDeferred")); + rcu_nocb_unlock_irqrestore(rdp, flags); } rdp->qlen_last_fqs_check = LONG_MAX / 2; } else { trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WakeNot")); + rcu_nocb_unlock_irqrestore(rdp, flags); } + if (!irqs_disabled_flags(flags)) + lockdep_assert_irqs_enabled(); return; } /* - * This is a helper for __call_rcu(), which invokes this when the normal - * callback queue is inoperable. If this is not a no-CBs CPU, this - * function returns failure back to __call_rcu(), which can complain - * appropriately. - * - * Otherwise, this function queues the callback where the corresponding - * "rcuo" kthread can find it. - */ -static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, - bool lazy, unsigned long flags) -{ - - if (!rcu_segcblist_is_offloaded(&rdp->cblist)) - return false; - __call_rcu_nocb_enqueue(rdp, rhp, &rhp->next, 1, lazy, flags); - if (__is_kfree_rcu_offset((unsigned long)rhp->func)) - trace_rcu_kfree_callback(rcu_state.name, rhp, - (unsigned long)rhp->func, - -atomic_long_read(&rdp->nocb_q_count_lazy), - -rcu_get_n_cbs_nocb_cpu(rdp)); - else - trace_rcu_callback(rcu_state.name, rhp, - -atomic_long_read(&rdp->nocb_q_count_lazy), - -rcu_get_n_cbs_nocb_cpu(rdp)); - - return true; -} - -/* - * Adopt orphaned callbacks on a no-CBs CPU, or return 0 if this is - * not a no-CBs CPU. - */ -static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_data *my_rdp, - struct rcu_data *rdp, - unsigned long flags) -{ - lockdep_assert_irqs_disabled(); - if (!rcu_segcblist_is_offloaded(&my_rdp->cblist)) - return false; /* Not NOCBs CPU, caller must migrate CBs. */ - __call_rcu_nocb_enqueue(my_rdp, rcu_segcblist_head(&rdp->cblist), - rcu_segcblist_tail(&rdp->cblist), - rcu_segcblist_n_cbs(&rdp->cblist), - rcu_segcblist_n_lazy_cbs(&rdp->cblist), flags); - rcu_segcblist_init(&rdp->cblist); - rcu_segcblist_disable(&rdp->cblist); - return true; -} - -/* - * If necessary, kick off a new grace period, and either way wait - * for a subsequent grace period to complete. - */ -static void rcu_nocb_wait_gp(struct rcu_data *rdp) -{ - unsigned long c; - bool d; - unsigned long flags; - bool needwake; - struct rcu_node *rnp = rdp->mynode; - - local_irq_save(flags); - c = rcu_seq_snap(&rcu_state.gp_seq); - if (!rdp->gpwrap && ULONG_CMP_GE(rdp->gp_seq_needed, c)) { - local_irq_restore(flags); - } else { - raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */ - needwake = rcu_start_this_gp(rnp, rdp, c); - raw_spin_unlock_irqrestore_rcu_node(rnp, flags); - if (needwake) - rcu_gp_kthread_wake(); - } - - /* - * Wait for the grace period. Do so interruptibly to avoid messing - * up the load average. - */ - trace_rcu_this_gp(rnp, rdp, c, TPS("StartWait")); - for (;;) { - swait_event_interruptible_exclusive( - rnp->nocb_gp_wq[rcu_seq_ctr(c) & 0x1], - (d = rcu_seq_done(&rnp->gp_seq, c))); - if (likely(d)) - break; - WARN_ON(signal_pending(current)); - trace_rcu_this_gp(rnp, rdp, c, TPS("ResumeWait")); - } - trace_rcu_this_gp(rnp, rdp, c, TPS("EndWait")); - smp_mb(); /* Ensure that CB invocation happens after GP end. */ -} - -/* - * No-CBs GP kthreads come here to wait for additional callbacks to show up. - * This function does not return until callbacks appear. + * No-CBs GP kthreads come here to wait for additional callbacks to show up + * or for grace periods to end. */ static void nocb_gp_wait(struct rcu_data *my_rdp) { - bool firsttime = true; + int __maybe_unused cpu = my_rdp->cpu; + unsigned long cur_gp_seq; unsigned long flags; bool gotcbs; + bool needwait_gp = false; + bool needwake; + bool needwake_gp; struct rcu_data *rdp; - struct rcu_head **tail; - - /* Wait for callbacks to appear. */ - if (!rcu_nocb_poll) { - trace_rcu_nocb_wake(rcu_state.name, my_rdp->cpu, TPS("Sleep")); - swait_event_interruptible_exclusive(my_rdp->nocb_gp_wq, - !READ_ONCE(my_rdp->nocb_gp_sleep)); - raw_spin_lock_irqsave(&my_rdp->nocb_lock, flags); - my_rdp->nocb_gp_sleep = true; - WRITE_ONCE(my_rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT); - del_timer(&my_rdp->nocb_timer); - raw_spin_unlock_irqrestore(&my_rdp->nocb_lock, flags); - } else if (firsttime) { - firsttime = false; /* Don't drown trace log with "Poll"! */ - trace_rcu_nocb_wake(rcu_state.name, my_rdp->cpu, TPS("Poll")); - } + struct rcu_node *rnp; + unsigned long wait_gp_seq; /* - * Each pass through the following loop checks for CBs. - * We are our own first CB kthread. Any CBs found are moved to - * nocb_gp_head, where they await a grace period. + * Each pass through the following loop checks for CBs and for the + * nearest grace period (if any) to wait for next. The CB kthreads + * and the global grace-period kthread are awakened if needed. */ - gotcbs = false; - smp_mb(); /* wakeup and _sleep before ->nocb_head reads. */ for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_cb_rdp) { - rdp->nocb_gp_head = READ_ONCE(rdp->nocb_head); - if (!rdp->nocb_gp_head) - continue; /* No CBs here, try next. */ - - /* Move callbacks to wait-for-GP list, which is empty. */ - WRITE_ONCE(rdp->nocb_head, NULL); - rdp->nocb_gp_tail = xchg(&rdp->nocb_tail, &rdp->nocb_head); - gotcbs = true; - } - - /* No callbacks? Sleep a bit if polling, and go retry. */ - if (unlikely(!gotcbs)) { - WARN_ON(signal_pending(current)); - if (rcu_nocb_poll) { - schedule_timeout_interruptible(1); - } else { - trace_rcu_nocb_wake(rcu_state.name, my_rdp->cpu, - TPS("WokeEmpty")); + if (rcu_segcblist_empty(&rdp->cblist)) + continue; /* No callbacks here, try next. */ + rnp = rdp->mynode; + raw_spin_lock_irqsave(&rdp->nocb_lock, flags); + WRITE_ONCE(my_rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT); + del_timer(&my_rdp->nocb_timer); + raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */ + needwake_gp = rcu_advance_cbs(rnp, rdp); + raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */ + // Need to wait on some grace period? + if (rcu_segcblist_nextgp(&rdp->cblist, &cur_gp_seq)) { + if (!needwait_gp || + ULONG_CMP_LT(cur_gp_seq, wait_gp_seq)) + wait_gp_seq = cur_gp_seq; + needwait_gp = true; } - return; - } - - /* Wait for one grace period. */ - rcu_nocb_wait_gp(my_rdp); - - /* Each pass through this loop wakes a CB kthread, if needed. */ - for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_cb_rdp) { - if (!rcu_nocb_poll && - READ_ONCE(rdp->nocb_head) && - READ_ONCE(my_rdp->nocb_gp_sleep)) { - raw_spin_lock_irqsave(&my_rdp->nocb_lock, flags); - my_rdp->nocb_gp_sleep = false;/* No need to sleep.*/ - raw_spin_unlock_irqrestore(&my_rdp->nocb_lock, flags); + if (rcu_segcblist_ready_cbs(&rdp->cblist)) { + needwake = rdp->nocb_cb_sleep; + WRITE_ONCE(rdp->nocb_cb_sleep, false); + smp_mb(); /* CB invocation -after- GP end. */ + } else { + needwake = false; } - if (!rdp->nocb_gp_head) - continue; /* No CBs, so no need to wake kthread. */ - - /* Append callbacks to CB kthread's "done" list. */ - raw_spin_lock_irqsave(&rdp->nocb_lock, flags); - tail = rdp->nocb_cb_tail; - rdp->nocb_cb_tail = rdp->nocb_gp_tail; - *tail = rdp->nocb_gp_head; raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); - if (tail == &rdp->nocb_cb_head) { - /* List was empty, so wake up the kthread. */ + if (needwake) { swake_up_one(&rdp->nocb_cb_wq); + gotcbs = true; } + if (needwake_gp) + rcu_gp_kthread_wake(); + } + + if (rcu_nocb_poll) { + /* Polling, so trace if first poll in the series. */ + if (gotcbs) + trace_rcu_nocb_wake(rcu_state.name, cpu, TPS("Poll")); + schedule_timeout_interruptible(1); + } else if (!needwait_gp) { + /* Wait for callbacks to appear. */ + trace_rcu_nocb_wake(rcu_state.name, cpu, TPS("Sleep")); + swait_event_interruptible_exclusive(my_rdp->nocb_gp_wq, + !READ_ONCE(my_rdp->nocb_gp_sleep)); + } else { + rnp = my_rdp->mynode; + trace_rcu_this_gp(rnp, my_rdp, wait_gp_seq, TPS("StartWait")); + swait_event_interruptible_exclusive( + rnp->nocb_gp_wq[rcu_seq_ctr(wait_gp_seq) & 0x1], + rcu_seq_done(&rnp->gp_seq, wait_gp_seq) || + !READ_ONCE(my_rdp->nocb_gp_sleep)); + trace_rcu_this_gp(rnp, my_rdp, wait_gp_seq, TPS("EndWait")); + } + if (!rcu_nocb_poll) { + raw_spin_lock_irqsave(&my_rdp->nocb_lock, flags); + WRITE_ONCE(my_rdp->nocb_gp_sleep, true); + raw_spin_unlock_irqrestore(&my_rdp->nocb_lock, flags); } + WARN_ON(signal_pending(current)); } /* @@ -1871,92 +1752,69 @@ static int rcu_nocb_gp_kthread(void *arg) { struct rcu_data *rdp = arg; - for (;;) + for (;;) { nocb_gp_wait(rdp); + cond_resched_tasks_rcu_qs(); + } return 0; } /* - * No-CBs CB kthreads come here to wait for additional callbacks to show up. - * This function returns true ("keep waiting") until callbacks appear and - * then false ("stop waiting") when callbacks finally do appear. + * Invoke any ready callbacks from the corresponding no-CBs CPU, + * then, if there are no more, wait for more to appear. */ -static bool nocb_cb_wait(struct rcu_data *rdp) +static void nocb_cb_wait(struct rcu_data *rdp) { + unsigned long flags; + bool needwake_gp = false; + struct rcu_node *rnp = rdp->mynode; + + local_irq_save(flags); + rcu_momentary_dyntick_idle(); + local_irq_restore(flags); + local_bh_disable(); + rcu_do_batch(rdp); + local_bh_enable(); + lockdep_assert_irqs_enabled(); + raw_spin_lock_irqsave(&rdp->nocb_lock, flags); + raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */ + needwake_gp = rcu_advance_cbs(rdp->mynode, rdp); + raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */ + if (rcu_segcblist_ready_cbs(&rdp->cblist)) { + raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); + if (needwake_gp) + rcu_gp_kthread_wake(); + return; + } + trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("CBSleep")); + WRITE_ONCE(rdp->nocb_cb_sleep, true); + raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); + if (needwake_gp) + rcu_gp_kthread_wake(); swait_event_interruptible_exclusive(rdp->nocb_cb_wq, - READ_ONCE(rdp->nocb_cb_head)); - if (smp_load_acquire(&rdp->nocb_cb_head)) { /* VVV */ - /* ^^^ Ensure CB invocation follows _head test. */ - return false; + !READ_ONCE(rdp->nocb_cb_sleep)); + if (!smp_load_acquire(&rdp->nocb_cb_sleep)) { /* VVV */ + /* ^^^ Ensure CB invocation follows _sleep test. */ + return; } WARN_ON(signal_pending(current)); trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WokeEmpty")); - return true; } /* - * Per-rcu_data kthread, but only for no-CBs CPUs. Each kthread invokes - * callbacks queued by the corresponding no-CBs CPU, however, there is an - * optional GP-CB relationship so that the grace-period kthreads don't - * have to do quite so many wakeups (as in they only need to wake the - * no-CBs GP kthreads, not the CB kthreads). + * Per-rcu_data kthread, but only for no-CBs CPUs. Repeatedly invoke + * nocb_cb_wait() to do the dirty work. */ static int rcu_nocb_cb_kthread(void *arg) { - int c, cl; - unsigned long flags; - struct rcu_head *list; - struct rcu_head *next; - struct rcu_head **tail; struct rcu_data *rdp = arg; - /* Each pass through this loop invokes one batch of callbacks */ + // Each pass through this loop does one callback batch, and, + // if there are no more ready callbacks, waits for them. for (;;) { - /* Wait for callbacks. */ - while (nocb_cb_wait(rdp)) - continue; - - /* Pull the ready-to-invoke callbacks onto local list. */ - raw_spin_lock_irqsave(&rdp->nocb_lock, flags); - list = rdp->nocb_cb_head; - rdp->nocb_cb_head = NULL; - tail = rdp->nocb_cb_tail; - rdp->nocb_cb_tail = &rdp->nocb_cb_head; - raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); - if (WARN_ON_ONCE(!list)) - continue; - trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WokeNonEmpty")); - - /* Each pass through the following loop invokes a callback. */ - trace_rcu_batch_start(rcu_state.name, - atomic_long_read(&rdp->nocb_q_count_lazy), - rcu_get_n_cbs_nocb_cpu(rdp), -1); - c = cl = 0; - while (list) { - next = list->next; - /* Wait for enqueuing to complete, if needed. */ - while (next == NULL && &list->next != tail) { - trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, - TPS("WaitQueue")); - schedule_timeout_interruptible(1); - trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, - TPS("WokeQueue")); - next = list->next; - } - debug_rcu_head_unqueue(list); - local_bh_disable(); - if (__rcu_reclaim(rcu_state.name, list)) - cl++; - c++; - local_bh_enable(); - cond_resched_tasks_rcu_qs(); - list = next; - } - trace_rcu_batch_end(rcu_state.name, c, !!list, 0, 0, 1); - smp_mb__before_atomic(); /* _add after CB invocation. */ - atomic_long_add(-c, &rdp->nocb_q_count); - atomic_long_add(-cl, &rdp->nocb_q_count_lazy); + nocb_cb_wait(rdp); + cond_resched_tasks_rcu_qs(); } return 0; } @@ -1980,7 +1838,7 @@ static void do_nocb_deferred_wakeup_common(struct rcu_data *rdp) } ndw = READ_ONCE(rdp->nocb_defer_wakeup); WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT); - __wake_nocb_gp(rdp, ndw == RCU_NOCB_WAKE_FORCE, flags); + wake_nocb_gp(rdp, ndw == RCU_NOCB_WAKE_FORCE, flags); trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("DeferredWake")); } @@ -2194,10 +2052,21 @@ static unsigned long rcu_get_n_cbs_nocb_cpu(struct rcu_data *rdp) #else /* #ifdef CONFIG_RCU_NOCB_CPU */ -static bool rcu_nocb_cpu_needs_barrier(int cpu) +/* No ->nocb_lock to acquire. */ +static void rcu_nocb_lock(struct rcu_data *rdp) { - WARN_ON_ONCE(1); /* Should be dead code. */ - return false; +} + +/* No ->nocb_lock to release. */ +static void rcu_nocb_unlock(struct rcu_data *rdp) +{ +} + +/* No ->nocb_lock to release. */ +static void rcu_nocb_unlock_irqrestore(struct rcu_data *rdp, + unsigned long flags) +{ + local_irq_restore(flags); } static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq) @@ -2213,17 +2082,10 @@ static void rcu_init_one_nocb(struct rcu_node *rnp) { } -static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, - bool lazy, unsigned long flags) +static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_empty, + unsigned long flags) { - return false; -} - -static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_data *my_rdp, - struct rcu_data *rdp, - unsigned long flags) -{ - return false; + WARN_ON_ONCE(1); /* Should be dead code! */ } static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp) -- cgit v1.2.3 From e7f4c5b3998a3cf1bd8dbf110948075b47ac9b78 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 21 May 2019 07:18:00 -0700 Subject: rcu/nocb: Remove obsolete nocb_head and nocb_tail fields Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.h | 3 --- kernel/rcu/tree_plugin.h | 1 - 2 files changed, 4 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 529eec2aa74d..74e3a4ab8095 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -194,8 +194,6 @@ struct rcu_data { /* 5) Callback offloading. */ #ifdef CONFIG_RCU_NOCB_CPU - struct rcu_head *nocb_head; /* CBs waiting for kthread. */ - struct rcu_head **nocb_tail; atomic_long_t nocb_q_count; /* # CBs waiting for nocb */ atomic_long_t nocb_q_count_lazy; /* invocation (all stages). */ struct rcu_head *nocb_cb_head; /* CBs ready to invoke. */ @@ -211,7 +209,6 @@ struct rcu_data { /* CBs waiting for GP. */ struct rcu_head **nocb_gp_tail; bool nocb_gp_sleep; /* Is the nocb GP thread asleep? */ - bool nocb_gp_forced; /* Forced nocb GP thread wakeup? */ struct swait_queue_head nocb_gp_wq; /* For nocb kthreads to sleep on. */ bool nocb_cb_sleep; /* Is the nocb CB thread asleep? */ struct task_struct *nocb_cb_kthread; diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index feffc46cccb0..838e0caaf53a 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1911,7 +1911,6 @@ void __init rcu_init_nohz(void) /* Initialize per-rcu_data variables for no-CBs CPUs. */ static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp) { - rdp->nocb_tail = &rdp->nocb_head; init_swait_queue_head(&rdp->nocb_cb_wq); init_swait_queue_head(&rdp->nocb_gp_wq); rdp->nocb_cb_tail = &rdp->nocb_cb_head; -- cgit v1.2.3 From c035280f1761b3336f4dad336906c19735d7ba5f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 21 May 2019 08:28:41 -0700 Subject: rcu/nocb: Remove obsolete nocb_q_count and nocb_q_count_lazy fields This commit removes the obsolete nocb_q_count and nocb_q_count_lazy fields, also removing rcu_get_n_cbs_nocb_cpu(), adjusting rcu_get_n_cbs_cpu(), and making rcutree_migrate_callbacks() once again disable the ->cblist fields of offline CPUs. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 6 +++--- kernel/rcu/tree.h | 3 --- kernel/rcu/tree_plugin.h | 14 -------------- 3 files changed, 3 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 054418d2d960..e5f30b364276 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -210,10 +210,9 @@ static long rcu_get_n_cbs_cpu(int cpu) { struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); - if (rcu_segcblist_is_enabled(&rdp->cblist) && - !rcu_segcblist_is_offloaded(&rdp->cblist)) /* Online normal CPU? */ + if (rcu_segcblist_is_enabled(&rdp->cblist)) return rcu_segcblist_n_cbs(&rdp->cblist); - return rcu_get_n_cbs_nocb_cpu(rdp); /* Works for offline, too. */ + return 0; } void rcu_softirq_qs(void) @@ -3181,6 +3180,7 @@ void rcutree_migrate_callbacks(int cpu) needwake = rcu_advance_cbs(my_rnp, rdp) || rcu_advance_cbs(my_rnp, my_rdp); rcu_segcblist_merge(&my_rdp->cblist, &rdp->cblist); + rcu_segcblist_disable(&rdp->cblist); WARN_ON_ONCE(rcu_segcblist_empty(&my_rdp->cblist) != !rcu_segcblist_n_cbs(&my_rdp->cblist)); if (rcu_segcblist_is_offloaded(&my_rdp->cblist)) { diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 74e3a4ab8095..d1df192272fb 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -194,8 +194,6 @@ struct rcu_data { /* 5) Callback offloading. */ #ifdef CONFIG_RCU_NOCB_CPU - atomic_long_t nocb_q_count; /* # CBs waiting for nocb */ - atomic_long_t nocb_q_count_lazy; /* invocation (all stages). */ struct rcu_head *nocb_cb_head; /* CBs ready to invoke. */ struct rcu_head **nocb_cb_tail; struct swait_queue_head nocb_cb_wq; /* For nocb kthreads to sleep on. */ @@ -437,7 +435,6 @@ static void rcu_nocb_unlock_irqrestore(struct rcu_data *rdp, #ifdef CONFIG_RCU_NOCB_CPU static void __init rcu_organize_nocb_kthreads(void); #endif /* #ifdef CONFIG_RCU_NOCB_CPU */ -static unsigned long rcu_get_n_cbs_nocb_cpu(struct rcu_data *rdp); static void rcu_bind_gp_kthread(void); static bool rcu_nohz_full_cpu(void); static void rcu_dynticks_task_enter(void); diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 838e0caaf53a..458838c63a6c 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -2040,15 +2040,6 @@ void rcu_bind_current_to_nocb(void) } EXPORT_SYMBOL_GPL(rcu_bind_current_to_nocb); -/* - * Return the number of RCU callbacks still queued from the specified - * CPU, which must be a nocbs CPU. - */ -static unsigned long rcu_get_n_cbs_nocb_cpu(struct rcu_data *rdp) -{ - return atomic_long_read(&rdp->nocb_q_count); -} - #else /* #ifdef CONFIG_RCU_NOCB_CPU */ /* No ->nocb_lock to acquire. */ @@ -2108,11 +2099,6 @@ static void __init rcu_spawn_nocb_kthreads(void) { } -static unsigned long rcu_get_n_cbs_nocb_cpu(struct rcu_data *rdp) -{ - return 0; -} - #endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */ /* -- cgit v1.2.3 From 2a777de757f4c7050997c6232a585eff59c5ea36 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 21 May 2019 09:10:24 -0700 Subject: rcu/nocb: Remove obsolete nocb_cb_tail and nocb_cb_head fields Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.h | 2 -- kernel/rcu/tree_plugin.h | 1 - 2 files changed, 3 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index d1df192272fb..6e4cf7de303f 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -194,8 +194,6 @@ struct rcu_data { /* 5) Callback offloading. */ #ifdef CONFIG_RCU_NOCB_CPU - struct rcu_head *nocb_cb_head; /* CBs ready to invoke. */ - struct rcu_head **nocb_cb_tail; struct swait_queue_head nocb_cb_wq; /* For nocb kthreads to sleep on. */ struct task_struct *nocb_gp_kthread; raw_spinlock_t nocb_lock; /* Guard following pair of fields. */ diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 458838c63a6c..1847fffdfa0a 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1913,7 +1913,6 @@ static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp) { init_swait_queue_head(&rdp->nocb_cb_wq); init_swait_queue_head(&rdp->nocb_gp_wq); - rdp->nocb_cb_tail = &rdp->nocb_cb_head; raw_spin_lock_init(&rdp->nocb_lock); timer_setup(&rdp->nocb_timer, do_nocb_deferred_wakeup_timer, 0); } -- cgit v1.2.3 From 4f9c1bc727f917c8c32ee1decc88e89057e0dffc Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 21 May 2019 09:20:10 -0700 Subject: rcu/nocb: Remove obsolete nocb_gp_head and nocb_gp_tail fields Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.h | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 6e4cf7de303f..c12e85c12310 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -201,10 +201,8 @@ struct rcu_data { struct timer_list nocb_timer; /* Enforce finite deferral. */ /* The following fields are used by GP kthread, hence own cacheline. */ - struct rcu_head *nocb_gp_head ____cacheline_internodealigned_in_smp; - /* CBs waiting for GP. */ - struct rcu_head **nocb_gp_tail; - bool nocb_gp_sleep; /* Is the nocb GP thread asleep? */ + bool nocb_gp_sleep ____cacheline_internodealigned_in_smp; + /* Is the nocb GP thread asleep? */ struct swait_queue_head nocb_gp_wq; /* For nocb kthreads to sleep on. */ bool nocb_cb_sleep; /* Is the nocb CB thread asleep? */ struct task_struct *nocb_cb_kthread; -- cgit v1.2.3 From ec5ef87bac820be8ae9cc0a95108cded039ed8ef Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 21 May 2019 13:03:49 -0700 Subject: rcu/nocb: Use build-time no-CBs check in rcu_do_batch() Currently, rcu_do_batch() invokes rcu_segcblist_is_offloaded() each time it needs to know whether the current CPU is a no-CBs CPU. Given that it is not possible to change the no-CBs status of a CPU after boot, and given that it is not possible to even have no-CBs CPUs in CONFIG_RCU_NOCB_CPU=n kernels, this per-callback invocation wastes CPU. This commit therefore created a const on-stack variable to allow this check to be done only once per rcu_do_batch() invocation. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index e5f30b364276..16dabd6b36d7 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2087,6 +2087,8 @@ int rcutree_dead_cpu(unsigned int cpu) static void rcu_do_batch(struct rcu_data *rdp) { unsigned long flags; + const bool offloaded = IS_ENABLED(CONFIG_RCU_NOCB_CPU) && + rcu_segcblist_is_offloaded(&rdp->cblist); struct rcu_head *rhp; struct rcu_cblist rcl = RCU_CBLIST_INITIALIZER(rcl); long bl, count; @@ -2128,12 +2130,11 @@ static void rcu_do_batch(struct rcu_data *rdp) * Stop only if limit reached and CPU has something to do. * Note: The rcl structure counts down from zero. */ - if (-rcl.len >= bl && - !rcu_segcblist_is_offloaded(&rdp->cblist) && + if (-rcl.len >= bl && !offloaded && (need_resched() || (!is_idle_task(current) && !rcu_is_callbacks_kthread()))) break; - if (rcu_segcblist_is_offloaded(&rdp->cblist)) { + if (offloaded) { WARN_ON_ONCE(in_serving_softirq()); local_bh_enable(); lockdep_assert_irqs_enabled(); @@ -2175,8 +2176,7 @@ static void rcu_do_batch(struct rcu_data *rdp) rcu_nocb_unlock_irqrestore(rdp, flags); /* Re-invoke RCU core processing if there are callbacks remaining. */ - if (!rcu_segcblist_is_offloaded(&rdp->cblist) && - rcu_segcblist_ready_cbs(&rdp->cblist)) + if (!offloaded && rcu_segcblist_ready_cbs(&rdp->cblist)) invoke_rcu_core(); } -- cgit v1.2.3 From c1ab99d66ebcebedd9d416a840c488eaf079f3e9 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 21 May 2019 13:39:15 -0700 Subject: rcu/nocb: Use build-time no-CBs check in rcu_core() Currently, rcu_core() invokes rcu_segcblist_is_offloaded() each time it needs to know whether the current CPU is a no-CBs CPU. Given that it is not possible to change the no-CBs status of a CPU after boot, and given that it is not possible to even have no-CBs CPUs in CONFIG_RCU_NOCB_CPU=n kernels, this repeated runtime invocation wastes CPU. This commit therefore created a const on-stack variable to allow this check to be done only once per rcu_core() invocation. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 16dabd6b36d7..14939273d120 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2302,6 +2302,8 @@ static __latent_entropy void rcu_core(void) unsigned long flags; struct rcu_data *rdp = raw_cpu_ptr(&rcu_data); struct rcu_node *rnp = rdp->mynode; + const bool offloaded = IS_ENABLED(CONFIG_RCU_NOCB_CPU) && + rcu_segcblist_is_offloaded(&rdp->cblist); if (cpu_is_offline(smp_processor_id())) return; @@ -2321,8 +2323,7 @@ static __latent_entropy void rcu_core(void) /* No grace period and unregistered callbacks? */ if (!rcu_gp_in_progress() && - rcu_segcblist_is_enabled(&rdp->cblist) && - !rcu_segcblist_is_offloaded(&rdp->cblist)) { + rcu_segcblist_is_enabled(&rdp->cblist) && !offloaded) { local_irq_save(flags); if (!rcu_segcblist_restempty(&rdp->cblist, RCU_NEXT_READY_TAIL)) rcu_accelerate_cbs_unlocked(rnp, rdp); @@ -2332,8 +2333,7 @@ static __latent_entropy void rcu_core(void) rcu_check_gp_start_stall(rnp, rdp, rcu_jiffies_till_stall_check()); /* If there are callbacks ready, invoke them. */ - if (!rcu_segcblist_is_offloaded(&rdp->cblist) && - rcu_segcblist_ready_cbs(&rdp->cblist) && + if (!offloaded && rcu_segcblist_ready_cbs(&rdp->cblist) && likely(READ_ONCE(rcu_scheduler_fully_active))) rcu_do_batch(rdp); -- cgit v1.2.3 From 921bb5fad11c0e8ec5f7625547552b252281f4de Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 21 May 2019 13:53:28 -0700 Subject: rcu/nocb: Use build-time no-CBs check in rcu_pending() Currently, rcu_pending() invokes rcu_segcblist_is_offloaded() even in CONFIG_RCU_NOCB_CPU=n kernels, which cannot possibly be offloaded. Given that rcu_pending() is on a fastpath, it makes sense to check for CONFIG_RCU_NOCB_CPU=y before invoking rcu_segcblist_is_offloaded(). This commit therefore makes this change. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 14939273d120..fb6b80aa34f6 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2773,7 +2773,8 @@ static int rcu_pending(void) /* Has RCU gone idle with this CPU needing another grace period? */ if (!rcu_gp_in_progress() && rcu_segcblist_is_enabled(&rdp->cblist) && - !rcu_segcblist_is_offloaded(&rdp->cblist) && + (!IS_ENABLED(CONFIG_RCU_NOCB_CPU) || + !rcu_segcblist_is_offloaded(&rdp->cblist)) && !rcu_segcblist_restempty(&rdp->cblist, RCU_NEXT_READY_TAIL)) return 1; -- cgit v1.2.3 From 969974e5c51e717fc9070b00eb2f61ae589ed13d Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 22 May 2019 09:35:11 -0700 Subject: rcu/nocb: Suppress uninitialized false-positive in nocb_gp_wait() Some compilers complain that wait_gp_seq might be used uninitialized in nocb_gp_wait(). This cannot actually happen because when wait_gp_seq is uninitialized, needwait_gp must be false, which prevents wait_gp_seq from being used. But this analysis is apparently beyond some compilers, so this commit adds a bogus initialization of wait_gp_seq for the sole purpose of suppressing the false-positive warning. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 1847fffdfa0a..c1dfbac8cd39 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1668,12 +1668,12 @@ static void nocb_gp_wait(struct rcu_data *my_rdp) unsigned long cur_gp_seq; unsigned long flags; bool gotcbs; - bool needwait_gp = false; + bool needwait_gp = false; // This prevents actual uninitialized use. bool needwake; bool needwake_gp; struct rcu_data *rdp; struct rcu_node *rnp; - unsigned long wait_gp_seq; + unsigned long wait_gp_seq = 0; // Suppress "use uninitialized" warning. /* * Each pass through the following loop checks for CBs and for the -- cgit v1.2.3 From 0bd55c693617cd2488378d011b66b92e1dd66ecf Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 12 Aug 2019 10:28:08 -0700 Subject: rcu/nohz: Turn off tick for offloaded CPUs Historically, no-CBs CPUs allowed the scheduler-clock tick to be unconditionally disabled on any transition to idle or nohz_full userspace execution (see the rcu_needs_cpu() implementations). Unfortunately, the checks used by rcu_needs_cpu() are defeated now that no-CBs CPUs use ->cblist, which might make users of battery-powered devices rather unhappy. This commit therefore adds explicit rcu_segcblist_is_offloaded() checks to return to the historical energy-efficient semantics. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index c1dfbac8cd39..ae927710d670 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1226,10 +1226,10 @@ static void rcu_prepare_kthreads(int cpu) #if !defined(CONFIG_RCU_FAST_NO_HZ) /* - * Check to see if any future RCU-related work will need to be done - * by the current CPU, even if none need be done immediately, returning - * 1 if so. This function is part of the RCU implementation; it is -not- - * an exported member of the RCU API. + * Check to see if any future non-offloaded RCU-related work will need + * to be done by the current CPU, even if none need be done immediately, + * returning 1 if so. This function is part of the RCU implementation; + * it is -not- an exported member of the RCU API. * * Because we not have RCU_FAST_NO_HZ, just check whether or not this * CPU has RCU callbacks queued. @@ -1237,7 +1237,8 @@ static void rcu_prepare_kthreads(int cpu) int rcu_needs_cpu(u64 basemono, u64 *nextevt) { *nextevt = KTIME_MAX; - return !rcu_segcblist_empty(&this_cpu_ptr(&rcu_data)->cblist); + return !rcu_segcblist_empty(&this_cpu_ptr(&rcu_data)->cblist) && + !rcu_segcblist_is_offloaded(&this_cpu_ptr(&rcu_data)->cblist); } /* @@ -1338,8 +1339,9 @@ int rcu_needs_cpu(u64 basemono, u64 *nextevt) lockdep_assert_irqs_disabled(); - /* If no callbacks, RCU doesn't need the CPU. */ - if (rcu_segcblist_empty(&rdp->cblist)) { + /* If no non-offloaded callbacks, RCU doesn't need the CPU. */ + if (rcu_segcblist_empty(&rdp->cblist) || + rcu_segcblist_is_offloaded(&this_cpu_ptr(&rcu_data)->cblist)) { *nextevt = KTIME_MAX; return 0; } -- cgit v1.2.3 From aeeacd9d844b2219d47e9010298b635c68a2a4c9 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 23 May 2019 10:43:58 -0700 Subject: rcu/nocb: Enable re-awakening under high callback load The __call_rcu_nocb_wake() function and its predecessors set ->qlen_last_fqs_check to zero for the first callback and to LONG_MAX / 2 for forced reawakenings. The former can result in a too-quick reawakening when there are many callbacks ready to invoke and the latter prevents a second reawakening. This commit therefore sets ->qlen_last_fqs_check to the current number of callbacks in both cases. While in the area, this commit also moves both assignments under ->nocb_lock. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index ae927710d670..7077ef7bea96 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1628,6 +1628,7 @@ static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_alldone, // Need to actually to a wakeup. len = rcu_segcblist_n_cbs(&rdp->cblist); if (was_alldone) { + rdp->qlen_last_fqs_check = len; if (!irqs_disabled_flags(flags)) { /* ... if queue was empty ... */ wake_nocb_gp(rdp, false, flags); @@ -1638,9 +1639,9 @@ static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_alldone, TPS("WakeEmptyIsDeferred")); rcu_nocb_unlock_irqrestore(rdp, flags); } - rdp->qlen_last_fqs_check = 0; } else if (len > rdp->qlen_last_fqs_check + qhimark) { /* ... or if many callbacks queued. */ + rdp->qlen_last_fqs_check = len; if (!irqs_disabled_flags(flags)) { wake_nocb_gp(rdp, true, flags); trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, @@ -1650,7 +1651,6 @@ static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_alldone, TPS("WakeOvfIsDeferred")); rcu_nocb_unlock_irqrestore(rdp, flags); } - rdp->qlen_last_fqs_check = LONG_MAX / 2; } else { trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WakeNot")); rcu_nocb_unlock_irqrestore(rdp, flags); -- cgit v1.2.3 From 383e13328373ae1e17119ff89c86ff5f9413f31c Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 23 May 2019 13:49:26 -0700 Subject: rcu/nocb: Never downgrade ->nocb_defer_wakeup in wake_nocb_gp_defer() Currently, wake_nocb_gp_defer() simply stores whatever waketype was passed in, which can result in a RCU_NOCB_WAKE_FORCE being downgraded to RCU_NOCB_WAKE, which could in turn delay callback processing. This commit therefore adds a check so that wake_nocb_gp_defer() only updates ->nocb_defer_wakeup when the update increases the forcefulness, thus avoiding downgrades. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 7077ef7bea96..b9e00660af60 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1600,7 +1600,8 @@ static void wake_nocb_gp_defer(struct rcu_data *rdp, int waketype, { if (rdp->nocb_defer_wakeup == RCU_NOCB_WAKE_NOT) mod_timer(&rdp->nocb_timer, jiffies + 1); - WRITE_ONCE(rdp->nocb_defer_wakeup, waketype); + if (rdp->nocb_defer_wakeup < waketype) + WRITE_ONCE(rdp->nocb_defer_wakeup, waketype); trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, reason); } -- cgit v1.2.3 From ce0a825e40606d6dbe6dfe90d4d4c0ccc9fa3bde Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 23 May 2019 13:56:12 -0700 Subject: rcu/nocb: Make __call_rcu_nocb_wake() safe for many callbacks It might be hard to imagine having more than two billion callbacks queued on a single CPU's ->cblist, but someone will do it sometime. This commit therefore makes __call_rcu_nocb_wake() handle this situation by upgrading local variable "len" from "int" to "long". Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index b9e00660af60..5cbc0848647c 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1615,7 +1615,7 @@ static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_alldone, unsigned long flags) __releases(rdp->nocb_lock) { - int len; + long len; struct task_struct *t; // If we are being polled or there is no kthread, just leave. -- cgit v1.2.3 From 7f36ef82e5cf0b401c2676fb3e56ad0633ed6ad5 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 28 May 2019 05:54:26 -0700 Subject: rcu/nocb: Avoid needless wakeups of no-CBs grace-period kthread Currently, the code provides an extra wakeup for the no-CBs grace-period kthread if one of its CPUs is generating excessive numbers of callbacks. But satisfying though it is to wake something up when things are going south, unless the thing being awakened can actually help solve the problem, that extra wakeup does nothing but consume additional CPU time, which is exactly what you don't want during a call_rcu() flood. This commit therefore avoids doing anything if the corresponding no-CBs callback kthread is going full tilt. Otherwise, if advancing callbacks immediately might help and if the leaf rcu_node structure's lock is immediately available, this commit invokes a new variant of rcu_advance_cbs() that advances callbacks only if doing so won't require awakening the grace-period kthread (not to be confused with any of the no-CBs grace-period kthreads). Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 15 +++++++++++++++ kernel/rcu/tree_plugin.h | 13 +++++++++---- 2 files changed, 24 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index fb6b80aa34f6..a6ddfae6978d 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1334,6 +1334,19 @@ static bool rcu_advance_cbs(struct rcu_node *rnp, struct rcu_data *rdp) return rcu_accelerate_cbs(rnp, rdp); } +/* + * Move and classify callbacks, but only if doing so won't require + * that the RCU grace-period kthread be awakened. + */ +static void __maybe_unused rcu_advance_cbs_nowake(struct rcu_node *rnp, + struct rcu_data *rdp) +{ + raw_lockdep_assert_held_rcu_node(rnp); + if (!rcu_seq_state(rcu_seq_current(&rnp->gp_seq))) + return; + WARN_ON_ONCE(rcu_advance_cbs(rnp, rdp)); +} + /* * Update CPU-local rcu_data state to record the beginnings and ends of * grace periods. The caller must hold the ->lock of the leaf rcu_node @@ -2118,6 +2131,8 @@ static void rcu_do_batch(struct rcu_data *rdp) rcu_segcblist_n_lazy_cbs(&rdp->cblist), rcu_segcblist_n_cbs(&rdp->cblist), bl); rcu_segcblist_extract_done_cbs(&rdp->cblist, &rcl); + if (offloaded) + rdp->qlen_last_fqs_check = rcu_segcblist_n_cbs(&rdp->cblist); rcu_nocb_unlock_irqrestore(rdp, flags); /* Invoke callbacks. */ diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 5cbc0848647c..c10afe778430 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1643,10 +1643,15 @@ static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_alldone, } else if (len > rdp->qlen_last_fqs_check + qhimark) { /* ... or if many callbacks queued. */ rdp->qlen_last_fqs_check = len; - if (!irqs_disabled_flags(flags)) { - wake_nocb_gp(rdp, true, flags); - trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, - TPS("WakeOvf")); + if (!rdp->nocb_cb_sleep && + rcu_segcblist_ready_cbs(&rdp->cblist)) { + // Already going full tilt, so don't try to rewake. + rcu_nocb_unlock_irqrestore(rdp, flags); + } else if (rcu_segcblist_pend_cbs(&rdp->cblist) && + raw_spin_trylock_rcu_node(rdp->mynode)) { + rcu_advance_cbs_nowake(rdp->mynode, rdp); + raw_spin_unlock_rcu_node(rdp->mynode); + rcu_nocb_unlock_irqrestore(rdp, flags); } else { wake_nocb_gp_defer(rdp, RCU_NOCB_WAKE_FORCE, TPS("WakeOvfIsDeferred")); -- cgit v1.2.3 From 81c0b3d724f419c0524f432c1ac22b9f518c2899 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 28 May 2019 07:18:08 -0700 Subject: rcu/nocb: Avoid ->nocb_lock capture by corresponding CPU A given rcu_data structure's ->nocb_lock can be acquired very frequently by the corresponding CPU and occasionally by the corresponding no-CBs grace-period and callbacks kthreads. In particular, these two kthreads will have frequent gaps between ->nocb_lock acquisitions that are roughly a grace period in duration. This means that any excessive ->nocb_lock contention will be due to the CPU's acquisitions, and this in turn enables a very naive contention-avoidance strategy to be quite effective. This commit therefore modifies rcu_nocb_lock() to first attempt a raw_spin_trylock(), and to atomically increment a separate ->nocb_lock_contended across a raw_spin_lock(). This new ->nocb_lock_contended field is checked in __call_rcu_nocb_wake() when interrupts are enabled, with a spin-wait for contending acquisitions to complete, thus allowing the kthreads a chance to acquire the lock. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.h | 18 ++++++++++++- kernel/rcu/tree_plugin.h | 68 ++++++++++++++++++++++++++++++++---------------- 2 files changed, 62 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index c12e85c12310..7062f9d9c053 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -197,6 +197,7 @@ struct rcu_data { struct swait_queue_head nocb_cb_wq; /* For nocb kthreads to sleep on. */ struct task_struct *nocb_gp_kthread; raw_spinlock_t nocb_lock; /* Guard following pair of fields. */ + atomic_t nocb_lock_contended; /* Contention experienced. */ int nocb_defer_wakeup; /* Defer wakeup of nocb_kthread. */ struct timer_list nocb_timer; /* Enforce finite deferral. */ @@ -430,7 +431,22 @@ static void rcu_nocb_unlock_irqrestore(struct rcu_data *rdp, unsigned long flags); #ifdef CONFIG_RCU_NOCB_CPU static void __init rcu_organize_nocb_kthreads(void); -#endif /* #ifdef CONFIG_RCU_NOCB_CPU */ +#define rcu_nocb_lock_irqsave(rdp, flags) \ +do { \ + if (!rcu_segcblist_is_offloaded(&(rdp)->cblist)) { \ + local_irq_save(flags); \ + } else if (!raw_spin_trylock_irqsave(&(rdp)->nocb_lock, (flags))) {\ + atomic_inc(&(rdp)->nocb_lock_contended); \ + smp_mb__after_atomic(); /* atomic_inc() before lock. */ \ + raw_spin_lock_irqsave(&(rdp)->nocb_lock, (flags)); \ + smp_mb__before_atomic(); /* atomic_dec() after lock. */ \ + atomic_dec(&(rdp)->nocb_lock_contended); \ + } \ +} while (0) +#else /* #ifdef CONFIG_RCU_NOCB_CPU */ +#define rcu_nocb_lock_irqsave(rdp, flags) local_irq_save(flags) +#endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */ + static void rcu_bind_gp_kthread(void); static bool rcu_nohz_full_cpu(void); static void rcu_dynticks_task_enter(void); diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index c10afe778430..5f0894cec75d 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1498,14 +1498,36 @@ early_param("rcu_nocb_poll", parse_rcu_nocb_poll); /* * Acquire the specified rcu_data structure's ->nocb_lock, but only - * if it corresponds to a no-CBs CPU. + * if it corresponds to a no-CBs CPU. If the lock isn't immediately + * available, increment ->nocb_lock_contended to flag the contention. */ static void rcu_nocb_lock(struct rcu_data *rdp) { - if (rcu_segcblist_is_offloaded(&rdp->cblist)) { - lockdep_assert_irqs_disabled(); - raw_spin_lock(&rdp->nocb_lock); - } + lockdep_assert_irqs_disabled(); + if (!rcu_segcblist_is_offloaded(&rdp->cblist) || + raw_spin_trylock(&rdp->nocb_lock)) + return; + atomic_inc(&rdp->nocb_lock_contended); + smp_mb__after_atomic(); /* atomic_inc() before lock. */ + raw_spin_lock(&rdp->nocb_lock); + smp_mb__before_atomic(); /* atomic_dec() after lock. */ + atomic_dec(&rdp->nocb_lock_contended); +} + +/* + * Spinwait until the specified rcu_data structure's ->nocb_lock is + * not contended. Please note that this is extremely special-purpose, + * relying on the fact that at most two kthreads and one CPU contend for + * this lock, and also that the two kthreads are guaranteed to have frequent + * grace-period-duration time intervals between successive acquisitions + * of the lock. This allows us to use an extremely simple throttling + * mechanism, and further to apply it only to the CPU doing floods of + * call_rcu() invocations. Don't try this at home! + */ +static void rcu_nocb_wait_contended(struct rcu_data *rdp) +{ + while (atomic_read(&rdp->nocb_lock_contended)) + cpu_relax(); } /* @@ -1575,19 +1597,19 @@ static void wake_nocb_gp(struct rcu_data *rdp, bool force, lockdep_assert_held(&rdp->nocb_lock); if (!READ_ONCE(rdp_gp->nocb_gp_kthread)) { - raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); + rcu_nocb_unlock_irqrestore(rdp, flags); return; } if (READ_ONCE(rdp_gp->nocb_gp_sleep) || force) { del_timer(&rdp->nocb_timer); - raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); + rcu_nocb_unlock_irqrestore(rdp, flags); smp_mb(); /* enqueue before ->nocb_gp_sleep. */ - raw_spin_lock_irqsave(&rdp_gp->nocb_lock, flags); + rcu_nocb_lock_irqsave(rdp_gp, flags); WRITE_ONCE(rdp_gp->nocb_gp_sleep, false); - raw_spin_unlock_irqrestore(&rdp_gp->nocb_lock, flags); + rcu_nocb_unlock_irqrestore(rdp_gp, flags); wake_up_process(rdp_gp->nocb_gp_kthread); } else { - raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); + rcu_nocb_unlock_irqrestore(rdp, flags); } } @@ -1646,23 +1668,23 @@ static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_alldone, if (!rdp->nocb_cb_sleep && rcu_segcblist_ready_cbs(&rdp->cblist)) { // Already going full tilt, so don't try to rewake. - rcu_nocb_unlock_irqrestore(rdp, flags); } else if (rcu_segcblist_pend_cbs(&rdp->cblist) && raw_spin_trylock_rcu_node(rdp->mynode)) { rcu_advance_cbs_nowake(rdp->mynode, rdp); raw_spin_unlock_rcu_node(rdp->mynode); - rcu_nocb_unlock_irqrestore(rdp, flags); } else { wake_nocb_gp_defer(rdp, RCU_NOCB_WAKE_FORCE, TPS("WakeOvfIsDeferred")); - rcu_nocb_unlock_irqrestore(rdp, flags); } + rcu_nocb_unlock_irqrestore(rdp, flags); } else { trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WakeNot")); rcu_nocb_unlock_irqrestore(rdp, flags); } - if (!irqs_disabled_flags(flags)) + if (!irqs_disabled_flags(flags)) { lockdep_assert_irqs_enabled(); + rcu_nocb_wait_contended(rdp); + } return; } @@ -1692,7 +1714,7 @@ static void nocb_gp_wait(struct rcu_data *my_rdp) if (rcu_segcblist_empty(&rdp->cblist)) continue; /* No callbacks here, try next. */ rnp = rdp->mynode; - raw_spin_lock_irqsave(&rdp->nocb_lock, flags); + rcu_nocb_lock_irqsave(rdp, flags); WRITE_ONCE(my_rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT); del_timer(&my_rdp->nocb_timer); raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */ @@ -1712,7 +1734,7 @@ static void nocb_gp_wait(struct rcu_data *my_rdp) } else { needwake = false; } - raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); + rcu_nocb_unlock_irqrestore(rdp, flags); if (needwake) { swake_up_one(&rdp->nocb_cb_wq); gotcbs = true; @@ -1741,9 +1763,9 @@ static void nocb_gp_wait(struct rcu_data *my_rdp) trace_rcu_this_gp(rnp, my_rdp, wait_gp_seq, TPS("EndWait")); } if (!rcu_nocb_poll) { - raw_spin_lock_irqsave(&my_rdp->nocb_lock, flags); + rcu_nocb_lock_irqsave(my_rdp, flags); WRITE_ONCE(my_rdp->nocb_gp_sleep, true); - raw_spin_unlock_irqrestore(&my_rdp->nocb_lock, flags); + rcu_nocb_unlock_irqrestore(my_rdp, flags); } WARN_ON(signal_pending(current)); } @@ -1784,12 +1806,12 @@ static void nocb_cb_wait(struct rcu_data *rdp) rcu_do_batch(rdp); local_bh_enable(); lockdep_assert_irqs_enabled(); - raw_spin_lock_irqsave(&rdp->nocb_lock, flags); + rcu_nocb_lock_irqsave(rdp, flags); raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */ needwake_gp = rcu_advance_cbs(rdp->mynode, rdp); raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */ if (rcu_segcblist_ready_cbs(&rdp->cblist)) { - raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); + rcu_nocb_unlock_irqrestore(rdp, flags); if (needwake_gp) rcu_gp_kthread_wake(); return; @@ -1797,7 +1819,7 @@ static void nocb_cb_wait(struct rcu_data *rdp) trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("CBSleep")); WRITE_ONCE(rdp->nocb_cb_sleep, true); - raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); + rcu_nocb_unlock_irqrestore(rdp, flags); if (needwake_gp) rcu_gp_kthread_wake(); swait_event_interruptible_exclusive(rdp->nocb_cb_wq, @@ -1839,9 +1861,9 @@ static void do_nocb_deferred_wakeup_common(struct rcu_data *rdp) unsigned long flags; int ndw; - raw_spin_lock_irqsave(&rdp->nocb_lock, flags); + rcu_nocb_lock_irqsave(rdp, flags); if (!rcu_nocb_need_deferred_wakeup(rdp)) { - raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); + rcu_nocb_unlock_irqrestore(rdp, flags); return; } ndw = READ_ONCE(rdp->nocb_defer_wakeup); -- cgit v1.2.3 From 9fcb09bddd56bae42319b606bae86e85c625f868 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 1 Jun 2019 05:14:47 -0700 Subject: rcu/nocb: Round down for number of no-CBs grace-period kthreads Currently, when the square root of the number of CPUs is rounded down by int_sqrt(), this round-down is applied to the number of callback kthreads per grace-period kthreads. This makes almost no difference for large systems, but results in oddities such as three no-CBs grace-period kthreads for a five-CPU system, which is a bit excessive. This commit therefore causes the round-down to apply to the number of no-CBs grace-period kthreads, so that systems with from four to eight CPUs have only two no-CBs grace period kthreads. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 5f0894cec75d..12212764ecd8 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -2028,7 +2028,7 @@ static void __init rcu_organize_nocb_kthreads(void) if (!cpumask_available(rcu_nocb_mask)) return; if (ls == -1) { - ls = int_sqrt(nr_cpu_ids); + ls = nr_cpu_ids / int_sqrt(nr_cpu_ids); rcu_nocb_gp_stride = ls; } -- cgit v1.2.3 From 6608c3a027bcc0b34cc02bc764ea9f52b9dce46f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 1 Jun 2019 06:16:38 -0700 Subject: rcu/nocb: Reduce contention at no-CBs registry-time CB advancement Currently, __call_rcu_nocb_wake() conditionally acquires the leaf rcu_node structure's ->lock, and only afterwards does rcu_advance_cbs_nowake() check to see if it is possible to advance callbacks without potentially needing to awaken the grace-period kthread. Given that the no-awaken check can be done locklessly, this commit reverses the order, so that rcu_advance_cbs_nowake() is invoked without holding the leaf rcu_node structure's ->lock and rcu_advance_cbs_nowake() checks the grace-period state before conditionally acquiring that lock, thus reducing the number of needless acquistions of the leaf rcu_node structure's ->lock. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 5 +++-- kernel/rcu/tree_plugin.h | 4 +--- 2 files changed, 4 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index a6ddfae6978d..ec320658aeef 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1341,10 +1341,11 @@ static bool rcu_advance_cbs(struct rcu_node *rnp, struct rcu_data *rdp) static void __maybe_unused rcu_advance_cbs_nowake(struct rcu_node *rnp, struct rcu_data *rdp) { - raw_lockdep_assert_held_rcu_node(rnp); - if (!rcu_seq_state(rcu_seq_current(&rnp->gp_seq))) + if (!rcu_seq_state(rcu_seq_current(&rnp->gp_seq)) || + !raw_spin_trylock_rcu_node(rnp)) return; WARN_ON_ONCE(rcu_advance_cbs(rnp, rdp)); + raw_spin_unlock_rcu_node(rnp); } /* diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 12212764ecd8..09c15f619e78 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1668,10 +1668,8 @@ static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_alldone, if (!rdp->nocb_cb_sleep && rcu_segcblist_ready_cbs(&rdp->cblist)) { // Already going full tilt, so don't try to rewake. - } else if (rcu_segcblist_pend_cbs(&rdp->cblist) && - raw_spin_trylock_rcu_node(rdp->mynode)) { + } else if (rcu_segcblist_pend_cbs(&rdp->cblist)) { rcu_advance_cbs_nowake(rdp->mynode, rdp); - raw_spin_unlock_rcu_node(rdp->mynode); } else { wake_nocb_gp_defer(rdp, RCU_NOCB_WAKE_FORCE, TPS("WakeOvfIsDeferred")); -- cgit v1.2.3 From 523bddd553c09a2cf051eb724bffba680424f5ec Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 1 Jun 2019 13:33:55 -0700 Subject: rcu/nocb: Reduce contention at no-CBs invocation-done time Currently, nocb_cb_wait() unconditionally acquires the leaf rcu_node ->lock to advance callbacks when done invoking the previous batch. It does this while holding ->nocb_lock, which means that contention on the leaf rcu_node ->lock visits itself on the ->nocb_lock. This commit therefore makes this lock acquisition conditional, forgoing callback advancement when the leaf rcu_node ->lock is not immediately available. (In this case, the no-CBs grace-period kthread will eventually do any needed callback advancement.) Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 09c15f619e78..c4cbfb5dc48d 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1805,9 +1805,10 @@ static void nocb_cb_wait(struct rcu_data *rdp) local_bh_enable(); lockdep_assert_irqs_enabled(); rcu_nocb_lock_irqsave(rdp, flags); - raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */ - needwake_gp = rcu_advance_cbs(rdp->mynode, rdp); - raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */ + if (raw_spin_trylock_rcu_node(rnp)) { /* irqs already disabled. */ + needwake_gp = rcu_advance_cbs(rdp->mynode, rdp); + raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */ + } if (rcu_segcblist_ready_cbs(&rdp->cblist)) { rcu_nocb_unlock_irqrestore(rdp, flags); if (needwake_gp) -- cgit v1.2.3 From 4fd8c5f153bc41ae847b9ddb1539b34f70c18278 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 2 Jun 2019 13:41:08 -0700 Subject: rcu/nocb: Reduce ->nocb_lock contention with separate ->nocb_gp_lock The sleep/wakeup of the no-CBs grace-period kthreads is synchronized using the ->nocb_lock of the first CPU corresponding to that kthread. This commit provides a separate ->nocb_gp_lock for this purpose, thus reducing contention on ->nocb_lock. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.h | 3 ++- kernel/rcu/tree_plugin.h | 9 +++++---- 2 files changed, 7 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 7062f9d9c053..2c3e9068671c 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -202,7 +202,8 @@ struct rcu_data { struct timer_list nocb_timer; /* Enforce finite deferral. */ /* The following fields are used by GP kthread, hence own cacheline. */ - bool nocb_gp_sleep ____cacheline_internodealigned_in_smp; + raw_spinlock_t nocb_gp_lock ____cacheline_internodealigned_in_smp; + bool nocb_gp_sleep; /* Is the nocb GP thread asleep? */ struct swait_queue_head nocb_gp_wq; /* For nocb kthreads to sleep on. */ bool nocb_cb_sleep; /* Is the nocb CB thread asleep? */ diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index c4cbfb5dc48d..e92bc39c4008 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1604,9 +1604,9 @@ static void wake_nocb_gp(struct rcu_data *rdp, bool force, del_timer(&rdp->nocb_timer); rcu_nocb_unlock_irqrestore(rdp, flags); smp_mb(); /* enqueue before ->nocb_gp_sleep. */ - rcu_nocb_lock_irqsave(rdp_gp, flags); + raw_spin_lock_irqsave(&rdp_gp->nocb_gp_lock, flags); WRITE_ONCE(rdp_gp->nocb_gp_sleep, false); - rcu_nocb_unlock_irqrestore(rdp_gp, flags); + raw_spin_unlock_irqrestore(&rdp_gp->nocb_gp_lock, flags); wake_up_process(rdp_gp->nocb_gp_kthread); } else { rcu_nocb_unlock_irqrestore(rdp, flags); @@ -1761,9 +1761,9 @@ static void nocb_gp_wait(struct rcu_data *my_rdp) trace_rcu_this_gp(rnp, my_rdp, wait_gp_seq, TPS("EndWait")); } if (!rcu_nocb_poll) { - rcu_nocb_lock_irqsave(my_rdp, flags); + raw_spin_lock_irqsave(&my_rdp->nocb_gp_lock, flags); WRITE_ONCE(my_rdp->nocb_gp_sleep, true); - rcu_nocb_unlock_irqrestore(my_rdp, flags); + raw_spin_unlock_irqrestore(&my_rdp->nocb_gp_lock, flags); } WARN_ON(signal_pending(current)); } @@ -1943,6 +1943,7 @@ static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp) init_swait_queue_head(&rdp->nocb_cb_wq); init_swait_queue_head(&rdp->nocb_gp_wq); raw_spin_lock_init(&rdp->nocb_lock); + raw_spin_lock_init(&rdp->nocb_gp_lock); timer_setup(&rdp->nocb_timer, do_nocb_deferred_wakeup_timer, 0); } -- cgit v1.2.3 From faca5c250935262f026cac1bb951a0f7672474b8 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 26 Jun 2019 09:50:38 -0700 Subject: rcu/nocb: Unconditionally advance and wake for excessive CBs When there are excessive numbers of callbacks, and when either the corresponding no-CBs callback kthread is asleep or there is no more ready-to-invoke callbacks, and when least one callback is pending, __call_rcu_nocb_wake() will advance the callbacks, but refrain from awakening the corresponding no-CBs grace-period kthread. However, because rcu_advance_cbs_nowake() is used, it is possible (if a bit unlikely) that the needed advancement could not happen due to a grace period not being in progress. Plus there will always be at least one pending callback due to one having just now been enqueued. This commit therefore attempts to advance callbacks and awakens the no-CBs grace-period kthread when there are excessive numbers of callbacks posted and when the no-CBs callback kthread is not in a position to do anything helpful. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index e92bc39c4008..4e49bb359464 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1668,13 +1668,19 @@ static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_alldone, if (!rdp->nocb_cb_sleep && rcu_segcblist_ready_cbs(&rdp->cblist)) { // Already going full tilt, so don't try to rewake. - } else if (rcu_segcblist_pend_cbs(&rdp->cblist)) { - rcu_advance_cbs_nowake(rdp->mynode, rdp); + rcu_nocb_unlock_irqrestore(rdp, flags); } else { - wake_nocb_gp_defer(rdp, RCU_NOCB_WAKE_FORCE, - TPS("WakeOvfIsDeferred")); + rcu_advance_cbs_nowake(rdp->mynode, rdp); + if (!irqs_disabled_flags(flags)) { + wake_nocb_gp(rdp, false, flags); + trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, + TPS("WakeOvf")); + } else { + wake_nocb_gp_defer(rdp, RCU_NOCB_WAKE_FORCE, + TPS("WakeOvfIsDeferred")); + rcu_nocb_unlock_irqrestore(rdp, flags); + } } - rcu_nocb_unlock_irqrestore(rdp, flags); } else { trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WakeNot")); rcu_nocb_unlock_irqrestore(rdp, flags); -- cgit v1.2.3 From eda669a6a2c517fd6db41d0fe3c95c1b749c60bd Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 1 Jul 2019 17:36:53 -0700 Subject: rcu/nocb: Atomic ->len field in rcu_segcblist structure Upcoming ->nocb_lock contention-reduction work requires that the rcu_segcblist structure's ->len field be concurrently manipulated, but only if there are no-CBs CPUs in the kernel. This commit therefore makes this ->len field be an atomic_long_t, but only in CONFIG_RCU_NOCB_CPU=y kernels. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcu_segcblist.c | 86 ++++++++++++++++++++++++++++++++++++++++++---- kernel/rcu/rcu_segcblist.h | 12 ++++++- 2 files changed, 90 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c index 92968b856593..ff431cc83037 100644 --- a/kernel/rcu/rcu_segcblist.c +++ b/kernel/rcu/rcu_segcblist.c @@ -23,6 +23,19 @@ void rcu_cblist_init(struct rcu_cblist *rclp) rclp->len_lazy = 0; } +/* + * Enqueue an rcu_head structure onto the specified callback list. + * This function assumes that the callback is non-lazy because it + * is intended for use by no-CBs CPUs, which do not distinguish + * between lazy and non-lazy RCU callbacks. + */ +void rcu_cblist_enqueue(struct rcu_cblist *rclp, struct rcu_head *rhp) +{ + *rclp->tail = rhp; + rclp->tail = &rhp->next; + WRITE_ONCE(rclp->len, rclp->len + 1); +} + /* * Dequeue the oldest rcu_head structure from the specified callback * list. This function assumes that the callback is non-lazy, but @@ -44,6 +57,67 @@ struct rcu_head *rcu_cblist_dequeue(struct rcu_cblist *rclp) return rhp; } +/* Set the length of an rcu_segcblist structure. */ +void rcu_segcblist_set_len(struct rcu_segcblist *rsclp, long v) +{ +#ifdef CONFIG_RCU_NOCB_CPU + atomic_long_set(&rsclp->len, v); +#else + WRITE_ONCE(rsclp->len, v); +#endif +} + +/* + * Increase the numeric length of an rcu_segcblist structure by the + * specified amount, which can be negative. This can cause the ->len + * field to disagree with the actual number of callbacks on the structure. + * This increase is fully ordered with respect to the callers accesses + * both before and after. + */ +void rcu_segcblist_add_len(struct rcu_segcblist *rsclp, long v) +{ +#ifdef CONFIG_RCU_NOCB_CPU + smp_mb__before_atomic(); /* Up to the caller! */ + atomic_long_add(v, &rsclp->len); + smp_mb__after_atomic(); /* Up to the caller! */ +#else + smp_mb(); /* Up to the caller! */ + WRITE_ONCE(rsclp->len, rsclp->len + v); + smp_mb(); /* Up to the caller! */ +#endif +} + +/* + * Increase the numeric length of an rcu_segcblist structure by one. + * This can cause the ->len field to disagree with the actual number of + * callbacks on the structure. This increase is fully ordered with respect + * to the callers accesses both before and after. + */ +void rcu_segcblist_inc_len(struct rcu_segcblist *rsclp) +{ + rcu_segcblist_add_len(rsclp, 1); +} + +/* + * Exchange the numeric length of the specified rcu_segcblist structure + * with the specified value. This can cause the ->len field to disagree + * with the actual number of callbacks on the structure. This exchange is + * fully ordered with respect to the callers accesses both before and after. + */ +long rcu_segcblist_xchg_len(struct rcu_segcblist *rsclp, long v) +{ +#ifdef CONFIG_RCU_NOCB_CPU + return atomic_long_xchg(&rsclp->len, v); +#else + long ret = rsclp->len; + + smp_mb(); /* Up to the caller! */ + WRITE_ONCE(rsclp->len, v); + smp_mb(); /* Up to the caller! */ + return ret; +#endif +} + /* * Initialize an rcu_segcblist structure. */ @@ -56,7 +130,7 @@ void rcu_segcblist_init(struct rcu_segcblist *rsclp) rsclp->head = NULL; for (i = 0; i < RCU_CBLIST_NSEGS; i++) rsclp->tails[i] = &rsclp->head; - rsclp->len = 0; + rcu_segcblist_set_len(rsclp, 0); rsclp->len_lazy = 0; rsclp->enabled = 1; } @@ -151,7 +225,7 @@ bool rcu_segcblist_nextgp(struct rcu_segcblist *rsclp, unsigned long *lp) void rcu_segcblist_enqueue(struct rcu_segcblist *rsclp, struct rcu_head *rhp, bool lazy) { - WRITE_ONCE(rsclp->len, rsclp->len + 1); /* ->len sampled locklessly. */ + rcu_segcblist_inc_len(rsclp); if (lazy) rsclp->len_lazy++; smp_mb(); /* Ensure counts are updated before callback is enqueued. */ @@ -177,7 +251,7 @@ bool rcu_segcblist_entrain(struct rcu_segcblist *rsclp, if (rcu_segcblist_n_cbs(rsclp) == 0) return false; - WRITE_ONCE(rsclp->len, rsclp->len + 1); + rcu_segcblist_inc_len(rsclp); if (lazy) rsclp->len_lazy++; smp_mb(); /* Ensure counts are updated before callback is entrained. */ @@ -204,9 +278,8 @@ void rcu_segcblist_extract_count(struct rcu_segcblist *rsclp, struct rcu_cblist *rclp) { rclp->len_lazy += rsclp->len_lazy; - rclp->len += rsclp->len; rsclp->len_lazy = 0; - WRITE_ONCE(rsclp->len, 0); /* ->len sampled locklessly. */ + rclp->len = rcu_segcblist_xchg_len(rsclp, 0); } /* @@ -259,8 +332,7 @@ void rcu_segcblist_insert_count(struct rcu_segcblist *rsclp, struct rcu_cblist *rclp) { rsclp->len_lazy += rclp->len_lazy; - /* ->len sampled locklessly. */ - WRITE_ONCE(rsclp->len, rsclp->len + rclp->len); + rcu_segcblist_add_len(rsclp, rclp->len); rclp->len_lazy = 0; rclp->len = 0; } diff --git a/kernel/rcu/rcu_segcblist.h b/kernel/rcu/rcu_segcblist.h index db38f0a512c4..1ff996647d3c 100644 --- a/kernel/rcu/rcu_segcblist.h +++ b/kernel/rcu/rcu_segcblist.h @@ -9,6 +9,12 @@ #include +/* Return number of callbacks in the specified callback list. */ +static inline long rcu_cblist_n_cbs(struct rcu_cblist *rclp) +{ + return READ_ONCE(rclp->len); +} + /* * Account for the fact that a previously dequeued callback turned out * to be marked as lazy. @@ -42,7 +48,11 @@ static inline bool rcu_segcblist_empty(struct rcu_segcblist *rsclp) /* Return number of callbacks in segmented callback list. */ static inline long rcu_segcblist_n_cbs(struct rcu_segcblist *rsclp) { +#ifdef CONFIG_RCU_NOCB_CPU + return atomic_long_read(&rsclp->len); +#else return READ_ONCE(rsclp->len); +#endif } /* Return number of lazy callbacks in segmented callback list. */ @@ -54,7 +64,7 @@ static inline long rcu_segcblist_n_lazy_cbs(struct rcu_segcblist *rsclp) /* Return number of lazy callbacks in segmented callback list. */ static inline long rcu_segcblist_n_nonlazy_cbs(struct rcu_segcblist *rsclp) { - return rsclp->len - rsclp->len_lazy; + return rcu_segcblist_n_cbs(rsclp) - rsclp->len_lazy; } /* -- cgit v1.2.3 From d1b222c6be1f8bfc77099e034219732ecaeaaf96 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 2 Jul 2019 16:03:33 -0700 Subject: rcu/nocb: Add bypass callback queueing Use of the rcu_data structure's segmented ->cblist for no-CBs CPUs takes advantage of unrelated grace periods, thus reducing the memory footprint in the face of floods of call_rcu() invocations. However, the ->cblist field is a more-complex rcu_segcblist structure which must be protected via locking. Even though there are only three entities which can acquire this lock (the CPU invoking call_rcu(), the no-CBs grace-period kthread, and the no-CBs callbacks kthread), the contention on this lock is excessive under heavy stress. This commit therefore greatly reduces contention by provisioning an rcu_cblist structure field named ->nocb_bypass within the rcu_data structure. Each no-CBs CPU is permitted only a limited number of enqueues onto the ->cblist per jiffy, controlled by a new nocb_nobypass_lim_per_jiffy kernel boot parameter that defaults to about 16 enqueues per millisecond (16 * 1000 / HZ). When that limit is exceeded, the CPU instead enqueues onto the new ->nocb_bypass. The ->nocb_bypass is flushed into the ->cblist every jiffy or when the number of callbacks on ->nocb_bypass exceeds qhimark, whichever happens first. During call_rcu() floods, this flushing is carried out by the CPU during the course of its call_rcu() invocations. However, a CPU could simply stop invoking call_rcu() at any time. The no-CBs grace-period kthread therefore carries out less-aggressive flushing (every few jiffies or when the number of callbacks on ->nocb_bypass exceeds (2 * qhimark), whichever comes first). This means that the no-CBs grace-period kthread cannot be permitted to do unbounded waits while there are callbacks on ->nocb_bypass. A ->nocb_bypass_timer is used to provide the needed wakeups. [ paulmck: Apply Coverity feedback reported by Colin Ian King. ] Signed-off-by: Paul E. McKenney --- kernel/rcu/rcu_segcblist.c | 30 ++++ kernel/rcu/rcu_segcblist.h | 5 + kernel/rcu/tree.c | 16 +- kernel/rcu/tree.h | 28 ++-- kernel/rcu/tree_plugin.h | 357 +++++++++++++++++++++++++++++++++++++++++---- 5 files changed, 395 insertions(+), 41 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c index ff431cc83037..495c58ce1640 100644 --- a/kernel/rcu/rcu_segcblist.c +++ b/kernel/rcu/rcu_segcblist.c @@ -36,6 +36,36 @@ void rcu_cblist_enqueue(struct rcu_cblist *rclp, struct rcu_head *rhp) WRITE_ONCE(rclp->len, rclp->len + 1); } +/* + * Flush the second rcu_cblist structure onto the first one, obliterating + * any contents of the first. If rhp is non-NULL, enqueue it as the sole + * element of the second rcu_cblist structure, but ensuring that the second + * rcu_cblist structure, if initially non-empty, always appears non-empty + * throughout the process. If rdp is NULL, the second rcu_cblist structure + * is instead initialized to empty. + */ +void rcu_cblist_flush_enqueue(struct rcu_cblist *drclp, + struct rcu_cblist *srclp, + struct rcu_head *rhp) +{ + drclp->head = srclp->head; + if (drclp->head) + drclp->tail = srclp->tail; + else + drclp->tail = &drclp->head; + drclp->len = srclp->len; + drclp->len_lazy = srclp->len_lazy; + if (!rhp) { + rcu_cblist_init(srclp); + } else { + rhp->next = NULL; + srclp->head = rhp; + srclp->tail = &rhp->next; + WRITE_ONCE(srclp->len, 1); + srclp->len_lazy = 0; + } +} + /* * Dequeue the oldest rcu_head structure from the specified callback * list. This function assumes that the callback is non-lazy, but diff --git a/kernel/rcu/rcu_segcblist.h b/kernel/rcu/rcu_segcblist.h index 1ff996647d3c..815c2fdd3fcc 100644 --- a/kernel/rcu/rcu_segcblist.h +++ b/kernel/rcu/rcu_segcblist.h @@ -25,6 +25,10 @@ static inline void rcu_cblist_dequeued_lazy(struct rcu_cblist *rclp) } void rcu_cblist_init(struct rcu_cblist *rclp); +void rcu_cblist_enqueue(struct rcu_cblist *rclp, struct rcu_head *rhp); +void rcu_cblist_flush_enqueue(struct rcu_cblist *drclp, + struct rcu_cblist *srclp, + struct rcu_head *rhp); struct rcu_head *rcu_cblist_dequeue(struct rcu_cblist *rclp); /* @@ -92,6 +96,7 @@ static inline bool rcu_segcblist_restempty(struct rcu_segcblist *rsclp, int seg) return !READ_ONCE(*READ_ONCE(rsclp->tails[seg])); } +void rcu_segcblist_inc_len(struct rcu_segcblist *rsclp); void rcu_segcblist_init(struct rcu_segcblist *rsclp); void rcu_segcblist_disable(struct rcu_segcblist *rsclp); void rcu_segcblist_offload(struct rcu_segcblist *rsclp); diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index ec320658aeef..457623100d12 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1251,6 +1251,7 @@ static bool rcu_accelerate_cbs(struct rcu_node *rnp, struct rcu_data *rdp) unsigned long gp_seq_req; bool ret = false; + rcu_lockdep_assert_cblist_protected(rdp); raw_lockdep_assert_held_rcu_node(rnp); /* If no pending (not yet ready to invoke) callbacks, nothing to do. */ @@ -1292,7 +1293,7 @@ static void rcu_accelerate_cbs_unlocked(struct rcu_node *rnp, unsigned long c; bool needwake; - lockdep_assert_irqs_disabled(); + rcu_lockdep_assert_cblist_protected(rdp); c = rcu_seq_snap(&rcu_state.gp_seq); if (!rdp->gpwrap && ULONG_CMP_GE(rdp->gp_seq_needed, c)) { /* Old request still live, so mark recent callbacks. */ @@ -1318,6 +1319,7 @@ static void rcu_accelerate_cbs_unlocked(struct rcu_node *rnp, */ static bool rcu_advance_cbs(struct rcu_node *rnp, struct rcu_data *rdp) { + rcu_lockdep_assert_cblist_protected(rdp); raw_lockdep_assert_held_rcu_node(rnp); /* If no pending (not yet ready to invoke) callbacks, nothing to do. */ @@ -1341,6 +1343,7 @@ static bool rcu_advance_cbs(struct rcu_node *rnp, struct rcu_data *rdp) static void __maybe_unused rcu_advance_cbs_nowake(struct rcu_node *rnp, struct rcu_data *rdp) { + rcu_lockdep_assert_cblist_protected(rdp); if (!rcu_seq_state(rcu_seq_current(&rnp->gp_seq)) || !raw_spin_trylock_rcu_node(rnp)) return; @@ -2187,7 +2190,9 @@ static void rcu_do_batch(struct rcu_data *rdp) * The following usually indicates a double call_rcu(). To track * this down, try building with CONFIG_DEBUG_OBJECTS_RCU_HEAD=y. */ - WARN_ON_ONCE(rcu_segcblist_empty(&rdp->cblist) != (count == 0)); + WARN_ON_ONCE(count == 0 && !rcu_segcblist_empty(&rdp->cblist)); + WARN_ON_ONCE(!IS_ENABLED(CONFIG_RCU_NOCB_CPU) && + count != 0 && rcu_segcblist_empty(&rdp->cblist)); rcu_nocb_unlock_irqrestore(rdp, flags); @@ -2564,8 +2569,9 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func, bool lazy) if (rcu_segcblist_empty(&rdp->cblist)) rcu_segcblist_init(&rdp->cblist); } - rcu_nocb_lock(rdp); - was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist); + if (rcu_nocb_try_bypass(rdp, head, &was_alldone, flags)) + return; // Enqueued onto ->nocb_bypass, so just leave. + /* If we get here, rcu_nocb_try_bypass() acquired ->nocb_lock. */ rcu_segcblist_enqueue(&rdp->cblist, head, lazy); if (__is_kfree_rcu_offset((unsigned long)func)) trace_rcu_kfree_callback(rcu_state.name, head, @@ -2839,6 +2845,7 @@ static void rcu_barrier_func(void *unused) rdp->barrier_head.func = rcu_barrier_callback; debug_rcu_head_queue(&rdp->barrier_head); rcu_nocb_lock(rdp); + WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, jiffies)); if (rcu_segcblist_entrain(&rdp->cblist, &rdp->barrier_head, 0)) { atomic_inc(&rcu_state.barrier_cpu_count); } else { @@ -3192,6 +3199,7 @@ void rcutree_migrate_callbacks(int cpu) my_rdp = this_cpu_ptr(&rcu_data); my_rnp = my_rdp->mynode; rcu_nocb_lock(my_rdp); /* irqs already disabled. */ + WARN_ON_ONCE(!rcu_nocb_flush_bypass(my_rdp, NULL, jiffies)); raw_spin_lock_rcu_node(my_rnp); /* irqs already disabled. */ /* Leverage recent GPs and set GP for new callbacks. */ needwake = rcu_advance_cbs(my_rnp, rdp) || diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 2c3e9068671c..e4df86db8137 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -200,18 +200,26 @@ struct rcu_data { atomic_t nocb_lock_contended; /* Contention experienced. */ int nocb_defer_wakeup; /* Defer wakeup of nocb_kthread. */ struct timer_list nocb_timer; /* Enforce finite deferral. */ + unsigned long nocb_gp_adv_time; /* Last call_rcu() CB adv (jiffies). */ + + /* The following fields are used by call_rcu, hence own cacheline. */ + raw_spinlock_t nocb_bypass_lock ____cacheline_internodealigned_in_smp; + struct rcu_cblist nocb_bypass; /* Lock-contention-bypass CB list. */ + unsigned long nocb_bypass_first; /* Time (jiffies) of first enqueue. */ + unsigned long nocb_nobypass_last; /* Last ->cblist enqueue (jiffies). */ + int nocb_nobypass_count; /* # ->cblist enqueues at ^^^ time. */ /* The following fields are used by GP kthread, hence own cacheline. */ raw_spinlock_t nocb_gp_lock ____cacheline_internodealigned_in_smp; - bool nocb_gp_sleep; - /* Is the nocb GP thread asleep? */ + struct timer_list nocb_bypass_timer; /* Force nocb_bypass flush. */ + bool nocb_gp_sleep; /* Is the nocb GP thread asleep? */ struct swait_queue_head nocb_gp_wq; /* For nocb kthreads to sleep on. */ bool nocb_cb_sleep; /* Is the nocb CB thread asleep? */ struct task_struct *nocb_cb_kthread; struct rcu_data *nocb_next_cb_rdp; /* Next rcu_data in wakeup chain. */ - /* The following fields are used by CB kthread, hence new cachline. */ + /* The following fields are used by CB kthread, hence new cacheline. */ struct rcu_data *nocb_gp_rdp ____cacheline_internodealigned_in_smp; /* GP rdp takes GP-end wakeups. */ #endif /* #ifdef CONFIG_RCU_NOCB_CPU */ @@ -419,6 +427,10 @@ static void zero_cpu_stall_ticks(struct rcu_data *rdp); static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp); static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq); static void rcu_init_one_nocb(struct rcu_node *rnp); +static bool rcu_nocb_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp, + unsigned long j); +static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp, + bool *was_alldone, unsigned long flags); static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_empty, unsigned long flags); static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp); @@ -430,19 +442,15 @@ static void rcu_nocb_lock(struct rcu_data *rdp); static void rcu_nocb_unlock(struct rcu_data *rdp); static void rcu_nocb_unlock_irqrestore(struct rcu_data *rdp, unsigned long flags); +static void rcu_lockdep_assert_cblist_protected(struct rcu_data *rdp); #ifdef CONFIG_RCU_NOCB_CPU static void __init rcu_organize_nocb_kthreads(void); #define rcu_nocb_lock_irqsave(rdp, flags) \ do { \ - if (!rcu_segcblist_is_offloaded(&(rdp)->cblist)) { \ + if (!rcu_segcblist_is_offloaded(&(rdp)->cblist)) \ local_irq_save(flags); \ - } else if (!raw_spin_trylock_irqsave(&(rdp)->nocb_lock, (flags))) {\ - atomic_inc(&(rdp)->nocb_lock_contended); \ - smp_mb__after_atomic(); /* atomic_inc() before lock. */ \ + else \ raw_spin_lock_irqsave(&(rdp)->nocb_lock, (flags)); \ - smp_mb__before_atomic(); /* atomic_dec() after lock. */ \ - atomic_dec(&(rdp)->nocb_lock_contended); \ - } \ } while (0) #else /* #ifdef CONFIG_RCU_NOCB_CPU */ #define rcu_nocb_lock_irqsave(rdp, flags) local_irq_save(flags) diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 4e49bb359464..12b14d7a2cf2 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1497,19 +1497,26 @@ static int __init parse_rcu_nocb_poll(char *arg) early_param("rcu_nocb_poll", parse_rcu_nocb_poll); /* - * Acquire the specified rcu_data structure's ->nocb_lock, but only - * if it corresponds to a no-CBs CPU. If the lock isn't immediately - * available, increment ->nocb_lock_contended to flag the contention. + * Don't bother bypassing ->cblist if the call_rcu() rate is low. + * After all, the main point of bypassing is to avoid lock contention + * on ->nocb_lock, which only can happen at high call_rcu() rates. */ -static void rcu_nocb_lock(struct rcu_data *rdp) +int nocb_nobypass_lim_per_jiffy = 16 * 1000 / HZ; +module_param(nocb_nobypass_lim_per_jiffy, int, 0); + +/* + * Acquire the specified rcu_data structure's ->nocb_bypass_lock. If the + * lock isn't immediately available, increment ->nocb_lock_contended to + * flag the contention. + */ +static void rcu_nocb_bypass_lock(struct rcu_data *rdp) { lockdep_assert_irqs_disabled(); - if (!rcu_segcblist_is_offloaded(&rdp->cblist) || - raw_spin_trylock(&rdp->nocb_lock)) + if (raw_spin_trylock(&rdp->nocb_bypass_lock)) return; atomic_inc(&rdp->nocb_lock_contended); smp_mb__after_atomic(); /* atomic_inc() before lock. */ - raw_spin_lock(&rdp->nocb_lock); + raw_spin_lock(&rdp->nocb_bypass_lock); smp_mb__before_atomic(); /* atomic_dec() after lock. */ atomic_dec(&rdp->nocb_lock_contended); } @@ -1530,6 +1537,37 @@ static void rcu_nocb_wait_contended(struct rcu_data *rdp) cpu_relax(); } +/* + * Conditionally acquire the specified rcu_data structure's + * ->nocb_bypass_lock. + */ +static bool rcu_nocb_bypass_trylock(struct rcu_data *rdp) +{ + lockdep_assert_irqs_disabled(); + return raw_spin_trylock(&rdp->nocb_bypass_lock); +} + +/* + * Release the specified rcu_data structure's ->nocb_bypass_lock. + */ +static void rcu_nocb_bypass_unlock(struct rcu_data *rdp) +{ + lockdep_assert_irqs_disabled(); + raw_spin_unlock(&rdp->nocb_bypass_lock); +} + +/* + * Acquire the specified rcu_data structure's ->nocb_lock, but only + * if it corresponds to a no-CBs CPU. + */ +static void rcu_nocb_lock(struct rcu_data *rdp) +{ + lockdep_assert_irqs_disabled(); + if (!rcu_segcblist_is_offloaded(&rdp->cblist)) + return; + raw_spin_lock(&rdp->nocb_lock); +} + /* * Release the specified rcu_data structure's ->nocb_lock, but only * if it corresponds to a no-CBs CPU. @@ -1557,6 +1595,15 @@ static void rcu_nocb_unlock_irqrestore(struct rcu_data *rdp, } } +/* Lockdep check that ->cblist may be safely accessed. */ +static void rcu_lockdep_assert_cblist_protected(struct rcu_data *rdp) +{ + lockdep_assert_irqs_disabled(); + if (rcu_segcblist_is_offloaded(&rdp->cblist) && + cpu_online(rdp->cpu)) + lockdep_assert_held(&rdp->nocb_lock); +} + /* * Wake up any no-CBs CPUs' kthreads that were waiting on the just-ended * grace period. @@ -1593,24 +1640,27 @@ static void wake_nocb_gp(struct rcu_data *rdp, bool force, unsigned long flags) __releases(rdp->nocb_lock) { + bool needwake = false; struct rcu_data *rdp_gp = rdp->nocb_gp_rdp; lockdep_assert_held(&rdp->nocb_lock); if (!READ_ONCE(rdp_gp->nocb_gp_kthread)) { + trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, + TPS("AlreadyAwake")); rcu_nocb_unlock_irqrestore(rdp, flags); return; } - if (READ_ONCE(rdp_gp->nocb_gp_sleep) || force) { - del_timer(&rdp->nocb_timer); - rcu_nocb_unlock_irqrestore(rdp, flags); - smp_mb(); /* enqueue before ->nocb_gp_sleep. */ - raw_spin_lock_irqsave(&rdp_gp->nocb_gp_lock, flags); + del_timer(&rdp->nocb_timer); + rcu_nocb_unlock_irqrestore(rdp, flags); + raw_spin_lock_irqsave(&rdp_gp->nocb_gp_lock, flags); + if (force || READ_ONCE(rdp_gp->nocb_gp_sleep)) { WRITE_ONCE(rdp_gp->nocb_gp_sleep, false); - raw_spin_unlock_irqrestore(&rdp_gp->nocb_gp_lock, flags); - wake_up_process(rdp_gp->nocb_gp_kthread); - } else { - rcu_nocb_unlock_irqrestore(rdp, flags); + needwake = true; + trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("DoWake")); } + raw_spin_unlock_irqrestore(&rdp_gp->nocb_gp_lock, flags); + if (needwake) + wake_up_process(rdp_gp->nocb_gp_kthread); } /* @@ -1627,6 +1677,189 @@ static void wake_nocb_gp_defer(struct rcu_data *rdp, int waketype, trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, reason); } +/* + * Flush the ->nocb_bypass queue into ->cblist, enqueuing rhp if non-NULL. + * However, if there is a callback to be enqueued and if ->nocb_bypass + * proves to be initially empty, just return false because the no-CB GP + * kthread may need to be awakened in this case. + * + * Note that this function always returns true if rhp is NULL. + */ +static bool rcu_nocb_do_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp, + unsigned long j) +{ + struct rcu_cblist rcl; + + WARN_ON_ONCE(!rcu_segcblist_is_offloaded(&rdp->cblist)); + rcu_lockdep_assert_cblist_protected(rdp); + lockdep_assert_held(&rdp->nocb_bypass_lock); + if (rhp && !rcu_cblist_n_cbs(&rdp->nocb_bypass)) { + raw_spin_unlock(&rdp->nocb_bypass_lock); + return false; + } + /* Note: ->cblist.len already accounts for ->nocb_bypass contents. */ + if (rhp) + rcu_segcblist_inc_len(&rdp->cblist); /* Must precede enqueue. */ + rcu_cblist_flush_enqueue(&rcl, &rdp->nocb_bypass, rhp); + rcu_segcblist_insert_pend_cbs(&rdp->cblist, &rcl); + WRITE_ONCE(rdp->nocb_bypass_first, j); + rcu_nocb_bypass_unlock(rdp); + return true; +} + +/* + * Flush the ->nocb_bypass queue into ->cblist, enqueuing rhp if non-NULL. + * However, if there is a callback to be enqueued and if ->nocb_bypass + * proves to be initially empty, just return false because the no-CB GP + * kthread may need to be awakened in this case. + * + * Note that this function always returns true if rhp is NULL. + */ +static bool rcu_nocb_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp, + unsigned long j) +{ + if (!rcu_segcblist_is_offloaded(&rdp->cblist)) + return true; + rcu_lockdep_assert_cblist_protected(rdp); + rcu_nocb_bypass_lock(rdp); + return rcu_nocb_do_flush_bypass(rdp, rhp, j); +} + +/* + * If the ->nocb_bypass_lock is immediately available, flush the + * ->nocb_bypass queue into ->cblist. + */ +static void rcu_nocb_try_flush_bypass(struct rcu_data *rdp, unsigned long j) +{ + rcu_lockdep_assert_cblist_protected(rdp); + if (!rcu_segcblist_is_offloaded(&rdp->cblist) || + !rcu_nocb_bypass_trylock(rdp)) + return; + WARN_ON_ONCE(!rcu_nocb_do_flush_bypass(rdp, NULL, j)); +} + +/* + * See whether it is appropriate to use the ->nocb_bypass list in order + * to control contention on ->nocb_lock. A limited number of direct + * enqueues are permitted into ->cblist per jiffy. If ->nocb_bypass + * is non-empty, further callbacks must be placed into ->nocb_bypass, + * otherwise rcu_barrier() breaks. Use rcu_nocb_flush_bypass() to switch + * back to direct use of ->cblist. However, ->nocb_bypass should not be + * used if ->cblist is empty, because otherwise callbacks can be stranded + * on ->nocb_bypass because we cannot count on the current CPU ever again + * invoking call_rcu(). The general rule is that if ->nocb_bypass is + * non-empty, the corresponding no-CBs grace-period kthread must not be + * in an indefinite sleep state. + * + * Finally, it is not permitted to use the bypass during early boot, + * as doing so would confuse the auto-initialization code. Besides + * which, there is no point in worrying about lock contention while + * there is only one CPU in operation. + */ +static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp, + bool *was_alldone, unsigned long flags) +{ + unsigned long c; + unsigned long cur_gp_seq; + unsigned long j = jiffies; + long ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass); + + if (!rcu_segcblist_is_offloaded(&rdp->cblist)) { + *was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist); + return false; /* Not offloaded, no bypassing. */ + } + lockdep_assert_irqs_disabled(); + + // Don't use ->nocb_bypass during early boot. + if (rcu_scheduler_active != RCU_SCHEDULER_RUNNING) { + rcu_nocb_lock(rdp); + WARN_ON_ONCE(rcu_cblist_n_cbs(&rdp->nocb_bypass)); + *was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist); + return false; + } + + // If we have advanced to a new jiffy, reset counts to allow + // moving back from ->nocb_bypass to ->cblist. + if (j == rdp->nocb_nobypass_last) { + c = rdp->nocb_nobypass_count + 1; + } else { + WRITE_ONCE(rdp->nocb_nobypass_last, j); + c = rdp->nocb_nobypass_count - nocb_nobypass_lim_per_jiffy; + if (ULONG_CMP_LT(rdp->nocb_nobypass_count, + nocb_nobypass_lim_per_jiffy)) + c = 0; + else if (c > nocb_nobypass_lim_per_jiffy) + c = nocb_nobypass_lim_per_jiffy; + } + WRITE_ONCE(rdp->nocb_nobypass_count, c); + + // If there hasn't yet been all that many ->cblist enqueues + // this jiffy, tell the caller to enqueue onto ->cblist. But flush + // ->nocb_bypass first. + if (rdp->nocb_nobypass_count < nocb_nobypass_lim_per_jiffy) { + rcu_nocb_lock(rdp); + *was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist); + if (*was_alldone) + trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, + TPS("FirstQ")); + WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, j)); + WARN_ON_ONCE(rcu_cblist_n_cbs(&rdp->nocb_bypass)); + return false; // Caller must enqueue the callback. + } + + // If ->nocb_bypass has been used too long or is too full, + // flush ->nocb_bypass to ->cblist. + if ((ncbs && j != READ_ONCE(rdp->nocb_bypass_first)) || + ncbs >= qhimark) { + rcu_nocb_lock(rdp); + if (!rcu_nocb_flush_bypass(rdp, rhp, j)) { + *was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist); + if (*was_alldone) + trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, + TPS("FirstQ")); + WARN_ON_ONCE(rcu_cblist_n_cbs(&rdp->nocb_bypass)); + return false; // Caller must enqueue the callback. + } + if (j != rdp->nocb_gp_adv_time && + rcu_segcblist_nextgp(&rdp->cblist, &cur_gp_seq) && + rcu_seq_done(&rdp->mynode->gp_seq, cur_gp_seq)) { + rcu_advance_cbs_nowake(rdp->mynode, rdp); + rdp->nocb_gp_adv_time = j; + } + rcu_nocb_unlock_irqrestore(rdp, flags); + return true; // Callback already enqueued. + } + + // We need to use the bypass. + rcu_nocb_wait_contended(rdp); + rcu_nocb_bypass_lock(rdp); + ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass); + rcu_segcblist_inc_len(&rdp->cblist); /* Must precede enqueue. */ + rcu_cblist_enqueue(&rdp->nocb_bypass, rhp); + if (!ncbs) { + WRITE_ONCE(rdp->nocb_bypass_first, j); + trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("FirstBQ")); + } + rcu_nocb_bypass_unlock(rdp); + smp_mb(); /* Order enqueue before wake. */ + if (ncbs) { + local_irq_restore(flags); + } else { + // No-CBs GP kthread might be indefinitely asleep, if so, wake. + rcu_nocb_lock(rdp); // Rare during call_rcu() flood. + if (!rcu_segcblist_pend_cbs(&rdp->cblist)) { + trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, + TPS("FirstBQwake")); + __call_rcu_nocb_wake(rdp, true, flags); + } else { + trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, + TPS("FirstBQnoWake")); + rcu_nocb_unlock_irqrestore(rdp, flags); + } + } + return true; // Callback already enqueued. +} + /* * Awaken the no-CBs grace-period kthead if needed, either due to it * legitimately being asleep or due to overload conditions. @@ -1685,23 +1918,33 @@ static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_alldone, trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WakeNot")); rcu_nocb_unlock_irqrestore(rdp, flags); } - if (!irqs_disabled_flags(flags)) { - lockdep_assert_irqs_enabled(); - rcu_nocb_wait_contended(rdp); - } return; } +/* Wake up the no-CBs GP kthread to flush ->nocb_bypass. */ +static void do_nocb_bypass_wakeup_timer(struct timer_list *t) +{ + unsigned long flags; + struct rcu_data *rdp = from_timer(rdp, t, nocb_bypass_timer); + + trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("Timer")); + rcu_nocb_lock_irqsave(rdp, flags); + __call_rcu_nocb_wake(rdp, true, flags); +} + /* * No-CBs GP kthreads come here to wait for additional callbacks to show up * or for grace periods to end. */ static void nocb_gp_wait(struct rcu_data *my_rdp) { + bool bypass = false; + long bypass_ncbs; int __maybe_unused cpu = my_rdp->cpu; unsigned long cur_gp_seq; unsigned long flags; bool gotcbs; + unsigned long j = jiffies; bool needwait_gp = false; // This prevents actual uninitialized use. bool needwake; bool needwake_gp; @@ -1715,21 +1958,50 @@ static void nocb_gp_wait(struct rcu_data *my_rdp) * and the global grace-period kthread are awakened if needed. */ for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_cb_rdp) { - if (rcu_segcblist_empty(&rdp->cblist)) + trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("Check")); + rcu_nocb_lock_irqsave(rdp, flags); + bypass_ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass); + if (bypass_ncbs && + (time_after(j, READ_ONCE(rdp->nocb_bypass_first) + 1) || + bypass_ncbs > 2 * qhimark)) { + // Bypass full or old, so flush it. + (void)rcu_nocb_try_flush_bypass(rdp, j); + bypass_ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass); + } else if (!bypass_ncbs && rcu_segcblist_empty(&rdp->cblist)) { + rcu_nocb_unlock_irqrestore(rdp, flags); continue; /* No callbacks here, try next. */ + } + if (bypass_ncbs) { + trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, + TPS("Bypass")); + bypass = true; + } rnp = rdp->mynode; - rcu_nocb_lock_irqsave(rdp, flags); - WRITE_ONCE(my_rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT); - del_timer(&my_rdp->nocb_timer); - raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */ - needwake_gp = rcu_advance_cbs(rnp, rdp); - raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */ + if (bypass) { // Avoid race with first bypass CB. + WRITE_ONCE(my_rdp->nocb_defer_wakeup, + RCU_NOCB_WAKE_NOT); + del_timer(&my_rdp->nocb_timer); + } + // Advance callbacks if helpful and low contention. + needwake_gp = false; + if (!rcu_segcblist_restempty(&rdp->cblist, + RCU_NEXT_READY_TAIL) || + (rcu_segcblist_nextgp(&rdp->cblist, &cur_gp_seq) && + rcu_seq_done(&rnp->gp_seq, cur_gp_seq))) { + raw_spin_lock_rcu_node(rnp); /* irqs disabled. */ + needwake_gp = rcu_advance_cbs(rnp, rdp); + raw_spin_unlock_rcu_node(rnp); /* irqs disabled. */ + } // Need to wait on some grace period? + WARN_ON_ONCE(!rcu_segcblist_restempty(&rdp->cblist, + RCU_NEXT_READY_TAIL)); if (rcu_segcblist_nextgp(&rdp->cblist, &cur_gp_seq)) { if (!needwait_gp || ULONG_CMP_LT(cur_gp_seq, wait_gp_seq)) wait_gp_seq = cur_gp_seq; needwait_gp = true; + trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, + TPS("NeedWaitGP")); } if (rcu_segcblist_ready_cbs(&rdp->cblist)) { needwake = rdp->nocb_cb_sleep; @@ -1747,6 +2019,13 @@ static void nocb_gp_wait(struct rcu_data *my_rdp) rcu_gp_kthread_wake(); } + if (bypass && !rcu_nocb_poll) { + // At least one child with non-empty ->nocb_bypass, so set + // timer in order to avoid stranding its callbacks. + raw_spin_lock_irqsave(&my_rdp->nocb_gp_lock, flags); + mod_timer(&my_rdp->nocb_bypass_timer, j + 2); + raw_spin_unlock_irqrestore(&my_rdp->nocb_gp_lock, flags); + } if (rcu_nocb_poll) { /* Polling, so trace if first poll in the series. */ if (gotcbs) @@ -1757,6 +2036,7 @@ static void nocb_gp_wait(struct rcu_data *my_rdp) trace_rcu_nocb_wake(rcu_state.name, cpu, TPS("Sleep")); swait_event_interruptible_exclusive(my_rdp->nocb_gp_wq, !READ_ONCE(my_rdp->nocb_gp_sleep)); + trace_rcu_nocb_wake(rcu_state.name, cpu, TPS("EndSleep")); } else { rnp = my_rdp->mynode; trace_rcu_this_gp(rnp, my_rdp, wait_gp_seq, TPS("StartWait")); @@ -1768,6 +2048,8 @@ static void nocb_gp_wait(struct rcu_data *my_rdp) } if (!rcu_nocb_poll) { raw_spin_lock_irqsave(&my_rdp->nocb_gp_lock, flags); + if (bypass) + del_timer(&my_rdp->nocb_bypass_timer); WRITE_ONCE(my_rdp->nocb_gp_sleep, true); raw_spin_unlock_irqrestore(&my_rdp->nocb_gp_lock, flags); } @@ -1949,8 +2231,11 @@ static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp) init_swait_queue_head(&rdp->nocb_cb_wq); init_swait_queue_head(&rdp->nocb_gp_wq); raw_spin_lock_init(&rdp->nocb_lock); + raw_spin_lock_init(&rdp->nocb_bypass_lock); raw_spin_lock_init(&rdp->nocb_gp_lock); timer_setup(&rdp->nocb_timer, do_nocb_deferred_wakeup_timer, 0); + timer_setup(&rdp->nocb_bypass_timer, do_nocb_bypass_wakeup_timer, 0); + rcu_cblist_init(&rdp->nocb_bypass); } /* @@ -2094,6 +2379,12 @@ static void rcu_nocb_unlock_irqrestore(struct rcu_data *rdp, local_irq_restore(flags); } +/* Lockdep check that ->cblist may be safely accessed. */ +static void rcu_lockdep_assert_cblist_protected(struct rcu_data *rdp) +{ + lockdep_assert_irqs_disabled(); +} + static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq) { } @@ -2107,6 +2398,18 @@ static void rcu_init_one_nocb(struct rcu_node *rnp) { } +static bool rcu_nocb_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp, + unsigned long j) +{ + return true; +} + +static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp, + bool *was_alldone, unsigned long flags) +{ + return false; +} + static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_empty, unsigned long flags) { -- cgit v1.2.3 From 6aacd88d1721e12b013ae4ccf4f17609bd5091f3 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 13 Jul 2019 12:27:03 -0700 Subject: rcu/nocb: EXP Check use and usefulness of ->nocb_lock_contended Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 12b14d7a2cf2..97c730753a6d 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1515,6 +1515,7 @@ static void rcu_nocb_bypass_lock(struct rcu_data *rdp) if (raw_spin_trylock(&rdp->nocb_bypass_lock)) return; atomic_inc(&rdp->nocb_lock_contended); + WARN_ON_ONCE(smp_processor_id() != rdp->cpu); smp_mb__after_atomic(); /* atomic_inc() before lock. */ raw_spin_lock(&rdp->nocb_bypass_lock); smp_mb__before_atomic(); /* atomic_dec() after lock. */ @@ -1533,7 +1534,8 @@ static void rcu_nocb_bypass_lock(struct rcu_data *rdp) */ static void rcu_nocb_wait_contended(struct rcu_data *rdp) { - while (atomic_read(&rdp->nocb_lock_contended)) + WARN_ON_ONCE(smp_processor_id() != rdp->cpu); + while (WARN_ON_ONCE(atomic_read(&rdp->nocb_lock_contended))) cpu_relax(); } -- cgit v1.2.3 From f7a81b12d6af42a9d09be1e5f041169f04b0b67a Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 25 Jun 2019 13:32:51 -0700 Subject: rcu/nocb: Print no-CBs diagnostics when rcutorture writer unduly delayed This commit causes locking, sleeping, and callback state to be printed for no-CBs CPUs when the rcutorture writer is delayed sufficiently for rcutorture to complain. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 1 + kernel/rcu/tree.h | 7 ++++- kernel/rcu/tree_plugin.h | 82 ++++++++++++++++++++++++++++++++++++++++++++++++ kernel/rcu/tree_stall.h | 5 +++ 4 files changed, 94 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index b22947324423..3c9feca1eab1 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -2176,6 +2176,7 @@ rcu_torture_cleanup(void) return; } + show_rcu_gp_kthreads(); rcu_torture_barrier_cleanup(); torture_stop_kthread(rcu_torture_fwd_prog, fwd_prog_task); torture_stop_kthread(rcu_torture_stall, stall_task); diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index e4df86db8137..c612f306fe89 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -212,7 +212,11 @@ struct rcu_data { /* The following fields are used by GP kthread, hence own cacheline. */ raw_spinlock_t nocb_gp_lock ____cacheline_internodealigned_in_smp; struct timer_list nocb_bypass_timer; /* Force nocb_bypass flush. */ - bool nocb_gp_sleep; /* Is the nocb GP thread asleep? */ + u8 nocb_gp_sleep; /* Is the nocb GP thread asleep? */ + u8 nocb_gp_bypass; /* Found a bypass on last scan? */ + u8 nocb_gp_gp; /* GP to wait for on last scan? */ + unsigned long nocb_gp_seq; /* If so, ->gp_seq to wait for. */ + unsigned long nocb_gp_loops; /* # passes through wait code. */ struct swait_queue_head nocb_gp_wq; /* For nocb kthreads to sleep on. */ bool nocb_cb_sleep; /* Is the nocb CB thread asleep? */ struct task_struct *nocb_cb_kthread; @@ -438,6 +442,7 @@ static void do_nocb_deferred_wakeup(struct rcu_data *rdp); static void rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp); static void rcu_spawn_cpu_nocb_kthread(int cpu); static void __init rcu_spawn_nocb_kthreads(void); +static void show_rcu_nocb_state(struct rcu_data *rdp); static void rcu_nocb_lock(struct rcu_data *rdp); static void rcu_nocb_unlock(struct rcu_data *rdp); static void rcu_nocb_unlock_irqrestore(struct rcu_data *rdp, diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 97c730753a6d..25a53742ca68 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -2021,6 +2021,9 @@ static void nocb_gp_wait(struct rcu_data *my_rdp) rcu_gp_kthread_wake(); } + my_rdp->nocb_gp_bypass = bypass; + my_rdp->nocb_gp_gp = needwait_gp; + my_rdp->nocb_gp_seq = needwait_gp ? wait_gp_seq : 0; if (bypass && !rcu_nocb_poll) { // At least one child with non-empty ->nocb_bypass, so set // timer in order to avoid stranding its callbacks. @@ -2055,6 +2058,7 @@ static void nocb_gp_wait(struct rcu_data *my_rdp) WRITE_ONCE(my_rdp->nocb_gp_sleep, true); raw_spin_unlock_irqrestore(&my_rdp->nocb_gp_lock, flags); } + my_rdp->nocb_gp_seq = -1; WARN_ON(signal_pending(current)); } @@ -2071,6 +2075,7 @@ static int rcu_nocb_gp_kthread(void *arg) struct rcu_data *rdp = arg; for (;;) { + WRITE_ONCE(rdp->nocb_gp_loops, rdp->nocb_gp_loops + 1); nocb_gp_wait(rdp); cond_resched_tasks_rcu_qs(); } @@ -2362,6 +2367,79 @@ void rcu_bind_current_to_nocb(void) } EXPORT_SYMBOL_GPL(rcu_bind_current_to_nocb); +/* + * Dump out nocb grace-period kthread state for the specified rcu_data + * structure. + */ +static void show_rcu_nocb_gp_state(struct rcu_data *rdp) +{ + struct rcu_node *rnp = rdp->mynode; + + pr_info("nocb GP %d %c%c%c%c%c%c %c[%c%c] %c%c:%ld rnp %d:%d %lu\n", + rdp->cpu, + "kK"[!!rdp->nocb_gp_kthread], + "lL"[raw_spin_is_locked(&rdp->nocb_gp_lock)], + "dD"[!!rdp->nocb_defer_wakeup], + "tT"[timer_pending(&rdp->nocb_timer)], + "bB"[timer_pending(&rdp->nocb_bypass_timer)], + "sS"[!!rdp->nocb_gp_sleep], + ".W"[swait_active(&rdp->nocb_gp_wq)], + ".W"[swait_active(&rnp->nocb_gp_wq[0])], + ".W"[swait_active(&rnp->nocb_gp_wq[1])], + ".B"[!!rdp->nocb_gp_bypass], + ".G"[!!rdp->nocb_gp_gp], + (long)rdp->nocb_gp_seq, + rnp->grplo, rnp->grphi, READ_ONCE(rdp->nocb_gp_loops)); +} + +/* Dump out nocb kthread state for the specified rcu_data structure. */ +static void show_rcu_nocb_state(struct rcu_data *rdp) +{ + struct rcu_segcblist *rsclp = &rdp->cblist; + bool waslocked; + bool wastimer; + bool wassleep; + + if (rdp->nocb_gp_rdp == rdp) + show_rcu_nocb_gp_state(rdp); + + pr_info(" CB %d->%d %c%c%c%c%c%c F%ld L%ld C%d %c%c%c%c%c q%ld\n", + rdp->cpu, rdp->nocb_gp_rdp->cpu, + "kK"[!!rdp->nocb_cb_kthread], + "bB"[raw_spin_is_locked(&rdp->nocb_bypass_lock)], + "cC"[!!atomic_read(&rdp->nocb_lock_contended)], + "lL"[raw_spin_is_locked(&rdp->nocb_lock)], + "sS"[!!rdp->nocb_cb_sleep], + ".W"[swait_active(&rdp->nocb_cb_wq)], + jiffies - rdp->nocb_bypass_first, + jiffies - rdp->nocb_nobypass_last, + rdp->nocb_nobypass_count, + ".D"[rcu_segcblist_ready_cbs(rsclp)], + ".W"[!rcu_segcblist_restempty(rsclp, RCU_DONE_TAIL)], + ".R"[!rcu_segcblist_restempty(rsclp, RCU_WAIT_TAIL)], + ".N"[!rcu_segcblist_restempty(rsclp, RCU_NEXT_READY_TAIL)], + ".B"[!!rcu_cblist_n_cbs(&rdp->nocb_bypass)], + rcu_segcblist_n_cbs(&rdp->cblist)); + + /* It is OK for GP kthreads to have GP state. */ + if (rdp->nocb_gp_rdp == rdp) + return; + + waslocked = raw_spin_is_locked(&rdp->nocb_gp_lock); + wastimer = timer_pending(&rdp->nocb_timer); + wassleep = swait_active(&rdp->nocb_gp_wq); + if (!rdp->nocb_defer_wakeup && !rdp->nocb_gp_sleep && + !waslocked && !wastimer && !wassleep) + return; /* Nothing untowards. */ + + pr_info(" !!! %c%c%c%c %c\n", + "lL"[waslocked], + "dD"[!!rdp->nocb_defer_wakeup], + "tT"[wastimer], + "sS"[!!rdp->nocb_gp_sleep], + ".W"[wassleep]); +} + #else /* #ifdef CONFIG_RCU_NOCB_CPU */ /* No ->nocb_lock to acquire. */ @@ -2439,6 +2517,10 @@ static void __init rcu_spawn_nocb_kthreads(void) { } +static void show_rcu_nocb_state(struct rcu_data *rdp) +{ +} + #endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */ /* diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index 0627a66699a6..841ab43f3e60 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -589,6 +589,11 @@ void show_rcu_gp_kthreads(void) cpu, (long)rdp->gp_seq_needed); } } + for_each_possible_cpu(cpu) { + rdp = per_cpu_ptr(&rcu_data, cpu); + if (rcu_segcblist_is_offloaded(&rdp->cblist)) + show_rcu_nocb_state(rdp); + } /* sched_show_task(rcu_state.gp_kthread); */ } EXPORT_SYMBOL_GPL(show_rcu_gp_kthreads); -- cgit v1.2.3 From 273f034065002bf9480601d66404c991b243b91e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 9 Jul 2019 06:54:42 -0700 Subject: rcu/nocb: Avoid synchronous wakeup in __call_rcu_nocb_wake() When callbacks are in full flow, the common case is waiting for a grace period, and this grace period will normally take a few jiffies to complete. It therefore isn't all that helpful for __call_rcu_nocb_wake() to do a synchronous wakeup in this case. This commit therefore turns this into a timer-based deferred wakeup of the no-CBs grace-period kthread. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 19 +++++-------------- 1 file changed, 5 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 25a53742ca68..4b59ef1cbc8b 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1900,22 +1900,13 @@ static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_alldone, } else if (len > rdp->qlen_last_fqs_check + qhimark) { /* ... or if many callbacks queued. */ rdp->qlen_last_fqs_check = len; - if (!rdp->nocb_cb_sleep && - rcu_segcblist_ready_cbs(&rdp->cblist)) { - // Already going full tilt, so don't try to rewake. - rcu_nocb_unlock_irqrestore(rdp, flags); - } else { + if (rdp->nocb_cb_sleep || + !rcu_segcblist_ready_cbs(&rdp->cblist)) { rcu_advance_cbs_nowake(rdp->mynode, rdp); - if (!irqs_disabled_flags(flags)) { - wake_nocb_gp(rdp, false, flags); - trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, - TPS("WakeOvf")); - } else { - wake_nocb_gp_defer(rdp, RCU_NOCB_WAKE_FORCE, - TPS("WakeOvfIsDeferred")); - rcu_nocb_unlock_irqrestore(rdp, flags); - } + wake_nocb_gp_defer(rdp, RCU_NOCB_WAKE_FORCE, + TPS("WakeOvfIsDeferred")); } + rcu_nocb_unlock_irqrestore(rdp, flags); } else { trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WakeNot")); rcu_nocb_unlock_irqrestore(rdp, flags); -- cgit v1.2.3 From 23651d9b9616060cf86af5e3b15defcf3bcd2642 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 10 Jul 2019 12:54:56 -0700 Subject: rcu/nocb: Advance CBs after merge in rcutree_migrate_callbacks() The rcutree_migrate_callbacks() invokes rcu_advance_cbs() on both the offlined CPU's ->cblist and that of the surviving CPU, then merges them. However, after the merge, and of the offlined CPU's callbacks that were not ready to be invoked will no longer be associated with a grace-period number. This commit therefore invokes rcu_advance_cbs() one more time on the merged ->cblist in order to assign a grace-period number to these callbacks. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 457623100d12..3e89b5b83ea0 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3205,6 +3205,7 @@ void rcutree_migrate_callbacks(int cpu) needwake = rcu_advance_cbs(my_rnp, rdp) || rcu_advance_cbs(my_rnp, my_rdp); rcu_segcblist_merge(&my_rdp->cblist, &rdp->cblist); + needwake = needwake || rcu_advance_cbs(my_rnp, my_rdp); rcu_segcblist_disable(&rdp->cblist); WARN_ON_ONCE(rcu_segcblist_empty(&my_rdp->cblist) != !rcu_segcblist_n_cbs(&my_rdp->cblist)); -- cgit v1.2.3 From 1d5a81c18dc68fc38a52e8dab1992a043a358927 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 15 Jul 2019 01:09:04 -0700 Subject: rcu/nocb: Reduce nocb_cb_wait() leaf rcu_node ->lock contention Currently, nocb_cb_wait() advances callbacks on each pass through its loop, though only if it succeeds in conditionally acquiring its leaf rcu_node structure's ->lock. Despite the conditional acquisition of ->lock, this does increase contention. This commit therefore avoids advancing callbacks unless there are callbacks in ->cblist whose grace period has completed. Note that nocb_cb_wait() doesn't worry about callbacks that have not yet been assigned a grace period. The idea is that the only reason for nocb_cb_wait() to advance callbacks is to allow it to continue invoking callbacks. Time will tell whether this is the correct choice. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 4b59ef1cbc8b..f6f23a16bd64 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -2079,6 +2079,7 @@ static int rcu_nocb_gp_kthread(void *arg) */ static void nocb_cb_wait(struct rcu_data *rdp) { + unsigned long cur_gp_seq; unsigned long flags; bool needwake_gp = false; struct rcu_node *rnp = rdp->mynode; @@ -2091,7 +2092,9 @@ static void nocb_cb_wait(struct rcu_data *rdp) local_bh_enable(); lockdep_assert_irqs_enabled(); rcu_nocb_lock_irqsave(rdp, flags); - if (raw_spin_trylock_rcu_node(rnp)) { /* irqs already disabled. */ + if (rcu_segcblist_nextgp(&rdp->cblist, &cur_gp_seq) && + rcu_seq_done(&rnp->gp_seq, cur_gp_seq) && + raw_spin_trylock_rcu_node(rnp)) { /* irqs already disabled. */ needwake_gp = rcu_advance_cbs(rdp->mynode, rdp); raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */ } -- cgit v1.2.3 From 296181d78df9892e08e794f2a9a4d2c38f9acedb Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 15 Jul 2019 06:06:40 -0700 Subject: rcu/nocb: Reduce __call_rcu_nocb_wake() leaf rcu_node ->lock contention Currently, __call_rcu_nocb_wake() advances callbacks each time that it detects excessive numbers of callbacks, though only if it succeeds in conditionally acquiring its leaf rcu_node structure's ->lock. Despite the conditional acquisition of ->lock, this does increase contention. This commit therefore avoids advancing callbacks unless there are callbacks in ->cblist whose grace period has completed and advancing has not yet been done during this jiffy. Note that this decision does not take the presence of new callbacks into account. That is because on this code path, there will always be at least one new callback, namely the one we just enqueued. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index f6f23a16bd64..f56fb4e97a8e 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1872,6 +1872,8 @@ static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_alldone, unsigned long flags) __releases(rdp->nocb_lock) { + unsigned long cur_gp_seq; + unsigned long j; long len; struct task_struct *t; @@ -1900,12 +1902,17 @@ static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_alldone, } else if (len > rdp->qlen_last_fqs_check + qhimark) { /* ... or if many callbacks queued. */ rdp->qlen_last_fqs_check = len; - if (rdp->nocb_cb_sleep || - !rcu_segcblist_ready_cbs(&rdp->cblist)) { + j = jiffies; + if (j != rdp->nocb_gp_adv_time && + rcu_segcblist_nextgp(&rdp->cblist, &cur_gp_seq) && + rcu_seq_done(&rdp->mynode->gp_seq, cur_gp_seq)) { rcu_advance_cbs_nowake(rdp->mynode, rdp); + rdp->nocb_gp_adv_time = j; + } + if (rdp->nocb_cb_sleep || + !rcu_segcblist_ready_cbs(&rdp->cblist)) wake_nocb_gp_defer(rdp, RCU_NOCB_WAKE_FORCE, TPS("WakeOvfIsDeferred")); - } rcu_nocb_unlock_irqrestore(rdp, flags); } else { trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WakeNot")); -- cgit v1.2.3 From f48fe4c586604c3a09938c6a6e9fd3356dfe8f3c Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 16 Jul 2019 02:17:00 -0700 Subject: rcu/nocb: Don't wake no-CBs GP kthread if timer posted under overload When under overload conditions, __call_rcu_nocb_wake() will wake the no-CBs GP kthread any time the no-CBs CB kthread is asleep or there are no ready-to-invoke callbacks, but only after a timer delay. If the no-CBs GP kthread has a ->nocb_bypass_timer pending, the deferred wakeup from __call_rcu_nocb_wake() is redundant. This commit therefore makes __call_rcu_nocb_wake() avoid posting the redundant deferred wakeup if ->nocb_bypass_timer is pending. This requires adding a bit of ordering of timer actions. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index f56fb4e97a8e..2defc7fe74c3 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1909,8 +1909,10 @@ static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_alldone, rcu_advance_cbs_nowake(rdp->mynode, rdp); rdp->nocb_gp_adv_time = j; } - if (rdp->nocb_cb_sleep || - !rcu_segcblist_ready_cbs(&rdp->cblist)) + smp_mb(); /* Enqueue before timer_pending(). */ + if ((rdp->nocb_cb_sleep || + !rcu_segcblist_ready_cbs(&rdp->cblist)) && + !timer_pending(&rdp->nocb_bypass_timer)) wake_nocb_gp_defer(rdp, RCU_NOCB_WAKE_FORCE, TPS("WakeOvfIsDeferred")); rcu_nocb_unlock_irqrestore(rdp, flags); @@ -1929,6 +1931,7 @@ static void do_nocb_bypass_wakeup_timer(struct timer_list *t) trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("Timer")); rcu_nocb_lock_irqsave(rdp, flags); + smp_mb__after_spinlock(); /* Timer expire before wakeup. */ __call_rcu_nocb_wake(rdp, true, flags); } -- cgit v1.2.3 From cfcdef5e30469f3f2d6786ad35fc3fdef2a3833f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 24 Jul 2019 18:07:52 -0700 Subject: rcu: Allow rcu_do_batch() to dynamically adjust batch sizes Bimodal behavior of rcu_do_batch() is not really suited to Google applications like gfe servers. When a process with millions of sockets exits, closing all files queues two rcu callbacks per socket. This eventually reaches the point where RCU enters an emergency mode, where rcu_do_batch() do not return until whole queue is flushed. Each rcu callback lasts at least 70 nsec, so with millions of elements, we easily spend more than 100 msec without rescheduling. Goal of this patch is to avoid the infamous message like following "need_resched set for > 51999388 ns (52 ticks) without schedule" We dynamically adjust the number of elements we process, instead of 10 / INFINITE choices, we use a floor of ~1 % of current entries. If the number is above 1000, we switch to a time based limit of 3 msec per batch, adjustable with /sys/module/rcutree/parameters/rcu_resched_ns Signed-off-by: Eric Dumazet [ paulmck: Forward-port and remove debug statements. ] Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 3e89b5b83ea0..71395e91b876 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -56,6 +56,7 @@ #include #include #include +#include #include "../time/tick-internal.h" #include "tree.h" @@ -416,6 +417,12 @@ module_param(qlowmark, long, 0444); static ulong jiffies_till_first_fqs = ULONG_MAX; static ulong jiffies_till_next_fqs = ULONG_MAX; static bool rcu_kick_kthreads; +static int rcu_divisor = 7; +module_param(rcu_divisor, int, 0644); + +/* Force an exit from rcu_do_batch() after 3 milliseconds. */ +static long rcu_resched_ns = 3 * NSEC_PER_MSEC; +module_param(rcu_resched_ns, long, 0644); /* * How long the grace period must be before we start recruiting @@ -2109,6 +2116,7 @@ static void rcu_do_batch(struct rcu_data *rdp) struct rcu_head *rhp; struct rcu_cblist rcl = RCU_CBLIST_INITIALIZER(rcl); long bl, count; + long pending, tlimit = 0; /* If no callbacks are ready, just return. */ if (!rcu_segcblist_ready_cbs(&rdp->cblist)) { @@ -2130,7 +2138,10 @@ static void rcu_do_batch(struct rcu_data *rdp) local_irq_save(flags); rcu_nocb_lock(rdp); WARN_ON_ONCE(cpu_is_offline(smp_processor_id())); - bl = rdp->blimit; + pending = rcu_segcblist_n_cbs(&rdp->cblist); + bl = max(rdp->blimit, pending >> rcu_divisor); + if (unlikely(bl > 100)) + tlimit = local_clock() + rcu_resched_ns; trace_rcu_batch_start(rcu_state.name, rcu_segcblist_n_lazy_cbs(&rdp->cblist), rcu_segcblist_n_cbs(&rdp->cblist), bl); @@ -2153,6 +2164,13 @@ static void rcu_do_batch(struct rcu_data *rdp) (need_resched() || (!is_idle_task(current) && !rcu_is_callbacks_kthread()))) break; + if (unlikely(tlimit)) { + /* only call local_clock() every 32 callbacks */ + if (likely((-rcl.len & 31) || local_clock() < tlimit)) + continue; + /* Exceeded the time limit, so leave. */ + break; + } if (offloaded) { WARN_ON_ONCE(in_serving_softirq()); local_bh_enable(); -- cgit v1.2.3 From e03250061b54041d3502696db421c44a4a8039f4 Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Fri, 16 Aug 2019 02:40:44 +0000 Subject: btf: fix return value check in btf_vmlinux_init() In case of error, the function kobject_create_and_add() returns NULL pointer not ERR_PTR(). The IS_ERR() test in the return value check should be replaced with NULL test. Fixes: 341dfcf8d78e ("btf: expose BTF info through sysfs") Signed-off-by: Wei Yongjun Acked-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov --- kernel/bpf/sysfs_btf.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/sysfs_btf.c b/kernel/bpf/sysfs_btf.c index 4659349fc795..7ae5dddd1fe6 100644 --- a/kernel/bpf/sysfs_btf.c +++ b/kernel/bpf/sysfs_btf.c @@ -30,17 +30,12 @@ static struct kobject *btf_kobj; static int __init btf_vmlinux_init(void) { - int err; - if (!_binary__btf_vmlinux_bin_start) return 0; btf_kobj = kobject_create_and_add("btf", kernel_kobj); - if (IS_ERR(btf_kobj)) { - err = PTR_ERR(btf_kobj); - btf_kobj = NULL; - return err; - } + if (!btf_kobj) + return -ENOMEM; bin_attr_btf_vmlinux.size = _binary__btf_vmlinux_bin_end - _binary__btf_vmlinux_bin_start; -- cgit v1.2.3 From 35c35493b0e3343522256f6054516f4e2161a6d4 Mon Sep 17 00:00:00 2001 From: Chuhong Yuan Date: Fri, 9 Aug 2019 15:10:34 +0800 Subject: printk: Replace strncmp() with str_has_prefix() strncmp(str, const, len) is error-prone because len is easy to have typo. An example is the hard-coded len has counting error or sizeof(const) forgets - 1. So we prefer using newly introduced str_has_prefix() to substitute such strncmp() to make code better. Link: http://lkml.kernel.org/r/20190809071034.17279-1-hslester96@gmail.com Cc: "Steven Rostedt" Cc: linux-kernel@vger.kernel.org Signed-off-by: Chuhong Yuan Reviewed-by: Sergey Senozhatsky [pmladek@suse.com: Slightly updated and reformatted the commit message.] Signed-off-by: Petr Mladek --- kernel/printk/braille.c | 15 +++++++++++---- kernel/printk/printk.c | 22 ++++++++++++++++------ 2 files changed, 27 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/braille.c b/kernel/printk/braille.c index 1d21ebacfdb8..17a9591e54ff 100644 --- a/kernel/printk/braille.c +++ b/kernel/printk/braille.c @@ -11,11 +11,18 @@ int _braille_console_setup(char **str, char **brl_options) { - if (!strncmp(*str, "brl,", 4)) { + size_t len; + + len = str_has_prefix(*str, "brl,"); + if (len) { *brl_options = ""; - *str += 4; - } else if (!strncmp(*str, "brl=", 4)) { - *brl_options = *str + 4; + *str += len; + return 0; + } + + len = str_has_prefix(*str, "brl="); + if (len) { + *brl_options = *str + len; *str = strchr(*brl_options, ','); if (!*str) { pr_err("need port name after brl=\n"); diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 1888f6a3b694..43a31015ec93 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -118,19 +118,29 @@ static unsigned int __read_mostly devkmsg_log = DEVKMSG_LOG_MASK_DEFAULT; static int __control_devkmsg(char *str) { + size_t len; + if (!str) return -EINVAL; - if (!strncmp(str, "on", 2)) { + len = str_has_prefix(str, "on"); + if (len) { devkmsg_log = DEVKMSG_LOG_MASK_ON; - return 2; - } else if (!strncmp(str, "off", 3)) { + return len; + } + + len = str_has_prefix(str, "off"); + if (len) { devkmsg_log = DEVKMSG_LOG_MASK_OFF; - return 3; - } else if (!strncmp(str, "ratelimit", 9)) { + return len; + } + + len = str_has_prefix(str, "ratelimit"); + if (len) { devkmsg_log = DEVKMSG_LOG_MASK_DEFAULT; - return 9; + return len; } + return -EINVAL; } -- cgit v1.2.3 From d30bdfc0ecf885ca0495bdee522cb9d7ee005cf8 Mon Sep 17 00:00:00 2001 From: Chuhong Yuan Date: Fri, 9 Aug 2019 15:10:23 +0800 Subject: PM: sleep: Replace strncmp() with str_has_prefix() Use str_has_prefix() instead of strncmp() which is less straightforward in decode_state(). Signed-off-by: Chuhong Yuan [ rjw: Subject & changelog ] Signed-off-by: Rafael J. Wysocki --- kernel/power/main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/power/main.c b/kernel/power/main.c index 938dc53a8b94..e8710d179b35 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -580,7 +580,7 @@ static suspend_state_t decode_state(const char *buf, size_t n) len = p ? p - buf : n; /* Check hibernation first. */ - if (len == 4 && !strncmp(buf, "disk", len)) + if (len == 4 && str_has_prefix(buf, "disk")) return PM_SUSPEND_MAX; #ifdef CONFIG_SUSPEND -- cgit v1.2.3 From 0d3d343560bad8e1b7879fe94251cfe731a2dd13 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 13 Aug 2019 09:25:14 +0200 Subject: genirq: remove the is_affinity_mask_valid hook This override was only used by the ia64 SGI SN2 platform, which is gone now. Signed-off-by: Christoph Hellwig Link: https://lkml.kernel.org/r/20190813072514.23299-29-hch@lst.de Signed-off-by: Tony Luck --- kernel/irq/proc.c | 14 -------------- 1 file changed, 14 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index da9addb8d655..cfc4f088a0e7 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -100,10 +100,6 @@ static int irq_affinity_hint_proc_show(struct seq_file *m, void *v) return 0; } -#ifndef is_affinity_mask_valid -#define is_affinity_mask_valid(val) 1 -#endif - int no_irq_affinity; static int irq_affinity_proc_show(struct seq_file *m, void *v) { @@ -136,11 +132,6 @@ static ssize_t write_irq_affinity(int type, struct file *file, if (err) goto free_cpumask; - if (!is_affinity_mask_valid(new_value)) { - err = -EINVAL; - goto free_cpumask; - } - /* * Do not allow disabling IRQs completely - it's a too easy * way to make the system unusable accidentally :-) At least @@ -232,11 +223,6 @@ static ssize_t default_affinity_write(struct file *file, if (err) goto out; - if (!is_affinity_mask_valid(new_value)) { - err = -EINVAL; - goto out; - } - /* * Do not allow disabling IRQs completely - it's a too easy * way to make the system unusable accidentally :-) At least -- cgit v1.2.3 From b0e4701ce15d0381cdea0643c7f0a35dc529cec2 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Wed, 14 Aug 2019 10:37:48 -0700 Subject: bpf: export bpf_map_inc_not_zero Rename existing bpf_map_inc_not_zero to __bpf_map_inc_not_zero to indicate that it's caller's responsibility to do proper locking. Create and export bpf_map_inc_not_zero wrapper that properly locks map_idr_lock. Will be used in the next commit to hold a map while cloning a socket. Cc: Martin KaFai Lau Cc: Yonghong Song Acked-by: Martin KaFai Lau Acked-by: Yonghong Song Signed-off-by: Stanislav Fomichev Signed-off-by: Daniel Borkmann --- kernel/bpf/syscall.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 5d141f16f6fa..cf8052b016e7 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -683,8 +683,8 @@ struct bpf_map *bpf_map_get_with_uref(u32 ufd) } /* map_idr_lock should have been held */ -static struct bpf_map *bpf_map_inc_not_zero(struct bpf_map *map, - bool uref) +static struct bpf_map *__bpf_map_inc_not_zero(struct bpf_map *map, + bool uref) { int refold; @@ -704,6 +704,16 @@ static struct bpf_map *bpf_map_inc_not_zero(struct bpf_map *map, return map; } +struct bpf_map *bpf_map_inc_not_zero(struct bpf_map *map, bool uref) +{ + spin_lock_bh(&map_idr_lock); + map = __bpf_map_inc_not_zero(map, uref); + spin_unlock_bh(&map_idr_lock); + + return map; +} +EXPORT_SYMBOL_GPL(bpf_map_inc_not_zero); + int __weak bpf_stackmap_copy(struct bpf_map *map, void *key, void *value) { return -ENOTSUPP; @@ -2177,7 +2187,7 @@ static int bpf_map_get_fd_by_id(const union bpf_attr *attr) spin_lock_bh(&map_idr_lock); map = idr_find(&map_idr, id); if (map) - map = bpf_map_inc_not_zero(map, true); + map = __bpf_map_inc_not_zero(map, true); else map = ERR_PTR(-ENOENT); spin_unlock_bh(&map_idr_lock); -- cgit v1.2.3 From 0402acd683c678874df6bdbc23530ca07ea19353 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20T=C3=B6pel?= Date: Thu, 15 Aug 2019 11:30:13 +0200 Subject: xsk: remove AF_XDP socket from map when the socket is released MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When an AF_XDP socket is released/closed the XSKMAP still holds a reference to the socket in a "released" state. The socket will still use the netdev queue resource, and block newly created sockets from attaching to that queue, but no user application can access the fill/complete/rx/tx queues. This results in that all applications need to explicitly clear the map entry from the old "zombie state" socket. This should be done automatically. In this patch, the sockets tracks, and have a reference to, which maps it resides in. When the socket is released, it will remove itself from all maps. Suggested-by: Bruce Richardson Signed-off-by: Björn Töpel Signed-off-by: Daniel Borkmann --- kernel/bpf/xskmap.c | 125 +++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 105 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/xskmap.c b/kernel/bpf/xskmap.c index 9bb96ace9fa1..16031d489173 100644 --- a/kernel/bpf/xskmap.c +++ b/kernel/bpf/xskmap.c @@ -13,8 +13,71 @@ struct xsk_map { struct bpf_map map; struct xdp_sock **xsk_map; struct list_head __percpu *flush_list; + spinlock_t lock; /* Synchronize map updates */ }; +int xsk_map_inc(struct xsk_map *map) +{ + struct bpf_map *m = &map->map; + + m = bpf_map_inc(m, false); + return IS_ERR(m) ? PTR_ERR(m) : 0; +} + +void xsk_map_put(struct xsk_map *map) +{ + bpf_map_put(&map->map); +} + +static struct xsk_map_node *xsk_map_node_alloc(struct xsk_map *map, + struct xdp_sock **map_entry) +{ + struct xsk_map_node *node; + int err; + + node = kzalloc(sizeof(*node), GFP_ATOMIC | __GFP_NOWARN); + if (!node) + return NULL; + + err = xsk_map_inc(map); + if (err) { + kfree(node); + return ERR_PTR(err); + } + + node->map = map; + node->map_entry = map_entry; + return node; +} + +static void xsk_map_node_free(struct xsk_map_node *node) +{ + xsk_map_put(node->map); + kfree(node); +} + +static void xsk_map_sock_add(struct xdp_sock *xs, struct xsk_map_node *node) +{ + spin_lock_bh(&xs->map_list_lock); + list_add_tail(&node->node, &xs->map_list); + spin_unlock_bh(&xs->map_list_lock); +} + +static void xsk_map_sock_delete(struct xdp_sock *xs, + struct xdp_sock **map_entry) +{ + struct xsk_map_node *n, *tmp; + + spin_lock_bh(&xs->map_list_lock); + list_for_each_entry_safe(n, tmp, &xs->map_list, node) { + if (map_entry == n->map_entry) { + list_del(&n->node); + xsk_map_node_free(n); + } + } + spin_unlock_bh(&xs->map_list_lock); +} + static struct bpf_map *xsk_map_alloc(union bpf_attr *attr) { struct xsk_map *m; @@ -34,6 +97,7 @@ static struct bpf_map *xsk_map_alloc(union bpf_attr *attr) return ERR_PTR(-ENOMEM); bpf_map_init_from_attr(&m->map, attr); + spin_lock_init(&m->lock); cost = (u64)m->map.max_entries * sizeof(struct xdp_sock *); cost += sizeof(struct list_head) * num_possible_cpus(); @@ -71,21 +135,9 @@ free_m: static void xsk_map_free(struct bpf_map *map) { struct xsk_map *m = container_of(map, struct xsk_map, map); - int i; bpf_clear_redirect_map(map); synchronize_net(); - - for (i = 0; i < map->max_entries; i++) { - struct xdp_sock *xs; - - xs = m->xsk_map[i]; - if (!xs) - continue; - - sock_put((struct sock *)xs); - } - free_percpu(m->flush_list); bpf_map_area_free(m->xsk_map); kfree(m); @@ -164,8 +216,9 @@ static int xsk_map_update_elem(struct bpf_map *map, void *key, void *value, u64 map_flags) { struct xsk_map *m = container_of(map, struct xsk_map, map); + struct xdp_sock *xs, *old_xs, **map_entry; u32 i = *(u32 *)key, fd = *(u32 *)value; - struct xdp_sock *xs, *old_xs; + struct xsk_map_node *node; struct socket *sock; int err; @@ -192,32 +245,64 @@ static int xsk_map_update_elem(struct bpf_map *map, void *key, void *value, return -EOPNOTSUPP; } - sock_hold(sock->sk); + map_entry = &m->xsk_map[i]; + node = xsk_map_node_alloc(m, map_entry); + if (IS_ERR(node)) { + sockfd_put(sock); + return PTR_ERR(node); + } - old_xs = xchg(&m->xsk_map[i], xs); + spin_lock_bh(&m->lock); + old_xs = READ_ONCE(*map_entry); + if (old_xs == xs) { + err = 0; + goto out; + } + xsk_map_sock_add(xs, node); + WRITE_ONCE(*map_entry, xs); if (old_xs) - sock_put((struct sock *)old_xs); - + xsk_map_sock_delete(old_xs, map_entry); + spin_unlock_bh(&m->lock); sockfd_put(sock); return 0; + +out: + spin_unlock_bh(&m->lock); + sockfd_put(sock); + xsk_map_node_free(node); + return err; } static int xsk_map_delete_elem(struct bpf_map *map, void *key) { struct xsk_map *m = container_of(map, struct xsk_map, map); - struct xdp_sock *old_xs; + struct xdp_sock *old_xs, **map_entry; int k = *(u32 *)key; if (k >= map->max_entries) return -EINVAL; - old_xs = xchg(&m->xsk_map[k], NULL); + spin_lock_bh(&m->lock); + map_entry = &m->xsk_map[k]; + old_xs = xchg(map_entry, NULL); if (old_xs) - sock_put((struct sock *)old_xs); + xsk_map_sock_delete(old_xs, map_entry); + spin_unlock_bh(&m->lock); return 0; } +void xsk_map_try_sock_delete(struct xsk_map *map, struct xdp_sock *xs, + struct xdp_sock **map_entry) +{ + spin_lock_bh(&map->lock); + if (READ_ONCE(*map_entry) == xs) { + WRITE_ONCE(*map_entry, NULL); + xsk_map_sock_delete(xs, map_entry); + } + spin_unlock_bh(&map->lock); +} + const struct bpf_map_ops xsk_map_ops = { .map_alloc = xsk_map_alloc, .map_free = xsk_map_free, -- cgit v1.2.3 From 36cc34358caf631115e1566485fffed3cf4196b5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20T=C3=B6pel?= Date: Thu, 15 Aug 2019 11:30:14 +0200 Subject: xsk: support BPF_EXIST and BPF_NOEXIST flags in XSKMAP MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The XSKMAP did not honor the BPF_EXIST/BPF_NOEXIST flags when updating an entry. This patch addresses that. Signed-off-by: Björn Töpel Signed-off-by: Daniel Borkmann --- kernel/bpf/xskmap.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/xskmap.c b/kernel/bpf/xskmap.c index 16031d489173..4cc28e226398 100644 --- a/kernel/bpf/xskmap.c +++ b/kernel/bpf/xskmap.c @@ -226,8 +226,6 @@ static int xsk_map_update_elem(struct bpf_map *map, void *key, void *value, return -EINVAL; if (unlikely(i >= m->map.max_entries)) return -E2BIG; - if (unlikely(map_flags == BPF_NOEXIST)) - return -EEXIST; sock = sockfd_lookup(fd, &err); if (!sock) @@ -257,6 +255,12 @@ static int xsk_map_update_elem(struct bpf_map *map, void *key, void *value, if (old_xs == xs) { err = 0; goto out; + } else if (old_xs && map_flags == BPF_NOEXIST) { + err = -EEXIST; + goto out; + } else if (!old_xs && map_flags == BPF_EXIST) { + err = -ENOENT; + goto out; } xsk_map_sock_add(xs, node); WRITE_ONCE(*map_entry, xs); -- cgit v1.2.3 From b0fdc01354f45d43f082025636ef808968a27b36 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Fri, 16 Aug 2019 18:06:26 +0200 Subject: sched/core: Schedule new worker even if PI-blocked If a task is PI-blocked (blocking on sleeping spinlock) then we don't want to schedule a new kworker if we schedule out due to lock contention because !RT does not do that as well. A spinning spinlock disables preemption and a worker does not schedule out on lock contention (but spin). On RT the RW-semaphore implementation uses an rtmutex so tsk_is_pi_blocked() will return true if a task blocks on it. In this case we will now start a new worker which may deadlock if one worker is waiting on progress from another worker. Since a RW-semaphore starts a new worker on !RT, we should do the same on RT. XFS is able to trigger this deadlock. Allow to schedule new worker if the current worker is PI-blocked. Signed-off-by: Sebastian Andrzej Siewior Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20190816160626.12742-1-bigeasy@linutronix.de Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 2b037f195473..010d578118d6 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3904,7 +3904,7 @@ void __noreturn do_task_dead(void) static inline void sched_submit_work(struct task_struct *tsk) { - if (!tsk->state || tsk_is_pi_blocked(tsk)) + if (!tsk->state) return; /* @@ -3920,6 +3920,9 @@ static inline void sched_submit_work(struct task_struct *tsk) preempt_enable_no_resched(); } + if (tsk_is_pi_blocked(tsk)) + return; + /* * If we are going to sleep and we have plugged IO queued, * make sure to submit it to avoid deadlocks. -- cgit v1.2.3 From f1c6ece23729257fb46562ff9224cf5f61b818da Mon Sep 17 00:00:00 2001 From: Andrea Righi Date: Mon, 12 Aug 2019 20:43:02 +0200 Subject: kprobes: Fix potential deadlock in kprobe_optimizer() lockdep reports the following deadlock scenario: WARNING: possible circular locking dependency detected kworker/1:1/48 is trying to acquire lock: 000000008d7a62b2 (text_mutex){+.+.}, at: kprobe_optimizer+0x163/0x290 but task is already holding lock: 00000000850b5e2d (module_mutex){+.+.}, at: kprobe_optimizer+0x31/0x290 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #1 (module_mutex){+.+.}: __mutex_lock+0xac/0x9f0 mutex_lock_nested+0x1b/0x20 set_all_modules_text_rw+0x22/0x90 ftrace_arch_code_modify_prepare+0x1c/0x20 ftrace_run_update_code+0xe/0x30 ftrace_startup_enable+0x2e/0x50 ftrace_startup+0xa7/0x100 register_ftrace_function+0x27/0x70 arm_kprobe+0xb3/0x130 enable_kprobe+0x83/0xa0 enable_trace_kprobe.part.0+0x2e/0x80 kprobe_register+0x6f/0xc0 perf_trace_event_init+0x16b/0x270 perf_kprobe_init+0xa7/0xe0 perf_kprobe_event_init+0x3e/0x70 perf_try_init_event+0x4a/0x140 perf_event_alloc+0x93a/0xde0 __do_sys_perf_event_open+0x19f/0xf30 __x64_sys_perf_event_open+0x20/0x30 do_syscall_64+0x65/0x1d0 entry_SYSCALL_64_after_hwframe+0x49/0xbe -> #0 (text_mutex){+.+.}: __lock_acquire+0xfcb/0x1b60 lock_acquire+0xca/0x1d0 __mutex_lock+0xac/0x9f0 mutex_lock_nested+0x1b/0x20 kprobe_optimizer+0x163/0x290 process_one_work+0x22b/0x560 worker_thread+0x50/0x3c0 kthread+0x112/0x150 ret_from_fork+0x3a/0x50 other info that might help us debug this: Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(module_mutex); lock(text_mutex); lock(module_mutex); lock(text_mutex); *** DEADLOCK *** As a reproducer I've been using bcc's funccount.py (https://github.com/iovisor/bcc/blob/master/tools/funccount.py), for example: # ./funccount.py '*interrupt*' That immediately triggers the lockdep splat. Fix by acquiring text_mutex before module_mutex in kprobe_optimizer(). Signed-off-by: Andrea Righi Acked-by: Masami Hiramatsu Cc: Anil S Keshavamurthy Cc: David S. Miller Cc: Linus Torvalds Cc: Naveen N. Rao Cc: Peter Zijlstra Cc: Thomas Gleixner Fixes: d5b844a2cf50 ("ftrace/x86: Remove possible deadlock between register_kprobe() and ftrace_run_update_code()") Link: http://lkml.kernel.org/r/20190812184302.GA7010@xps-13 Signed-off-by: Ingo Molnar --- kernel/kprobes.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 9873fc627d61..d9770a5393c8 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -470,6 +470,7 @@ static DECLARE_DELAYED_WORK(optimizing_work, kprobe_optimizer); */ static void do_optimize_kprobes(void) { + lockdep_assert_held(&text_mutex); /* * The optimization/unoptimization refers online_cpus via * stop_machine() and cpu-hotplug modifies online_cpus. @@ -487,9 +488,7 @@ static void do_optimize_kprobes(void) list_empty(&optimizing_list)) return; - mutex_lock(&text_mutex); arch_optimize_kprobes(&optimizing_list); - mutex_unlock(&text_mutex); } /* @@ -500,6 +499,7 @@ static void do_unoptimize_kprobes(void) { struct optimized_kprobe *op, *tmp; + lockdep_assert_held(&text_mutex); /* See comment in do_optimize_kprobes() */ lockdep_assert_cpus_held(); @@ -507,7 +507,6 @@ static void do_unoptimize_kprobes(void) if (list_empty(&unoptimizing_list)) return; - mutex_lock(&text_mutex); arch_unoptimize_kprobes(&unoptimizing_list, &freeing_list); /* Loop free_list for disarming */ list_for_each_entry_safe(op, tmp, &freeing_list, list) { @@ -524,7 +523,6 @@ static void do_unoptimize_kprobes(void) } else list_del_init(&op->list); } - mutex_unlock(&text_mutex); } /* Reclaim all kprobes on the free_list */ @@ -556,6 +554,7 @@ static void kprobe_optimizer(struct work_struct *work) { mutex_lock(&kprobe_mutex); cpus_read_lock(); + mutex_lock(&text_mutex); /* Lock modules while optimizing kprobes */ mutex_lock(&module_mutex); @@ -583,6 +582,7 @@ static void kprobe_optimizer(struct work_struct *work) do_free_cleaned_kprobes(); mutex_unlock(&module_mutex); + mutex_unlock(&text_mutex); cpus_read_unlock(); mutex_unlock(&kprobe_mutex); -- cgit v1.2.3 From 4ff96fb52c6964ad42e0a878be8f86a2e8052ddd Mon Sep 17 00:00:00 2001 From: Miroslav Benes Date: Fri, 19 Jul 2019 14:28:39 +0200 Subject: livepatch: Nullify obj->mod in klp_module_coming()'s error path klp_module_coming() is called for every module appearing in the system. It sets obj->mod to a patched module for klp_object obj. Unfortunately it leaves it set even if an error happens later in the function and the patched module is not allowed to be loaded. klp_is_object_loaded() uses obj->mod variable and could currently give a wrong return value. The bug is probably harmless as of now. Signed-off-by: Miroslav Benes Reviewed-by: Petr Mladek Acked-by: Josh Poimboeuf Signed-off-by: Petr Mladek --- kernel/livepatch/core.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index c4ce08f43bd6..ab4a4606d19b 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c @@ -1175,6 +1175,7 @@ err: pr_warn("patch '%s' failed for module '%s', refusing to load module '%s'\n", patch->mod->name, obj->mod->name, obj->mod->name); mod->klp_alive = false; + obj->mod = NULL; klp_cleanup_module_patches_limited(mod, patch); mutex_unlock(&klp_mutex); -- cgit v1.2.3 From 33da8e7c814f77310250bb54a9db36a44c5de784 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 16 Aug 2019 12:33:54 -0500 Subject: signal: Allow cifs and drbd to receive their terminating signals MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit My recent to change to only use force_sig for a synchronous events wound up breaking signal reception cifs and drbd. I had overlooked the fact that by default kthreads start out with all signals set to SIG_IGN. So a change I thought was safe turned out to have made it impossible for those kernel thread to catch their signals. Reverting the work on force_sig is a bad idea because what the code was doing was very much a misuse of force_sig. As the way force_sig ultimately allowed the signal to happen was to change the signal handler to SIG_DFL. Which after the first signal will allow userspace to send signals to these kernel threads. At least for wake_ack_receiver in drbd that does not appear actively wrong. So correct this problem by adding allow_kernel_signal that will allow signals whose siginfo reports they were sent by the kernel through, but will not allow userspace generated signals, and update cifs and drbd to call allow_kernel_signal in an appropriate place so that their thread can receive this signal. Fixing things this way ensures that userspace won't be able to send signals and cause problems, that it is clear which signals the threads are expecting to receive, and it guarantees that nothing else in the system will be affected. This change was partly inspired by similar cifs and drbd patches that added allow_signal. Reported-by: ronnie sahlberg Reported-by: Christoph Böhmwalder Tested-by: Christoph Böhmwalder Cc: Steve French Cc: Philipp Reisner Cc: David Laight Fixes: 247bc9470b1e ("cifs: fix rmmod regression in cifs.ko caused by force_sig changes") Fixes: 72abe3bcf091 ("signal/cifs: Fix cifs_put_tcp_session to call send_sig instead of force_sig") Fixes: fee109901f39 ("signal/drbd: Use send_sig not force_sig") Fixes: 3cf5d076fb4d ("signal: Remove task parameter from force_sig") Signed-off-by: "Eric W. Biederman" --- kernel/signal.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index e667be6907d7..534fec266a33 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -90,6 +90,11 @@ static bool sig_task_ignored(struct task_struct *t, int sig, bool force) handler == SIG_DFL && !(force && sig_kernel_only(sig))) return true; + /* Only allow kernel generated signals to this kthread */ + if (unlikely((t->flags & PF_KTHREAD) && + (handler == SIG_KTHREAD_KERNEL) && !force)) + return true; + return sig_handler_ignored(handler, sig); } -- cgit v1.2.3 From b6a32bbd8735def2d0d696ba59205d1874b7800f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 16 Aug 2019 18:09:23 +0200 Subject: genirq: Force interrupt threading on RT Switch force_irqthreads from a boot time modifiable variable to a compile time constant when CONFIG_PREEMPT_RT is enabled. Signed-off-by: Thomas Gleixner Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20190816160923.12855-1-bigeasy@linutronix.de --- kernel/irq/manage.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index e8f7f179bf77..97de1b7d43af 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -23,7 +23,7 @@ #include "internals.h" -#ifdef CONFIG_IRQ_FORCED_THREADING +#if defined(CONFIG_IRQ_FORCED_THREADING) && !defined(CONFIG_PREEMPT_RT) __read_mostly bool force_irqthreads; EXPORT_SYMBOL_GPL(force_irqthreads); -- cgit v1.2.3 From d0ff14fdc987303aeeb7de6f1bd72c3749ae2a9b Mon Sep 17 00:00:00 2001 From: Michael Kelley Date: Thu, 1 Aug 2019 23:53:53 +0000 Subject: genirq: Properly pair kobject_del() with kobject_add() If alloc_descs() fails before irq_sysfs_init() has run, free_desc() in the cleanup path will call kobject_del() even though the kobject has not been added with kobject_add(). Fix this by making the call to kobject_del() conditional on whether irq_sysfs_init() has run. This problem surfaced because commit aa30f47cf666 ("kobject: Add support for default attribute groups to kobj_type") makes kobject_del() stricter about pairing with kobject_add(). If the pairing is incorrrect, a WARNING and backtrace occur in sysfs_remove_group() because there is no parent. [ tglx: Add a comment to the code and make it work with CONFIG_SYSFS=n ] Fixes: ecb3f394c5db ("genirq: Expose interrupt information through sysfs") Signed-off-by: Michael Kelley Signed-off-by: Thomas Gleixner Acked-by: Greg Kroah-Hartman Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/1564703564-4116-1-git-send-email-mikelley@microsoft.com --- kernel/irq/irqdesc.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 9484e88dabc2..9be995fc3c5a 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -295,6 +295,18 @@ static void irq_sysfs_add(int irq, struct irq_desc *desc) } } +static void irq_sysfs_del(struct irq_desc *desc) +{ + /* + * If irq_sysfs_init() has not yet been invoked (early boot), then + * irq_kobj_base is NULL and the descriptor was never added. + * kobject_del() complains about a object with no parent, so make + * it conditional. + */ + if (irq_kobj_base) + kobject_del(&desc->kobj); +} + static int __init irq_sysfs_init(void) { struct irq_desc *desc; @@ -325,6 +337,7 @@ static struct kobj_type irq_kobj_type = { }; static void irq_sysfs_add(int irq, struct irq_desc *desc) {} +static void irq_sysfs_del(struct irq_desc *desc) {} #endif /* CONFIG_SYSFS */ @@ -438,7 +451,7 @@ static void free_desc(unsigned int irq) * The sysfs entry must be serialized against a concurrent * irq_sysfs_init() as well. */ - kobject_del(&desc->kobj); + irq_sysfs_del(desc); delete_irq_desc(irq); /* -- cgit v1.2.3 From 49fcf732bdae0550721ef73af7c45109ce26b2a9 Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 19 Aug 2019 17:17:40 -0700 Subject: lockdown: Enforce module signatures if the kernel is locked down If the kernel is locked down, require that all modules have valid signatures that we can verify. I have adjusted the errors generated: (1) If there's no signature (ENODATA) or we can't check it (ENOPKG, ENOKEY), then: (a) If signatures are enforced then EKEYREJECTED is returned. (b) If there's no signature or we can't check it, but the kernel is locked down then EPERM is returned (this is then consistent with other lockdown cases). (2) If the signature is unparseable (EBADMSG, EINVAL), the signature fails the check (EKEYREJECTED) or a system error occurs (eg. ENOMEM), we return the error we got. Note that the X.509 code doesn't check for key expiry as the RTC might not be valid or might not have been transferred to the kernel's clock yet. [Modified by Matthew Garrett to remove the IMA integration. This will be replaced with integration with the IMA architecture policy patchset.] Signed-off-by: David Howells Signed-off-by: Matthew Garrett Reviewed-by: Kees Cook Cc: Jessica Yu Signed-off-by: James Morris --- kernel/module.c | 37 ++++++++++++++++++++++++++++++------- 1 file changed, 30 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 80c7c09584cf..2206c08a5e10 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2753,8 +2753,9 @@ static inline void kmemleak_load_module(const struct module *mod, #ifdef CONFIG_MODULE_SIG static int module_sig_check(struct load_info *info, int flags) { - int err = -ENOKEY; + int err = -ENODATA; const unsigned long markerlen = sizeof(MODULE_SIG_STRING) - 1; + const char *reason; const void *mod = info->hdr; /* @@ -2769,16 +2770,38 @@ static int module_sig_check(struct load_info *info, int flags) err = mod_verify_sig(mod, info); } - if (!err) { + switch (err) { + case 0: info->sig_ok = true; return 0; - } - /* Not having a signature is only an error if we're strict. */ - if (err == -ENOKEY && !is_module_sig_enforced()) - err = 0; + /* We don't permit modules to be loaded into trusted kernels + * without a valid signature on them, but if we're not + * enforcing, certain errors are non-fatal. + */ + case -ENODATA: + reason = "Loading of unsigned module"; + goto decide; + case -ENOPKG: + reason = "Loading of module with unsupported crypto"; + goto decide; + case -ENOKEY: + reason = "Loading of module with unavailable key"; + decide: + if (is_module_sig_enforced()) { + pr_notice("%s is rejected\n", reason); + return -EKEYREJECTED; + } - return err; + return security_locked_down(LOCKDOWN_MODULE_SIGNATURE); + + /* All other errors are fatal, including nomem, unparseable + * signatures and signature check failures - even if signatures + * aren't required. + */ + default: + return err; + } } #else /* !CONFIG_MODULE_SIG */ static int module_sig_check(struct load_info *info, int flags) -- cgit v1.2.3 From 7d31f4602f8d366072471ca138e4ea7b8edf9be0 Mon Sep 17 00:00:00 2001 From: Matthew Garrett Date: Mon, 19 Aug 2019 17:17:42 -0700 Subject: kexec_load: Disable at runtime if the kernel is locked down The kexec_load() syscall permits the loading and execution of arbitrary code in ring 0, which is something that lock-down is meant to prevent. It makes sense to disable kexec_load() in this situation. This does not affect kexec_file_load() syscall which can check for a signature on the image to be booted. Signed-off-by: David Howells Signed-off-by: Matthew Garrett Acked-by: Dave Young Reviewed-by: Kees Cook cc: kexec@lists.infradead.org Signed-off-by: James Morris --- kernel/kexec.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'kernel') diff --git a/kernel/kexec.c b/kernel/kexec.c index 1b018f1a6e0d..bc933c0db9bf 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -205,6 +205,14 @@ static inline int kexec_load_check(unsigned long nr_segments, if (result < 0) return result; + /* + * kexec can be used to circumvent module loading restrictions, so + * prevent loading in that case + */ + result = security_locked_down(LOCKDOWN_KEXEC); + if (result) + return result; + /* * Verify we have a legal set of flags * This leaves us room for future extensions. -- cgit v1.2.3 From 99d5cadfde2b1acb7650021df5abaa5ec447dd10 Mon Sep 17 00:00:00 2001 From: Jiri Bohac Date: Mon, 19 Aug 2019 17:17:44 -0700 Subject: kexec_file: split KEXEC_VERIFY_SIG into KEXEC_SIG and KEXEC_SIG_FORCE This is a preparatory patch for kexec_file_load() lockdown. A locked down kernel needs to prevent unsigned kernel images from being loaded with kexec_file_load(). Currently, the only way to force the signature verification is compiling with KEXEC_VERIFY_SIG. This prevents loading usigned images even when the kernel is not locked down at runtime. This patch splits KEXEC_VERIFY_SIG into KEXEC_SIG and KEXEC_SIG_FORCE. Analogous to the MODULE_SIG and MODULE_SIG_FORCE for modules, KEXEC_SIG turns on the signature verification but allows unsigned images to be loaded. KEXEC_SIG_FORCE disallows images without a valid signature. Signed-off-by: Jiri Bohac Signed-off-by: David Howells Signed-off-by: Matthew Garrett cc: kexec@lists.infradead.org Signed-off-by: James Morris --- kernel/kexec_file.c | 60 +++++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 51 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index ef7b951a8087..972931201995 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -88,7 +88,7 @@ int __weak arch_kimage_file_post_load_cleanup(struct kimage *image) return kexec_image_post_load_cleanup_default(image); } -#ifdef CONFIG_KEXEC_VERIFY_SIG +#ifdef CONFIG_KEXEC_SIG static int kexec_image_verify_sig_default(struct kimage *image, void *buf, unsigned long buf_len) { @@ -177,6 +177,51 @@ void kimage_file_post_load_cleanup(struct kimage *image) image->image_loader_data = NULL; } +#ifdef CONFIG_KEXEC_SIG +static int +kimage_validate_signature(struct kimage *image) +{ + const char *reason; + int ret; + + ret = arch_kexec_kernel_verify_sig(image, image->kernel_buf, + image->kernel_buf_len); + switch (ret) { + case 0: + break; + + /* Certain verification errors are non-fatal if we're not + * checking errors, provided we aren't mandating that there + * must be a valid signature. + */ + case -ENODATA: + reason = "kexec of unsigned image"; + goto decide; + case -ENOPKG: + reason = "kexec of image with unsupported crypto"; + goto decide; + case -ENOKEY: + reason = "kexec of image with unavailable key"; + decide: + if (IS_ENABLED(CONFIG_KEXEC_SIG_FORCE)) { + pr_notice("%s rejected\n", reason); + return ret; + } + + return 0; + + /* All other errors are fatal, including nomem, unparseable + * signatures and signature check failures - even if signatures + * aren't required. + */ + default: + pr_notice("kernel signature verification failed (%d).\n", ret); + } + + return ret; +} +#endif + /* * In file mode list of segments is prepared by kernel. Copy relevant * data from user space, do error checking, prepare segment list @@ -186,7 +231,7 @@ kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd, const char __user *cmdline_ptr, unsigned long cmdline_len, unsigned flags) { - int ret = 0; + int ret; void *ldata; loff_t size; @@ -205,14 +250,11 @@ kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd, if (ret) goto out; -#ifdef CONFIG_KEXEC_VERIFY_SIG - ret = arch_kexec_kernel_verify_sig(image, image->kernel_buf, - image->kernel_buf_len); - if (ret) { - pr_debug("kernel signature verification failed.\n"); +#ifdef CONFIG_KEXEC_SIG + ret = kimage_validate_signature(image); + + if (ret) goto out; - } - pr_debug("kernel signature verification successful.\n"); #endif /* It is possible that there no initramfs is being loaded */ if (!(flags & KEXEC_FILE_NO_INITRAMFS)) { -- cgit v1.2.3 From 155bdd30af17e90941589b5db4dab9a29b28c112 Mon Sep 17 00:00:00 2001 From: Jiri Bohac Date: Mon, 19 Aug 2019 17:17:45 -0700 Subject: kexec_file: Restrict at runtime if the kernel is locked down When KEXEC_SIG is not enabled, kernel should not load images through kexec_file systemcall if the kernel is locked down. [Modified by David Howells to fit with modifications to the previous patch and to return -EPERM if the kernel is locked down for consistency with other lockdowns. Modified by Matthew Garrett to remove the IMA integration, which will be replaced by integrating with the IMA architecture policy patches.] Signed-off-by: Jiri Bohac Signed-off-by: David Howells Signed-off-by: Matthew Garrett cc: kexec@lists.infradead.org Signed-off-by: James Morris --- kernel/kexec_file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index 972931201995..43109ef4d6bf 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -208,7 +208,7 @@ kimage_validate_signature(struct kimage *image) return ret; } - return 0; + return security_locked_down(LOCKDOWN_KEXEC); /* All other errors are fatal, including nomem, unparseable * signatures and signature check failures - even if signatures -- cgit v1.2.3 From 38bd94b8a1bd46e6d3d9718c7ff582e4c8ccb440 Mon Sep 17 00:00:00 2001 From: Josh Boyer Date: Mon, 19 Aug 2019 17:17:46 -0700 Subject: hibernate: Disable when the kernel is locked down There is currently no way to verify the resume image when returning from hibernate. This might compromise the signed modules trust model, so until we can work with signed hibernate images we disable it when the kernel is locked down. Signed-off-by: Josh Boyer Signed-off-by: David Howells Signed-off-by: Matthew Garrett Reviewed-by: Kees Cook Cc: rjw@rjwysocki.net Cc: pavel@ucw.cz cc: linux-pm@vger.kernel.org Signed-off-by: James Morris --- kernel/power/hibernate.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index cd7434e6000d..3c0a5a8170b0 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -30,6 +30,7 @@ #include #include #include +#include #include #include "power.h" @@ -68,7 +69,7 @@ static const struct platform_hibernation_ops *hibernation_ops; bool hibernation_available(void) { - return (nohibernate == 0); + return nohibernate == 0 && !security_locked_down(LOCKDOWN_HIBERNATION); } /** -- cgit v1.2.3 From 20657f66ef52e5005369e4ef539d4cbf01eab10d Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 19 Aug 2019 17:17:55 -0700 Subject: lockdown: Lock down module params that specify hardware parameters (eg. ioport) Provided an annotation for module parameters that specify hardware parameters (such as io ports, iomem addresses, irqs, dma channels, fixed dma buffers and other types). Suggested-by: Alan Cox Signed-off-by: David Howells Signed-off-by: Matthew Garrett Reviewed-by: Kees Cook Cc: Jessica Yu Signed-off-by: James Morris --- kernel/params.c | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/params.c b/kernel/params.c index cf448785d058..8e56f8b12d8f 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -12,6 +12,7 @@ #include #include #include +#include #ifdef CONFIG_SYSFS /* Protects all built-in parameters, modules use their own param_lock */ @@ -96,13 +97,19 @@ bool parameq(const char *a, const char *b) return parameqn(a, b, strlen(a)+1); } -static void param_check_unsafe(const struct kernel_param *kp) +static bool param_check_unsafe(const struct kernel_param *kp) { + if (kp->flags & KERNEL_PARAM_FL_HWPARAM && + security_locked_down(LOCKDOWN_MODULE_PARAMETERS)) + return false; + if (kp->flags & KERNEL_PARAM_FL_UNSAFE) { pr_notice("Setting dangerous option %s - tainting kernel\n", kp->name); add_taint(TAINT_USER, LOCKDEP_STILL_OK); } + + return true; } static int parse_one(char *param, @@ -132,8 +139,10 @@ static int parse_one(char *param, pr_debug("handling %s with %p\n", param, params[i].ops->set); kernel_param_lock(params[i].mod); - param_check_unsafe(¶ms[i]); - err = params[i].ops->set(val, ¶ms[i]); + if (param_check_unsafe(¶ms[i])) + err = params[i].ops->set(val, ¶ms[i]); + else + err = -EPERM; kernel_param_unlock(params[i].mod); return err; } @@ -553,8 +562,10 @@ static ssize_t param_attr_store(struct module_attribute *mattr, return -EPERM; kernel_param_lock(mk->mod); - param_check_unsafe(attribute->param); - err = attribute->param->ops->set(buf, attribute->param); + if (param_check_unsafe(attribute->param)) + err = attribute->param->ops->set(buf, attribute->param); + else + err = -EPERM; kernel_param_unlock(mk->mod); if (!err) return len; -- cgit v1.2.3 From a94549dd87f5ea4ca50fee493df08a2dc6256b53 Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 19 Aug 2019 17:17:58 -0700 Subject: lockdown: Lock down tracing and perf kprobes when in confidentiality mode Disallow the creation of perf and ftrace kprobes when the kernel is locked down in confidentiality mode by preventing their registration. This prevents kprobes from being used to access kernel memory to steal crypto data, but continues to allow the use of kprobes from signed modules. Reported-by: Alexei Starovoitov Signed-off-by: David Howells Signed-off-by: Matthew Garrett Acked-by: Masami Hiramatsu Reviewed-by: Kees Cook Cc: Naveen N. Rao Cc: Anil S Keshavamurthy Cc: davem@davemloft.net Cc: Masami Hiramatsu Signed-off-by: James Morris --- kernel/trace/trace_kprobe.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 7d736248a070..fcb28b0702b2 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -11,6 +11,7 @@ #include #include #include +#include #include "trace_dynevent.h" #include "trace_kprobe_selftest.h" @@ -415,6 +416,10 @@ static int __register_trace_kprobe(struct trace_kprobe *tk) { int i, ret; + ret = security_locked_down(LOCKDOWN_KPROBES); + if (ret) + return ret; + if (trace_probe_is_registered(&tk->tp)) return -EINVAL; -- cgit v1.2.3 From 9d1f8be5cf42b497a3bddf1d523f2bb142e9318c Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 19 Aug 2019 17:17:59 -0700 Subject: bpf: Restrict bpf when kernel lockdown is in confidentiality mode bpf_read() and bpf_read_str() could potentially be abused to (eg) allow private keys in kernel memory to be leaked. Disable them if the kernel has been locked down in confidentiality mode. Suggested-by: Alexei Starovoitov Signed-off-by: Matthew Garrett Reviewed-by: Kees Cook cc: netdev@vger.kernel.org cc: Chun-Yi Lee cc: Alexei Starovoitov Cc: Daniel Borkmann Signed-off-by: James Morris --- kernel/trace/bpf_trace.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 1c9a4745e596..33a954c367f3 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -139,8 +139,13 @@ BPF_CALL_3(bpf_probe_read, void *, dst, u32, size, const void *, unsafe_ptr) { int ret; + ret = security_locked_down(LOCKDOWN_BPF_READ); + if (ret < 0) + goto out; + ret = probe_kernel_read(dst, unsafe_ptr, size); if (unlikely(ret < 0)) +out: memset(dst, 0, size); return ret; @@ -566,6 +571,10 @@ BPF_CALL_3(bpf_probe_read_str, void *, dst, u32, size, { int ret; + ret = security_locked_down(LOCKDOWN_BPF_READ); + if (ret < 0) + goto out; + /* * The strncpy_from_unsafe() call will likely not fill the entire * buffer, but that's okay in this circumstance as we're probing @@ -577,6 +586,7 @@ BPF_CALL_3(bpf_probe_read_str, void *, dst, u32, size, */ ret = strncpy_from_unsafe(dst, unsafe_ptr, size); if (unlikely(ret < 0)) +out: memset(dst, 0, size); return ret; -- cgit v1.2.3 From b0c8fdc7fdb77586c3d1937050925b960743306e Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 19 Aug 2019 17:18:00 -0700 Subject: lockdown: Lock down perf when in confidentiality mode Disallow the use of certain perf facilities that might allow userspace to access kernel data. Signed-off-by: David Howells Signed-off-by: Matthew Garrett Reviewed-by: Kees Cook Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Arnaldo Carvalho de Melo Signed-off-by: James Morris --- kernel/events/core.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index f85929ce13be..8732f980a4fc 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -10798,6 +10798,13 @@ SYSCALL_DEFINE5(perf_event_open, perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN)) return -EACCES; + err = security_locked_down(LOCKDOWN_PERF); + if (err && (attr.sample_type & PERF_SAMPLE_REGS_INTR)) + /* REGS_INTR can leak data, lockdown must prevent this */ + return err; + + err = 0; + /* * In cgroup mode, the pid argument is used to pass the fd * opened to the cgroup directory in cgroupfs. The cpu argument -- cgit v1.2.3 From 29d3c1c8dfe752c01b7115ecd5a3142b232a38e1 Mon Sep 17 00:00:00 2001 From: Matthew Garrett Date: Mon, 19 Aug 2019 17:18:01 -0700 Subject: kexec: Allow kexec_file() with appropriate IMA policy when locked down Systems in lockdown mode should block the kexec of untrusted kernels. For x86 and ARM we can ensure that a kernel is trustworthy by validating a PE signature, but this isn't possible on other architectures. On those platforms we can use IMA digital signatures instead. Add a function to determine whether IMA has or will verify signatures for a given event type, and if so permit kexec_file() even if the kernel is otherwise locked down. This is restricted to cases where CONFIG_INTEGRITY_TRUSTED_KEYRING is set in order to prevent an attacker from loading additional keys at runtime. Signed-off-by: Matthew Garrett Acked-by: Mimi Zohar Cc: Dmitry Kasatkin Cc: linux-integrity@vger.kernel.org Signed-off-by: James Morris --- kernel/kexec_file.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index 43109ef4d6bf..7f4a618fc8c1 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -208,7 +208,15 @@ kimage_validate_signature(struct kimage *image) return ret; } - return security_locked_down(LOCKDOWN_KEXEC); + /* If IMA is guaranteed to appraise a signature on the kexec + * image, permit it even if the kernel is otherwise locked + * down. + */ + if (!ima_appraise_signature(READING_KEXEC_IMAGE) && + security_locked_down(LOCKDOWN_KEXEC)) + return -EPERM; + + return 0; /* All other errors are fatal, including nomem, unparseable * signatures and signature check failures - even if signatures -- cgit v1.2.3 From c7d8b7824ff9de866a356e1892dbe9f191aa5d06 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 6 Aug 2019 20:15:42 -0300 Subject: hmm: use mmu_notifier_get/put for 'struct hmm' This is a significant simplification, it eliminates all the remaining 'hmm' stuff in mm_struct, eliminates krefing along the critical notifier paths, and takes away all the ugly locking and abuse of page_table_lock. mmu_notifier_get() provides the single struct hmm per struct mm which eliminates mm->hmm. It also directly guarantees that no mmu_notifier op callback is callable while concurrent free is possible, this eliminates all the krefs inside the mmu_notifier callbacks. The remaining krefs in the range code were overly cautious, drivers are already not permitted to free the mirror while a range exists. Link: https://lore.kernel.org/r/20190806231548.25242-6-jgg@ziepe.ca Reviewed-by: Christoph Hellwig Reviewed-by: Ralph Campbell Tested-by: Ralph Campbell Signed-off-by: Jason Gunthorpe --- kernel/fork.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index d8ae0f1b4148..bd4a0762f12f 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1007,7 +1007,6 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, mm_init_owner(mm, p); RCU_INIT_POINTER(mm->exe_file, NULL); mmu_notifier_mm_init(mm); - hmm_mm_init(mm); init_tlb_flush_pending(mm); #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS mm->pmd_huge_pte = NULL; -- cgit v1.2.3 From 0c385190392d8c7128fb7517b3c676e19c7b8808 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 18 Aug 2019 11:05:54 +0200 Subject: resource: add a not device managed request_free_mem_region variant Factor out the guts of devm_request_free_mem_region so that we can implement both a device managed and a manually release version as tiny wrappers around it. Link: https://lore.kernel.org/r/20190818090557.17853-2-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Ira Weiny Reviewed-by: Dan Williams Tested-by: Bharata B Rao Reviewed-by: Jason Gunthorpe Signed-off-by: Jason Gunthorpe --- kernel/resource.c | 45 +++++++++++++++++++++++++++++++-------------- 1 file changed, 31 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/resource.c b/kernel/resource.c index 7ea4306503c5..74877e9d90ca 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -1644,19 +1644,8 @@ void resource_list_free(struct list_head *head) EXPORT_SYMBOL(resource_list_free); #ifdef CONFIG_DEVICE_PRIVATE -/** - * devm_request_free_mem_region - find free region for device private memory - * - * @dev: device struct to bind the resource to - * @size: size in bytes of the device memory to add - * @base: resource tree to look in - * - * This function tries to find an empty range of physical address big enough to - * contain the new resource, so that it can later be hotplugged as ZONE_DEVICE - * memory, which in turn allocates struct pages. - */ -struct resource *devm_request_free_mem_region(struct device *dev, - struct resource *base, unsigned long size) +static struct resource *__request_free_mem_region(struct device *dev, + struct resource *base, unsigned long size, const char *name) { resource_size_t end, addr; struct resource *res; @@ -1670,7 +1659,10 @@ struct resource *devm_request_free_mem_region(struct device *dev, REGION_DISJOINT) continue; - res = devm_request_mem_region(dev, addr, size, dev_name(dev)); + if (dev) + res = devm_request_mem_region(dev, addr, size, name); + else + res = request_mem_region(addr, size, name); if (!res) return ERR_PTR(-ENOMEM); res->desc = IORES_DESC_DEVICE_PRIVATE_MEMORY; @@ -1679,7 +1671,32 @@ struct resource *devm_request_free_mem_region(struct device *dev, return ERR_PTR(-ERANGE); } + +/** + * devm_request_free_mem_region - find free region for device private memory + * + * @dev: device struct to bind the resource to + * @size: size in bytes of the device memory to add + * @base: resource tree to look in + * + * This function tries to find an empty range of physical address big enough to + * contain the new resource, so that it can later be hotplugged as ZONE_DEVICE + * memory, which in turn allocates struct pages. + */ +struct resource *devm_request_free_mem_region(struct device *dev, + struct resource *base, unsigned long size) +{ + return __request_free_mem_region(dev, base, size, dev_name(dev)); +} EXPORT_SYMBOL_GPL(devm_request_free_mem_region); + +struct resource *request_free_mem_region(struct resource *base, + unsigned long size, const char *name) +{ + return __request_free_mem_region(NULL, base, size, name); +} +EXPORT_SYMBOL_GPL(request_free_mem_region); + #endif /* CONFIG_DEVICE_PRIVATE */ static int __init strict_iomem(char *str) -- cgit v1.2.3 From fdc029b19dfd190840d23e5c70bd231140b0e4a4 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 18 Aug 2019 11:05:55 +0200 Subject: memremap: remove the dev field in struct dev_pagemap The dev field in struct dev_pagemap is only used to print dev_name in two places, which are at best nice to have. Just remove the field and thus the name in those two messages. Link: https://lore.kernel.org/r/20190818090557.17853-3-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Ira Weiny Reviewed-by: Dan Williams Tested-by: Bharata B Rao Reviewed-by: Jason Gunthorpe Signed-off-by: Jason Gunthorpe --- kernel/memremap.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/memremap.c b/kernel/memremap.c index 6ee03a816d67..600a14cbe663 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -96,7 +96,6 @@ static void dev_pagemap_cleanup(struct dev_pagemap *pgmap) static void devm_memremap_pages_release(void *data) { struct dev_pagemap *pgmap = data; - struct device *dev = pgmap->dev; struct resource *res = &pgmap->res; unsigned long pfn; int nid; @@ -123,8 +122,7 @@ static void devm_memremap_pages_release(void *data) untrack_pfn(NULL, PHYS_PFN(res->start), resource_size(res)); pgmap_array_delete(res); - dev_WARN_ONCE(dev, pgmap->altmap.alloc, - "%s: failed to free all reserved pages\n", __func__); + WARN_ONCE(pgmap->altmap.alloc, "failed to free all reserved pages\n"); } static void dev_pagemap_percpu_release(struct percpu_ref *ref) @@ -245,8 +243,6 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) goto err_array; } - pgmap->dev = dev; - error = xa_err(xa_store_range(&pgmap_array, PHYS_PFN(res->start), PHYS_PFN(res->end), pgmap, GFP_KERNEL)); if (error) -- cgit v1.2.3 From 6f42193fd86eb5eb518984032c85e9b5582b7625 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 18 Aug 2019 11:05:56 +0200 Subject: memremap: don't use a separate devm action for devmap_managed_enable_get Just clean up for early failures and then piggy back on devm_memremap_pages_release. This helps with a pending not device managed version of devm_memremap_pages. Link: https://lore.kernel.org/r/20190818090557.17853-4-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Ira Weiny Reviewed-by: Dan Williams Tested-by: Bharata B Rao Reviewed-by: Jason Gunthorpe Signed-off-by: Jason Gunthorpe --- kernel/memremap.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/memremap.c b/kernel/memremap.c index 600a14cbe663..09a087ca30ff 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -21,13 +21,13 @@ DEFINE_STATIC_KEY_FALSE(devmap_managed_key); EXPORT_SYMBOL(devmap_managed_key); static atomic_t devmap_managed_enable; -static void devmap_managed_enable_put(void *data) +static void devmap_managed_enable_put(void) { if (atomic_dec_and_test(&devmap_managed_enable)) static_branch_disable(&devmap_managed_key); } -static int devmap_managed_enable_get(struct device *dev, struct dev_pagemap *pgmap) +static int devmap_managed_enable_get(struct dev_pagemap *pgmap) { if (!pgmap->ops || !pgmap->ops->page_free) { WARN(1, "Missing page_free method\n"); @@ -36,13 +36,16 @@ static int devmap_managed_enable_get(struct device *dev, struct dev_pagemap *pgm if (atomic_inc_return(&devmap_managed_enable) == 1) static_branch_enable(&devmap_managed_key); - return devm_add_action_or_reset(dev, devmap_managed_enable_put, NULL); + return 0; } #else -static int devmap_managed_enable_get(struct device *dev, struct dev_pagemap *pgmap) +static int devmap_managed_enable_get(struct dev_pagemap *pgmap) { return -EINVAL; } +static void devmap_managed_enable_put(void) +{ +} #endif /* CONFIG_DEV_PAGEMAP_OPS */ static void pgmap_array_delete(struct resource *res) @@ -123,6 +126,7 @@ static void devm_memremap_pages_release(void *data) untrack_pfn(NULL, PHYS_PFN(res->start), resource_size(res)); pgmap_array_delete(res); WARN_ONCE(pgmap->altmap.alloc, "failed to free all reserved pages\n"); + devmap_managed_enable_put(); } static void dev_pagemap_percpu_release(struct percpu_ref *ref) @@ -212,7 +216,7 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) } if (need_devmap_managed) { - error = devmap_managed_enable_get(dev, pgmap); + error = devmap_managed_enable_get(pgmap); if (error) return ERR_PTR(error); } @@ -321,6 +325,7 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) err_array: dev_pagemap_kill(pgmap); dev_pagemap_cleanup(pgmap); + devmap_managed_enable_put(); return ERR_PTR(error); } EXPORT_SYMBOL_GPL(devm_memremap_pages); -- cgit v1.2.3 From 6869b7b206595ae0e326f59719090351eb8f4f5d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 18 Aug 2019 11:05:57 +0200 Subject: memremap: provide a not device managed memremap_pages The kvmppc ultravisor code wants a device private memory pool that is system wide and not attached to a device. Instead of faking up one provide a low-level memremap_pages for it. Link: https://lore.kernel.org/r/20190818090557.17853-5-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Ira Weiny Reviewed-by: Dan Williams Tested-by: Bharata B Rao Reviewed-by: Jason Gunthorpe Signed-off-by: Jason Gunthorpe --- kernel/memremap.c | 84 ++++++++++++++++++++++++++++++++++--------------------- 1 file changed, 52 insertions(+), 32 deletions(-) (limited to 'kernel') diff --git a/kernel/memremap.c b/kernel/memremap.c index 09a087ca30ff..77a77704eb28 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -96,9 +96,8 @@ static void dev_pagemap_cleanup(struct dev_pagemap *pgmap) } } -static void devm_memremap_pages_release(void *data) +void memunmap_pages(struct dev_pagemap *pgmap) { - struct dev_pagemap *pgmap = data; struct resource *res = &pgmap->res; unsigned long pfn; int nid; @@ -128,6 +127,12 @@ static void devm_memremap_pages_release(void *data) WARN_ONCE(pgmap->altmap.alloc, "failed to free all reserved pages\n"); devmap_managed_enable_put(); } +EXPORT_SYMBOL_GPL(memunmap_pages); + +static void devm_memremap_pages_release(void *data) +{ + memunmap_pages(data); +} static void dev_pagemap_percpu_release(struct percpu_ref *ref) { @@ -137,27 +142,12 @@ static void dev_pagemap_percpu_release(struct percpu_ref *ref) complete(&pgmap->done); } -/** - * devm_memremap_pages - remap and provide memmap backing for the given resource - * @dev: hosting device for @res - * @pgmap: pointer to a struct dev_pagemap - * - * Notes: - * 1/ At a minimum the res and type members of @pgmap must be initialized - * by the caller before passing it to this function - * - * 2/ The altmap field may optionally be initialized, in which case - * PGMAP_ALTMAP_VALID must be set in pgmap->flags. - * - * 3/ The ref field may optionally be provided, in which pgmap->ref must be - * 'live' on entry and will be killed and reaped at - * devm_memremap_pages_release() time, or if this routine fails. - * - * 4/ res is expected to be a host memory range that could feasibly be - * treated as a "System RAM" range, i.e. not a device mmio range, but - * this is not enforced. +/* + * Not device managed version of dev_memremap_pages, undone by + * memunmap_pages(). Please use dev_memremap_pages if you have a struct + * device available. */ -void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) +void *memremap_pages(struct dev_pagemap *pgmap, int nid) { struct resource *res = &pgmap->res; struct dev_pagemap *conflict_pgmap; @@ -168,7 +158,7 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) .altmap = pgmap_altmap(pgmap), }; pgprot_t pgprot = PAGE_KERNEL; - int error, nid, is_ram; + int error, is_ram; bool need_devmap_managed = true; switch (pgmap->type) { @@ -223,7 +213,7 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) conflict_pgmap = get_dev_pagemap(PHYS_PFN(res->start), NULL); if (conflict_pgmap) { - dev_WARN(dev, "Conflicting mapping in same section\n"); + WARN(1, "Conflicting mapping in same section\n"); put_dev_pagemap(conflict_pgmap); error = -ENOMEM; goto err_array; @@ -231,7 +221,7 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) conflict_pgmap = get_dev_pagemap(PHYS_PFN(res->end), NULL); if (conflict_pgmap) { - dev_WARN(dev, "Conflicting mapping in same section\n"); + WARN(1, "Conflicting mapping in same section\n"); put_dev_pagemap(conflict_pgmap); error = -ENOMEM; goto err_array; @@ -252,7 +242,6 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) if (error) goto err_array; - nid = dev_to_node(dev); if (nid < 0) nid = numa_mem_id(); @@ -308,12 +297,6 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) PHYS_PFN(res->start), PHYS_PFN(resource_size(res)), pgmap); percpu_ref_get_many(pgmap->ref, pfn_end(pgmap) - pfn_first(pgmap)); - - error = devm_add_action_or_reset(dev, devm_memremap_pages_release, - pgmap); - if (error) - return ERR_PTR(error); - return __va(res->start); err_add_memory: @@ -328,6 +311,43 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) devmap_managed_enable_put(); return ERR_PTR(error); } +EXPORT_SYMBOL_GPL(memremap_pages); + +/** + * devm_memremap_pages - remap and provide memmap backing for the given resource + * @dev: hosting device for @res + * @pgmap: pointer to a struct dev_pagemap + * + * Notes: + * 1/ At a minimum the res and type members of @pgmap must be initialized + * by the caller before passing it to this function + * + * 2/ The altmap field may optionally be initialized, in which case + * PGMAP_ALTMAP_VALID must be set in pgmap->flags. + * + * 3/ The ref field may optionally be provided, in which pgmap->ref must be + * 'live' on entry and will be killed and reaped at + * devm_memremap_pages_release() time, or if this routine fails. + * + * 4/ res is expected to be a host memory range that could feasibly be + * treated as a "System RAM" range, i.e. not a device mmio range, but + * this is not enforced. + */ +void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) +{ + int error; + void *ret; + + ret = memremap_pages(pgmap, dev_to_node(dev)); + if (IS_ERR(ret)) + return ret; + + error = devm_add_action_or_reset(dev, devm_memremap_pages_release, + pgmap); + if (error) + return ERR_PTR(error); + return ret; +} EXPORT_SYMBOL_GPL(devm_memremap_pages); void devm_memunmap_pages(struct device *dev, struct dev_pagemap *pgmap) -- cgit v1.2.3 From ede6bc88d6bbe16a938de2b00f60f4e03768c988 Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Tue, 20 Aug 2019 01:36:52 +0000 Subject: bpf: Use PTR_ERR_OR_ZERO in xsk_map_inc() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Use PTR_ERR_OR_ZERO rather than if(IS_ERR(...)) + PTR_ERR. Signed-off-by: YueHaibing Acked-by: Björn Töpel Signed-off-by: Daniel Borkmann --- kernel/bpf/xskmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/xskmap.c b/kernel/bpf/xskmap.c index 4cc28e226398..942c662e2eed 100644 --- a/kernel/bpf/xskmap.c +++ b/kernel/bpf/xskmap.c @@ -21,7 +21,7 @@ int xsk_map_inc(struct xsk_map *map) struct bpf_map *m = &map->map; m = bpf_map_inc(m, false); - return IS_ERR(m) ? PTR_ERR(m) : 0; + return PTR_ERR_OR_ZERO(m); } void xsk_map_put(struct xsk_map *map) -- cgit v1.2.3 From 3481e64bbe9876e359aec74c9e93039b94f678b0 Mon Sep 17 00:00:00 2001 From: Quentin Monnet Date: Tue, 20 Aug 2019 14:53:46 +0100 Subject: bpf: add BTF ids in procfs for file descriptors to BTF objects Implement the show_fdinfo hook for BTF FDs file operations, and make it print the id of the BTF object. This allows for a quick retrieval of the BTF id from its FD; or it can help understanding what type of object (BTF) the file descriptor points to. v2: - Do not expose data_size, only btf_id, in FD info. Signed-off-by: Quentin Monnet Reviewed-by: Jakub Kicinski Signed-off-by: Daniel Borkmann --- kernel/bpf/btf.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 5fcc7a17eb5a..6b403dc18486 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -3376,6 +3376,15 @@ void btf_type_seq_show(const struct btf *btf, u32 type_id, void *obj, btf_type_ops(t)->seq_show(btf, t, type_id, obj, 0, m); } +#ifdef CONFIG_PROC_FS +static void bpf_btf_show_fdinfo(struct seq_file *m, struct file *filp) +{ + const struct btf *btf = filp->private_data; + + seq_printf(m, "btf_id:\t%u\n", btf->id); +} +#endif + static int btf_release(struct inode *inode, struct file *filp) { btf_put(filp->private_data); @@ -3383,6 +3392,9 @@ static int btf_release(struct inode *inode, struct file *filp) } const struct file_operations btf_fops = { +#ifdef CONFIG_PROC_FS + .show_fdinfo = bpf_btf_show_fdinfo, +#endif .release = btf_release, }; -- cgit v1.2.3 From 1b9ed84ecf268904d89edf2908426a8eb3b5a4ba Mon Sep 17 00:00:00 2001 From: Quentin Monnet Date: Tue, 20 Aug 2019 10:31:50 +0100 Subject: bpf: add new BPF_BTF_GET_NEXT_ID syscall command Add a new command for the bpf() system call: BPF_BTF_GET_NEXT_ID is used to cycle through all BTF objects loaded on the system. The motivation is to be able to inspect (list) all BTF objects presents on the system. Signed-off-by: Quentin Monnet Reviewed-by: Jakub Kicinski Signed-off-by: Alexei Starovoitov --- kernel/bpf/btf.c | 4 ++-- kernel/bpf/syscall.c | 4 ++++ 2 files changed, 6 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 6b403dc18486..adb3adcebe3c 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -195,8 +195,8 @@ i < btf_type_vlen(struct_type); \ i++, member++) -static DEFINE_IDR(btf_idr); -static DEFINE_SPINLOCK(btf_idr_lock); +DEFINE_IDR(btf_idr); +DEFINE_SPINLOCK(btf_idr_lock); struct btf { void *data; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index cf8052b016e7..c0f62fd67c6b 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2884,6 +2884,10 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz err = bpf_obj_get_next_id(&attr, uattr, &map_idr, &map_idr_lock); break; + case BPF_BTF_GET_NEXT_ID: + err = bpf_obj_get_next_id(&attr, uattr, + &btf_idr, &btf_idr_lock); + break; case BPF_PROG_GET_FD_BY_ID: err = bpf_prog_get_fd_by_id(&attr); break; -- cgit v1.2.3 From 3e91ec89f527b9870fe42dcbdb74fd389d123a95 Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Thu, 15 Aug 2019 16:44:00 +0100 Subject: arm64: Tighten the PR_{SET, GET}_TAGGED_ADDR_CTRL prctl() unused arguments Require that arg{3,4,5} of the PR_{SET,GET}_TAGGED_ADDR_CTRL prctl and arg2 of the PR_GET_TAGGED_ADDR_CTRL prctl() are zero rather than ignored for future extensions. Acked-by: Andrey Konovalov Signed-off-by: Catalin Marinas Signed-off-by: Will Deacon --- kernel/sys.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index c6c4d5358bd3..ec48396b4943 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -2499,9 +2499,13 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, error = PAC_RESET_KEYS(me, arg2); break; case PR_SET_TAGGED_ADDR_CTRL: + if (arg3 || arg4 || arg5) + return -EINVAL; error = SET_TAGGED_ADDR_CTRL(arg2); break; case PR_GET_TAGGED_ADDR_CTRL: + if (arg2 || arg3 || arg4 || arg5) + return -EINVAL; error = GET_TAGGED_ADDR_CTRL(); break; default: -- cgit v1.2.3 From ec8f954a40da8cd3d159713b608e901f0cd909a9 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 2 Aug 2019 07:35:59 +0200 Subject: posix-timers: Use a callback for cancel synchronization on PREEMPT_RT Posix timer delete retry loops are affected by the same priority inversion and live lock issues as the other timers. Provide a RT specific synchronization function which keeps a reference to the timer by holding rcu read lock to prevent the timer from being freed, dropping the timer lock and invoking the timer specific wait function via a new callback. This does not yet cover posix CPU timers because they need more special treatment on PREEMPT_RT. [ This is folded into the original attempt which did not use a callback. ] Originally-by: Anna-Maria Gleixenr Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Link: https://lkml.kernel.org/r/20190819143801.656864506@linutronix.de --- kernel/time/alarmtimer.c | 14 ++++++++++++++ kernel/time/posix-timers.c | 18 +++++++++++++++++- kernel/time/posix-timers.h | 1 + 3 files changed, 32 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 36947449dba2..ec32876e284d 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -605,6 +605,19 @@ static int alarm_timer_try_to_cancel(struct k_itimer *timr) return alarm_try_to_cancel(&timr->it.alarm.alarmtimer); } +/** + * alarm_timer_wait_running - Posix timer callback to wait for a timer + * @timr: Pointer to the posixtimer data struct + * + * Called from the core code when timer cancel detected that the callback + * is running. @timr is unlocked and rcu read lock is held to prevent it + * from being freed. + */ +static void alarm_timer_wait_running(struct k_itimer *timr) +{ + hrtimer_cancel_wait_running(&timr->it.alarm.alarmtimer.timer); +} + /** * alarm_timer_arm - Posix timer callback to arm a timer * @timr: Pointer to the posixtimer data struct @@ -834,6 +847,7 @@ const struct k_clock alarm_clock = { .timer_forward = alarm_timer_forward, .timer_remaining = alarm_timer_remaining, .timer_try_to_cancel = alarm_timer_try_to_cancel, + .timer_wait_running = alarm_timer_wait_running, .nsleep = alarm_timer_nsleep, }; #endif /* CONFIG_POSIX_TIMERS */ diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 3e663f982c82..9e377830cc10 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -805,13 +805,25 @@ static int common_hrtimer_try_to_cancel(struct k_itimer *timr) return hrtimer_try_to_cancel(&timr->it.real.timer); } +static void common_timer_wait_running(struct k_itimer *timer) +{ + hrtimer_cancel_wait_running(&timer->it.real.timer); +} + static struct k_itimer *timer_wait_running(struct k_itimer *timer, unsigned long *flags) { + const struct k_clock *kc = READ_ONCE(timer->kclock); timer_t timer_id = READ_ONCE(timer->it_id); + /* Prevent kfree(timer) after dropping the lock */ + rcu_read_lock(); unlock_timer(timer, *flags); - cpu_relax(); + + if (!WARN_ON_ONCE(!kc->timer_wait_running)) + kc->timer_wait_running(timer); + + rcu_read_unlock(); /* Relock the timer. It might be not longer hashed. */ return lock_timer(timer_id, flags); } @@ -1255,6 +1267,7 @@ static const struct k_clock clock_realtime = { .timer_forward = common_hrtimer_forward, .timer_remaining = common_hrtimer_remaining, .timer_try_to_cancel = common_hrtimer_try_to_cancel, + .timer_wait_running = common_timer_wait_running, .timer_arm = common_hrtimer_arm, }; @@ -1270,6 +1283,7 @@ static const struct k_clock clock_monotonic = { .timer_forward = common_hrtimer_forward, .timer_remaining = common_hrtimer_remaining, .timer_try_to_cancel = common_hrtimer_try_to_cancel, + .timer_wait_running = common_timer_wait_running, .timer_arm = common_hrtimer_arm, }; @@ -1300,6 +1314,7 @@ static const struct k_clock clock_tai = { .timer_forward = common_hrtimer_forward, .timer_remaining = common_hrtimer_remaining, .timer_try_to_cancel = common_hrtimer_try_to_cancel, + .timer_wait_running = common_timer_wait_running, .timer_arm = common_hrtimer_arm, }; @@ -1315,6 +1330,7 @@ static const struct k_clock clock_boottime = { .timer_forward = common_hrtimer_forward, .timer_remaining = common_hrtimer_remaining, .timer_try_to_cancel = common_hrtimer_try_to_cancel, + .timer_wait_running = common_timer_wait_running, .timer_arm = common_hrtimer_arm, }; diff --git a/kernel/time/posix-timers.h b/kernel/time/posix-timers.h index de5daa6d975a..897c29e162b9 100644 --- a/kernel/time/posix-timers.h +++ b/kernel/time/posix-timers.h @@ -24,6 +24,7 @@ struct k_clock { int (*timer_try_to_cancel)(struct k_itimer *timr); void (*timer_arm)(struct k_itimer *timr, ktime_t expires, bool absolute, bool sigev_none); + void (*timer_wait_running)(struct k_itimer *timr); }; extern const struct k_clock clock_posix_cpu; -- cgit v1.2.3 From 0bee3b601b77dbe7981b5474ae8758d6bf60177a Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 20 Aug 2019 15:12:23 +0200 Subject: hrtimer: Improve comments on handling priority inversion against softirq kthread The handling of a priority inversion between timer cancelling and a a not well defined possible preemption of softirq kthread is not very clear. Especially in the posix timers side it's unclear why there is a specific RT wait callback. All the nice explanations can be found in the initial changelog of f61eff83cec9 (hrtimer: Prepare support for PREEMPT_RT"). Extract the detailed informations from there and put it into comments. Signed-off-by: Frederic Weisbecker Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20190820132656.GC2093@lenoir --- kernel/time/hrtimer.c | 14 ++++++++++---- kernel/time/posix-timers.c | 6 ++++++ 2 files changed, 16 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 499122752649..833353732554 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -1201,10 +1201,16 @@ static void hrtimer_sync_wait_running(struct hrtimer_cpu_base *cpu_base, * deletion of a timer failed because the timer callback function was * running. * - * This prevents priority inversion, if the softirq thread on a remote CPU - * got preempted, and it prevents a life lock when the task which tries to - * delete a timer preempted the softirq thread running the timer callback - * function. + * This prevents priority inversion: if the soft irq thread is preempted + * in the middle of a timer callback, then calling del_timer_sync() can + * lead to two issues: + * + * - If the caller is on a remote CPU then it has to spin wait for the timer + * handler to complete. This can result in unbound priority inversion. + * + * - If the caller originates from the task which preempted the timer + * handler on the same CPU, then spin waiting for the timer handler to + * complete is never going to end. */ void hrtimer_cancel_wait_running(const struct hrtimer *timer) { diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 9e377830cc10..0ec5b7a1d769 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -810,6 +810,12 @@ static void common_timer_wait_running(struct k_itimer *timer) hrtimer_cancel_wait_running(&timer->it.real.timer); } +/* + * On PREEMPT_RT this prevent priority inversion against softirq kthread in + * case it gets preempted while executing a timer callback. See comments in + * hrtimer_cancel_wait_running. For PREEMPT_RT=n this just results in a + * cpu_relax(). + */ static struct k_itimer *timer_wait_running(struct k_itimer *timer, unsigned long *flags) { -- cgit v1.2.3 From 7cb9a94c158b956f46cf093ed966d0c1e996dddb Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 19 Aug 2019 16:31:45 +0200 Subject: posix-cpu-timers: Fixup stale comment The comment above cleanup_timers() is outdated. The timers are only removed from the task/process list heads but not modified in any other way. Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Link: https://lkml.kernel.org/r/20190819143801.747233612@linutronix.de --- kernel/time/posix-cpu-timers.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 0a426f4e3125..742d4a4e6f71 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -412,9 +412,10 @@ static void cleanup_timers_list(struct list_head *head) } /* - * Clean out CPU timers still ticking when a thread exited. The task - * pointer is cleared, and the expiry time is replaced with the residual - * time for later timer_gettime calls to return. + * Clean out CPU timers which are still armed when a thread exits. The + * timers are only removed from the list. No other updates are done. The + * corresponding posix timers are still accessible, but cannot be rearmed. + * * This must be called with the siglock held. */ static void cleanup_timers(struct list_head *head) -- cgit v1.2.3 From 90ae409f9eb3bcaf38688f9ec22375816053a08e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 20 Aug 2019 11:45:49 +0900 Subject: dma-direct: fix zone selection after an unaddressable CMA allocation The new dma_alloc_contiguous hides if we allocate CMA or regular pages, and thus fails to retry a ZONE_NORMAL allocation if the CMA allocation succeeds but isn't addressable. That means we either fail outright or dip into a small zone that might not succeed either. Thanks to Hillf Danton for debugging this issue. Fixes: b1d2dc009dec ("dma-contiguous: add dma_{alloc,free}_contiguous() helpers") Reported-by: Tobias Klausmann Signed-off-by: Christoph Hellwig Tested-by: Tobias Klausmann --- kernel/dma/contiguous.c | 8 ++------ kernel/dma/direct.c | 10 +++++++++- 2 files changed, 11 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/contiguous.c b/kernel/dma/contiguous.c index 2bd410f934b3..69cfb4345388 100644 --- a/kernel/dma/contiguous.c +++ b/kernel/dma/contiguous.c @@ -230,9 +230,7 @@ bool dma_release_from_contiguous(struct device *dev, struct page *pages, */ struct page *dma_alloc_contiguous(struct device *dev, size_t size, gfp_t gfp) { - int node = dev ? dev_to_node(dev) : NUMA_NO_NODE; - size_t count = PAGE_ALIGN(size) >> PAGE_SHIFT; - size_t align = get_order(PAGE_ALIGN(size)); + size_t count = size >> PAGE_SHIFT; struct page *page = NULL; struct cma *cma = NULL; @@ -243,14 +241,12 @@ struct page *dma_alloc_contiguous(struct device *dev, size_t size, gfp_t gfp) /* CMA can be used only in the context which permits sleeping */ if (cma && gfpflags_allow_blocking(gfp)) { + size_t align = get_order(size); size_t cma_align = min_t(size_t, align, CONFIG_CMA_ALIGNMENT); page = cma_alloc(cma, count, cma_align, gfp & __GFP_NOWARN); } - /* Fallback allocation of normal pages */ - if (!page) - page = alloc_pages_node(node, gfp, align); return page; } diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index 795c9b095d75..706113c6bebc 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -85,6 +85,8 @@ static bool dma_coherent_ok(struct device *dev, phys_addr_t phys, size_t size) struct page *__dma_direct_alloc_pages(struct device *dev, size_t size, dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs) { + size_t alloc_size = PAGE_ALIGN(size); + int node = dev_to_node(dev); struct page *page = NULL; u64 phys_mask; @@ -95,8 +97,14 @@ struct page *__dma_direct_alloc_pages(struct device *dev, size_t size, gfp &= ~__GFP_ZERO; gfp |= __dma_direct_optimal_gfp_mask(dev, dev->coherent_dma_mask, &phys_mask); + page = dma_alloc_contiguous(dev, alloc_size, gfp); + if (page && !dma_coherent_ok(dev, page_to_phys(page), size)) { + dma_free_contiguous(dev, page, alloc_size); + page = NULL; + } again: - page = dma_alloc_contiguous(dev, size, gfp); + if (!page) + page = alloc_pages_node(node, gfp, get_order(alloc_size)); if (page && !dma_coherent_ok(dev, page_to_phys(page), size)) { dma_free_contiguous(dev, page, size); page = NULL; -- cgit v1.2.3 From 2434aea58e652a9fe114181ac90aa60e2f2e1b25 Mon Sep 17 00:00:00 2001 From: Tri Vo Date: Tue, 6 Aug 2019 18:48:45 -0700 Subject: PM / wakeup: Use wakeup_source_register() in wakelock.c kernel/power/wakelock.c duplicates wakeup source creation and registration code from drivers/base/power/wakeup.c. Change struct wakelock's wakeup source to a pointer and use wakeup_source_register() function to create and register said wakeup source. Use wakeup_source_unregister() on cleanup path. Signed-off-by: Tri Vo Reviewed-by: Stephen Boyd Reviewed-by: Greg Kroah-Hartman Signed-off-by: Rafael J. Wysocki --- kernel/power/wakelock.c | 32 +++++++++++++++++++------------- 1 file changed, 19 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/power/wakelock.c b/kernel/power/wakelock.c index 4210152e56f0..d1eb7fd98b64 100644 --- a/kernel/power/wakelock.c +++ b/kernel/power/wakelock.c @@ -27,7 +27,7 @@ static DEFINE_MUTEX(wakelocks_lock); struct wakelock { char *name; struct rb_node node; - struct wakeup_source ws; + struct wakeup_source *ws; #ifdef CONFIG_PM_WAKELOCKS_GC struct list_head lru; #endif @@ -46,7 +46,7 @@ ssize_t pm_show_wakelocks(char *buf, bool show_active) for (node = rb_first(&wakelocks_tree); node; node = rb_next(node)) { wl = rb_entry(node, struct wakelock, node); - if (wl->ws.active == show_active) + if (wl->ws->active == show_active) str += scnprintf(str, end - str, "%s ", wl->name); } if (str > buf) @@ -112,16 +112,16 @@ static void __wakelocks_gc(struct work_struct *work) u64 idle_time_ns; bool active; - spin_lock_irq(&wl->ws.lock); - idle_time_ns = ktime_to_ns(ktime_sub(now, wl->ws.last_time)); - active = wl->ws.active; - spin_unlock_irq(&wl->ws.lock); + spin_lock_irq(&wl->ws->lock); + idle_time_ns = ktime_to_ns(ktime_sub(now, wl->ws->last_time)); + active = wl->ws->active; + spin_unlock_irq(&wl->ws->lock); if (idle_time_ns < ((u64)WL_GC_TIME_SEC * NSEC_PER_SEC)) break; if (!active) { - wakeup_source_remove(&wl->ws); + wakeup_source_unregister(wl->ws); rb_erase(&wl->node, &wakelocks_tree); list_del(&wl->lru); kfree(wl->name); @@ -187,9 +187,15 @@ static struct wakelock *wakelock_lookup_add(const char *name, size_t len, kfree(wl); return ERR_PTR(-ENOMEM); } - wl->ws.name = wl->name; - wl->ws.last_time = ktime_get(); - wakeup_source_add(&wl->ws); + + wl->ws = wakeup_source_register(wl->name); + if (!wl->ws) { + kfree(wl->name); + kfree(wl); + return ERR_PTR(-ENOMEM); + } + wl->ws->last_time = ktime_get(); + rb_link_node(&wl->node, parent, node); rb_insert_color(&wl->node, &wakelocks_tree); wakelocks_lru_add(wl); @@ -233,9 +239,9 @@ int pm_wake_lock(const char *buf) u64 timeout_ms = timeout_ns + NSEC_PER_MSEC - 1; do_div(timeout_ms, NSEC_PER_MSEC); - __pm_wakeup_event(&wl->ws, timeout_ms); + __pm_wakeup_event(wl->ws, timeout_ms); } else { - __pm_stay_awake(&wl->ws); + __pm_stay_awake(wl->ws); } wakelocks_lru_most_recent(wl); @@ -271,7 +277,7 @@ int pm_wake_unlock(const char *buf) ret = PTR_ERR(wl); goto out; } - __pm_relax(&wl->ws); + __pm_relax(wl->ws); wakelocks_lru_most_recent(wl); wakelocks_gc(); -- cgit v1.2.3 From c8377adfa78103be5380200eb9dab764d7ca890e Mon Sep 17 00:00:00 2001 From: Tri Vo Date: Tue, 6 Aug 2019 18:48:46 -0700 Subject: PM / wakeup: Show wakeup sources stats in sysfs Add an ID and a device pointer to 'struct wakeup_source'. Use them to to expose wakeup sources statistics in sysfs under /sys/class/wakeup/wakeup/*. Co-developed-by: Greg Kroah-Hartman Signed-off-by: Greg Kroah-Hartman Co-developed-by: Stephen Boyd Signed-off-by: Stephen Boyd Signed-off-by: Tri Vo Tested-by: Kalesh Singh Signed-off-by: Rafael J. Wysocki --- kernel/power/autosleep.c | 2 +- kernel/power/wakelock.c | 2 +- kernel/time/alarmtimer.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/power/autosleep.c b/kernel/power/autosleep.c index 41e83a779e19..9af5a50d3489 100644 --- a/kernel/power/autosleep.c +++ b/kernel/power/autosleep.c @@ -116,7 +116,7 @@ int pm_autosleep_set_state(suspend_state_t state) int __init pm_autosleep_init(void) { - autosleep_ws = wakeup_source_register("autosleep"); + autosleep_ws = wakeup_source_register(NULL, "autosleep"); if (!autosleep_ws) return -ENOMEM; diff --git a/kernel/power/wakelock.c b/kernel/power/wakelock.c index d1eb7fd98b64..105df4dfc783 100644 --- a/kernel/power/wakelock.c +++ b/kernel/power/wakelock.c @@ -188,7 +188,7 @@ static struct wakelock *wakelock_lookup_add(const char *name, size_t len, return ERR_PTR(-ENOMEM); } - wl->ws = wakeup_source_register(wl->name); + wl->ws = wakeup_source_register(NULL, wl->name); if (!wl->ws) { kfree(wl->name); kfree(wl); diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 57518efc3810..93b382d9701c 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -97,7 +97,7 @@ static int alarmtimer_rtc_add_device(struct device *dev, if (!device_may_wakeup(rtc->dev.parent)) return -1; - __ws = wakeup_source_register("alarmtimer"); + __ws = wakeup_source_register(dev, "alarmtimer"); spin_lock_irqsave(&rtcdev_lock, flags); if (!rtcdev) { -- cgit v1.2.3 From c3082a674f46fe49383b157882c41dfabaa37113 Mon Sep 17 00:00:00 2001 From: Amit Kucheria Date: Thu, 11 Jul 2019 19:51:25 +0530 Subject: PM: QoS: Get rid of unused flags The network_latency and network_throughput flags for PM-QoS have not found much use in drivers or in userspace since they were introduced. Commit 4a733ef1bea7 ("mac80211: remove PM-QoS listener") removed the only user PM_QOS_NETWORK_LATENCY in the kernel a while ago and there don't seem to be any userspace tools using the character device files either. PM_QOS_MEMORY_BANDWIDTH was never even added to the trace events. Remove all the flags except cpu_dma_latency. Signed-off-by: Amit Kucheria Acked-by: Pavel Machek Signed-off-by: Rafael J. Wysocki --- kernel/power/qos.c | 48 ------------------------------------------------ 1 file changed, 48 deletions(-) (limited to 'kernel') diff --git a/kernel/power/qos.c b/kernel/power/qos.c index 33e3febaba53..9568a2fe7c11 100644 --- a/kernel/power/qos.c +++ b/kernel/power/qos.c @@ -78,57 +78,9 @@ static struct pm_qos_object cpu_dma_pm_qos = { .name = "cpu_dma_latency", }; -static BLOCKING_NOTIFIER_HEAD(network_lat_notifier); -static struct pm_qos_constraints network_lat_constraints = { - .list = PLIST_HEAD_INIT(network_lat_constraints.list), - .target_value = PM_QOS_NETWORK_LAT_DEFAULT_VALUE, - .default_value = PM_QOS_NETWORK_LAT_DEFAULT_VALUE, - .no_constraint_value = PM_QOS_NETWORK_LAT_DEFAULT_VALUE, - .type = PM_QOS_MIN, - .notifiers = &network_lat_notifier, -}; -static struct pm_qos_object network_lat_pm_qos = { - .constraints = &network_lat_constraints, - .name = "network_latency", -}; - - -static BLOCKING_NOTIFIER_HEAD(network_throughput_notifier); -static struct pm_qos_constraints network_tput_constraints = { - .list = PLIST_HEAD_INIT(network_tput_constraints.list), - .target_value = PM_QOS_NETWORK_THROUGHPUT_DEFAULT_VALUE, - .default_value = PM_QOS_NETWORK_THROUGHPUT_DEFAULT_VALUE, - .no_constraint_value = PM_QOS_NETWORK_THROUGHPUT_DEFAULT_VALUE, - .type = PM_QOS_MAX, - .notifiers = &network_throughput_notifier, -}; -static struct pm_qos_object network_throughput_pm_qos = { - .constraints = &network_tput_constraints, - .name = "network_throughput", -}; - - -static BLOCKING_NOTIFIER_HEAD(memory_bandwidth_notifier); -static struct pm_qos_constraints memory_bw_constraints = { - .list = PLIST_HEAD_INIT(memory_bw_constraints.list), - .target_value = PM_QOS_MEMORY_BANDWIDTH_DEFAULT_VALUE, - .default_value = PM_QOS_MEMORY_BANDWIDTH_DEFAULT_VALUE, - .no_constraint_value = PM_QOS_MEMORY_BANDWIDTH_DEFAULT_VALUE, - .type = PM_QOS_SUM, - .notifiers = &memory_bandwidth_notifier, -}; -static struct pm_qos_object memory_bandwidth_pm_qos = { - .constraints = &memory_bw_constraints, - .name = "memory_bandwidth", -}; - - static struct pm_qos_object *pm_qos_array[] = { &null_pm_qos, &cpu_dma_pm_qos, - &network_lat_pm_qos, - &network_throughput_pm_qos, - &memory_bandwidth_pm_qos, }; static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf, -- cgit v1.2.3 From 3b5be16c7e90a69c93349d210766250fffcb54bd Mon Sep 17 00:00:00 2001 From: He Zhe Date: Tue, 20 Aug 2019 22:53:10 +0800 Subject: modules: page-align module section allocations only for arches supporting strict module rwx We should keep the case of "#define debug_align(X) (X)" for all arches without CONFIG_HAS_STRICT_MODULE_RWX ability, which would save people, who are sensitive to system size, a lot of memory when using modules, especially for embedded systems. This is also the intention of the original #ifdef... statement and still valid for now. Note that this still keeps the effect of the fix of the following commit, 38f054d549a8 ("modules: always page-align module section allocations"), since when CONFIG_ARCH_HAS_STRICT_MODULE_RWX is enabled, module pages are aligned. Signed-off-by: He Zhe Signed-off-by: Jessica Yu --- kernel/module.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index cd8df516666d..9ee93421269c 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -64,9 +64,14 @@ /* * Modules' sections will be aligned on page boundaries - * to ensure complete separation of code and data + * to ensure complete separation of code and data, but + * only when CONFIG_ARCH_HAS_STRICT_MODULE_RWX=y */ +#ifdef CONFIG_ARCH_HAS_STRICT_MODULE_RWX # define debug_align(X) ALIGN(X, PAGE_SIZE) +#else +# define debug_align(X) (X) +#endif /* If this is set, the section belongs in the init part of the module */ #define INIT_OFFSET_MASK (1UL << (BITS_PER_LONG-1)) -- cgit v1.2.3 From 49ec9177b8ec9c081850744468811e708a7c9841 Mon Sep 17 00:00:00 2001 From: Santosh Sivaraj Date: Tue, 20 Aug 2019 13:43:49 +0530 Subject: extable: Add function to search only kernel exception table Certain architecture specific operating modes (e.g., in powerpc machine check handler that is unable to access vmalloc memory), the search_exception_tables cannot be called because it also searches the module exception tables if entry is not found in the kernel exception table. Signed-off-by: Santosh Sivaraj Reviewed-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190820081352.8641-5-santosh@fossix.org --- kernel/extable.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/extable.c b/kernel/extable.c index e23cce6e6092..f6c9406eec7d 100644 --- a/kernel/extable.c +++ b/kernel/extable.c @@ -40,13 +40,20 @@ void __init sort_main_extable(void) } } +/* Given an address, look for it in the kernel exception table */ +const +struct exception_table_entry *search_kernel_exception_table(unsigned long addr) +{ + return search_extable(__start___ex_table, + __stop___ex_table - __start___ex_table, addr); +} + /* Given an address, look for it in the exception tables. */ const struct exception_table_entry *search_exception_tables(unsigned long addr) { const struct exception_table_entry *e; - e = search_extable(__start___ex_table, - __stop___ex_table - __start___ex_table, addr); + e = search_kernel_exception_table(addr); if (!e) e = search_module_extables(addr); return e; -- cgit v1.2.3 From dd2261ed45aaeddeb77768f291d604179bcab096 Mon Sep 17 00:00:00 2001 From: Julien Grall Date: Wed, 21 Aug 2019 10:24:07 +0100 Subject: hrtimer: Protect lockless access to timer->base The update to timer->base is protected by the base->cpu_base->lock(). However, hrtimer_cancel_wait_running() does access it lockless. So the compiler is allowed to refetch timer->base which can cause havoc when the timer base is changed concurrently. Use READ_ONCE() to prevent this. [ tglx: Adapted from a RT patch ] Signed-off-by: Julien Grall Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20190821092409.13225-2-julien.grall@arm.com --- kernel/time/hrtimer.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 833353732554..f48864e2ff8a 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -1214,7 +1214,8 @@ static void hrtimer_sync_wait_running(struct hrtimer_cpu_base *cpu_base, */ void hrtimer_cancel_wait_running(const struct hrtimer *timer) { - struct hrtimer_clock_base *base = timer->base; + /* Lockless read. Prevent the compiler from reloading it below */ + struct hrtimer_clock_base *base = READ_ONCE(timer->base); if (!timer->is_soft || !base || !base->cpu_base) { cpu_relax(); -- cgit v1.2.3 From 68b2c8c1e421096f4b46ac2ac502d25ca067a2a6 Mon Sep 17 00:00:00 2001 From: Julien Grall Date: Wed, 21 Aug 2019 10:24:09 +0100 Subject: hrtimer: Don't take expiry_lock when timer is currently migrated migration_base is used as a placeholder when an hrtimer is migrated to a different CPU. In the case that hrtimer_cancel_wait_running() hits a timer which is currently migrated it would pointlessly acquire the expiry lock of the migration base, which is even not initialized. Surely it could be initialized, but there is absolutely no point in acquiring this lock because the timer is guaranteed not to run it's callback for which the caller waits to finish on that base. So it would just do the inc/lock/dec/unlock dance for nothing. As the base switch is short and non-preemptible, there is no issue when the wait function returns immediately. The timer base and base->cpu_base cannot be NULL in the code path which is invoking that, so just replace those checks with a check whether base is migration base. [ tglx: Updated from RT patch. Massaged changelog. Added comment. ] Signed-off-by: Julien Grall Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20190821092409.13225-4-julien.grall@arm.com --- kernel/time/hrtimer.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index f48864e2ff8a..ebbd0fb3512b 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -1217,7 +1217,11 @@ void hrtimer_cancel_wait_running(const struct hrtimer *timer) /* Lockless read. Prevent the compiler from reloading it below */ struct hrtimer_clock_base *base = READ_ONCE(timer->base); - if (!timer->is_soft || !base || !base->cpu_base) { + /* + * Just relax if the timer expires in hard interrupt context or if + * it is currently on the migration base. + */ + if (!timer->is_soft || base == &migration_base) cpu_relax(); return; } -- cgit v1.2.3 From 5cbd22c179012114099fe3996999b54fd2a092c5 Mon Sep 17 00:00:00 2001 From: Peter Wu Date: Wed, 21 Aug 2019 00:08:57 +0100 Subject: bpf: clarify description for CONFIG_BPF_EVENTS PERF_EVENT_IOC_SET_BPF supports uprobes since v4.3, and tracepoints since v4.7 via commit 04a22fae4cbc ("tracing, perf: Implement BPF programs attached to uprobes"), and commit 98b5c2c65c29 ("perf, bpf: allow bpf programs attach to tracepoints") respectively. Signed-off-by: Peter Wu Signed-off-by: Alexei Starovoitov --- kernel/trace/Kconfig | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 98da8998c25c..b09d7b1ffffd 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -520,7 +520,8 @@ config BPF_EVENTS bool default y help - This allows the user to attach BPF programs to kprobe events. + This allows the user to attach BPF programs to kprobe, uprobe, and + tracepoint events. config DYNAMIC_EVENTS def_bool n -- cgit v1.2.3 From 692117c1f7a6770ed41dd8f277cd9fed1dfb16f1 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 19 Aug 2019 16:31:46 +0200 Subject: posix-cpu-timers: Sanitize bogus WARNONS Warning when p == NULL and then proceeding and dereferencing p does not make any sense as the kernel will crash with a NULL pointer dereference right away. Bailing out when p == NULL and returning an error code does not cure the underlying problem which caused p to be NULL. Though it might allow to do proper debugging. Same applies to the clock id check in set_process_cpu_timer(). Clean them up and make them return without trying to do further damage. Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Link: https://lkml.kernel.org/r/20190819143801.846497772@linutronix.de --- kernel/time/posix-cpu-timers.c | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 742d4a4e6f71..98223d2805c2 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -375,7 +375,8 @@ static int posix_cpu_timer_del(struct k_itimer *timer) struct sighand_struct *sighand; struct task_struct *p = timer->it.cpu.task; - WARN_ON_ONCE(p == NULL); + if (WARN_ON_ONCE(!p)) + return -EINVAL; /* * Protect against sighand release/switch in exit/exec and process/ @@ -581,7 +582,8 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags, u64 old_expires, new_expires, old_incr, val; int ret; - WARN_ON_ONCE(p == NULL); + if (WARN_ON_ONCE(!p)) + return -EINVAL; /* * Use the to_ktime conversion because that clamps the maximum @@ -716,10 +718,11 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags, static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec64 *itp) { - u64 now; struct task_struct *p = timer->it.cpu.task; + u64 now; - WARN_ON_ONCE(p == NULL); + if (WARN_ON_ONCE(!p)) + return; /* * Easy part: convert the reload time. @@ -1001,12 +1004,13 @@ static void check_process_timers(struct task_struct *tsk, */ static void posix_cpu_timer_rearm(struct k_itimer *timer) { + struct task_struct *p = timer->it.cpu.task; struct sighand_struct *sighand; unsigned long flags; - struct task_struct *p = timer->it.cpu.task; u64 now; - WARN_ON_ONCE(p == NULL); + if (WARN_ON_ONCE(!p)) + return; /* * Fetch the current sample and update the timer's expiry time. @@ -1203,7 +1207,9 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx, u64 now; int ret; - WARN_ON_ONCE(clock_idx == CPUCLOCK_SCHED); + if (WARN_ON_ONCE(clock_idx >= CPUCLOCK_SCHED)) + return; + ret = cpu_timer_sample_group(clock_idx, tsk, &now); if (oldval && ret != -EINVAL) { -- cgit v1.2.3 From dce3e8fd039cc1b62760b3ad6822cf04c262cd0e Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 19 Aug 2019 16:31:47 +0200 Subject: posix-cpu-timers: Remove tsk argument from run_posix_cpu_timers() It's always current. Don't give people wrong ideas. Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Link: https://lkml.kernel.org/r/20190819143801.945469967@linutronix.de --- kernel/time/posix-cpu-timers.c | 5 +++-- kernel/time/timer.c | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 98223d2805c2..387e0e86e1b8 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -1137,11 +1137,12 @@ static inline int fastpath_timer_check(struct task_struct *tsk) * already updated our counts. We need to check if any timers fire now. * Interrupts are disabled. */ -void run_posix_cpu_timers(struct task_struct *tsk) +void run_posix_cpu_timers(void) { - LIST_HEAD(firing); + struct task_struct *tsk = current; struct k_itimer *timer, *next; unsigned long flags; + LIST_HEAD(firing); lockdep_assert_irqs_disabled(); diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 673c6a0f0c45..0e315a2e77ae 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -1728,7 +1728,7 @@ void update_process_times(int user_tick) #endif scheduler_tick(); if (IS_ENABLED(CONFIG_POSIX_TIMERS)) - run_posix_cpu_timers(p); + run_posix_cpu_timers(); } /** -- cgit v1.2.3 From b99328a60a482108f5195b4d611f90992ca016ba Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 22 Aug 2019 13:00:15 +0200 Subject: timekeeping/vsyscall: Prevent math overflow in BOOTTIME update The VDSO update for CLOCK_BOOTTIME has a overflow issue as it shifts the nanoseconds based boot time offset left by the clocksource shift. That overflows once the boot time offset becomes large enough. As a consequence CLOCK_BOOTTIME in the VDSO becomes a random number causing applications to misbehave. Fix it by storing a timespec64 representation of the offset when boot time is adjusted and add that to the MONOTONIC base time value in the vdso data page. Using the timespec64 representation avoids a 64bit division in the update code. Fixes: 44f57d788e7d ("timekeeping: Provide a generic update_vsyscall() implementation") Reported-by: Chris Clayton Signed-off-by: Thomas Gleixner Tested-by: Chris Clayton Tested-by: Vincenzo Frascino Link: https://lkml.kernel.org/r/alpine.DEB.2.21.1908221257580.1983@nanos.tec.linutronix.de --- kernel/time/timekeeping.c | 5 +++++ kernel/time/vsyscall.c | 22 +++++++++++++--------- 2 files changed, 18 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index d911c8470149..ca69290bee2a 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -146,6 +146,11 @@ static void tk_set_wall_to_mono(struct timekeeper *tk, struct timespec64 wtm) static inline void tk_update_sleep_time(struct timekeeper *tk, ktime_t delta) { tk->offs_boot = ktime_add(tk->offs_boot, delta); + /* + * Timespec representation for VDSO update to avoid 64bit division + * on every update. + */ + tk->monotonic_to_boot = ktime_to_timespec64(tk->offs_boot); } /* diff --git a/kernel/time/vsyscall.c b/kernel/time/vsyscall.c index 8cf3596a4ce6..4bc37ac3bb05 100644 --- a/kernel/time/vsyscall.c +++ b/kernel/time/vsyscall.c @@ -17,7 +17,7 @@ static inline void update_vdso_data(struct vdso_data *vdata, struct timekeeper *tk) { struct vdso_timestamp *vdso_ts; - u64 nsec; + u64 nsec, sec; vdata[CS_HRES_COARSE].cycle_last = tk->tkr_mono.cycle_last; vdata[CS_HRES_COARSE].mask = tk->tkr_mono.mask; @@ -45,23 +45,27 @@ static inline void update_vdso_data(struct vdso_data *vdata, } vdso_ts->nsec = nsec; - /* CLOCK_MONOTONIC_RAW */ - vdso_ts = &vdata[CS_RAW].basetime[CLOCK_MONOTONIC_RAW]; - vdso_ts->sec = tk->raw_sec; - vdso_ts->nsec = tk->tkr_raw.xtime_nsec; + /* Copy MONOTONIC time for BOOTTIME */ + sec = vdso_ts->sec; + /* Add the boot offset */ + sec += tk->monotonic_to_boot.tv_sec; + nsec += (u64)tk->monotonic_to_boot.tv_nsec << tk->tkr_mono.shift; /* CLOCK_BOOTTIME */ vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_BOOTTIME]; - vdso_ts->sec = tk->xtime_sec + tk->wall_to_monotonic.tv_sec; - nsec = tk->tkr_mono.xtime_nsec; - nsec += ((u64)(tk->wall_to_monotonic.tv_nsec + - ktime_to_ns(tk->offs_boot)) << tk->tkr_mono.shift); + vdso_ts->sec = sec; + while (nsec >= (((u64)NSEC_PER_SEC) << tk->tkr_mono.shift)) { nsec -= (((u64)NSEC_PER_SEC) << tk->tkr_mono.shift); vdso_ts->sec++; } vdso_ts->nsec = nsec; + /* CLOCK_MONOTONIC_RAW */ + vdso_ts = &vdata[CS_RAW].basetime[CLOCK_MONOTONIC_RAW]; + vdso_ts->sec = tk->raw_sec; + vdso_ts->nsec = tk->tkr_raw.xtime_nsec; + /* CLOCK_TAI */ vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_TAI]; vdso_ts->sec = tk->xtime_sec + (s64)tk->tai_offset; -- cgit v1.2.3 From 6754172c208d9d3dae208c6494611ac167d56688 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 21 Aug 2019 14:07:10 -0700 Subject: bpf: fix precision tracking in presence of bpf2bpf calls While adding extra tests for precision tracking and extra infra to adjust verifier heuristics the existing test "calls: cross frame pruning - liveness propagation" started to fail. The root cause is the same as described in verifer.c comment: * Also if parent's curframe > frame where backtracking started, * the verifier need to mark registers in both frames, otherwise callees * may incorrectly prune callers. This is similar to * commit 7640ead93924 ("bpf: verifier: make sure callees don't prune with caller differences") * For now backtracking falls back into conservative marking. Turned out though that returning -ENOTSUPP from backtrack_insn() and doing mark_all_scalars_precise() in the current parentage chain is not enough. Depending on how is_state_visited() heuristic is creating parentage chain it's possible that callee will incorrectly prune caller. Fix the issue by setting precise=true earlier and more aggressively. Before this fix the precision tracking _within_ functions that don't do bpf2bpf calls would still work. Whereas now precision tracking is completely disabled when bpf2bpf calls are present anywhere in the program. No difference in cilium tests (they don't have bpf2bpf calls). No difference in test_progs though some of them have bpf2bpf calls, but precision tracking wasn't effective there. Fixes: b5dc0163d8fd ("bpf: precise scalar_value tracking") Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- kernel/bpf/verifier.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index c84d83f86141..b5c14c9d7b98 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -985,9 +985,6 @@ static void __mark_reg_unbounded(struct bpf_reg_state *reg) reg->smax_value = S64_MAX; reg->umin_value = 0; reg->umax_value = U64_MAX; - - /* constant backtracking is enabled for root only for now */ - reg->precise = capable(CAP_SYS_ADMIN) ? false : true; } /* Mark a register as having a completely unknown (scalar) value. */ @@ -1014,7 +1011,11 @@ static void mark_reg_unknown(struct bpf_verifier_env *env, __mark_reg_not_init(regs + regno); return; } - __mark_reg_unknown(regs + regno); + regs += regno; + __mark_reg_unknown(regs); + /* constant backtracking is enabled for root without bpf2bpf calls */ + regs->precise = env->subprog_cnt > 1 || !env->allow_ptr_leaks ? + true : false; } static void __mark_reg_not_init(struct bpf_reg_state *reg) -- cgit v1.2.3 From c751798aa224fadc5124b49eeb38fb468c0fa039 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 23 Aug 2019 22:14:23 +0200 Subject: bpf: fix use after free in prog symbol exposure syzkaller managed to trigger the warning in bpf_jit_free() which checks via bpf_prog_kallsyms_verify_off() for potentially unlinked JITed BPF progs in kallsyms, and subsequently trips over GPF when walking kallsyms entries: [...] 8021q: adding VLAN 0 to HW filter on device batadv0 8021q: adding VLAN 0 to HW filter on device batadv0 WARNING: CPU: 0 PID: 9869 at kernel/bpf/core.c:810 bpf_jit_free+0x1e8/0x2a0 Kernel panic - not syncing: panic_on_warn set ... CPU: 0 PID: 9869 Comm: kworker/0:7 Not tainted 5.0.0-rc8+ #1 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Workqueue: events bpf_prog_free_deferred Call Trace: __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0x113/0x167 lib/dump_stack.c:113 panic+0x212/0x40b kernel/panic.c:214 __warn.cold.8+0x1b/0x38 kernel/panic.c:571 report_bug+0x1a4/0x200 lib/bug.c:186 fixup_bug arch/x86/kernel/traps.c:178 [inline] do_error_trap+0x11b/0x200 arch/x86/kernel/traps.c:271 do_invalid_op+0x36/0x40 arch/x86/kernel/traps.c:290 invalid_op+0x14/0x20 arch/x86/entry/entry_64.S:973 RIP: 0010:bpf_jit_free+0x1e8/0x2a0 Code: 02 4c 89 e2 83 e2 07 38 d0 7f 08 84 c0 0f 85 86 00 00 00 48 ba 00 02 00 00 00 00 ad de 0f b6 43 02 49 39 d6 0f 84 5f fe ff ff <0f> 0b e9 58 fe ff ff 48 b8 00 00 00 00 00 fc ff df 4c 89 e2 48 c1 RSP: 0018:ffff888092f67cd8 EFLAGS: 00010202 RAX: 0000000000000007 RBX: ffffc90001947000 RCX: ffffffff816e9d88 RDX: dead000000000200 RSI: 0000000000000008 RDI: ffff88808769f7f0 RBP: ffff888092f67d00 R08: fffffbfff1394059 R09: fffffbfff1394058 R10: fffffbfff1394058 R11: ffffffff89ca02c7 R12: ffffc90001947002 R13: ffffc90001947020 R14: ffffffff881eca80 R15: ffff88808769f7e8 BUG: unable to handle kernel paging request at fffffbfff400d000 #PF error: [normal kernel read fault] PGD 21ffee067 P4D 21ffee067 PUD 21ffed067 PMD 9f942067 PTE 0 Oops: 0000 [#1] PREEMPT SMP KASAN CPU: 0 PID: 9869 Comm: kworker/0:7 Not tainted 5.0.0-rc8+ #1 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Workqueue: events bpf_prog_free_deferred RIP: 0010:bpf_get_prog_addr_region kernel/bpf/core.c:495 [inline] RIP: 0010:bpf_tree_comp kernel/bpf/core.c:558 [inline] RIP: 0010:__lt_find include/linux/rbtree_latch.h:115 [inline] RIP: 0010:latch_tree_find include/linux/rbtree_latch.h:208 [inline] RIP: 0010:bpf_prog_kallsyms_find+0x107/0x2e0 kernel/bpf/core.c:632 Code: 00 f0 ff ff 44 38 c8 7f 08 84 c0 0f 85 fa 00 00 00 41 f6 45 02 01 75 02 0f 0b 48 39 da 0f 82 92 00 00 00 48 89 d8 48 c1 e8 03 <42> 0f b6 04 30 84 c0 74 08 3c 03 0f 8e 45 01 00 00 8b 03 48 c1 e0 [...] Upon further debugging, it turns out that whenever we trigger this issue, the kallsyms removal in bpf_prog_ksym_node_del() was /skipped/ but yet bpf_jit_free() reported that the entry is /in use/. Problem is that symbol exposure via bpf_prog_kallsyms_add() but also perf_event_bpf_event() were done /after/ bpf_prog_new_fd(). Once the fd is exposed to the public, a parallel close request came in right before we attempted to do the bpf_prog_kallsyms_add(). Given at this time the prog reference count is one, we start to rip everything underneath us via bpf_prog_release() -> bpf_prog_put(). The memory is eventually released via deferred free, so we're seeing that bpf_jit_free() has a kallsym entry because we added it from bpf_prog_load() but /after/ bpf_prog_put() from the remote CPU. Therefore, move both notifications /before/ we install the fd. The issue was never seen between bpf_prog_alloc_id() and bpf_prog_new_fd() because upon bpf_prog_get_fd_by_id() we'll take another reference to the BPF prog, so we're still holding the original reference from the bpf_prog_load(). Fixes: 6ee52e2a3fe4 ("perf, bpf: Introduce PERF_RECORD_BPF_EVENT") Fixes: 74451e66d516 ("bpf: make jited programs visible in traces") Reported-by: syzbot+bd3bba6ff3fcea7a6ec6@syzkaller.appspotmail.com Signed-off-by: Daniel Borkmann Cc: Song Liu --- kernel/bpf/syscall.c | 30 ++++++++++++++++++------------ 1 file changed, 18 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 5d141f16f6fa..272071e9112f 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1707,20 +1707,26 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr) if (err) goto free_used_maps; - err = bpf_prog_new_fd(prog); - if (err < 0) { - /* failed to allocate fd. - * bpf_prog_put() is needed because the above - * bpf_prog_alloc_id() has published the prog - * to the userspace and the userspace may - * have refcnt-ed it through BPF_PROG_GET_FD_BY_ID. - */ - bpf_prog_put(prog); - return err; - } - + /* Upon success of bpf_prog_alloc_id(), the BPF prog is + * effectively publicly exposed. However, retrieving via + * bpf_prog_get_fd_by_id() will take another reference, + * therefore it cannot be gone underneath us. + * + * Only for the time /after/ successful bpf_prog_new_fd() + * and before returning to userspace, we might just hold + * one reference and any parallel close on that fd could + * rip everything out. Hence, below notifications must + * happen before bpf_prog_new_fd(). + * + * Also, any failure handling from this point onwards must + * be using bpf_prog_put() given the program is exposed. + */ bpf_prog_kallsyms_add(prog); perf_event_bpf_event(prog, PERF_BPF_EVENT_PROG_LOAD, 0); + + err = bpf_prog_new_fd(prog); + if (err < 0) + bpf_prog_put(prog); return err; free_used_maps: -- cgit v1.2.3 From 7b2b55da1db10a5525460633ae4b6fb0be060c41 Mon Sep 17 00:00:00 2001 From: Jason Xing Date: Sat, 24 Aug 2019 17:54:53 -0700 Subject: psi: get poll_work to run when calling poll syscall next time Only when calling the poll syscall the first time can user receive POLLPRI correctly. After that, user always fails to acquire the event signal. Reproduce case: 1. Get the monitor code in Documentation/accounting/psi.txt 2. Run it, and wait for the event triggered. 3. Kill and restart the process. The question is why we can end up with poll_scheduled = 1 but the work not running (which would reset it to 0). And the answer is because the scheduling side sees group->poll_kworker under RCU protection and then schedules it, but here we cancel the work and destroy the worker. The cancel needs to pair with resetting the poll_scheduled flag. Link: http://lkml.kernel.org/r/1566357985-97781-1-git-send-email-joseph.qi@linux.alibaba.com Signed-off-by: Jason Xing Signed-off-by: Joseph Qi Reviewed-by: Caspar Zhang Reviewed-by: Suren Baghdasaryan Acked-by: Johannes Weiner Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sched/psi.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 23fbbcc414d5..6e52b67b420e 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -1131,7 +1131,15 @@ static void psi_trigger_destroy(struct kref *ref) * deadlock while waiting for psi_poll_work to acquire trigger_lock */ if (kworker_to_destroy) { + /* + * After the RCU grace period has expired, the worker + * can no longer be found through group->poll_kworker. + * But it might have been already scheduled before + * that - deschedule it cleanly before destroying it. + */ kthread_cancel_delayed_work_sync(&group->poll_work); + atomic_set(&group->poll_scheduled, 0); + kthread_destroy_worker(kworker_to_destroy); } kfree(t); -- cgit v1.2.3 From ede7c460b1da5be7b8ef4efe47f1687babf06408 Mon Sep 17 00:00:00 2001 From: "Naveen N. Rao" Date: Thu, 22 Aug 2019 00:53:58 +0530 Subject: bpf: handle 32-bit zext during constant blinding Since BPF constant blinding is performed after the verifier pass, the ALU32 instructions inserted for doubleword immediate loads don't have a corresponding zext instruction. This is causing a kernel oops on powerpc and can be reproduced by running 'test_cgroup_storage' with bpf_jit_harden=2. Fix this by emitting BPF_ZEXT during constant blinding if prog->aux->verifier_zext is set. Fixes: a4b1d3c1ddf6cb ("bpf: verifier: insert zero extension according to analysis result") Reported-by: Michael Ellerman Signed-off-by: Naveen N. Rao Reviewed-by: Jiong Wang Signed-off-by: Daniel Borkmann --- kernel/bpf/core.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 8191a7db2777..66088a9e9b9e 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -890,7 +890,8 @@ int bpf_jit_get_func_addr(const struct bpf_prog *prog, static int bpf_jit_blind_insn(const struct bpf_insn *from, const struct bpf_insn *aux, - struct bpf_insn *to_buff) + struct bpf_insn *to_buff, + bool emit_zext) { struct bpf_insn *to = to_buff; u32 imm_rnd = get_random_int(); @@ -1005,6 +1006,8 @@ static int bpf_jit_blind_insn(const struct bpf_insn *from, case 0: /* Part 2 of BPF_LD | BPF_IMM | BPF_DW. */ *to++ = BPF_ALU32_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ aux[0].imm); *to++ = BPF_ALU32_IMM(BPF_XOR, BPF_REG_AX, imm_rnd); + if (emit_zext) + *to++ = BPF_ZEXT_REG(BPF_REG_AX); *to++ = BPF_ALU64_REG(BPF_OR, aux[0].dst_reg, BPF_REG_AX); break; @@ -1088,7 +1091,8 @@ struct bpf_prog *bpf_jit_blind_constants(struct bpf_prog *prog) insn[1].code == 0) memcpy(aux, insn, sizeof(aux)); - rewritten = bpf_jit_blind_insn(insn, aux, insn_buff); + rewritten = bpf_jit_blind_insn(insn, aux, insn_buff, + clone->aux->verifier_zext); if (!rewritten) continue; -- cgit v1.2.3 From 53c1788b7d7720565214a466afffdc818d8c6e5f Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 16 Aug 2019 10:28:48 +0800 Subject: genirq/affinity: Improve __irq_build_affinity_masks() One invariant of __irq_build_affinity_masks() is that all CPUs in the specified masks (cpu_mask AND node_to_cpumask for each node) should be covered during the spread. Even though all requested vectors have been reached, it's still required to spread vectors among remained CPUs. A similar policy has been taken in case of 'numvecs <= nodes' already. So remove the following check inside the loop: if (done >= numvecs) break; Meantime assign at least 1 vector for remaining nodes if 'numvecs' vectors have been handled already. Also, if the specified cpumask for one numa node is empty, simply do not spread vectors on this node. Signed-off-by: Ming Lei Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20190816022849.14075-2-ming.lei@redhat.com --- kernel/irq/affinity.c | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c index 6fef48033f96..c7cca942bd8a 100644 --- a/kernel/irq/affinity.c +++ b/kernel/irq/affinity.c @@ -129,14 +129,26 @@ static int __irq_build_affinity_masks(unsigned int startvec, for_each_node_mask(n, nodemsk) { unsigned int ncpus, v, vecs_to_assign, vecs_per_node; - /* Spread the vectors per node */ - vecs_per_node = (numvecs - (curvec - firstvec)) / nodes; - /* Get the cpus on this node which are in the mask */ cpumask_and(nmsk, cpu_mask, node_to_cpumask[n]); - - /* Calculate the number of cpus per vector */ ncpus = cpumask_weight(nmsk); + if (!ncpus) + continue; + + /* + * Calculate the number of cpus per vector + * + * Spread the vectors evenly per node. If the requested + * vector number has been reached, simply allocate one + * vector for each remaining node so that all nodes can + * be covered + */ + if (numvecs > done) + vecs_per_node = max_t(unsigned, + (numvecs - done) / nodes, 1); + else + vecs_per_node = 1; + vecs_to_assign = min(vecs_per_node, ncpus); /* Account for rounding errors */ @@ -156,13 +168,11 @@ static int __irq_build_affinity_masks(unsigned int startvec, } done += v; - if (done >= numvecs) - break; if (curvec >= last_affv) curvec = firstvec; --nodes; } - return done; + return done < numvecs ? done : numvecs; } /* -- cgit v1.2.3 From b1a5a73e64e99faa5f4deef2ae96d7371a0fb5d0 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 16 Aug 2019 10:28:49 +0800 Subject: genirq/affinity: Spread vectors on node according to nr_cpu ratio Now __irq_build_affinity_masks() spreads vectors evenly per node, but there is a case that not all vectors have been spread when each numa node has a different number of CPUs which triggers the warning in the spreading code. Improve the spreading algorithm by - assigning vectors according to the ratio of the number of CPUs on a node to the number of remaining CPUs. - running the assignment from smaller nodes to bigger nodes to guarantee that every active node gets allocated at least one vector. This ensures that all vectors are spread out. Asided of that the spread becomes more fair if the nodes have different number of CPUs. For example, on the following machine: CPU(s): 16 On-line CPU(s) list: 0-15 Thread(s) per core: 1 Core(s) per socket: 8 Socket(s): 2 NUMA node(s): 2 ... NUMA node0 CPU(s): 0,1,3,5-9,11,13-15 NUMA node1 CPU(s): 2,4,10,12 When a driver requests to allocate 8 vectors, the following spread results: irq 31, cpu list 2,4 irq 32, cpu list 10,12 irq 33, cpu list 0-1 irq 34, cpu list 3,5 irq 35, cpu list 6-7 irq 36, cpu list 8-9 irq 37, cpu list 11,13 irq 38, cpu list 14-15 So Node 0 has now 6 and Node 1 has 2 vectors assigned. The original algorithm assigned 4 vectors on each node which was unfair versus Node 0. [ tglx: Massaged changelog ] Reported-by: Jon Derrick Signed-off-by: Ming Lei Signed-off-by: Thomas Gleixner Reviewed-by: Keith Busch Reviewed-by: Jon Derrick Link: https://lkml.kernel.org/r/20190816022849.14075-3-ming.lei@redhat.com --- kernel/irq/affinity.c | 239 ++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 200 insertions(+), 39 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c index c7cca942bd8a..d905e844bf3a 100644 --- a/kernel/irq/affinity.c +++ b/kernel/irq/affinity.c @@ -7,6 +7,7 @@ #include #include #include +#include static void irq_spread_init_one(struct cpumask *irqmsk, struct cpumask *nmsk, unsigned int cpus_per_vec) @@ -94,6 +95,155 @@ static int get_nodes_in_cpumask(cpumask_var_t *node_to_cpumask, return nodes; } +struct node_vectors { + unsigned id; + + union { + unsigned nvectors; + unsigned ncpus; + }; +}; + +static int ncpus_cmp_func(const void *l, const void *r) +{ + const struct node_vectors *ln = l; + const struct node_vectors *rn = r; + + return ln->ncpus - rn->ncpus; +} + +/* + * Allocate vector number for each node, so that for each node: + * + * 1) the allocated number is >= 1 + * + * 2) the allocated numbver is <= active CPU number of this node + * + * The actual allocated total vectors may be less than @numvecs when + * active total CPU number is less than @numvecs. + * + * Active CPUs means the CPUs in '@cpu_mask AND @node_to_cpumask[]' + * for each node. + */ +static void alloc_nodes_vectors(unsigned int numvecs, + const cpumask_var_t *node_to_cpumask, + const struct cpumask *cpu_mask, + const nodemask_t nodemsk, + struct cpumask *nmsk, + struct node_vectors *node_vectors) +{ + unsigned n, remaining_ncpus = 0; + + for (n = 0; n < nr_node_ids; n++) { + node_vectors[n].id = n; + node_vectors[n].ncpus = UINT_MAX; + } + + for_each_node_mask(n, nodemsk) { + unsigned ncpus; + + cpumask_and(nmsk, cpu_mask, node_to_cpumask[n]); + ncpus = cpumask_weight(nmsk); + + if (!ncpus) + continue; + remaining_ncpus += ncpus; + node_vectors[n].ncpus = ncpus; + } + + numvecs = min_t(unsigned, remaining_ncpus, numvecs); + + sort(node_vectors, nr_node_ids, sizeof(node_vectors[0]), + ncpus_cmp_func, NULL); + + /* + * Allocate vectors for each node according to the ratio of this + * node's nr_cpus to remaining un-assigned ncpus. 'numvecs' is + * bigger than number of active numa nodes. Always start the + * allocation from the node with minimized nr_cpus. + * + * This way guarantees that each active node gets allocated at + * least one vector, and the theory is simple: over-allocation + * is only done when this node is assigned by one vector, so + * other nodes will be allocated >= 1 vector, since 'numvecs' is + * bigger than number of numa nodes. + * + * One perfect invariant is that number of allocated vectors for + * each node is <= CPU count of this node: + * + * 1) suppose there are two nodes: A and B + * ncpu(X) is CPU count of node X + * vecs(X) is the vector count allocated to node X via this + * algorithm + * + * ncpu(A) <= ncpu(B) + * ncpu(A) + ncpu(B) = N + * vecs(A) + vecs(B) = V + * + * vecs(A) = max(1, round_down(V * ncpu(A) / N)) + * vecs(B) = V - vecs(A) + * + * both N and V are integer, and 2 <= V <= N, suppose + * V = N - delta, and 0 <= delta <= N - 2 + * + * 2) obviously vecs(A) <= ncpu(A) because: + * + * if vecs(A) is 1, then vecs(A) <= ncpu(A) given + * ncpu(A) >= 1 + * + * otherwise, + * vecs(A) <= V * ncpu(A) / N <= ncpu(A), given V <= N + * + * 3) prove how vecs(B) <= ncpu(B): + * + * if round_down(V * ncpu(A) / N) == 0, vecs(B) won't be + * over-allocated, so vecs(B) <= ncpu(B), + * + * otherwise: + * + * vecs(A) = + * round_down(V * ncpu(A) / N) = + * round_down((N - delta) * ncpu(A) / N) = + * round_down((N * ncpu(A) - delta * ncpu(A)) / N) >= + * round_down((N * ncpu(A) - delta * N) / N) = + * cpu(A) - delta + * + * then: + * + * vecs(A) - V >= ncpu(A) - delta - V + * => + * V - vecs(A) <= V + delta - ncpu(A) + * => + * vecs(B) <= N - ncpu(A) + * => + * vecs(B) <= cpu(B) + * + * For nodes >= 3, it can be thought as one node and another big + * node given that is exactly what this algorithm is implemented, + * and we always re-calculate 'remaining_ncpus' & 'numvecs', and + * finally for each node X: vecs(X) <= ncpu(X). + * + */ + for (n = 0; n < nr_node_ids; n++) { + unsigned nvectors, ncpus; + + if (node_vectors[n].ncpus == UINT_MAX) + continue; + + WARN_ON_ONCE(numvecs == 0); + + ncpus = node_vectors[n].ncpus; + nvectors = max_t(unsigned, 1, + numvecs * ncpus / remaining_ncpus); + WARN_ON_ONCE(nvectors > ncpus); + + node_vectors[n].nvectors = nvectors; + + remaining_ncpus -= ncpus; + numvecs -= nvectors; + } +} + static int __irq_build_affinity_masks(unsigned int startvec, unsigned int numvecs, unsigned int firstvec, @@ -102,10 +252,11 @@ static int __irq_build_affinity_masks(unsigned int startvec, struct cpumask *nmsk, struct irq_affinity_desc *masks) { - unsigned int n, nodes, cpus_per_vec, extra_vecs, done = 0; + unsigned int i, n, nodes, cpus_per_vec, extra_vecs, done = 0; unsigned int last_affv = firstvec + numvecs; unsigned int curvec = startvec; nodemask_t nodemsk = NODE_MASK_NONE; + struct node_vectors *node_vectors; if (!cpumask_weight(cpu_mask)) return 0; @@ -126,53 +277,57 @@ static int __irq_build_affinity_masks(unsigned int startvec, return numvecs; } - for_each_node_mask(n, nodemsk) { - unsigned int ncpus, v, vecs_to_assign, vecs_per_node; + node_vectors = kcalloc(nr_node_ids, + sizeof(struct node_vectors), + GFP_KERNEL); + if (!node_vectors) + return -ENOMEM; + + /* allocate vector number for each node */ + alloc_nodes_vectors(numvecs, node_to_cpumask, cpu_mask, + nodemsk, nmsk, node_vectors); + + for (i = 0; i < nr_node_ids; i++) { + unsigned int ncpus, v; + struct node_vectors *nv = &node_vectors[i]; + + if (nv->nvectors == UINT_MAX) + continue; /* Get the cpus on this node which are in the mask */ - cpumask_and(nmsk, cpu_mask, node_to_cpumask[n]); + cpumask_and(nmsk, cpu_mask, node_to_cpumask[nv->id]); ncpus = cpumask_weight(nmsk); if (!ncpus) continue; - /* - * Calculate the number of cpus per vector - * - * Spread the vectors evenly per node. If the requested - * vector number has been reached, simply allocate one - * vector for each remaining node so that all nodes can - * be covered - */ - if (numvecs > done) - vecs_per_node = max_t(unsigned, - (numvecs - done) / nodes, 1); - else - vecs_per_node = 1; - - vecs_to_assign = min(vecs_per_node, ncpus); + WARN_ON_ONCE(nv->nvectors > ncpus); /* Account for rounding errors */ - extra_vecs = ncpus - vecs_to_assign * (ncpus / vecs_to_assign); + extra_vecs = ncpus - nv->nvectors * (ncpus / nv->nvectors); - for (v = 0; curvec < last_affv && v < vecs_to_assign; - curvec++, v++) { - cpus_per_vec = ncpus / vecs_to_assign; + /* Spread allocated vectors on CPUs of the current node */ + for (v = 0; v < nv->nvectors; v++, curvec++) { + cpus_per_vec = ncpus / nv->nvectors; /* Account for extra vectors to compensate rounding errors */ if (extra_vecs) { cpus_per_vec++; --extra_vecs; } + + /* + * wrapping has to be considered given 'startvec' + * may start anywhere + */ + if (curvec >= last_affv) + curvec = firstvec; irq_spread_init_one(&masks[curvec].mask, nmsk, cpus_per_vec); } - - done += v; - if (curvec >= last_affv) - curvec = firstvec; - --nodes; + done += nv->nvectors; } - return done < numvecs ? done : numvecs; + kfree(node_vectors); + return done; } /* @@ -184,7 +339,7 @@ static int irq_build_affinity_masks(unsigned int startvec, unsigned int numvecs, unsigned int firstvec, struct irq_affinity_desc *masks) { - unsigned int curvec = startvec, nr_present, nr_others; + unsigned int curvec = startvec, nr_present = 0, nr_others = 0; cpumask_var_t *node_to_cpumask; cpumask_var_t nmsk, npresmsk; int ret = -ENOMEM; @@ -199,15 +354,17 @@ static int irq_build_affinity_masks(unsigned int startvec, unsigned int numvecs, if (!node_to_cpumask) goto fail_npresmsk; - ret = 0; /* Stabilize the cpumasks */ get_online_cpus(); build_node_to_cpumask(node_to_cpumask); /* Spread on present CPUs starting from affd->pre_vectors */ - nr_present = __irq_build_affinity_masks(curvec, numvecs, - firstvec, node_to_cpumask, - cpu_present_mask, nmsk, masks); + ret = __irq_build_affinity_masks(curvec, numvecs, firstvec, + node_to_cpumask, cpu_present_mask, + nmsk, masks); + if (ret < 0) + goto fail_build_affinity; + nr_present = ret; /* * Spread on non present CPUs starting from the next vector to be @@ -220,12 +377,16 @@ static int irq_build_affinity_masks(unsigned int startvec, unsigned int numvecs, else curvec = firstvec + nr_present; cpumask_andnot(npresmsk, cpu_possible_mask, cpu_present_mask); - nr_others = __irq_build_affinity_masks(curvec, numvecs, - firstvec, node_to_cpumask, - npresmsk, nmsk, masks); + ret = __irq_build_affinity_masks(curvec, numvecs, firstvec, + node_to_cpumask, npresmsk, nmsk, + masks); + if (ret >= 0) + nr_others = ret; + + fail_build_affinity: put_online_cpus(); - if (nr_present < numvecs) + if (ret >= 0) WARN_ON(nr_present + nr_others < numvecs); free_node_to_cpumask(node_to_cpumask); @@ -235,7 +396,7 @@ static int irq_build_affinity_masks(unsigned int startvec, unsigned int numvecs, fail_nmsk: free_cpumask_var(nmsk); - return ret; + return ret < 0 ? ret : 0; } static void default_calc_sets(struct irq_affinity *affd, unsigned int affvecs) -- cgit v1.2.3 From 2a1a3fa0f29270583f0e6e3100d609e09697add1 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Sat, 24 Aug 2019 14:12:31 +0100 Subject: kallsyms: Don't let kallsyms_lookup_size_offset() fail on retrieving the first symbol An arm64 kernel configured with CONFIG_KPROBES=y CONFIG_KALLSYMS=y # CONFIG_KALLSYMS_ALL is not set CONFIG_KALLSYMS_BASE_RELATIVE=y reports the following kprobe failure: [ 0.032677] kprobes: failed to populate blacklist: -22 [ 0.033376] Please take care of using kprobes. It appears that kprobe fails to retrieve the symbol at address 0xffff000010081000, despite this symbol being in System.map: ffff000010081000 T __exception_text_start This symbol is part of the first group of aliases in the kallsyms_offsets array (symbol names generated using ugly hacks in scripts/kallsyms.c): kallsyms_offsets: .long 0x1000 // do_undefinstr .long 0x1000 // efi_header_end .long 0x1000 // _stext .long 0x1000 // __exception_text_start .long 0x12b0 // do_cp15instr Looking at the implementation of get_symbol_pos(), it returns the lowest index for aliasing symbols. In this case, it return 0. But kallsyms_lookup_size_offset() considers 0 as a failure, which is obviously wrong (there is definitely a valid symbol living there). In turn, the kprobe blacklisting stops abruptly, hence the original error. A CONFIG_KALLSYMS_ALL kernel wouldn't fail as there is always some random symbols at the beginning of this array, which are never looked up via kallsyms_lookup_size_offset. Fix it by considering that get_symbol_pos() is always successful (which is consistent with the other uses of this function). Fixes: ffc5089196446 ("[PATCH] Create kallsyms_lookup_size_offset()") Reviewed-by: Masami Hiramatsu Cc: Arnaldo Carvalho de Melo Cc: Peter Zijlstra Cc: Will Deacon Cc: Catalin Marinas Signed-off-by: Marc Zyngier Signed-off-by: Will Deacon --- kernel/kallsyms.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index 95a260f9214b..136ce049c4ad 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -263,8 +263,10 @@ int kallsyms_lookup_size_offset(unsigned long addr, unsigned long *symbolsize, { char namebuf[KSYM_NAME_LEN]; - if (is_ksym_addr(addr)) - return !!get_symbol_pos(addr, symbolsize, offset); + if (is_ksym_addr(addr)) { + get_symbol_pos(addr, symbolsize, offset); + return 1; + } return !!module_address_lookup(addr, symbolsize, offset, NULL, namebuf) || !!__bpf_address_lookup(addr, symbolsize, offset, namebuf); } -- cgit v1.2.3 From 10d274e880eb208ec6a76261a9f8f8155020f771 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 22 Aug 2019 22:52:12 -0700 Subject: bpf: introduce verifier internal test flag Introduce BPF_F_TEST_STATE_FREQ flag to stress test parentage chain and state pruning. Signed-off-by: Alexei Starovoitov Acked-by: Song Liu Signed-off-by: Daniel Borkmann --- kernel/bpf/syscall.c | 1 + kernel/bpf/verifier.c | 5 ++++- 2 files changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index c0f62fd67c6b..ca60eafa6922 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1629,6 +1629,7 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr) if (attr->prog_flags & ~(BPF_F_STRICT_ALIGNMENT | BPF_F_ANY_ALIGNMENT | + BPF_F_TEST_STATE_FREQ | BPF_F_TEST_RND_HI32)) return -EINVAL; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 10c0ff93f52b..f340cfe53c6e 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -7222,7 +7222,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) struct bpf_verifier_state_list *sl, **pprev; struct bpf_verifier_state *cur = env->cur_state, *new; int i, j, err, states_cnt = 0; - bool add_new_state = false; + bool add_new_state = env->test_state_freq ? true : false; cur->last_insn_idx = env->prev_insn_idx; if (!env->insn_aux_data[insn_idx].prune_point) @@ -9262,6 +9262,9 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, env->allow_ptr_leaks = is_priv; + if (is_priv) + env->test_state_freq = attr->prog_flags & BPF_F_TEST_STATE_FREQ; + ret = replace_map_fd_with_map_ptr(env); if (ret < 0) goto skip_full_check; -- cgit v1.2.3 From 77c84dd1881d0f0176cb678d770bfbda26c54390 Mon Sep 17 00:00:00 2001 From: Douglas RAILLARD Date: Wed, 7 Aug 2019 16:33:40 +0100 Subject: sched/cpufreq: Align trace event behavior of fast switching Fast switching path only emits an event for the CPU of interest, whereas the regular path emits an event for all the CPUs that had their frequency changed, i.e. all the CPUs sharing the same policy. With the current behavior, looking at cpu_frequency event for a given CPU that is using the fast switching path will not give the correct frequency signal. Signed-off-by: Douglas RAILLARD Acked-by: Peter Zijlstra (Intel) Signed-off-by: Rafael J. Wysocki --- kernel/sched/cpufreq_schedutil.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 867b4bb6d4be..b03ca2f73713 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -117,6 +117,7 @@ static void sugov_fast_switch(struct sugov_policy *sg_policy, u64 time, unsigned int next_freq) { struct cpufreq_policy *policy = sg_policy->policy; + int cpu; if (!sugov_update_next_freq(sg_policy, time, next_freq)) return; @@ -126,7 +127,11 @@ static void sugov_fast_switch(struct sugov_policy *sg_policy, u64 time, return; policy->cur = next_freq; - trace_cpu_frequency(next_freq, smp_processor_id()); + + if (trace_cpu_frequency_enabled()) { + for_each_cpu(cpu, policy->cpus) + trace_cpu_frequency(next_freq, cpu); + } } static void sugov_deferred_update(struct sugov_policy *sg_policy, u64 time, -- cgit v1.2.3 From ab43762ef010967e4ccd53627f70a2eecbeafefb Mon Sep 17 00:00:00 2001 From: Alexander Shishkin Date: Tue, 6 Aug 2019 11:46:00 +0300 Subject: perf: Allow normal events to output AUX data In some cases, ordinary (non-AUX) events can generate data for AUX events. For example, PEBS events can come out as records in the Intel PT stream instead of their usual DS records, if configured to do so. One requirement for such events is to consistently schedule together, to ensure that the data from the "AUX output" events isn't lost while their corresponding AUX event is not scheduled. We use grouping to provide this guarantee: an "AUX output" event can be added to a group where an AUX event is a group leader, and provided that the former supports writing to the latter. Signed-off-by: Alexander Shishkin Signed-off-by: Peter Zijlstra (Intel) Cc: Ingo Molnar Cc: Arnaldo Carvalho de Melo Cc: kan.liang@linux.intel.com Link: https://lkml.kernel.org/r/20190806084606.4021-2-alexander.shishkin@linux.intel.com --- kernel/events/core.c | 93 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 93 insertions(+) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 0463c1151bae..2aad959e6def 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1887,6 +1887,89 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx) ctx->generation++; } +static int +perf_aux_output_match(struct perf_event *event, struct perf_event *aux_event) +{ + if (!has_aux(aux_event)) + return 0; + + if (!event->pmu->aux_output_match) + return 0; + + return event->pmu->aux_output_match(aux_event); +} + +static void put_event(struct perf_event *event); +static void event_sched_out(struct perf_event *event, + struct perf_cpu_context *cpuctx, + struct perf_event_context *ctx); + +static void perf_put_aux_event(struct perf_event *event) +{ + struct perf_event_context *ctx = event->ctx; + struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); + struct perf_event *iter; + + /* + * If event uses aux_event tear down the link + */ + if (event->aux_event) { + iter = event->aux_event; + event->aux_event = NULL; + put_event(iter); + return; + } + + /* + * If the event is an aux_event, tear down all links to + * it from other events. + */ + for_each_sibling_event(iter, event->group_leader) { + if (iter->aux_event != event) + continue; + + iter->aux_event = NULL; + put_event(event); + + /* + * If it's ACTIVE, schedule it out and put it into ERROR + * state so that we don't try to schedule it again. Note + * that perf_event_enable() will clear the ERROR status. + */ + event_sched_out(iter, cpuctx, ctx); + perf_event_set_state(event, PERF_EVENT_STATE_ERROR); + } +} + +static int perf_get_aux_event(struct perf_event *event, + struct perf_event *group_leader) +{ + /* + * Our group leader must be an aux event if we want to be + * an aux_output. This way, the aux event will precede its + * aux_output events in the group, and therefore will always + * schedule first. + */ + if (!group_leader) + return 0; + + if (!perf_aux_output_match(event, group_leader)) + return 0; + + if (!atomic_long_inc_not_zero(&group_leader->refcount)) + return 0; + + /* + * Link aux_outputs to their aux event; this is undone in + * perf_group_detach() by perf_put_aux_event(). When the + * group in torn down, the aux_output events loose their + * link to the aux_event and can't schedule any more. + */ + event->aux_event = group_leader; + + return 1; +} + static void perf_group_detach(struct perf_event *event) { struct perf_event *sibling, *tmp; @@ -1902,6 +1985,8 @@ static void perf_group_detach(struct perf_event *event) event->attach_state &= ~PERF_ATTACH_GROUP; + perf_put_aux_event(event); + /* * If this is a sibling, remove it from its group. */ @@ -10426,6 +10511,12 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, goto err_ns; } + if (event->attr.aux_output && + !(pmu->capabilities & PERF_PMU_CAP_AUX_OUTPUT)) { + err = -EOPNOTSUPP; + goto err_pmu; + } + err = exclusive_event_init(event); if (err) goto err_pmu; @@ -11082,6 +11173,8 @@ SYSCALL_DEFINE5(perf_event_open, } } + if (event->attr.aux_output && !perf_get_aux_event(event, group_leader)) + goto err_locked; /* * Must be under the same ctx::mutex as perf_install_in_context(), -- cgit v1.2.3 From 6ae40e3fdcd33a6ff3c490b9302d6a1861093f65 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 21 Aug 2019 21:08:48 +0200 Subject: posix-cpu-timers: Provide task validation functions The code contains three slightly different copies of validating whether a given clock resolves to a valid task and whether the current caller has permissions to access it. Create central functions. Replace check_clock() as a first step and rename it to something sensible. Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20190821192919.326097175@linutronix.de --- kernel/time/posix-cpu-timers.c | 65 ++++++++++++++++++++++++++++-------------- 1 file changed, 44 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 387e0e86e1b8..b06ed8b14861 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -35,27 +35,52 @@ void update_rlimit_cpu(struct task_struct *task, unsigned long rlim_new) spin_unlock_irq(&task->sighand->siglock); } -static int check_clock(const clockid_t which_clock) +/* + * Functions for validating access to tasks. + */ +static struct task_struct *lookup_task(const pid_t pid, bool thread) { - int error = 0; struct task_struct *p; - const pid_t pid = CPUCLOCK_PID(which_clock); - if (CPUCLOCK_WHICH(which_clock) >= CPUCLOCK_MAX) - return -EINVAL; + if (!pid) + return thread ? current : current->group_leader; + + p = find_task_by_vpid(pid); + if (!p || p == current) + return p; + if (thread) + return same_thread_group(p, current) ? p : NULL; + if (p == current) + return p; + return has_group_leader_pid(p) ? p : NULL; +} + +static struct task_struct *__get_task_for_clock(const clockid_t clock, + bool getref) +{ + const bool thread = !!CPUCLOCK_PERTHREAD(clock); + const pid_t pid = CPUCLOCK_PID(clock); + struct task_struct *p; - if (pid == 0) - return 0; + if (CPUCLOCK_WHICH(clock) >= CPUCLOCK_MAX) + return NULL; rcu_read_lock(); - p = find_task_by_vpid(pid); - if (!p || !(CPUCLOCK_PERTHREAD(which_clock) ? - same_thread_group(p, current) : has_group_leader_pid(p))) { - error = -EINVAL; - } + p = lookup_task(pid, thread); + if (p && getref) + get_task_struct(p); rcu_read_unlock(); + return p; +} - return error; +static inline struct task_struct *get_task_for_clock(const clockid_t clock) +{ + return __get_task_for_clock(clock, true); +} + +static inline int validate_clock_permissions(const clockid_t clock) +{ + return __get_task_for_clock(clock, false) ? 0 : -EINVAL; } /* @@ -125,7 +150,8 @@ static inline u64 virt_ticks(struct task_struct *p) static int posix_cpu_clock_getres(const clockid_t which_clock, struct timespec64 *tp) { - int error = check_clock(which_clock); + int error = validate_clock_permissions(which_clock); + if (!error) { tp->tv_sec = 0; tp->tv_nsec = ((NSEC_PER_SEC + HZ - 1) / HZ); @@ -142,20 +168,17 @@ posix_cpu_clock_getres(const clockid_t which_clock, struct timespec64 *tp) } static int -posix_cpu_clock_set(const clockid_t which_clock, const struct timespec64 *tp) +posix_cpu_clock_set(const clockid_t clock, const struct timespec64 *tp) { + int error = validate_clock_permissions(clock); + /* * You can never reset a CPU clock, but we check for other errors * in the call before failing with EPERM. */ - int error = check_clock(which_clock); - if (error == 0) { - error = -EPERM; - } - return error; + return error ? : -EPERM; } - /* * Sample a per-thread clock for the given task. */ -- cgit v1.2.3 From bfcf3e92c6c07cd1084624bad5622f3dad96328c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 21 Aug 2019 21:08:49 +0200 Subject: posix-cpu-timers: Use common permission check in posix_cpu_clock_get() Replace the next slightly different copy of permission checks. That also removes the necessarity to check the return value of the sample functions because the clock id is already validated. Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Link: https://lkml.kernel.org/r/20190821192919.414813172@linutronix.de --- kernel/time/posix-cpu-timers.c | 57 +++++++++++------------------------------- 1 file changed, 14 insertions(+), 43 deletions(-) (limited to 'kernel') diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index b06ed8b14861..eb11117bf227 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -289,53 +289,24 @@ static int cpu_clock_sample_group(const clockid_t which_clock, return 0; } -static int posix_cpu_clock_get_task(struct task_struct *tsk, - const clockid_t which_clock, - struct timespec64 *tp) +static int posix_cpu_clock_get(const clockid_t clock, struct timespec64 *tp) { - int err = -EINVAL; - u64 rtn; - - if (CPUCLOCK_PERTHREAD(which_clock)) { - if (same_thread_group(tsk, current)) - err = cpu_clock_sample(which_clock, tsk, &rtn); - } else { - if (tsk == current || thread_group_leader(tsk)) - err = cpu_clock_sample_group(which_clock, tsk, &rtn); - } - - if (!err) - *tp = ns_to_timespec64(rtn); + const clockid_t clkid = CPUCLOCK_WHICH(clock); + struct task_struct *tsk; + u64 t; - return err; -} - - -static int posix_cpu_clock_get(const clockid_t which_clock, struct timespec64 *tp) -{ - const pid_t pid = CPUCLOCK_PID(which_clock); - int err = -EINVAL; + tsk = get_task_for_clock(clock); + if (!tsk) + return -EINVAL; - if (pid == 0) { - /* - * Special case constant value for our own clocks. - * We don't have to do any lookup to find ourselves. - */ - err = posix_cpu_clock_get_task(current, which_clock, tp); - } else { - /* - * Find the given PID, and validate that the caller - * should be able to see it. - */ - struct task_struct *p; - rcu_read_lock(); - p = find_task_by_vpid(pid); - if (p) - err = posix_cpu_clock_get_task(p, which_clock, tp); - rcu_read_unlock(); - } + if (CPUCLOCK_PERTHREAD(clock)) + cpu_clock_sample(clkid, tsk, &t); + else + cpu_clock_sample_group(clkid, tsk, &t); + put_task_struct(tsk); - return err; + *tp = ns_to_timespec64(t); + return 0; } /* -- cgit v1.2.3 From e5a8b65b4cb2fe024b83bdec0424269949cc0a27 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 21 Aug 2019 21:08:50 +0200 Subject: posix-cpu-timers: Use common permission check in posix_cpu_timer_create() Yet another copy of the same thing gone... Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Link: https://lkml.kernel.org/r/20190821192919.505833418@linutronix.de --- kernel/time/posix-cpu-timers.c | 35 +++-------------------------------- 1 file changed, 3 insertions(+), 32 deletions(-) (limited to 'kernel') diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index eb11117bf227..4426a0f9c470 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -316,44 +316,15 @@ static int posix_cpu_clock_get(const clockid_t clock, struct timespec64 *tp) */ static int posix_cpu_timer_create(struct k_itimer *new_timer) { - int ret = 0; - const pid_t pid = CPUCLOCK_PID(new_timer->it_clock); - struct task_struct *p; + struct task_struct *p = get_task_for_clock(new_timer->it_clock); - if (CPUCLOCK_WHICH(new_timer->it_clock) >= CPUCLOCK_MAX) + if (!p) return -EINVAL; new_timer->kclock = &clock_posix_cpu; - INIT_LIST_HEAD(&new_timer->it.cpu.entry); - - rcu_read_lock(); - if (CPUCLOCK_PERTHREAD(new_timer->it_clock)) { - if (pid == 0) { - p = current; - } else { - p = find_task_by_vpid(pid); - if (p && !same_thread_group(p, current)) - p = NULL; - } - } else { - if (pid == 0) { - p = current->group_leader; - } else { - p = find_task_by_vpid(pid); - if (p && !has_group_leader_pid(p)) - p = NULL; - } - } new_timer->it.cpu.task = p; - if (p) { - get_task_struct(p); - } else { - ret = -EINVAL; - } - rcu_read_unlock(); - - return ret; + return 0; } /* -- cgit v1.2.3 From 19298fbf453c90a6cf72288155f80c6f55e9139d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 21 Aug 2019 21:08:51 +0200 Subject: posix-cpu-timers: Provide quick sample function for itimer get_itimer() needs a sample of the current thread group cputime. It invokes thread_group_cputimer() - which is a misnomer. That function also starts eventually the group cputime accouting which is bogus because the accounting is already active when a timer is armed. Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Link: https://lkml.kernel.org/r/20190821192919.599658199@linutronix.de --- kernel/time/posix-cpu-timers.c | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'kernel') diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 4426a0f9c470..c22b6b604a95 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -232,6 +232,27 @@ static inline void sample_cputime_atomic(struct task_cputime *times, times->sum_exec_runtime = atomic64_read(&atomic_times->sum_exec_runtime); } +/** + * thread_group_sample_cputime - Sample cputime for a given task + * @tsk: Task for which cputime needs to be started + * @iimes: Storage for time samples + * + * Called from sys_getitimer() to calculate the expiry time of an active + * timer. That means group cputime accounting is already active. Called + * with task sighand lock held. + * + * Updates @times with an uptodate sample of the thread group cputimes. + */ +void thread_group_sample_cputime(struct task_struct *tsk, + struct task_cputime *times) +{ + struct thread_group_cputimer *cputimer = &tsk->signal->cputimer; + + WARN_ON_ONCE(!cputimer->running); + + sample_cputime_atomic(times, &cputimer->cputime_atomic); +} + void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times) { struct thread_group_cputimer *cputimer = &tsk->signal->cputimer; -- cgit v1.2.3 From a34360d42434bbf28c0f375444c52c154ae3e6cf Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 21 Aug 2019 21:08:52 +0200 Subject: itimers: Use quick sample function get_itimer() locks sighand lock and checks whether the timer is already expired. If it is not expired then the thread group cputime accounting is already enabled. Use the sampling function not the one which is meant for starting a timer. Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Link: https://lkml.kernel.org/r/20190821192919.689713638@linutronix.de --- kernel/time/itimer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/itimer.c b/kernel/time/itimer.c index 9d26fd4ba4c0..ae04bc259240 100644 --- a/kernel/time/itimer.c +++ b/kernel/time/itimer.c @@ -58,7 +58,7 @@ static void get_cpu_itimer(struct task_struct *tsk, unsigned int clock_id, struct task_cputime cputime; u64 t; - thread_group_cputimer(tsk, &cputime); + thread_group_sample_cputime(tsk, &cputime); if (clock_id == CPUCLOCK_PROF) t = cputime.utime + cputime.stime; else -- cgit v1.2.3 From a324956fae05d863386c682830e917f6685f1d4f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 21 Aug 2019 21:08:53 +0200 Subject: posix-cpu-timers: Sample directly in timer check The thread group accounting is active, otherwise the expiry function would not be running. Sample the thread group time directly. Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Link: https://lkml.kernel.org/r/20190821192919.780348088@linutronix.de --- kernel/time/posix-cpu-timers.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index c22b6b604a95..cb736787145b 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -914,16 +914,17 @@ static void check_process_timers(struct task_struct *tsk, if (!READ_ONCE(tsk->signal->cputimer.running)) return; - /* + /* * Signify that a thread is checking for process timers. * Write access to this field is protected by the sighand lock. */ sig->cputimer.checking_timer = true; /* - * Collect the current process totals. + * Collect the current process totals. Group accounting is active + * so the sample can be taken directly. */ - thread_group_cputimer(tsk, &cputime); + sample_cputime_atomic(&cputime, &sig->cputimer.cputime_atomic); utime = cputime.utime; ptime = utime + cputime.stime; sum_sched_runtime = cputime.sum_exec_runtime; -- cgit v1.2.3 From c506bef424ca282f2ad357e86fee940c69018974 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 21 Aug 2019 21:08:54 +0200 Subject: posix-cpu-timers: Rename thread_group_cputimer() and make it static thread_group_cputimer() is a complete misnomer. The function does two things: - For arming process wide timers it makes sure that the atomic time storage is up to date. If no cpu timer is armed yet, then the atomic time storage is not updated by the scheduler for performance reasons. In that case a full summing up of all threads needs to be done and the update needs to be enabled. - Samples the current time into the caller supplied storage. Rename it to thread_group_start_cputime(), make it static and fixup the callsite. Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Link: https://lkml.kernel.org/r/20190821192919.869350319@linutronix.de --- kernel/time/posix-cpu-timers.c | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index cb736787145b..def225ae069a 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -253,7 +253,20 @@ void thread_group_sample_cputime(struct task_struct *tsk, sample_cputime_atomic(times, &cputimer->cputime_atomic); } -void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times) +/** + * thread_group_start_cputime - Start cputime and return a sample + * @tsk: Task for which cputime needs to be started + * @iimes: Storage for time samples + * + * The thread group cputime accouting is avoided when there are no posix + * CPU timers armed. Before starting a timer it's required to check whether + * the time accounting is active. If not, a full update of the atomic + * accounting store needs to be done and the accounting enabled. + * + * Updates @times with an uptodate sample of the thread group cputimes. + */ +static void +thread_group_start_cputime(struct task_struct *tsk, struct task_cputime *times) { struct thread_group_cputimer *cputimer = &tsk->signal->cputimer; struct task_cputime sum; @@ -536,7 +549,7 @@ static int cpu_timer_sample_group(const clockid_t which_clock, { struct task_cputime cputime; - thread_group_cputimer(p, &cputime); + thread_group_start_cputime(p, &cputime); switch (CPUCLOCK_WHICH(which_clock)) { default: return -EINVAL; -- cgit v1.2.3 From 24ab7f5a7b2c917e89fc6a87252f18faff91d6ce Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 21 Aug 2019 21:08:55 +0200 Subject: posix-cpu-timers: Consolidate thread group sample code cpu_clock_sample_group() and cpu_timer_sample_group() are almost the same. Before the rename one called thread_group_cputimer() and the other thread_group_cputime(). Really intuitive function names. Consolidate the functions and also avoid the thread traversal when the thread group's accounting is already active. Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Link: https://lkml.kernel.org/r/20190821192919.960966884@linutronix.de --- kernel/time/posix-cpu-timers.c | 59 ++++++++++++++---------------------------- 1 file changed, 20 insertions(+), 39 deletions(-) (limited to 'kernel') diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index def225ae069a..a9003b2d614d 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -294,29 +294,37 @@ thread_group_start_cputime(struct task_struct *tsk, struct task_cputime *times) } /* - * Sample a process (thread group) clock for the given group_leader task. - * Must be called with task sighand lock held for safe while_each_thread() - * traversal. + * Sample a process (thread group) clock for the given task clkid. If the + * group's cputime accounting is already enabled, read the atomic + * store. Otherwise a full update is required. Task's sighand lock must be + * held to protect the task traversal on a full update. */ static int cpu_clock_sample_group(const clockid_t which_clock, struct task_struct *p, - u64 *sample) + u64 *sample, bool start) { + struct thread_group_cputimer *cputimer = &p->signal->cputimer; struct task_cputime cputime; + if (!READ_ONCE(cputimer->running)) { + if (start) + thread_group_start_cputime(p, &cputime); + else + thread_group_cputime(p, &cputime); + } else { + sample_cputime_atomic(&cputime, &cputimer->cputime_atomic); + } + switch (CPUCLOCK_WHICH(which_clock)) { default: return -EINVAL; case CPUCLOCK_PROF: - thread_group_cputime(p, &cputime); *sample = cputime.utime + cputime.stime; break; case CPUCLOCK_VIRT: - thread_group_cputime(p, &cputime); *sample = cputime.utime; break; case CPUCLOCK_SCHED: - thread_group_cputime(p, &cputime); *sample = cputime.sum_exec_runtime; break; } @@ -336,7 +344,7 @@ static int posix_cpu_clock_get(const clockid_t clock, struct timespec64 *tp) if (CPUCLOCK_PERTHREAD(clock)) cpu_clock_sample(clkid, tsk, &t); else - cpu_clock_sample_group(clkid, tsk, &t); + cpu_clock_sample_group(clkid, tsk, &t, false); put_task_struct(tsk); *tp = ns_to_timespec64(t); @@ -539,33 +547,6 @@ static void cpu_timer_fire(struct k_itimer *timer) } } -/* - * Sample a process (thread group) timer for the given group_leader task. - * Must be called with task sighand lock held for safe while_each_thread() - * traversal. - */ -static int cpu_timer_sample_group(const clockid_t which_clock, - struct task_struct *p, u64 *sample) -{ - struct task_cputime cputime; - - thread_group_start_cputime(p, &cputime); - switch (CPUCLOCK_WHICH(which_clock)) { - default: - return -EINVAL; - case CPUCLOCK_PROF: - *sample = cputime.utime + cputime.stime; - break; - case CPUCLOCK_VIRT: - *sample = cputime.utime; - break; - case CPUCLOCK_SCHED: - *sample = cputime.sum_exec_runtime; - break; - } - return 0; -} - /* * Guts of sys_timer_settime for CPU timers. * This is called with the timer locked and interrupts disabled. @@ -627,7 +608,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags, if (CPUCLOCK_PERTHREAD(timer->it_clock)) { cpu_clock_sample(timer->it_clock, p, &val); } else { - cpu_timer_sample_group(timer->it_clock, p, &val); + cpu_clock_sample_group(timer->it_clock, p, &val, true); } if (old) { @@ -755,7 +736,7 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec64 *itp timer->it.cpu.expires = 0; return; } else { - cpu_timer_sample_group(timer->it_clock, p, &now); + cpu_clock_sample_group(timer->it_clock, p, &now, false); unlock_task_sighand(p, &flags); } } @@ -1042,7 +1023,7 @@ static void posix_cpu_timer_rearm(struct k_itimer *timer) /* If the process is dying, no need to rearm */ goto unlock; } - cpu_timer_sample_group(timer->it_clock, p, &now); + cpu_clock_sample_group(timer->it_clock, p, &now, true); bump_cpu_timer(timer, now); /* Leave the sighand locked for the call below. */ } @@ -1211,7 +1192,7 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx, if (WARN_ON_ONCE(clock_idx >= CPUCLOCK_SCHED)) return; - ret = cpu_timer_sample_group(clock_idx, tsk, &now); + ret = cpu_clock_sample_group(clock_idx, tsk, &now, true); if (oldval && ret != -EINVAL) { /* -- cgit v1.2.3 From c7a37c6f4c651a531101c5721814333bae2804ec Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 21 Aug 2019 21:08:56 +0200 Subject: posix-cpu-timers: Use clock ID in posix_cpu_timer_set() Extract the clock ID (PROF/VIRT/SCHED) from the clock selector and use it as argument to the sample functions. That allows to simplify them once all callers are fixed. Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Link: https://lkml.kernel.org/r/20190821192920.050770464@linutronix.de --- kernel/time/posix-cpu-timers.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index a9003b2d614d..12561f8ef378 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -556,10 +556,11 @@ static void cpu_timer_fire(struct k_itimer *timer) static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags, struct itimerspec64 *new, struct itimerspec64 *old) { - unsigned long flags; - struct sighand_struct *sighand; - struct task_struct *p = timer->it.cpu.task; + clockid_t clkid = CPUCLOCK_WHICH(timer->it_clock); u64 old_expires, new_expires, old_incr, val; + struct task_struct *p = timer->it.cpu.task; + struct sighand_struct *sighand; + unsigned long flags; int ret; if (WARN_ON_ONCE(!p)) @@ -606,9 +607,9 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags, * check if it's already passed. In short, we need a sample. */ if (CPUCLOCK_PERTHREAD(timer->it_clock)) { - cpu_clock_sample(timer->it_clock, p, &val); + cpu_clock_sample(clkid, p, &val); } else { - cpu_clock_sample_group(timer->it_clock, p, &val, true); + cpu_clock_sample_group(clkid, p, &val, true); } if (old) { -- cgit v1.2.3 From 99093c5b81f58815fbfc1fe301c87206c6f5f730 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 21 Aug 2019 21:08:57 +0200 Subject: posix-cpu-timers: Use clock ID in posix_cpu_timer_get() Extract the clock ID (PROF/VIRT/SCHED) from the clock selector and use it as argument to the sample functions. That allows to simplify them once all callers are fixed. Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Link: https://lkml.kernel.org/r/20190821192920.155487201@linutronix.de --- kernel/time/posix-cpu-timers.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 12561f8ef378..8e4ce8ade88c 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -699,6 +699,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags, static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec64 *itp) { + clockid_t clkid = CPUCLOCK_WHICH(timer->it_clock); struct task_struct *p = timer->it.cpu.task; u64 now; @@ -717,7 +718,7 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec64 *itp * Sample the clock to take the difference with the expiry time. */ if (CPUCLOCK_PERTHREAD(timer->it_clock)) { - cpu_clock_sample(timer->it_clock, p, &now); + cpu_clock_sample(clkid, p, &now); } else { struct sighand_struct *sighand; unsigned long flags; @@ -737,7 +738,7 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec64 *itp timer->it.cpu.expires = 0; return; } else { - cpu_clock_sample_group(timer->it_clock, p, &now, false); + cpu_clock_sample_group(clkid, p, &now, false); unlock_task_sighand(p, &flags); } } -- cgit v1.2.3 From da020ce406b2a9b714b82de9123a4c5a4848647b Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 21 Aug 2019 21:08:58 +0200 Subject: posix-cpu-timers: Use clock ID in posix_cpu_timer_rearm() Extract the clock ID (PROF/VIRT/SCHED) from the clock selector and use it as argument to the sample functions. That allows to simplify them once all callers are fixed. Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Link: https://lkml.kernel.org/r/20190821192920.245357769@linutronix.de --- kernel/time/posix-cpu-timers.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 8e4ce8ade88c..aff7e3b64d84 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -987,6 +987,7 @@ static void check_process_timers(struct task_struct *tsk, */ static void posix_cpu_timer_rearm(struct k_itimer *timer) { + clockid_t clkid = CPUCLOCK_WHICH(timer->it_clock); struct task_struct *p = timer->it.cpu.task; struct sighand_struct *sighand; unsigned long flags; @@ -999,7 +1000,7 @@ static void posix_cpu_timer_rearm(struct k_itimer *timer) * Fetch the current sample and update the timer's expiry time. */ if (CPUCLOCK_PERTHREAD(timer->it_clock)) { - cpu_clock_sample(timer->it_clock, p, &now); + cpu_clock_sample(clkid, p, &now); bump_cpu_timer(timer, now); if (unlikely(p->exit_state)) return; @@ -1025,7 +1026,7 @@ static void posix_cpu_timer_rearm(struct k_itimer *timer) /* If the process is dying, no need to rearm */ goto unlock; } - cpu_clock_sample_group(timer->it_clock, p, &now, true); + cpu_clock_sample_group(clkid, p, &now, true); bump_cpu_timer(timer, now); /* Leave the sighand locked for the call below. */ } -- cgit v1.2.3 From 5405d0051f7ca820d1781d87baf4d730ff58f208 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 21 Aug 2019 21:08:59 +0200 Subject: posix-cpu-timers: Remove pointless return value check set_process_cpu_timer() checks already whether the clock id is valid. No point in checking the return value of the sample function. That allows to simplify the sample function later. Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Link: https://lkml.kernel.org/r/20190821192920.339725769@linutronix.de --- kernel/time/posix-cpu-timers.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index aff7e3b64d84..5944b7494be7 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -1190,14 +1190,13 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx, u64 *newval, u64 *oldval) { u64 now; - int ret; if (WARN_ON_ONCE(clock_idx >= CPUCLOCK_SCHED)) return; - ret = cpu_clock_sample_group(clock_idx, tsk, &now, true); + cpu_clock_sample_group(clock_idx, tsk, &now, true); - if (oldval && ret != -EINVAL) { + if (oldval) { /* * We are setting itimer. The *oldval is absolute and we update * it to be relative, *newval argument is relative and we update -- cgit v1.2.3 From 2092c1d4fed9f279d10600b4c1b5167dd8486a2a Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 21 Aug 2019 21:09:00 +0200 Subject: posix-cpu-timers: Simplify sample functions All callers hand in a valdiated clock id. Remove the return value which was unchecked in most places anyway. Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Link: https://lkml.kernel.org/r/20190821192920.430475832@linutronix.de --- kernel/time/posix-cpu-timers.c | 28 +++++++++++++--------------- 1 file changed, 13 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 5944b7494be7..cc3d148344d3 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -180,14 +180,12 @@ posix_cpu_clock_set(const clockid_t clock, const struct timespec64 *tp) } /* - * Sample a per-thread clock for the given task. + * Sample a per-thread clock for the given task. clkid is validated. */ -static int cpu_clock_sample(const clockid_t which_clock, - struct task_struct *p, u64 *sample) +static void cpu_clock_sample(const clockid_t clkid, struct task_struct *p, + u64 *sample) { - switch (CPUCLOCK_WHICH(which_clock)) { - default: - return -EINVAL; + switch (clkid) { case CPUCLOCK_PROF: *sample = prof_ticks(p); break; @@ -197,8 +195,9 @@ static int cpu_clock_sample(const clockid_t which_clock, case CPUCLOCK_SCHED: *sample = task_sched_runtime(p); break; + default: + WARN_ON_ONCE(1); } - return 0; } /* @@ -297,11 +296,11 @@ thread_group_start_cputime(struct task_struct *tsk, struct task_cputime *times) * Sample a process (thread group) clock for the given task clkid. If the * group's cputime accounting is already enabled, read the atomic * store. Otherwise a full update is required. Task's sighand lock must be - * held to protect the task traversal on a full update. + * held to protect the task traversal on a full update. clkid is already + * validated. */ -static int cpu_clock_sample_group(const clockid_t which_clock, - struct task_struct *p, - u64 *sample, bool start) +static void cpu_clock_sample_group(const clockid_t clkid, struct task_struct *p, + u64 *sample, bool start) { struct thread_group_cputimer *cputimer = &p->signal->cputimer; struct task_cputime cputime; @@ -315,9 +314,7 @@ static int cpu_clock_sample_group(const clockid_t which_clock, sample_cputime_atomic(&cputime, &cputimer->cputime_atomic); } - switch (CPUCLOCK_WHICH(which_clock)) { - default: - return -EINVAL; + switch (clkid) { case CPUCLOCK_PROF: *sample = cputime.utime + cputime.stime; break; @@ -327,8 +324,9 @@ static int cpu_clock_sample_group(const clockid_t which_clock, case CPUCLOCK_SCHED: *sample = cputime.sum_exec_runtime; break; + default: + WARN_ON_ONCE(1); } - return 0; } static int posix_cpu_clock_get(const clockid_t clock, struct timespec64 *tp) -- cgit v1.2.3 From 8c2d74f03705c2c993a57673bae8fd535eabede6 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 21 Aug 2019 21:09:01 +0200 Subject: posix-cpu-timers: Get rid of pointer indirection Now that the sample functions have no return value anymore, the result can simply be returned instead of using pointer indirection. Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Link: https://lkml.kernel.org/r/20190821192920.535079278@linutronix.de --- kernel/time/posix-cpu-timers.c | 50 +++++++++++++++++++----------------------- 1 file changed, 22 insertions(+), 28 deletions(-) (limited to 'kernel') diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index cc3d148344d3..a2007ce9322a 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -182,22 +182,19 @@ posix_cpu_clock_set(const clockid_t clock, const struct timespec64 *tp) /* * Sample a per-thread clock for the given task. clkid is validated. */ -static void cpu_clock_sample(const clockid_t clkid, struct task_struct *p, - u64 *sample) +static u64 cpu_clock_sample(const clockid_t clkid, struct task_struct *p) { switch (clkid) { case CPUCLOCK_PROF: - *sample = prof_ticks(p); - break; + return prof_ticks(p); case CPUCLOCK_VIRT: - *sample = virt_ticks(p); - break; + return virt_ticks(p); case CPUCLOCK_SCHED: - *sample = task_sched_runtime(p); - break; + return task_sched_runtime(p); default: WARN_ON_ONCE(1); } + return 0; } /* @@ -299,8 +296,8 @@ thread_group_start_cputime(struct task_struct *tsk, struct task_cputime *times) * held to protect the task traversal on a full update. clkid is already * validated. */ -static void cpu_clock_sample_group(const clockid_t clkid, struct task_struct *p, - u64 *sample, bool start) +static u64 cpu_clock_sample_group(const clockid_t clkid, struct task_struct *p, + bool start) { struct thread_group_cputimer *cputimer = &p->signal->cputimer; struct task_cputime cputime; @@ -316,17 +313,15 @@ static void cpu_clock_sample_group(const clockid_t clkid, struct task_struct *p, switch (clkid) { case CPUCLOCK_PROF: - *sample = cputime.utime + cputime.stime; - break; + return cputime.utime + cputime.stime; case CPUCLOCK_VIRT: - *sample = cputime.utime; - break; + return cputime.utime; case CPUCLOCK_SCHED: - *sample = cputime.sum_exec_runtime; - break; + return cputime.sum_exec_runtime; default: WARN_ON_ONCE(1); } + return 0; } static int posix_cpu_clock_get(const clockid_t clock, struct timespec64 *tp) @@ -340,9 +335,9 @@ static int posix_cpu_clock_get(const clockid_t clock, struct timespec64 *tp) return -EINVAL; if (CPUCLOCK_PERTHREAD(clock)) - cpu_clock_sample(clkid, tsk, &t); + t = cpu_clock_sample(clkid, tsk); else - cpu_clock_sample_group(clkid, tsk, &t, false); + t = cpu_clock_sample_group(clkid, tsk, false); put_task_struct(tsk); *tp = ns_to_timespec64(t); @@ -604,11 +599,10 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags, * times (in arm_timer). With an absolute time, we must * check if it's already passed. In short, we need a sample. */ - if (CPUCLOCK_PERTHREAD(timer->it_clock)) { - cpu_clock_sample(clkid, p, &val); - } else { - cpu_clock_sample_group(clkid, p, &val, true); - } + if (CPUCLOCK_PERTHREAD(timer->it_clock)) + val = cpu_clock_sample(clkid, p); + else + val = cpu_clock_sample_group(clkid, p, true); if (old) { if (old_expires == 0) { @@ -716,7 +710,7 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec64 *itp * Sample the clock to take the difference with the expiry time. */ if (CPUCLOCK_PERTHREAD(timer->it_clock)) { - cpu_clock_sample(clkid, p, &now); + now = cpu_clock_sample(clkid, p); } else { struct sighand_struct *sighand; unsigned long flags; @@ -736,7 +730,7 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec64 *itp timer->it.cpu.expires = 0; return; } else { - cpu_clock_sample_group(clkid, p, &now, false); + now = cpu_clock_sample_group(clkid, p, false); unlock_task_sighand(p, &flags); } } @@ -998,7 +992,7 @@ static void posix_cpu_timer_rearm(struct k_itimer *timer) * Fetch the current sample and update the timer's expiry time. */ if (CPUCLOCK_PERTHREAD(timer->it_clock)) { - cpu_clock_sample(clkid, p, &now); + now = cpu_clock_sample(clkid, p); bump_cpu_timer(timer, now); if (unlikely(p->exit_state)) return; @@ -1024,7 +1018,7 @@ static void posix_cpu_timer_rearm(struct k_itimer *timer) /* If the process is dying, no need to rearm */ goto unlock; } - cpu_clock_sample_group(clkid, p, &now, true); + now = cpu_clock_sample_group(clkid, p, true); bump_cpu_timer(timer, now); /* Leave the sighand locked for the call below. */ } @@ -1192,7 +1186,7 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx, if (WARN_ON_ONCE(clock_idx >= CPUCLOCK_SCHED)) return; - cpu_clock_sample_group(clock_idx, tsk, &now, true); + now = cpu_clock_sample_group(clock_idx, tsk, true); if (oldval) { /* -- cgit v1.2.3 From 0476ff2c151ee35bda2f938e8828b6f6113ebf55 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 21 Aug 2019 21:09:02 +0200 Subject: posix-cpu-timers: Sample task times once in expiry check Sampling the task times twice does not make sense. Do it once. Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Link: https://lkml.kernel.org/r/20190821192920.639878168@linutronix.de --- kernel/time/posix-cpu-timers.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index a2007ce9322a..98dab3e4be08 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -785,9 +785,9 @@ static inline void check_dl_overrun(struct task_struct *tsk) static void check_thread_timers(struct task_struct *tsk, struct list_head *firing) { - struct list_head *timers = tsk->cpu_timers; struct task_cputime *tsk_expires = &tsk->cputime_expires; - u64 expires; + struct list_head *timers = tsk->cpu_timers; + u64 expires, stime, utime; unsigned long soft; if (dl_task(tsk)) @@ -800,10 +800,12 @@ static void check_thread_timers(struct task_struct *tsk, if (task_cputime_zero(&tsk->cputime_expires)) return; - expires = check_timers_list(timers, firing, prof_ticks(tsk)); + task_cputime(tsk, &utime, &stime); + + expires = check_timers_list(timers, firing, utime + stime); tsk_expires->prof_exp = expires; - expires = check_timers_list(++timers, firing, virt_ticks(tsk)); + expires = check_timers_list(++timers, firing, utime); tsk_expires->virt_exp = expires; tsk_expires->sched_exp = check_timers_list(++timers, firing, -- cgit v1.2.3 From ab693c5a5e3107f035d5162e6ada9aaf7dd76a1d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 21 Aug 2019 21:09:03 +0200 Subject: posix-cpu-timers: Move prof/virt_ticks into caller The functions have only one caller left. No point in having them. Move the almost duplicated code into the caller and simplify it. Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Link: https://lkml.kernel.org/r/20190821192920.729298382@linutronix.de --- kernel/time/posix-cpu-timers.c | 30 +++++++++--------------------- 1 file changed, 9 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 98dab3e4be08..b1c97664a730 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -130,23 +130,6 @@ static inline int task_cputime_zero(const struct task_cputime *cputime) return 0; } -static inline u64 prof_ticks(struct task_struct *p) -{ - u64 utime, stime; - - task_cputime(p, &utime, &stime); - - return utime + stime; -} -static inline u64 virt_ticks(struct task_struct *p) -{ - u64 utime, stime; - - task_cputime(p, &utime, &stime); - - return utime; -} - static int posix_cpu_clock_getres(const clockid_t which_clock, struct timespec64 *tp) { @@ -184,13 +167,18 @@ posix_cpu_clock_set(const clockid_t clock, const struct timespec64 *tp) */ static u64 cpu_clock_sample(const clockid_t clkid, struct task_struct *p) { + u64 utime, stime; + + if (clkid == CPUCLOCK_SCHED) + return task_sched_runtime(p); + + task_cputime(p, &utime, &stime); + switch (clkid) { case CPUCLOCK_PROF: - return prof_ticks(p); + return utime + stime; case CPUCLOCK_VIRT: - return virt_ticks(p); - case CPUCLOCK_SCHED: - return task_sched_runtime(p); + return utime; default: WARN_ON_ONCE(1); } -- cgit v1.2.3 From 2b69942f9021bf75bd1b001f53bd2578361fadf3 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 21 Aug 2019 21:09:04 +0200 Subject: posix-cpu-timers: Create a container struct Per task/process data of posix CPU timers is all over the place which makes the code hard to follow and requires ifdeffery. Create a container to hold all this information in one place, so data is consolidated and the ifdeffery can be confined to the posix timer header file and removed from places like fork. As a first step, move the cpu_timers list head array into the new struct and clean up the initializers and simplify fork. The remaining #ifdef in fork will be removed later. Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Link: https://lkml.kernel.org/r/20190821192920.819418976@linutronix.de --- kernel/fork.c | 11 ++++------- kernel/time/posix-cpu-timers.c | 20 ++++++++++---------- 2 files changed, 14 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index d8ae0f1b4148..b6a135e4275b 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1523,6 +1523,7 @@ void __cleanup_sighand(struct sighand_struct *sighand) */ static void posix_cpu_timers_init_group(struct signal_struct *sig) { + struct posix_cputimers *pct = &sig->posix_cputimers; unsigned long cpu_limit; cpu_limit = READ_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur); @@ -1531,10 +1532,7 @@ static void posix_cpu_timers_init_group(struct signal_struct *sig) sig->cputimer.running = true; } - /* The timer lists. */ - INIT_LIST_HEAD(&sig->cpu_timers[0]); - INIT_LIST_HEAD(&sig->cpu_timers[1]); - INIT_LIST_HEAD(&sig->cpu_timers[2]); + posix_cputimers_init(pct); } #else static inline void posix_cpu_timers_init_group(struct signal_struct *sig) { } @@ -1649,9 +1647,8 @@ static void posix_cpu_timers_init(struct task_struct *tsk) tsk->cputime_expires.prof_exp = 0; tsk->cputime_expires.virt_exp = 0; tsk->cputime_expires.sched_exp = 0; - INIT_LIST_HEAD(&tsk->cpu_timers[0]); - INIT_LIST_HEAD(&tsk->cpu_timers[1]); - INIT_LIST_HEAD(&tsk->cpu_timers[2]); + + posix_cputimers_init(&tsk->posix_cputimers); } #else static inline void posix_cpu_timers_init(struct task_struct *tsk) { } diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index b1c97664a730..849e2045fb6e 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -407,11 +407,11 @@ static void cleanup_timers_list(struct list_head *head) * * This must be called with the siglock held. */ -static void cleanup_timers(struct list_head *head) +static void cleanup_timers(struct posix_cputimers *pct) { - cleanup_timers_list(head); - cleanup_timers_list(++head); - cleanup_timers_list(++head); + cleanup_timers_list(&pct->cpu_timers[CPUCLOCK_PROF]); + cleanup_timers_list(&pct->cpu_timers[CPUCLOCK_VIRT]); + cleanup_timers_list(&pct->cpu_timers[CPUCLOCK_SCHED]); } /* @@ -421,11 +421,11 @@ static void cleanup_timers(struct list_head *head) */ void posix_cpu_timers_exit(struct task_struct *tsk) { - cleanup_timers(tsk->cpu_timers); + cleanup_timers(&tsk->posix_cputimers); } void posix_cpu_timers_exit_group(struct task_struct *tsk) { - cleanup_timers(tsk->signal->cpu_timers); + cleanup_timers(&tsk->signal->posix_cputimers); } static inline int expires_gt(u64 expires, u64 new_exp) @@ -446,10 +446,10 @@ static void arm_timer(struct k_itimer *timer) struct cpu_timer_list *next; if (CPUCLOCK_PERTHREAD(timer->it_clock)) { - head = p->cpu_timers; + head = p->posix_cputimers.cpu_timers; cputime_expires = &p->cputime_expires; } else { - head = p->signal->cpu_timers; + head = p->signal->posix_cputimers.cpu_timers; cputime_expires = &p->signal->cputime_expires; } head += CPUCLOCK_WHICH(timer->it_clock); @@ -773,8 +773,8 @@ static inline void check_dl_overrun(struct task_struct *tsk) static void check_thread_timers(struct task_struct *tsk, struct list_head *firing) { + struct list_head *timers = tsk->posix_cputimers.cpu_timers; struct task_cputime *tsk_expires = &tsk->cputime_expires; - struct list_head *timers = tsk->cpu_timers; u64 expires, stime, utime; unsigned long soft; @@ -879,9 +879,9 @@ static void check_process_timers(struct task_struct *tsk, struct list_head *firing) { struct signal_struct *const sig = tsk->signal; + struct list_head *timers = sig->posix_cputimers.cpu_timers; u64 utime, ptime, virt_expires, prof_expires; u64 sum_sched_runtime, sched_expires; - struct list_head *timers = sig->cpu_timers; struct task_cputime cputime; unsigned long soft; -- cgit v1.2.3 From 3a245c0f110e2bfcf7f2cd2248a29005c78999e3 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 21 Aug 2019 21:09:06 +0200 Subject: posix-cpu-timers: Move expiry cache into struct posix_cputimers The expiry cache belongs into the posix_cputimers container where the other cpu timers information is. Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Link: https://lkml.kernel.org/r/20190821192921.014444012@linutronix.de --- kernel/fork.c | 25 +++-------------------- kernel/sched/rt.c | 6 ++++-- kernel/time/posix-cpu-timers.c | 45 +++++++++++++++++++++++++----------------- 3 files changed, 34 insertions(+), 42 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index b6a135e4275b..52bfe7c20ff6 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1527,12 +1527,9 @@ static void posix_cpu_timers_init_group(struct signal_struct *sig) unsigned long cpu_limit; cpu_limit = READ_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur); - if (cpu_limit != RLIM_INFINITY) { - sig->cputime_expires.prof_exp = cpu_limit * NSEC_PER_SEC; + posix_cputimers_group_init(pct, cpu_limit); + if (cpu_limit != RLIM_INFINITY) sig->cputimer.running = true; - } - - posix_cputimers_init(pct); } #else static inline void posix_cpu_timers_init_group(struct signal_struct *sig) { } @@ -1638,22 +1635,6 @@ static void rt_mutex_init_task(struct task_struct *p) #endif } -#ifdef CONFIG_POSIX_TIMERS -/* - * Initialize POSIX timer handling for a single task. - */ -static void posix_cpu_timers_init(struct task_struct *tsk) -{ - tsk->cputime_expires.prof_exp = 0; - tsk->cputime_expires.virt_exp = 0; - tsk->cputime_expires.sched_exp = 0; - - posix_cputimers_init(&tsk->posix_cputimers); -} -#else -static inline void posix_cpu_timers_init(struct task_struct *tsk) { } -#endif - static inline void init_task_pid_links(struct task_struct *task) { enum pid_type type; @@ -1932,7 +1913,7 @@ static __latent_entropy struct task_struct *copy_process( task_io_accounting_init(&p->ioac); acct_clear_integrals(p); - posix_cpu_timers_init(p); + posix_cputimers_init(&p->posix_cputimers); p->io_context = NULL; audit_set_context(p, NULL); diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index da3e85e61013..d6678f773c96 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -2305,8 +2305,10 @@ static void watchdog(struct rq *rq, struct task_struct *p) } next = DIV_ROUND_UP(min(soft, hard), USEC_PER_SEC/HZ); - if (p->rt.timeout > next) - p->cputime_expires.sched_exp = p->se.sum_exec_runtime; + if (p->rt.timeout > next) { + posix_cputimers_rt_watchdog(&p->posix_cputimers, + p->se.sum_exec_runtime); + } } } #else diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 849e2045fb6e..3e29d1692437 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -20,11 +20,18 @@ static void posix_cpu_timer_rearm(struct k_itimer *timer); +void posix_cputimers_group_init(struct posix_cputimers *pct, u64 cpu_limit) +{ + posix_cputimers_init(pct); + if (cpu_limit != RLIM_INFINITY) + pct->cputime_expires.prof_exp = cpu_limit * NSEC_PER_SEC; +} + /* * Called after updating RLIMIT_CPU to run cpu timer and update - * tsk->signal->cputime_expires expiration cache if necessary. Needs - * siglock protection since other code may update expiration cache as - * well. + * tsk->signal->posix_cputimers.cputime_expires expiration cache if + * necessary. Needs siglock protection since other code may update + * expiration cache as well. */ void update_rlimit_cpu(struct task_struct *task, unsigned long rlim_new) { @@ -447,10 +454,10 @@ static void arm_timer(struct k_itimer *timer) if (CPUCLOCK_PERTHREAD(timer->it_clock)) { head = p->posix_cputimers.cpu_timers; - cputime_expires = &p->cputime_expires; + cputime_expires = &p->posix_cputimers.cputime_expires; } else { head = p->signal->posix_cputimers.cpu_timers; - cputime_expires = &p->signal->cputime_expires; + cputime_expires = &p->signal->posix_cputimers.cputime_expires; } head += CPUCLOCK_WHICH(timer->it_clock); @@ -774,7 +781,7 @@ static void check_thread_timers(struct task_struct *tsk, struct list_head *firing) { struct list_head *timers = tsk->posix_cputimers.cpu_timers; - struct task_cputime *tsk_expires = &tsk->cputime_expires; + struct task_cputime *tsk_expires = &tsk->posix_cputimers.cputime_expires; u64 expires, stime, utime; unsigned long soft; @@ -785,7 +792,7 @@ static void check_thread_timers(struct task_struct *tsk, * If cputime_expires is zero, then there are no active * per thread CPU timers. */ - if (task_cputime_zero(&tsk->cputime_expires)) + if (task_cputime_zero(tsk_expires)) return; task_cputime(tsk, &utime, &stime); @@ -954,10 +961,10 @@ static void check_process_timers(struct task_struct *tsk, prof_expires = x; } - sig->cputime_expires.prof_exp = prof_expires; - sig->cputime_expires.virt_exp = virt_expires; - sig->cputime_expires.sched_exp = sched_expires; - if (task_cputime_zero(&sig->cputime_expires)) + sig->posix_cputimers.cputime_expires.prof_exp = prof_expires; + sig->posix_cputimers.cputime_expires.virt_exp = virt_expires; + sig->posix_cputimers.cputime_expires.sched_exp = sched_expires; + if (task_cputime_zero(&sig->posix_cputimers.cputime_expires)) stop_process_timers(sig); sig->cputimer.checking_timer = false; @@ -1058,12 +1065,13 @@ static inline int fastpath_timer_check(struct task_struct *tsk) { struct signal_struct *sig; - if (!task_cputime_zero(&tsk->cputime_expires)) { + if (!task_cputime_zero(&tsk->posix_cputimers.cputime_expires)) { struct task_cputime task_sample; task_cputime(tsk, &task_sample.utime, &task_sample.stime); task_sample.sum_exec_runtime = tsk->se.sum_exec_runtime; - if (task_cputime_expired(&task_sample, &tsk->cputime_expires)) + if (task_cputime_expired(&task_sample, + &tsk->posix_cputimers.cputime_expires)) return 1; } @@ -1088,7 +1096,8 @@ static inline int fastpath_timer_check(struct task_struct *tsk) sample_cputime_atomic(&group_sample, &sig->cputimer.cputime_atomic); - if (task_cputime_expired(&group_sample, &sig->cputime_expires)) + if (task_cputime_expired(&group_sample, + &sig->posix_cputimers.cputime_expires)) return 1; } @@ -1204,12 +1213,12 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx, */ switch (clock_idx) { case CPUCLOCK_PROF: - if (expires_gt(tsk->signal->cputime_expires.prof_exp, *newval)) - tsk->signal->cputime_expires.prof_exp = *newval; + if (expires_gt(tsk->signal->posix_cputimers.cputime_expires.prof_exp, *newval)) + tsk->signal->posix_cputimers.cputime_expires.prof_exp = *newval; break; case CPUCLOCK_VIRT: - if (expires_gt(tsk->signal->cputime_expires.virt_exp, *newval)) - tsk->signal->cputime_expires.virt_exp = *newval; + if (expires_gt(tsk->signal->posix_cputimers.cputime_expires.virt_exp, *newval)) + tsk->signal->posix_cputimers.cputime_expires.virt_exp = *newval; break; } -- cgit v1.2.3 From 11b8462f7e1d25f639c88949a2746a9c2667a766 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 21 Aug 2019 21:09:07 +0200 Subject: posix-cpu-timers: Provide array based access to expiry cache Using struct task_cputime for the expiry cache is a pretty odd choice and comes with magic defines to rename the fields for usage in the expiry cache. struct task_cputime is basically a u64 array with 3 members, but it has distinct members. The expiry cache content is different than the content of task_cputime because expiry[PROF] = task_cputime.stime + task_cputime.utime expiry[VIRT] = task_cputime.utime expiry[SCHED] = task_cputime.sum_exec_runtime So there is no direct mapping between task_cputime and the expiry cache and the #define based remapping is just a horrible hack. Having the expiry cache array based allows further simplification of the expiry code. To avoid an all in one cleanup which is hard to review add a temporary anonymous union into struct task_cputime which allows array based access to it. That requires to reorder the members. Add a build time sanity check to validate that the members are at the same place. The union and the build time checks will be removed after conversion. Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Link: https://lkml.kernel.org/r/20190821192921.105793824@linutronix.de --- kernel/time/posix-cpu-timers.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 3e29d1692437..a38b6d04e8b5 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -18,13 +18,23 @@ #include "posix-timers.h" +static inline void temporary_check(void) +{ + BUILD_BUG_ON(offsetof(struct task_cputime, stime) != + CPUCLOCK_PROF * sizeof(u64)); + BUILD_BUG_ON(offsetof(struct task_cputime, utime) != + CPUCLOCK_VIRT * sizeof(u64)); + BUILD_BUG_ON(offsetof(struct task_cputime, sum_exec_runtime) != + CPUCLOCK_SCHED * sizeof(u64)); +} + static void posix_cpu_timer_rearm(struct k_itimer *timer); void posix_cputimers_group_init(struct posix_cputimers *pct, u64 cpu_limit) { posix_cputimers_init(pct); if (cpu_limit != RLIM_INFINITY) - pct->cputime_expires.prof_exp = cpu_limit * NSEC_PER_SEC; + pct->expiries[CPUCLOCK_PROF] = cpu_limit * NSEC_PER_SEC; } /* -- cgit v1.2.3 From 3b495b22d04df3220ccae829bf7c5cadb3059ccf Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 21 Aug 2019 21:09:08 +0200 Subject: posix-cpu-timers: Simplify timer queueing Now that the expiry cache can be accessed as an array, the per clock checking can be reduced to just comparing the corresponding array elements. Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Link: https://lkml.kernel.org/r/20190821192921.212129449@linutronix.de --- kernel/time/posix-cpu-timers.c | 55 ++++++++++++++++-------------------------- 1 file changed, 21 insertions(+), 34 deletions(-) (limited to 'kernel') diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index a38b6d04e8b5..b13241756fdd 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -456,20 +456,20 @@ static inline int expires_gt(u64 expires, u64 new_exp) */ static void arm_timer(struct k_itimer *timer) { + struct cpu_timer_list *const nt = &timer->it.cpu; + int clkidx = CPUCLOCK_WHICH(timer->it_clock); + u64 *cpuexp, newexp = timer->it.cpu.expires; struct task_struct *p = timer->it.cpu.task; struct list_head *head, *listpos; - struct task_cputime *cputime_expires; - struct cpu_timer_list *const nt = &timer->it.cpu; struct cpu_timer_list *next; if (CPUCLOCK_PERTHREAD(timer->it_clock)) { - head = p->posix_cputimers.cpu_timers; - cputime_expires = &p->posix_cputimers.cputime_expires; + head = p->posix_cputimers.cpu_timers + clkidx; + cpuexp = p->posix_cputimers.expiries + clkidx; } else { - head = p->signal->posix_cputimers.cpu_timers; - cputime_expires = &p->signal->posix_cputimers.cputime_expires; + head = p->signal->posix_cputimers.cpu_timers + clkidx; + cpuexp = p->signal->posix_cputimers.expiries + clkidx; } - head += CPUCLOCK_WHICH(timer->it_clock); listpos = head; list_for_each_entry(next, head, entry) { @@ -479,35 +479,22 @@ static void arm_timer(struct k_itimer *timer) } list_add(&nt->entry, listpos); - if (listpos == head) { - u64 exp = nt->expires; + if (listpos != head) + return; - /* - * We are the new earliest-expiring POSIX 1.b timer, hence - * need to update expiration cache. Take into account that - * for process timers we share expiration cache with itimers - * and RLIMIT_CPU and for thread timers with RLIMIT_RTTIME. - */ + /* + * We are the new earliest-expiring POSIX 1.b timer, hence + * need to update expiration cache. Take into account that + * for process timers we share expiration cache with itimers + * and RLIMIT_CPU and for thread timers with RLIMIT_RTTIME. + */ + if (expires_gt(*cpuexp, newexp)) + *cpuexp = newexp; - switch (CPUCLOCK_WHICH(timer->it_clock)) { - case CPUCLOCK_PROF: - if (expires_gt(cputime_expires->prof_exp, exp)) - cputime_expires->prof_exp = exp; - break; - case CPUCLOCK_VIRT: - if (expires_gt(cputime_expires->virt_exp, exp)) - cputime_expires->virt_exp = exp; - break; - case CPUCLOCK_SCHED: - if (expires_gt(cputime_expires->sched_exp, exp)) - cputime_expires->sched_exp = exp; - break; - } - if (CPUCLOCK_PERTHREAD(timer->it_clock)) - tick_dep_set_task(p, TICK_DEP_BIT_POSIX_TIMER); - else - tick_dep_set_signal(p->signal, TICK_DEP_BIT_POSIX_TIMER); - } + if (CPUCLOCK_PERTHREAD(timer->it_clock)) + tick_dep_set_task(p, TICK_DEP_BIT_POSIX_TIMER); + else + tick_dep_set_signal(p->signal, TICK_DEP_BIT_POSIX_TIMER); } /* -- cgit v1.2.3 From 1b0dd96d0f07c482bf1d04987cc1b35e376a7518 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 21 Aug 2019 21:09:09 +0200 Subject: posix-cpu-timers: Simplify set_process_cpu_timer() The expiry cache can now be accessed as an array. Replace the per clock checks with a simple comparison of the clock indexed array member. Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Link: https://lkml.kernel.org/r/20190821192921.303316423@linutronix.de --- kernel/time/posix-cpu-timers.c | 24 ++++++++---------------- 1 file changed, 8 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index b13241756fdd..2c47ce6cab8c 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -1174,15 +1174,15 @@ void run_posix_cpu_timers(void) * Set one of the process-wide special case CPU timers or RLIMIT_CPU. * The tsk->sighand->siglock must be held by the caller. */ -void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx, +void set_process_cpu_timer(struct task_struct *tsk, unsigned int clkid, u64 *newval, u64 *oldval) { - u64 now; + u64 now, *expiry = tsk->signal->posix_cputimers.expiries + clkid; - if (WARN_ON_ONCE(clock_idx >= CPUCLOCK_SCHED)) + if (WARN_ON_ONCE(clkid >= CPUCLOCK_SCHED)) return; - now = cpu_clock_sample_group(clock_idx, tsk, true); + now = cpu_clock_sample_group(clkid, tsk, true); if (oldval) { /* @@ -1205,19 +1205,11 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx, } /* - * Update expiration cache if we are the earliest timer, or eventually - * RLIMIT_CPU limit is earlier than prof_exp cpu timer expire. + * Update expiration cache if this is the earliest timer. CPUCLOCK_PROF + * expiry cache is also used by RLIMIT_CPU!. */ - switch (clock_idx) { - case CPUCLOCK_PROF: - if (expires_gt(tsk->signal->posix_cputimers.cputime_expires.prof_exp, *newval)) - tsk->signal->posix_cputimers.cputime_expires.prof_exp = *newval; - break; - case CPUCLOCK_VIRT: - if (expires_gt(tsk->signal->posix_cputimers.cputime_expires.virt_exp, *newval)) - tsk->signal->posix_cputimers.cputime_expires.virt_exp = *newval; - break; - } + if (expires_gt(*expiry, *newval)) + *expiry = *newval; tick_dep_set_signal(tsk->signal, TICK_DEP_BIT_POSIX_TIMER); } -- cgit v1.2.3 From c02b078e63a6f42029cb655d0aa3c991271637ac Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 21 Aug 2019 21:09:10 +0200 Subject: posix-cpu-timers: Switch check_*_timers() to array cache Use the array based expiry cache in check_thread_timers() and convert the store in check_process_timers() for consistency. Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Link: https://lkml.kernel.org/r/20190821192921.408222378@linutronix.de --- kernel/time/posix-cpu-timers.c | 26 +++++++++++--------------- 1 file changed, 11 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 2c47ce6cab8c..220e3c7ae849 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -778,8 +778,7 @@ static void check_thread_timers(struct task_struct *tsk, struct list_head *firing) { struct list_head *timers = tsk->posix_cputimers.cpu_timers; - struct task_cputime *tsk_expires = &tsk->posix_cputimers.cputime_expires; - u64 expires, stime, utime; + u64 stime, utime, *expires = tsk->posix_cputimers.expiries; unsigned long soft; if (dl_task(tsk)) @@ -789,19 +788,14 @@ static void check_thread_timers(struct task_struct *tsk, * If cputime_expires is zero, then there are no active * per thread CPU timers. */ - if (task_cputime_zero(tsk_expires)) + if (task_cputime_zero(&tsk->posix_cputimers.cputime_expires)) return; task_cputime(tsk, &utime, &stime); - expires = check_timers_list(timers, firing, utime + stime); - tsk_expires->prof_exp = expires; - - expires = check_timers_list(++timers, firing, utime); - tsk_expires->virt_exp = expires; - - tsk_expires->sched_exp = check_timers_list(++timers, firing, - tsk->se.sum_exec_runtime); + *expires++ = check_timers_list(timers, firing, utime + stime); + *expires++ = check_timers_list(++timers, firing, utime); + *expires = check_timers_list(++timers, firing, tsk->se.sum_exec_runtime); /* * Check for the special case thread timers. @@ -839,7 +833,8 @@ static void check_thread_timers(struct task_struct *tsk, __group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk); } } - if (task_cputime_zero(tsk_expires)) + + if (task_cputime_zero(&tsk->posix_cputimers.cputime_expires)) tick_dep_clear_task(tsk, TICK_DEP_BIT_POSIX_TIMER); } @@ -958,9 +953,10 @@ static void check_process_timers(struct task_struct *tsk, prof_expires = x; } - sig->posix_cputimers.cputime_expires.prof_exp = prof_expires; - sig->posix_cputimers.cputime_expires.virt_exp = virt_expires; - sig->posix_cputimers.cputime_expires.sched_exp = sched_expires; + sig->posix_cputimers.expiries[CPUCLOCK_PROF] = prof_expires; + sig->posix_cputimers.expiries[CPUCLOCK_VIRT] = virt_expires; + sig->posix_cputimers.expiries[CPUCLOCK_SCHED] = sched_expires; + if (task_cputime_zero(&sig->posix_cputimers.cputime_expires)) stop_process_timers(sig); -- cgit v1.2.3 From b0d524f77956eec887b30732af1f5f98cbf62b9f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 21 Aug 2019 21:09:12 +0200 Subject: posix-cpu-timers: Provide array based sample functions Instead of using task_cputime and doing the addition of utime and stime at all call sites, it's way simpler to have a sample array which allows indexed based checks against the expiry cache array. Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Link: https://lkml.kernel.org/r/20190821192921.590362974@linutronix.de --- kernel/time/posix-cpu-timers.c | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) (limited to 'kernel') diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 220e3c7ae849..11c841c6a45d 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -202,6 +202,32 @@ static u64 cpu_clock_sample(const clockid_t clkid, struct task_struct *p) return 0; } +static inline void store_samples(u64 *samples, u64 stime, u64 utime, u64 rtime) +{ + samples[CPUCLOCK_PROF] = stime + utime; + samples[CPUCLOCK_VIRT] = utime; + samples[CPUCLOCK_SCHED] = rtime; +} + +static void task_sample_cputime(struct task_struct *p, u64 *samples) +{ + u64 stime, utime; + + task_cputime(p, &utime, &stime); + store_samples(samples, stime, utime, p->se.sum_exec_runtime); +} + +static void proc_sample_cputime_atomic(struct task_cputime_atomic *at, + u64 *samples) +{ + u64 stime, utime, rtime; + + utime = atomic64_read(&at->utime); + stime = atomic64_read(&at->stime); + rtime = atomic64_read(&at->sum_exec_runtime); + store_samples(samples, stime, utime, rtime); +} + /* * Set cputime to sum_cputime if sum_cputime > cputime. Use cmpxchg * to avoid race conditions with concurrent updates to cputime. -- cgit v1.2.3 From 001f7971433a53bb76443cd8f5827ca27b0bc8ca Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 21 Aug 2019 21:09:13 +0200 Subject: posix-cpu-timers: Make expiry checks array based The expiry cache is an array indexed by clock ids. The new sample functions allow to retrieve a corresponding array of samples. Convert the fastpath expiry checks to make use of the new sample functions and do the comparisons on the sample and the expiry array. Make the check for the expiry array being zero array based as well. Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Link: https://lkml.kernel.org/r/20190821192921.695481430@linutronix.de --- kernel/time/posix-cpu-timers.c | 85 ++++++++++++++++++------------------------ 1 file changed, 36 insertions(+), 49 deletions(-) (limited to 'kernel') diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 11c841c6a45d..e159b039e44a 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -39,7 +39,7 @@ void posix_cputimers_group_init(struct posix_cputimers *pct, u64 cpu_limit) /* * Called after updating RLIMIT_CPU to run cpu timer and update - * tsk->signal->posix_cputimers.cputime_expires expiration cache if + * tsk->signal->posix_cputimers.expiries expiration cache if * necessary. Needs siglock protection since other code may update * expiration cache as well. */ @@ -132,19 +132,9 @@ static void bump_cpu_timer(struct k_itimer *timer, u64 now) } } -/** - * task_cputime_zero - Check a task_cputime struct for all zero fields. - * - * @cputime: The struct to compare. - * - * Checks @cputime to see if all fields are zero. Returns true if all fields - * are zero, false if any field is nonzero. - */ -static inline int task_cputime_zero(const struct task_cputime *cputime) +static inline bool expiry_cache_is_zero(const u64 *ec) { - if (!cputime->utime && !cputime->stime && !cputime->sum_exec_runtime) - return 1; - return 0; + return !(ec[CPUCLOCK_PROF] | ec[CPUCLOCK_VIRT] | ec[CPUCLOCK_SCHED]); } static int @@ -811,10 +801,10 @@ static void check_thread_timers(struct task_struct *tsk, check_dl_overrun(tsk); /* - * If cputime_expires is zero, then there are no active - * per thread CPU timers. + * If the expiry cache is zero, then there are no active per thread + * CPU timers. */ - if (task_cputime_zero(&tsk->posix_cputimers.cputime_expires)) + if (expiry_cache_is_zero(tsk->posix_cputimers.expiries)) return; task_cputime(tsk, &utime, &stime); @@ -860,7 +850,7 @@ static void check_thread_timers(struct task_struct *tsk, } } - if (task_cputime_zero(&tsk->posix_cputimers.cputime_expires)) + if (expiry_cache_is_zero(tsk->posix_cputimers.expiries)) tick_dep_clear_task(tsk, TICK_DEP_BIT_POSIX_TIMER); } @@ -983,7 +973,7 @@ static void check_process_timers(struct task_struct *tsk, sig->posix_cputimers.expiries[CPUCLOCK_VIRT] = virt_expires; sig->posix_cputimers.expiries[CPUCLOCK_SCHED] = sched_expires; - if (task_cputime_zero(&sig->posix_cputimers.cputime_expires)) + if (expiry_cache_is_zero(sig->posix_cputimers.expiries)) stop_process_timers(sig); sig->cputimer.checking_timer = false; @@ -1048,26 +1038,23 @@ unlock: } /** - * task_cputime_expired - Compare two task_cputime entities. + * task_cputimers_expired - Compare two task_cputime entities. * - * @sample: The task_cputime structure to be checked for expiration. - * @expires: Expiration times, against which @sample will be checked. + * @samples: Array of current samples for the CPUCLOCK clocks + * @expiries: Array of expiry values for the CPUCLOCK clocks * - * Checks @sample against @expires to see if any field of @sample has expired. - * Returns true if any field of the former is greater than the corresponding - * field of the latter if the latter field is set. Otherwise returns false. + * Returns true if any mmember of @samples is greater than the corresponding + * member of @expiries if that member is non zero. False otherwise */ -static inline int task_cputime_expired(const struct task_cputime *sample, - const struct task_cputime *expires) +static inline bool task_cputimers_expired(const u64 *sample, const u64 *expiries) { - if (expires->utime && sample->utime >= expires->utime) - return 1; - if (expires->stime && sample->utime + sample->stime >= expires->stime) - return 1; - if (expires->sum_exec_runtime != 0 && - sample->sum_exec_runtime >= expires->sum_exec_runtime) - return 1; - return 0; + int i; + + for (i = 0; i < CPUCLOCK_MAX; i++) { + if (expiries[i] && sample[i] >= expiries[i]) + return true; + } + return false; } /** @@ -1080,18 +1067,17 @@ static inline int task_cputime_expired(const struct task_cputime *sample, * timers and compare them with the corresponding expiration times. Return * true if a timer has expired, else return false. */ -static inline int fastpath_timer_check(struct task_struct *tsk) +static inline bool fastpath_timer_check(struct task_struct *tsk) { + u64 *expiries = tsk->posix_cputimers.expiries; struct signal_struct *sig; - if (!task_cputime_zero(&tsk->posix_cputimers.cputime_expires)) { - struct task_cputime task_sample; + if (!expiry_cache_is_zero(expiries)) { + u64 samples[CPUCLOCK_MAX]; - task_cputime(tsk, &task_sample.utime, &task_sample.stime); - task_sample.sum_exec_runtime = tsk->se.sum_exec_runtime; - if (task_cputime_expired(&task_sample, - &tsk->posix_cputimers.cputime_expires)) - return 1; + task_sample_cputime(tsk, samples); + if (task_cputimers_expired(samples, expiries)) + return true; } sig = tsk->signal; @@ -1111,19 +1097,20 @@ static inline int fastpath_timer_check(struct task_struct *tsk) */ if (READ_ONCE(sig->cputimer.running) && !READ_ONCE(sig->cputimer.checking_timer)) { - struct task_cputime group_sample; + u64 samples[CPUCLOCK_MAX]; - sample_cputime_atomic(&group_sample, &sig->cputimer.cputime_atomic); + proc_sample_cputime_atomic(&sig->cputimer.cputime_atomic, + samples); - if (task_cputime_expired(&group_sample, - &sig->posix_cputimers.cputime_expires)) - return 1; + if (task_cputimers_expired(samples, + sig->posix_cputimers.expiries)) + return true; } if (dl_task(tsk) && tsk->dl.dl_overrun) - return 1; + return true; - return 0; + return false; } /* -- cgit v1.2.3 From 46b883995c88520f2c4de6a64cccc04c69d59f83 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 21 Aug 2019 21:09:14 +0200 Subject: posix-cpu-timers: Remove cputime_expires The last users of the magic struct cputime based expiry cache are gone. Remove the leftovers. Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Link: https://lkml.kernel.org/r/20190821192921.790209622@linutronix.de --- kernel/time/posix-cpu-timers.c | 10 ---------- 1 file changed, 10 deletions(-) (limited to 'kernel') diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index e159b039e44a..ffd49181e23d 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -18,16 +18,6 @@ #include "posix-timers.h" -static inline void temporary_check(void) -{ - BUILD_BUG_ON(offsetof(struct task_cputime, stime) != - CPUCLOCK_PROF * sizeof(u64)); - BUILD_BUG_ON(offsetof(struct task_cputime, utime) != - CPUCLOCK_VIRT * sizeof(u64)); - BUILD_BUG_ON(offsetof(struct task_cputime, sum_exec_runtime) != - CPUCLOCK_SCHED * sizeof(u64)); -} - static void posix_cpu_timer_rearm(struct k_itimer *timer); void posix_cputimers_group_init(struct posix_cputimers *pct, u64 cpu_limit) -- cgit v1.2.3 From 87dc64480fb19a6a0fedbdff1e2557be50673287 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 26 Aug 2019 20:22:24 +0200 Subject: posix-cpu-timers: Restructure expiry array Now that the abused struct task_cputime is gone, it's more natural to bundle the expiry cache and the list head of each clock into a struct and have an array of those structs. Follow the hrtimer naming convention of 'bases' and rename the expiry cache to 'nextevt' and adapt all usage sites. Generates also better code .text size shrinks by 80 bytes. Suggested-by: Ingo Molnar Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Link: https://lkml.kernel.org/r/alpine.DEB.2.21.1908262021140.1939@nanos.tec.linutronix.de --- kernel/time/posix-cpu-timers.c | 105 ++++++++++++++++++++++------------------- 1 file changed, 56 insertions(+), 49 deletions(-) (limited to 'kernel') diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index ffd49181e23d..9ac601abc4c4 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -24,13 +24,13 @@ void posix_cputimers_group_init(struct posix_cputimers *pct, u64 cpu_limit) { posix_cputimers_init(pct); if (cpu_limit != RLIM_INFINITY) - pct->expiries[CPUCLOCK_PROF] = cpu_limit * NSEC_PER_SEC; + pct->bases[CPUCLOCK_PROF].nextevt = cpu_limit * NSEC_PER_SEC; } /* * Called after updating RLIMIT_CPU to run cpu timer and update - * tsk->signal->posix_cputimers.expiries expiration cache if - * necessary. Needs siglock protection since other code may update + * tsk->signal->posix_cputimers.bases[clock].nextevt expiration cache if + * necessary. Needs siglock protection since other code may update the * expiration cache as well. */ void update_rlimit_cpu(struct task_struct *task, unsigned long rlim_new) @@ -122,9 +122,11 @@ static void bump_cpu_timer(struct k_itimer *timer, u64 now) } } -static inline bool expiry_cache_is_zero(const u64 *ec) +static inline bool expiry_cache_is_zero(const struct posix_cputimers *pct) { - return !(ec[CPUCLOCK_PROF] | ec[CPUCLOCK_VIRT] | ec[CPUCLOCK_SCHED]); + return !(pct->bases[CPUCLOCK_PROF].nextevt | + pct->bases[CPUCLOCK_VIRT].nextevt | + pct->bases[CPUCLOCK_SCHED].nextevt); } static int @@ -432,9 +434,9 @@ static void cleanup_timers_list(struct list_head *head) */ static void cleanup_timers(struct posix_cputimers *pct) { - cleanup_timers_list(&pct->cpu_timers[CPUCLOCK_PROF]); - cleanup_timers_list(&pct->cpu_timers[CPUCLOCK_VIRT]); - cleanup_timers_list(&pct->cpu_timers[CPUCLOCK_SCHED]); + cleanup_timers_list(&pct->bases[CPUCLOCK_PROF].cpu_timers); + cleanup_timers_list(&pct->bases[CPUCLOCK_VIRT].cpu_timers); + cleanup_timers_list(&pct->bases[CPUCLOCK_SCHED].cpu_timers); } /* @@ -464,21 +466,19 @@ static void arm_timer(struct k_itimer *timer) { struct cpu_timer_list *const nt = &timer->it.cpu; int clkidx = CPUCLOCK_WHICH(timer->it_clock); - u64 *cpuexp, newexp = timer->it.cpu.expires; struct task_struct *p = timer->it.cpu.task; + u64 newexp = timer->it.cpu.expires; + struct posix_cputimer_base *base; struct list_head *head, *listpos; struct cpu_timer_list *next; - if (CPUCLOCK_PERTHREAD(timer->it_clock)) { - head = p->posix_cputimers.cpu_timers + clkidx; - cpuexp = p->posix_cputimers.expiries + clkidx; - } else { - head = p->signal->posix_cputimers.cpu_timers + clkidx; - cpuexp = p->signal->posix_cputimers.expiries + clkidx; - } + if (CPUCLOCK_PERTHREAD(timer->it_clock)) + base = p->posix_cputimers.bases + clkidx; + else + base = p->signal->posix_cputimers.bases + clkidx; - listpos = head; - list_for_each_entry(next, head, entry) { + listpos = head = &base->cpu_timers; + list_for_each_entry(next,head, entry) { if (nt->expires < next->expires) break; listpos = &next->entry; @@ -494,8 +494,8 @@ static void arm_timer(struct k_itimer *timer) * for process timers we share expiration cache with itimers * and RLIMIT_CPU and for thread timers with RLIMIT_RTTIME. */ - if (expires_gt(*cpuexp, newexp)) - *cpuexp = newexp; + if (expires_gt(base->nextevt, newexp)) + base->nextevt = newexp; if (CPUCLOCK_PERTHREAD(timer->it_clock)) tick_dep_set_task(p, TICK_DEP_BIT_POSIX_TIMER); @@ -783,9 +783,9 @@ static inline void check_dl_overrun(struct task_struct *tsk) static void check_thread_timers(struct task_struct *tsk, struct list_head *firing) { - struct list_head *timers = tsk->posix_cputimers.cpu_timers; - u64 stime, utime, *expires = tsk->posix_cputimers.expiries; + struct posix_cputimer_base *base = tsk->posix_cputimers.bases; unsigned long soft; + u64 stime, utime; if (dl_task(tsk)) check_dl_overrun(tsk); @@ -794,14 +794,18 @@ static void check_thread_timers(struct task_struct *tsk, * If the expiry cache is zero, then there are no active per thread * CPU timers. */ - if (expiry_cache_is_zero(tsk->posix_cputimers.expiries)) + if (expiry_cache_is_zero(&tsk->posix_cputimers)) return; task_cputime(tsk, &utime, &stime); - *expires++ = check_timers_list(timers, firing, utime + stime); - *expires++ = check_timers_list(++timers, firing, utime); - *expires = check_timers_list(++timers, firing, tsk->se.sum_exec_runtime); + base->nextevt = check_timers_list(&base->cpu_timers, firing, + utime + stime); + base++; + base->nextevt = check_timers_list(&base->cpu_timers, firing, utime); + base++; + base->nextevt = check_timers_list(&base->cpu_timers, firing, + tsk->se.sum_exec_runtime); /* * Check for the special case thread timers. @@ -840,7 +844,7 @@ static void check_thread_timers(struct task_struct *tsk, } } - if (expiry_cache_is_zero(tsk->posix_cputimers.expiries)) + if (expiry_cache_is_zero(&tsk->posix_cputimers)) tick_dep_clear_task(tsk, TICK_DEP_BIT_POSIX_TIMER); } @@ -884,7 +888,7 @@ static void check_process_timers(struct task_struct *tsk, struct list_head *firing) { struct signal_struct *const sig = tsk->signal; - struct list_head *timers = sig->posix_cputimers.cpu_timers; + struct posix_cputimer_base *base = sig->posix_cputimers.bases; u64 utime, ptime, virt_expires, prof_expires; u64 sum_sched_runtime, sched_expires; struct task_cputime cputime; @@ -912,9 +916,12 @@ static void check_process_timers(struct task_struct *tsk, ptime = utime + cputime.stime; sum_sched_runtime = cputime.sum_exec_runtime; - prof_expires = check_timers_list(timers, firing, ptime); - virt_expires = check_timers_list(++timers, firing, utime); - sched_expires = check_timers_list(++timers, firing, sum_sched_runtime); + prof_expires = check_timers_list(&base[CPUCLOCK_PROF].cpu_timers, + firing, ptime); + virt_expires = check_timers_list(&base[CPUCLOCK_VIRT].cpu_timers, + firing, utime); + sched_expires = check_timers_list(&base[CPUCLOCK_SCHED].cpu_timers, + firing, sum_sched_runtime); /* * Check for the special case process timers. @@ -959,11 +966,11 @@ static void check_process_timers(struct task_struct *tsk, prof_expires = x; } - sig->posix_cputimers.expiries[CPUCLOCK_PROF] = prof_expires; - sig->posix_cputimers.expiries[CPUCLOCK_VIRT] = virt_expires; - sig->posix_cputimers.expiries[CPUCLOCK_SCHED] = sched_expires; + base[CPUCLOCK_PROF].nextevt = prof_expires; + base[CPUCLOCK_VIRT].nextevt = virt_expires; + base[CPUCLOCK_SCHED].nextevt = sched_expires; - if (expiry_cache_is_zero(sig->posix_cputimers.expiries)) + if (expiry_cache_is_zero(&sig->posix_cputimers)) stop_process_timers(sig); sig->cputimer.checking_timer = false; @@ -1028,20 +1035,21 @@ unlock: } /** - * task_cputimers_expired - Compare two task_cputime entities. + * task_cputimers_expired - Check whether posix CPU timers are expired * * @samples: Array of current samples for the CPUCLOCK clocks - * @expiries: Array of expiry values for the CPUCLOCK clocks + * @pct: Pointer to a posix_cputimers container * - * Returns true if any mmember of @samples is greater than the corresponding - * member of @expiries if that member is non zero. False otherwise + * Returns true if any member of @samples is greater than the corresponding + * member of @pct->bases[CLK].nextevt. False otherwise */ -static inline bool task_cputimers_expired(const u64 *sample, const u64 *expiries) +static inline bool +task_cputimers_expired(const u64 *sample, struct posix_cputimers *pct) { int i; for (i = 0; i < CPUCLOCK_MAX; i++) { - if (expiries[i] && sample[i] >= expiries[i]) + if (pct->bases[i].nextevt && sample[i] >= pct->bases[i].nextevt) return true; } return false; @@ -1059,14 +1067,13 @@ static inline bool task_cputimers_expired(const u64 *sample, const u64 *expiries */ static inline bool fastpath_timer_check(struct task_struct *tsk) { - u64 *expiries = tsk->posix_cputimers.expiries; struct signal_struct *sig; - if (!expiry_cache_is_zero(expiries)) { + if (!expiry_cache_is_zero(&tsk->posix_cputimers)) { u64 samples[CPUCLOCK_MAX]; task_sample_cputime(tsk, samples); - if (task_cputimers_expired(samples, expiries)) + if (task_cputimers_expired(samples, &tsk->posix_cputimers)) return true; } @@ -1092,8 +1099,7 @@ static inline bool fastpath_timer_check(struct task_struct *tsk) proc_sample_cputime_atomic(&sig->cputimer.cputime_atomic, samples); - if (task_cputimers_expired(samples, - sig->posix_cputimers.expiries)) + if (task_cputimers_expired(samples, &sig->posix_cputimers)) return true; } @@ -1176,11 +1182,12 @@ void run_posix_cpu_timers(void) void set_process_cpu_timer(struct task_struct *tsk, unsigned int clkid, u64 *newval, u64 *oldval) { - u64 now, *expiry = tsk->signal->posix_cputimers.expiries + clkid; + u64 now, *nextevt; if (WARN_ON_ONCE(clkid >= CPUCLOCK_SCHED)) return; + nextevt = &tsk->signal->posix_cputimers.bases[clkid].nextevt; now = cpu_clock_sample_group(clkid, tsk, true); if (oldval) { @@ -1207,8 +1214,8 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clkid, * Update expiration cache if this is the earliest timer. CPUCLOCK_PROF * expiry cache is also used by RLIMIT_CPU!. */ - if (expires_gt(*expiry, *newval)) - *expiry = *newval; + if (expires_gt(*nextevt, *newval)) + *nextevt = *newval; tick_dep_set_signal(tsk->signal, TICK_DEP_BIT_POSIX_TIMER); } -- cgit v1.2.3 From b7be4ef1365dcb56fdffc6689e41058b23f5996d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 21 Aug 2019 21:09:16 +0200 Subject: posix-cpu-timers: Switch thread group sampling to array That allows more simplifications in various places. Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Link: https://lkml.kernel.org/r/20190821192921.988426956@linutronix.de --- kernel/time/itimer.c | 11 ++--- kernel/time/posix-cpu-timers.c | 104 ++++++++++++++++++----------------------- 2 files changed, 48 insertions(+), 67 deletions(-) (limited to 'kernel') diff --git a/kernel/time/itimer.c b/kernel/time/itimer.c index ae04bc259240..77f1e5635cc1 100644 --- a/kernel/time/itimer.c +++ b/kernel/time/itimer.c @@ -55,15 +55,10 @@ static void get_cpu_itimer(struct task_struct *tsk, unsigned int clock_id, val = it->expires; interval = it->incr; if (val) { - struct task_cputime cputime; - u64 t; + u64 t, samples[CPUCLOCK_MAX]; - thread_group_sample_cputime(tsk, &cputime); - if (clock_id == CPUCLOCK_PROF) - t = cputime.utime + cputime.stime; - else - /* CPUCLOCK_VIRT */ - t = cputime.utime; + thread_group_sample_cputime(tsk, samples); + t = samples[clock_id]; if (val < t) /* about to fire */ diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 9ac601abc4c4..e62139a89375 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -225,22 +225,14 @@ retry: } } -static void update_gt_cputime(struct task_cputime_atomic *cputime_atomic, struct task_cputime *sum) +static void update_gt_cputime(struct task_cputime_atomic *cputime_atomic, + struct task_cputime *sum) { __update_gt_cputime(&cputime_atomic->utime, sum->utime); __update_gt_cputime(&cputime_atomic->stime, sum->stime); __update_gt_cputime(&cputime_atomic->sum_exec_runtime, sum->sum_exec_runtime); } -/* Sample task_cputime_atomic values in "atomic_timers", store results in "times". */ -static inline void sample_cputime_atomic(struct task_cputime *times, - struct task_cputime_atomic *atomic_times) -{ - times->utime = atomic64_read(&atomic_times->utime); - times->stime = atomic64_read(&atomic_times->stime); - times->sum_exec_runtime = atomic64_read(&atomic_times->sum_exec_runtime); -} - /** * thread_group_sample_cputime - Sample cputime for a given task * @tsk: Task for which cputime needs to be started @@ -252,20 +244,19 @@ static inline void sample_cputime_atomic(struct task_cputime *times, * * Updates @times with an uptodate sample of the thread group cputimes. */ -void thread_group_sample_cputime(struct task_struct *tsk, - struct task_cputime *times) +void thread_group_sample_cputime(struct task_struct *tsk, u64 *samples) { struct thread_group_cputimer *cputimer = &tsk->signal->cputimer; WARN_ON_ONCE(!cputimer->running); - sample_cputime_atomic(times, &cputimer->cputime_atomic); + proc_sample_cputime_atomic(&cputimer->cputime_atomic, samples); } /** * thread_group_start_cputime - Start cputime and return a sample * @tsk: Task for which cputime needs to be started - * @iimes: Storage for time samples + * @samples: Storage for time samples * * The thread group cputime accouting is avoided when there are no posix * CPU timers armed. Before starting a timer it's required to check whether @@ -274,14 +265,14 @@ void thread_group_sample_cputime(struct task_struct *tsk, * * Updates @times with an uptodate sample of the thread group cputimes. */ -static void -thread_group_start_cputime(struct task_struct *tsk, struct task_cputime *times) +static void thread_group_start_cputime(struct task_struct *tsk, u64 *samples) { struct thread_group_cputimer *cputimer = &tsk->signal->cputimer; - struct task_cputime sum; /* Check if cputimer isn't running. This is accessed without locking. */ if (!READ_ONCE(cputimer->running)) { + struct task_cputime sum; + /* * The POSIX timer interface allows for absolute time expiry * values through the TIMER_ABSTIME flag, therefore we have @@ -299,7 +290,15 @@ thread_group_start_cputime(struct task_struct *tsk, struct task_cputime *times) */ WRITE_ONCE(cputimer->running, true); } - sample_cputime_atomic(times, &cputimer->cputime_atomic); + proc_sample_cputime_atomic(&cputimer->cputime_atomic, samples); +} + +static void __thread_group_cputime(struct task_struct *tsk, u64 *samples) +{ + struct task_cputime ct; + + thread_group_cputime(tsk, &ct); + store_samples(samples, ct.stime, ct.utime, ct.sum_exec_runtime); } /* @@ -313,28 +312,18 @@ static u64 cpu_clock_sample_group(const clockid_t clkid, struct task_struct *p, bool start) { struct thread_group_cputimer *cputimer = &p->signal->cputimer; - struct task_cputime cputime; + u64 samples[CPUCLOCK_MAX]; if (!READ_ONCE(cputimer->running)) { if (start) - thread_group_start_cputime(p, &cputime); + thread_group_start_cputime(p, samples); else - thread_group_cputime(p, &cputime); + __thread_group_cputime(p, samples); } else { - sample_cputime_atomic(&cputime, &cputimer->cputime_atomic); + proc_sample_cputime_atomic(&cputimer->cputime_atomic, samples); } - switch (clkid) { - case CPUCLOCK_PROF: - return cputime.utime + cputime.stime; - case CPUCLOCK_VIRT: - return cputime.utime; - case CPUCLOCK_SCHED: - return cputime.sum_exec_runtime; - default: - WARN_ON_ONCE(1); - } - return 0; + return samples[clkid]; } static int posix_cpu_clock_get(const clockid_t clock, struct timespec64 *tp) @@ -889,9 +878,7 @@ static void check_process_timers(struct task_struct *tsk, { struct signal_struct *const sig = tsk->signal; struct posix_cputimer_base *base = sig->posix_cputimers.bases; - u64 utime, ptime, virt_expires, prof_expires; - u64 sum_sched_runtime, sched_expires; - struct task_cputime cputime; + u64 virt_exp, prof_exp, sched_exp, samples[CPUCLOCK_MAX]; unsigned long soft; /* @@ -911,30 +898,29 @@ static void check_process_timers(struct task_struct *tsk, * Collect the current process totals. Group accounting is active * so the sample can be taken directly. */ - sample_cputime_atomic(&cputime, &sig->cputimer.cputime_atomic); - utime = cputime.utime; - ptime = utime + cputime.stime; - sum_sched_runtime = cputime.sum_exec_runtime; - - prof_expires = check_timers_list(&base[CPUCLOCK_PROF].cpu_timers, - firing, ptime); - virt_expires = check_timers_list(&base[CPUCLOCK_VIRT].cpu_timers, - firing, utime); - sched_expires = check_timers_list(&base[CPUCLOCK_SCHED].cpu_timers, - firing, sum_sched_runtime); + proc_sample_cputime_atomic(&sig->cputimer.cputime_atomic, samples); + + prof_exp = check_timers_list(&base[CPUCLOCK_PROF].cpu_timers, + firing, samples[CPUCLOCK_PROF]); + virt_exp = check_timers_list(&base[CPUCLOCK_VIRT].cpu_timers, + firing, samples[CPUCLOCK_VIRT]); + sched_exp = check_timers_list(&base[CPUCLOCK_SCHED].cpu_timers, + firing, samples[CPUCLOCK_SCHED]); /* * Check for the special case process timers. */ - check_cpu_itimer(tsk, &sig->it[CPUCLOCK_PROF], &prof_expires, ptime, - SIGPROF); - check_cpu_itimer(tsk, &sig->it[CPUCLOCK_VIRT], &virt_expires, utime, - SIGVTALRM); + check_cpu_itimer(tsk, &sig->it[CPUCLOCK_PROF], &prof_exp, + samples[CPUCLOCK_PROF], SIGPROF); + check_cpu_itimer(tsk, &sig->it[CPUCLOCK_VIRT], &virt_exp, + samples[CPUCLOCK_PROF], SIGVTALRM); + soft = task_rlimit(tsk, RLIMIT_CPU); if (soft != RLIM_INFINITY) { - unsigned long psecs = div_u64(ptime, NSEC_PER_SEC); + u64 softns, ptime = samples[CPUCLOCK_PROF]; unsigned long hard = task_rlimit_max(tsk, RLIMIT_CPU); - u64 x; + unsigned long psecs = div_u64(ptime, NSEC_PER_SEC); + if (psecs >= hard) { /* * At the hard limit, we just die. @@ -961,14 +947,14 @@ static void check_process_timers(struct task_struct *tsk, sig->rlim[RLIMIT_CPU].rlim_cur = soft; } } - x = soft * NSEC_PER_SEC; - if (!prof_expires || x < prof_expires) - prof_expires = x; + softns = soft * NSEC_PER_SEC; + if (!prof_exp || softns < prof_exp) + prof_exp = softns; } - base[CPUCLOCK_PROF].nextevt = prof_expires; - base[CPUCLOCK_VIRT].nextevt = virt_expires; - base[CPUCLOCK_SCHED].nextevt = sched_expires; + base[CPUCLOCK_PROF].nextevt = prof_exp; + base[CPUCLOCK_VIRT].nextevt = virt_exp; + base[CPUCLOCK_SCHED].nextevt = sched_exp; if (expiry_cache_is_zero(&sig->posix_cputimers)) stop_process_timers(sig); -- cgit v1.2.3 From fe0517f893d36636de20d0a809fc0c788ca0cade Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 21 Aug 2019 21:09:17 +0200 Subject: posix-cpu-timers: Respect INFINITY for hard RTTIME limit The RTIME limit expiry code does not check the hard RTTIME limit for INFINITY, i.e. being disabled. Add it. While this could be considered an ABI breakage if something would depend on this behaviour. Though it's highly unlikely to have an effect because RLIM_INFINITY is at minimum INT_MAX and the RTTIME limit is in seconds, so the timer would fire after ~68 years. Adding this obvious correct limit check also allows further consolidation of that code and is a prerequisite for cleaning up the 0 based checks and the rlimit setter code. Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Link: https://lkml.kernel.org/r/20190821192922.078293002@linutronix.de --- kernel/time/posix-cpu-timers.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index e62139a89375..a738d7659915 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -921,7 +921,7 @@ static void check_process_timers(struct task_struct *tsk, unsigned long hard = task_rlimit_max(tsk, RLIMIT_CPU); unsigned long psecs = div_u64(ptime, NSEC_PER_SEC); - if (psecs >= hard) { + if (hard != RLIM_INFINITY && psecs >= hard) { /* * At the hard limit, we just die. * No need to calculate anything else now. -- cgit v1.2.3 From 24db4dd90dd53ad6e3331b6f01cb985e466cface Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 21 Aug 2019 21:09:18 +0200 Subject: rlimit: Rewrite non-sensical RLIMIT_CPU comment The comment above the function which arms RLIMIT_CPU in the posix CPU timer code makes no sense at all. It claims that the kernel does not return an error code when it rejected the attempt to set RLIMIT_CPU. That's clearly bogus as the code does an error check and the rlimit is only set and activated when the permission checks are ok. In case of a rejection an appropriate error code is returned. This is a historical and outdated comment which got dragged along even when the rlimit handling code was rewritten. Replace it with an explanation why the setup function is not called when the rlimit value is RLIM_INFINITY and how the 'disarming' is handled. Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Link: https://lkml.kernel.org/r/20190821192922.185511287@linutronix.de --- kernel/sys.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index 2969304c29fe..c578b75d7923 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1576,10 +1576,9 @@ int do_prlimit(struct task_struct *tsk, unsigned int resource, task_unlock(tsk->group_leader); /* - * RLIMIT_CPU handling. Note that the kernel fails to return an error - * code if it rejected the user's attempt to set RLIMIT_CPU. This is a - * very long-standing error, and fixing it now risks breakage of - * applications, so we live with it + * RLIMIT_CPU handling. Arm the posix CPU timer if the limit is not + * infite. In case of RLIM_INFINITY the posix CPU timer code + * ignores the rlimit. */ if (!retval && new_rlim && resource == RLIMIT_CPU && new_rlim->rlim_cur != RLIM_INFINITY && -- cgit v1.2.3 From 2bbdbdae05167c688b6d3499a7dab74208b80a22 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 21 Aug 2019 21:09:19 +0200 Subject: posix-cpu-timers: Get rid of zero checks Deactivation of the expiry cache is done by setting all clock caches to 0. That requires to have a check for zero in all places which update the expiry cache: if (cache == 0 || new < cache) cache = new; Use U64_MAX as the deactivated value, which allows to remove the zero checks when updating the cache and reduces it to the obvious check: if (new < cache) cache = new; This also removes the weird workaround in do_prlimit() which was required to convert a RLIMIT_CPU value of 0 (immediate expiry) to 1 because handing in 0 to the posix CPU timer code would have effectively disarmed it. Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Link: https://lkml.kernel.org/r/20190821192922.275086128@linutronix.de --- kernel/sys.c | 9 --------- kernel/time/posix-cpu-timers.c | 38 +++++++++++++++----------------------- 2 files changed, 15 insertions(+), 32 deletions(-) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index c578b75d7923..2462aa84247f 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1557,15 +1557,6 @@ int do_prlimit(struct task_struct *tsk, unsigned int resource, retval = -EPERM; if (!retval) retval = security_task_setrlimit(tsk, resource, new_rlim); - if (resource == RLIMIT_CPU && new_rlim->rlim_cur == 0) { - /* - * The caller is asking for an immediate RLIMIT_CPU - * expiry. But we use the zero value to mean "it was - * never set". So let's cheat and make it one second - * instead - */ - new_rlim->rlim_cur = 1; - } } if (!retval) { if (old_rlim) diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index a738d7659915..cf85292575c5 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -122,11 +122,12 @@ static void bump_cpu_timer(struct k_itimer *timer, u64 now) } } -static inline bool expiry_cache_is_zero(const struct posix_cputimers *pct) +/* Check whether all cache entries contain U64_MAX, i.e. eternal expiry time */ +static inline bool expiry_cache_is_inactive(const struct posix_cputimers *pct) { - return !(pct->bases[CPUCLOCK_PROF].nextevt | - pct->bases[CPUCLOCK_VIRT].nextevt | - pct->bases[CPUCLOCK_SCHED].nextevt); + return !(~pct->bases[CPUCLOCK_PROF].nextevt | + ~pct->bases[CPUCLOCK_VIRT].nextevt | + ~pct->bases[CPUCLOCK_SCHED].nextevt); } static int @@ -442,11 +443,6 @@ void posix_cpu_timers_exit_group(struct task_struct *tsk) cleanup_timers(&tsk->signal->posix_cputimers); } -static inline int expires_gt(u64 expires, u64 new_exp) -{ - return expires == 0 || expires > new_exp; -} - /* * Insert the timer on the appropriate list before any timers that * expire later. This must be called with the sighand lock held. @@ -483,7 +479,7 @@ static void arm_timer(struct k_itimer *timer) * for process timers we share expiration cache with itimers * and RLIMIT_CPU and for thread timers with RLIMIT_RTTIME. */ - if (expires_gt(base->nextevt, newexp)) + if (newexp < base->nextevt) base->nextevt = newexp; if (CPUCLOCK_PERTHREAD(timer->it_clock)) @@ -753,7 +749,7 @@ check_timers_list(struct list_head *timers, list_move_tail(&t->entry, firing); } - return 0; + return U64_MAX; } static inline void check_dl_overrun(struct task_struct *tsk) @@ -779,11 +775,7 @@ static void check_thread_timers(struct task_struct *tsk, if (dl_task(tsk)) check_dl_overrun(tsk); - /* - * If the expiry cache is zero, then there are no active per thread - * CPU timers. - */ - if (expiry_cache_is_zero(&tsk->posix_cputimers)) + if (expiry_cache_is_inactive(&tsk->posix_cputimers)) return; task_cputime(tsk, &utime, &stime); @@ -833,7 +825,7 @@ static void check_thread_timers(struct task_struct *tsk, } } - if (expiry_cache_is_zero(&tsk->posix_cputimers)) + if (expiry_cache_is_inactive(&tsk->posix_cputimers)) tick_dep_clear_task(tsk, TICK_DEP_BIT_POSIX_TIMER); } @@ -864,7 +856,7 @@ static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it, __group_send_sig_info(signo, SEND_SIG_PRIV, tsk); } - if (it->expires && (!*expires || it->expires < *expires)) + if (it->expires && it->expires < *expires) *expires = it->expires; } @@ -948,7 +940,7 @@ static void check_process_timers(struct task_struct *tsk, } } softns = soft * NSEC_PER_SEC; - if (!prof_exp || softns < prof_exp) + if (softns < prof_exp) prof_exp = softns; } @@ -956,7 +948,7 @@ static void check_process_timers(struct task_struct *tsk, base[CPUCLOCK_VIRT].nextevt = virt_exp; base[CPUCLOCK_SCHED].nextevt = sched_exp; - if (expiry_cache_is_zero(&sig->posix_cputimers)) + if (expiry_cache_is_inactive(&sig->posix_cputimers)) stop_process_timers(sig); sig->cputimer.checking_timer = false; @@ -1035,7 +1027,7 @@ task_cputimers_expired(const u64 *sample, struct posix_cputimers *pct) int i; for (i = 0; i < CPUCLOCK_MAX; i++) { - if (pct->bases[i].nextevt && sample[i] >= pct->bases[i].nextevt) + if (sample[i] >= pct->bases[i].nextevt) return true; } return false; @@ -1055,7 +1047,7 @@ static inline bool fastpath_timer_check(struct task_struct *tsk) { struct signal_struct *sig; - if (!expiry_cache_is_zero(&tsk->posix_cputimers)) { + if (!expiry_cache_is_inactive(&tsk->posix_cputimers)) { u64 samples[CPUCLOCK_MAX]; task_sample_cputime(tsk, samples); @@ -1200,7 +1192,7 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clkid, * Update expiration cache if this is the earliest timer. CPUCLOCK_PROF * expiry cache is also used by RLIMIT_CPU!. */ - if (expires_gt(*nextevt, *newval)) + if (*newval < *nextevt) *nextevt = *newval; tick_dep_set_signal(tsk->signal, TICK_DEP_BIT_POSIX_TIMER); -- cgit v1.2.3 From 1cd07c0b94f2c320270d76edb7dd49bceb09c1df Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 21 Aug 2019 21:09:20 +0200 Subject: posix-cpu-timers: Consolidate timer expiry further With the array based samples and expiry cache, the expiry function can use a loop to collect timers from the clock specific lists. Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Link: https://lkml.kernel.org/r/20190821192922.365469982@linutronix.de --- kernel/time/posix-cpu-timers.c | 63 ++++++++++++++++++++---------------------- 1 file changed, 30 insertions(+), 33 deletions(-) (limited to 'kernel') diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index cf85292575c5..caafdfdd6f0f 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -752,6 +752,18 @@ check_timers_list(struct list_head *timers, return U64_MAX; } +static void collect_posix_cputimers(struct posix_cputimers *pct, + u64 *samples, struct list_head *firing) +{ + struct posix_cputimer_base *base = pct->bases; + int i; + + for (i = 0; i < CPUCLOCK_MAX; i++, base++) { + base->nextevt = check_timers_list(&base->cpu_timers, firing, + samples[i]); + } +} + static inline void check_dl_overrun(struct task_struct *tsk) { if (tsk->dl.dl_overrun) { @@ -768,25 +780,18 @@ static inline void check_dl_overrun(struct task_struct *tsk) static void check_thread_timers(struct task_struct *tsk, struct list_head *firing) { - struct posix_cputimer_base *base = tsk->posix_cputimers.bases; + struct posix_cputimers *pct = &tsk->posix_cputimers; + u64 samples[CPUCLOCK_MAX]; unsigned long soft; - u64 stime, utime; if (dl_task(tsk)) check_dl_overrun(tsk); - if (expiry_cache_is_inactive(&tsk->posix_cputimers)) + if (expiry_cache_is_inactive(pct)) return; - task_cputime(tsk, &utime, &stime); - - base->nextevt = check_timers_list(&base->cpu_timers, firing, - utime + stime); - base++; - base->nextevt = check_timers_list(&base->cpu_timers, firing, utime); - base++; - base->nextevt = check_timers_list(&base->cpu_timers, firing, - tsk->se.sum_exec_runtime); + task_sample_cputime(tsk, samples); + collect_posix_cputimers(pct, samples, firing); /* * Check for the special case thread timers. @@ -825,7 +830,7 @@ static void check_thread_timers(struct task_struct *tsk, } } - if (expiry_cache_is_inactive(&tsk->posix_cputimers)) + if (expiry_cache_is_inactive(pct)) tick_dep_clear_task(tsk, TICK_DEP_BIT_POSIX_TIMER); } @@ -869,15 +874,15 @@ static void check_process_timers(struct task_struct *tsk, struct list_head *firing) { struct signal_struct *const sig = tsk->signal; - struct posix_cputimer_base *base = sig->posix_cputimers.bases; - u64 virt_exp, prof_exp, sched_exp, samples[CPUCLOCK_MAX]; + struct posix_cputimers *pct = &sig->posix_cputimers; + u64 samples[CPUCLOCK_MAX]; unsigned long soft; /* * If cputimer is not running, then there are no active * process wide timers (POSIX 1.b, itimers, RLIMIT_CPU). */ - if (!READ_ONCE(tsk->signal->cputimer.running)) + if (!READ_ONCE(sig->cputimer.running)) return; /* @@ -891,21 +896,17 @@ static void check_process_timers(struct task_struct *tsk, * so the sample can be taken directly. */ proc_sample_cputime_atomic(&sig->cputimer.cputime_atomic, samples); - - prof_exp = check_timers_list(&base[CPUCLOCK_PROF].cpu_timers, - firing, samples[CPUCLOCK_PROF]); - virt_exp = check_timers_list(&base[CPUCLOCK_VIRT].cpu_timers, - firing, samples[CPUCLOCK_VIRT]); - sched_exp = check_timers_list(&base[CPUCLOCK_SCHED].cpu_timers, - firing, samples[CPUCLOCK_SCHED]); + collect_posix_cputimers(pct, samples, firing); /* * Check for the special case process timers. */ - check_cpu_itimer(tsk, &sig->it[CPUCLOCK_PROF], &prof_exp, + check_cpu_itimer(tsk, &sig->it[CPUCLOCK_PROF], + &pct->bases[CPUCLOCK_PROF].nextevt, samples[CPUCLOCK_PROF], SIGPROF); - check_cpu_itimer(tsk, &sig->it[CPUCLOCK_VIRT], &virt_exp, - samples[CPUCLOCK_PROF], SIGVTALRM); + check_cpu_itimer(tsk, &sig->it[CPUCLOCK_VIRT], + &pct->bases[CPUCLOCK_VIRT].nextevt, + samples[CPUCLOCK_VIRT], SIGVTALRM); soft = task_rlimit(tsk, RLIMIT_CPU); if (soft != RLIM_INFINITY) { @@ -940,15 +941,11 @@ static void check_process_timers(struct task_struct *tsk, } } softns = soft * NSEC_PER_SEC; - if (softns < prof_exp) - prof_exp = softns; + if (softns < pct->bases[CPUCLOCK_PROF].nextevt) + pct->bases[CPUCLOCK_PROF].nextevt = softns; } - base[CPUCLOCK_PROF].nextevt = prof_exp; - base[CPUCLOCK_VIRT].nextevt = virt_exp; - base[CPUCLOCK_SCHED].nextevt = sched_exp; - - if (expiry_cache_is_inactive(&sig->posix_cputimers)) + if (expiry_cache_is_inactive(pct)) stop_process_timers(sig); sig->cputimer.checking_timer = false; -- cgit v1.2.3 From 8ea1de90a5eccdc18c8f05f8596bae8660a3ff9a Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 21 Aug 2019 21:09:21 +0200 Subject: posix-cpu-timers: Get rid of 64bit divisions Instead of dividing A to match the units of B it's more efficient to multiply B to match the units of A. Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Link: https://lkml.kernel.org/r/20190821192922.458286860@linutronix.de --- kernel/time/posix-cpu-timers.c | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index caafdfdd6f0f..dcdf9c8241b1 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -798,10 +798,11 @@ static void check_thread_timers(struct task_struct *tsk, */ soft = task_rlimit(tsk, RLIMIT_RTTIME); if (soft != RLIM_INFINITY) { + /* Task RT timeout is accounted in jiffies. RTTIME is usec */ + unsigned long rtim = tsk->rt.timeout * (USEC_PER_SEC / HZ); unsigned long hard = task_rlimit_max(tsk, RLIMIT_RTTIME); - if (hard != RLIM_INFINITY && - tsk->rt.timeout > DIV_ROUND_UP(hard, USEC_PER_SEC/HZ)) { + if (hard != RLIM_INFINITY && rtim >= hard) { /* * At the hard limit, we just die. * No need to calculate anything else now. @@ -813,7 +814,7 @@ static void check_thread_timers(struct task_struct *tsk, __group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk); return; } - if (tsk->rt.timeout > DIV_ROUND_UP(soft, USEC_PER_SEC/HZ)) { + if (rtim >= soft) { /* * At the soft limit, send a SIGXCPU every second. */ @@ -910,11 +911,13 @@ static void check_process_timers(struct task_struct *tsk, soft = task_rlimit(tsk, RLIMIT_CPU); if (soft != RLIM_INFINITY) { - u64 softns, ptime = samples[CPUCLOCK_PROF]; + /* RLIMIT_CPU is in seconds. Samples are nanoseconds */ unsigned long hard = task_rlimit_max(tsk, RLIMIT_CPU); - unsigned long psecs = div_u64(ptime, NSEC_PER_SEC); + u64 ptime = samples[CPUCLOCK_PROF]; + u64 softns = (u64)soft * NSEC_PER_SEC; + u64 hardns = (u64)hard * NSEC_PER_SEC; - if (hard != RLIM_INFINITY && psecs >= hard) { + if (hard != RLIM_INFINITY && ptime >= hardns) { /* * At the hard limit, we just die. * No need to calculate anything else now. @@ -926,7 +929,7 @@ static void check_process_timers(struct task_struct *tsk, __group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk); return; } - if (psecs >= soft) { + if (ptime >= softns) { /* * At the soft limit, send a SIGXCPU every second. */ @@ -936,11 +939,12 @@ static void check_process_timers(struct task_struct *tsk, } __group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk); if (soft < hard) { - soft++; - sig->rlim[RLIMIT_CPU].rlim_cur = soft; + sig->rlim[RLIMIT_CPU].rlim_cur = soft + 1; + softns += NSEC_PER_SEC; } } - softns = soft * NSEC_PER_SEC; + + /* Update the expiry cache */ if (softns < pct->bases[CPUCLOCK_PROF].nextevt) pct->bases[CPUCLOCK_PROF].nextevt = softns; } -- cgit v1.2.3 From dd6702241337bcd0bae01d2644b7bae1a496d937 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 21 Aug 2019 21:09:22 +0200 Subject: posix-cpu-timers: Remove pointless comparisons The soft RLIMIT expiry code checks whether the soft limit is greater than the hard limit. That's pointless because if the soft RLIMIT is greater than the hard RLIMIT then that code cannot be reached as the hard RLIMIT check is before that and already killed the process. Remove it. Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Link: https://lkml.kernel.org/r/20190821192922.548747613@linutronix.de --- kernel/time/posix-cpu-timers.c | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index dcdf9c8241b1..115c8dfa4d46 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -814,15 +814,14 @@ static void check_thread_timers(struct task_struct *tsk, __group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk); return; } + if (rtim >= soft) { /* * At the soft limit, send a SIGXCPU every second. */ - if (soft < hard) { - soft += USEC_PER_SEC; - tsk->signal->rlim[RLIMIT_RTTIME].rlim_cur = - soft; - } + soft += USEC_PER_SEC; + tsk->signal->rlim[RLIMIT_RTTIME].rlim_cur = soft; + if (print_fatal_signals) { pr_info("RT Watchdog Timeout (soft): %s[%d]\n", tsk->comm, task_pid_nr(tsk)); @@ -938,10 +937,9 @@ static void check_process_timers(struct task_struct *tsk, tsk->comm, task_pid_nr(tsk)); } __group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk); - if (soft < hard) { - sig->rlim[RLIMIT_CPU].rlim_cur = soft + 1; - softns += NSEC_PER_SEC; - } + + sig->rlim[RLIMIT_CPU].rlim_cur = soft + 1; + softns += NSEC_PER_SEC; } /* Update the expiry cache */ -- cgit v1.2.3 From 8991afe2640d05a805eba01277856e8549cdc838 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 21 Aug 2019 21:09:23 +0200 Subject: posix-cpu-timers: Deduplicate rlimit handling Both thread and process expiry functions have the same functionality for sending signals for soft and hard RLIMITs duplicated in 4 different ways. Split it out into a common function and cleanup the callsites. Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Link: https://lkml.kernel.org/r/20190821192922.653276779@linutronix.de --- kernel/time/posix-cpu-timers.c | 67 ++++++++++++++++-------------------------- 1 file changed, 25 insertions(+), 42 deletions(-) (limited to 'kernel') diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 115c8dfa4d46..ef39a7a4a95c 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -772,6 +772,20 @@ static inline void check_dl_overrun(struct task_struct *tsk) } } +static bool check_rlimit(u64 time, u64 limit, int signo, bool rt, bool hard) +{ + if (time < limit) + return false; + + if (print_fatal_signals) { + pr_info("%s Watchdog Timeout (%s): %s[%d]\n", + rt ? "RT" : "CPU", hard ? "hard" : "soft", + current->comm, task_pid_nr(current)); + } + __group_send_sig_info(signo, SEND_SIG_PRIV, current); + return true; +} + /* * Check for any per-thread CPU timers that have fired and move them off * the tsk->cpu_timers[N] list onto the firing list. Here we update the @@ -799,34 +813,18 @@ static void check_thread_timers(struct task_struct *tsk, soft = task_rlimit(tsk, RLIMIT_RTTIME); if (soft != RLIM_INFINITY) { /* Task RT timeout is accounted in jiffies. RTTIME is usec */ - unsigned long rtim = tsk->rt.timeout * (USEC_PER_SEC / HZ); + unsigned long rttime = tsk->rt.timeout * (USEC_PER_SEC / HZ); unsigned long hard = task_rlimit_max(tsk, RLIMIT_RTTIME); - if (hard != RLIM_INFINITY && rtim >= hard) { - /* - * At the hard limit, we just die. - * No need to calculate anything else now. - */ - if (print_fatal_signals) { - pr_info("CPU Watchdog Timeout (hard): %s[%d]\n", - tsk->comm, task_pid_nr(tsk)); - } - __group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk); + /* At the hard limit, send SIGKILL. No further action. */ + if (hard != RLIM_INFINITY && + check_rlimit(rttime, hard, SIGKILL, true, true)) return; - } - if (rtim >= soft) { - /* - * At the soft limit, send a SIGXCPU every second. - */ + /* At the soft limit, send a SIGXCPU every second */ + if (check_rlimit(rttime, soft, SIGXCPU, true, false)) { soft += USEC_PER_SEC; tsk->signal->rlim[RLIMIT_RTTIME].rlim_cur = soft; - - if (print_fatal_signals) { - pr_info("RT Watchdog Timeout (soft): %s[%d]\n", - tsk->comm, task_pid_nr(tsk)); - } - __group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk); } } @@ -916,28 +914,13 @@ static void check_process_timers(struct task_struct *tsk, u64 softns = (u64)soft * NSEC_PER_SEC; u64 hardns = (u64)hard * NSEC_PER_SEC; - if (hard != RLIM_INFINITY && ptime >= hardns) { - /* - * At the hard limit, we just die. - * No need to calculate anything else now. - */ - if (print_fatal_signals) { - pr_info("RT Watchdog Timeout (hard): %s[%d]\n", - tsk->comm, task_pid_nr(tsk)); - } - __group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk); + /* At the hard limit, send SIGKILL. No further action. */ + if (hard != RLIM_INFINITY && + check_rlimit(ptime, hardns, SIGKILL, false, true)) return; - } - if (ptime >= softns) { - /* - * At the soft limit, send a SIGXCPU every second. - */ - if (print_fatal_signals) { - pr_info("CPU Watchdog Timeout (soft): %s[%d]\n", - tsk->comm, task_pid_nr(tsk)); - } - __group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk); + /* At the soft limit, send a SIGXCPU every second */ + if (check_rlimit(ptime, softns, SIGXCPU, false, false)) { sig->rlim[RLIMIT_CPU].rlim_cur = soft + 1; softns += NSEC_PER_SEC; } -- cgit v1.2.3 From 244d49e30653658d4e7e9b2b8427777cbbc5affe Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 21 Aug 2019 21:09:24 +0200 Subject: posix-cpu-timers: Move state tracking to struct posix_cputimers Put it where it belongs and clean up the ifdeffery in fork completely. Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20190821192922.743229404@linutronix.de --- kernel/fork.c | 6 ---- kernel/time/posix-cpu-timers.c | 73 +++++++++++++++++++++++------------------- 2 files changed, 40 insertions(+), 39 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 52bfe7c20ff6..f1228d9f0b11 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1517,7 +1517,6 @@ void __cleanup_sighand(struct sighand_struct *sighand) } } -#ifdef CONFIG_POSIX_TIMERS /* * Initialize POSIX timer handling for a thread group. */ @@ -1528,12 +1527,7 @@ static void posix_cpu_timers_init_group(struct signal_struct *sig) cpu_limit = READ_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur); posix_cputimers_group_init(pct, cpu_limit); - if (cpu_limit != RLIM_INFINITY) - sig->cputimer.running = true; } -#else -static inline void posix_cpu_timers_init_group(struct signal_struct *sig) { } -#endif static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) { diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index ef39a7a4a95c..52f4c99c1d60 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -23,8 +23,10 @@ static void posix_cpu_timer_rearm(struct k_itimer *timer); void posix_cputimers_group_init(struct posix_cputimers *pct, u64 cpu_limit) { posix_cputimers_init(pct); - if (cpu_limit != RLIM_INFINITY) + if (cpu_limit != RLIM_INFINITY) { pct->bases[CPUCLOCK_PROF].nextevt = cpu_limit * NSEC_PER_SEC; + pct->timers_active = true; + } } /* @@ -248,8 +250,9 @@ static void update_gt_cputime(struct task_cputime_atomic *cputime_atomic, void thread_group_sample_cputime(struct task_struct *tsk, u64 *samples) { struct thread_group_cputimer *cputimer = &tsk->signal->cputimer; + struct posix_cputimers *pct = &tsk->signal->posix_cputimers; - WARN_ON_ONCE(!cputimer->running); + WARN_ON_ONCE(!pct->timers_active); proc_sample_cputime_atomic(&cputimer->cputime_atomic, samples); } @@ -269,9 +272,10 @@ void thread_group_sample_cputime(struct task_struct *tsk, u64 *samples) static void thread_group_start_cputime(struct task_struct *tsk, u64 *samples) { struct thread_group_cputimer *cputimer = &tsk->signal->cputimer; + struct posix_cputimers *pct = &tsk->signal->posix_cputimers; /* Check if cputimer isn't running. This is accessed without locking. */ - if (!READ_ONCE(cputimer->running)) { + if (!READ_ONCE(pct->timers_active)) { struct task_cputime sum; /* @@ -283,13 +287,13 @@ static void thread_group_start_cputime(struct task_struct *tsk, u64 *samples) update_gt_cputime(&cputimer->cputime_atomic, &sum); /* - * We're setting cputimer->running without a lock. Ensure - * this only gets written to in one operation. We set - * running after update_gt_cputime() as a small optimization, - * but barriers are not required because update_gt_cputime() + * We're setting timers_active without a lock. Ensure this + * only gets written to in one operation. We set it after + * update_gt_cputime() as a small optimization, but + * barriers are not required because update_gt_cputime() * can handle concurrent updates. */ - WRITE_ONCE(cputimer->running, true); + WRITE_ONCE(pct->timers_active, true); } proc_sample_cputime_atomic(&cputimer->cputime_atomic, samples); } @@ -313,9 +317,10 @@ static u64 cpu_clock_sample_group(const clockid_t clkid, struct task_struct *p, bool start) { struct thread_group_cputimer *cputimer = &p->signal->cputimer; + struct posix_cputimers *pct = &p->signal->posix_cputimers; u64 samples[CPUCLOCK_MAX]; - if (!READ_ONCE(cputimer->running)) { + if (!READ_ONCE(pct->timers_active)) { if (start) thread_group_start_cputime(p, samples); else @@ -834,10 +839,10 @@ static void check_thread_timers(struct task_struct *tsk, static inline void stop_process_timers(struct signal_struct *sig) { - struct thread_group_cputimer *cputimer = &sig->cputimer; + struct posix_cputimers *pct = &sig->posix_cputimers; - /* Turn off cputimer->running. This is done without locking. */ - WRITE_ONCE(cputimer->running, false); + /* Turn off the active flag. This is done without locking. */ + WRITE_ONCE(pct->timers_active, false); tick_dep_clear_signal(sig, TICK_DEP_BIT_POSIX_TIMER); } @@ -877,17 +882,17 @@ static void check_process_timers(struct task_struct *tsk, unsigned long soft; /* - * If cputimer is not running, then there are no active - * process wide timers (POSIX 1.b, itimers, RLIMIT_CPU). + * If there are no active process wide timers (POSIX 1.b, itimers, + * RLIMIT_CPU) nothing to check. */ - if (!READ_ONCE(sig->cputimer.running)) + if (!READ_ONCE(pct->timers_active)) return; /* * Signify that a thread is checking for process timers. * Write access to this field is protected by the sighand lock. */ - sig->cputimer.checking_timer = true; + pct->timers_active = true; /* * Collect the current process totals. Group accounting is active @@ -933,7 +938,7 @@ static void check_process_timers(struct task_struct *tsk, if (expiry_cache_is_inactive(pct)) stop_process_timers(sig); - sig->cputimer.checking_timer = false; + pct->expiry_active = false; } /* @@ -1027,39 +1032,41 @@ task_cputimers_expired(const u64 *sample, struct posix_cputimers *pct) */ static inline bool fastpath_timer_check(struct task_struct *tsk) { + struct posix_cputimers *pct = &tsk->posix_cputimers; struct signal_struct *sig; - if (!expiry_cache_is_inactive(&tsk->posix_cputimers)) { + if (!expiry_cache_is_inactive(pct)) { u64 samples[CPUCLOCK_MAX]; task_sample_cputime(tsk, samples); - if (task_cputimers_expired(samples, &tsk->posix_cputimers)) + if (task_cputimers_expired(samples, pct)) return true; } sig = tsk->signal; + pct = &sig->posix_cputimers; /* - * Check if thread group timers expired when the cputimer is - * running and no other thread in the group is already checking - * for thread group cputimers. These fields are read without the - * sighand lock. However, this is fine because this is meant to - * be a fastpath heuristic to determine whether we should try to - * acquire the sighand lock to check/handle timers. + * Check if thread group timers expired when timers are active and + * no other thread in the group is already handling expiry for + * thread group cputimers. These fields are read without the + * sighand lock. However, this is fine because this is meant to be + * a fastpath heuristic to determine whether we should try to + * acquire the sighand lock to handle timer expiry. * - * In the worst case scenario, if 'running' or 'checking_timer' gets - * set but the current thread doesn't see the change yet, we'll wait - * until the next thread in the group gets a scheduler interrupt to - * handle the timer. This isn't an issue in practice because these - * types of delays with signals actually getting sent are expected. + * In the worst case scenario, if concurrently timers_active is set + * or expiry_active is cleared, but the current thread doesn't see + * the change yet, the timer checks are delayed until the next + * thread in the group gets a scheduler interrupt to handle the + * timer. This isn't an issue in practice because these types of + * delays with signals actually getting sent are expected. */ - if (READ_ONCE(sig->cputimer.running) && - !READ_ONCE(sig->cputimer.checking_timer)) { + if (READ_ONCE(pct->timers_active) && !READ_ONCE(pct->expiry_active)) { u64 samples[CPUCLOCK_MAX]; proc_sample_cputime_atomic(&sig->cputimer.cputime_atomic, samples); - if (task_cputimers_expired(samples, &sig->posix_cputimers)) + if (task_cputimers_expired(samples, pct)) return true; } -- cgit v1.2.3 From 60bda037f1dd8151e0c9ee5b09f0c091a0f643cd Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 27 Aug 2019 21:31:02 +0200 Subject: posix-cpu-timers: Utilize timerqueue for storage Using a linear O(N) search for timer insertion affects execution time and D-cache footprint badly with a larger number of timers. Switch the storage to a timerqueue which is already used for hrtimers and alarmtimers. It does not affect the size of struct k_itimer as it.alarm is still larger. The extra list head for the expiry list will go away later once the expiry is moved into task work context. Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Link: https://lkml.kernel.org/r/alpine.DEB.2.21.1908272129220.1939@nanos.tec.linutronix.de --- kernel/time/posix-cpu-timers.c | 189 +++++++++++++++++++++-------------------- 1 file changed, 96 insertions(+), 93 deletions(-) (limited to 'kernel') diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 52f4c99c1d60..73c492ce404b 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -96,19 +96,19 @@ static inline int validate_clock_permissions(const clockid_t clock) * Update expiry time from increment, and increase overrun count, * given the current clock sample. */ -static void bump_cpu_timer(struct k_itimer *timer, u64 now) +static u64 bump_cpu_timer(struct k_itimer *timer, u64 now) { + u64 delta, incr, expires = timer->it.cpu.node.expires; int i; - u64 delta, incr; if (!timer->it_interval) - return; + return expires; - if (now < timer->it.cpu.expires) - return; + if (now < expires) + return expires; incr = timer->it_interval; - delta = now + incr - timer->it.cpu.expires; + delta = now + incr - expires; /* Don't use (incr*2 < delta), incr*2 might overflow. */ for (i = 0; incr < delta - incr; i++) @@ -118,10 +118,11 @@ static void bump_cpu_timer(struct k_itimer *timer, u64 now) if (delta < incr) continue; - timer->it.cpu.expires += incr; + timer->it.cpu.node.expires += incr; timer->it_overrun += 1LL << i; delta -= incr; } + return timer->it.cpu.node.expires; } /* Check whether all cache entries contain U64_MAX, i.e. eternal expiry time */ @@ -365,7 +366,7 @@ static int posix_cpu_timer_create(struct k_itimer *new_timer) return -EINVAL; new_timer->kclock = &clock_posix_cpu; - INIT_LIST_HEAD(&new_timer->it.cpu.entry); + timerqueue_init(&new_timer->it.cpu.node); new_timer->it.cpu.task = p; return 0; } @@ -378,10 +379,11 @@ static int posix_cpu_timer_create(struct k_itimer *new_timer) */ static int posix_cpu_timer_del(struct k_itimer *timer) { - int ret = 0; - unsigned long flags; + struct cpu_timer *ctmr = &timer->it.cpu; + struct task_struct *p = ctmr->task; struct sighand_struct *sighand; - struct task_struct *p = timer->it.cpu.task; + unsigned long flags; + int ret = 0; if (WARN_ON_ONCE(!p)) return -EINVAL; @@ -393,15 +395,15 @@ static int posix_cpu_timer_del(struct k_itimer *timer) sighand = lock_task_sighand(p, &flags); if (unlikely(sighand == NULL)) { /* - * We raced with the reaping of the task. - * The deletion should have cleared us off the list. + * This raced with the reaping of the task. The exit cleanup + * should have removed this timer from the timer queue. */ - WARN_ON_ONCE(!list_empty(&timer->it.cpu.entry)); + WARN_ON_ONCE(ctmr->head || timerqueue_node_queued(&ctmr->node)); } else { if (timer->it.cpu.firing) ret = TIMER_RETRY; else - list_del(&timer->it.cpu.entry); + cpu_timer_dequeue(ctmr); unlock_task_sighand(p, &flags); } @@ -412,12 +414,16 @@ static int posix_cpu_timer_del(struct k_itimer *timer) return ret; } -static void cleanup_timers_list(struct list_head *head) +static void cleanup_timerqueue(struct timerqueue_head *head) { - struct cpu_timer_list *timer, *next; + struct timerqueue_node *node; + struct cpu_timer *ctmr; - list_for_each_entry_safe(timer, next, head, entry) - list_del_init(&timer->entry); + while ((node = timerqueue_getnext(head))) { + timerqueue_del(head, node); + ctmr = container_of(node, struct cpu_timer, node); + ctmr->head = NULL; + } } /* @@ -429,9 +435,9 @@ static void cleanup_timers_list(struct list_head *head) */ static void cleanup_timers(struct posix_cputimers *pct) { - cleanup_timers_list(&pct->bases[CPUCLOCK_PROF].cpu_timers); - cleanup_timers_list(&pct->bases[CPUCLOCK_VIRT].cpu_timers); - cleanup_timers_list(&pct->bases[CPUCLOCK_SCHED].cpu_timers); + cleanup_timerqueue(&pct->bases[CPUCLOCK_PROF].tqhead); + cleanup_timerqueue(&pct->bases[CPUCLOCK_VIRT].tqhead); + cleanup_timerqueue(&pct->bases[CPUCLOCK_SCHED].tqhead); } /* @@ -454,28 +460,18 @@ void posix_cpu_timers_exit_group(struct task_struct *tsk) */ static void arm_timer(struct k_itimer *timer) { - struct cpu_timer_list *const nt = &timer->it.cpu; int clkidx = CPUCLOCK_WHICH(timer->it_clock); - struct task_struct *p = timer->it.cpu.task; - u64 newexp = timer->it.cpu.expires; + struct cpu_timer *ctmr = &timer->it.cpu; + u64 newexp = cpu_timer_getexpires(ctmr); + struct task_struct *p = ctmr->task; struct posix_cputimer_base *base; - struct list_head *head, *listpos; - struct cpu_timer_list *next; if (CPUCLOCK_PERTHREAD(timer->it_clock)) base = p->posix_cputimers.bases + clkidx; else base = p->signal->posix_cputimers.bases + clkidx; - listpos = head = &base->cpu_timers; - list_for_each_entry(next,head, entry) { - if (nt->expires < next->expires) - break; - listpos = &next->entry; - } - list_add(&nt->entry, listpos); - - if (listpos != head) + if (!cpu_timer_enqueue(&base->tqhead, ctmr)) return; /* @@ -498,24 +494,26 @@ static void arm_timer(struct k_itimer *timer) */ static void cpu_timer_fire(struct k_itimer *timer) { + struct cpu_timer *ctmr = &timer->it.cpu; + if ((timer->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE) { /* * User don't want any signal. */ - timer->it.cpu.expires = 0; + cpu_timer_setexpires(ctmr, 0); } else if (unlikely(timer->sigq == NULL)) { /* * This a special case for clock_nanosleep, * not a normal timer from sys_timer_create. */ wake_up_process(timer->it_process); - timer->it.cpu.expires = 0; + cpu_timer_setexpires(ctmr, 0); } else if (!timer->it_interval) { /* * One-shot timer. Clear it as soon as it's fired. */ posix_timer_event(timer, 0); - timer->it.cpu.expires = 0; + cpu_timer_setexpires(ctmr, 0); } else if (posix_timer_event(timer, ++timer->it_requeue_pending)) { /* * The signal did not get queued because the signal @@ -539,10 +537,11 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags, { clockid_t clkid = CPUCLOCK_WHICH(timer->it_clock); u64 old_expires, new_expires, old_incr, val; - struct task_struct *p = timer->it.cpu.task; + struct cpu_timer *ctmr = &timer->it.cpu; + struct task_struct *p = ctmr->task; struct sighand_struct *sighand; unsigned long flags; - int ret; + int ret = 0; if (WARN_ON_ONCE(!p)) return -EINVAL; @@ -562,22 +561,21 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags, * If p has just been reaped, we can no * longer get any information about it at all. */ - if (unlikely(sighand == NULL)) { + if (unlikely(sighand == NULL)) return -ESRCH; - } /* * Disarm any old timer after extracting its expiry time. */ - - ret = 0; old_incr = timer->it_interval; - old_expires = timer->it.cpu.expires; + old_expires = cpu_timer_getexpires(ctmr); + if (unlikely(timer->it.cpu.firing)) { timer->it.cpu.firing = -1; ret = TIMER_RETRY; - } else - list_del_init(&timer->it.cpu.entry); + } else { + cpu_timer_dequeue(ctmr); + } /* * We need to sample the current value to convert the new @@ -598,18 +596,16 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags, old->it_value.tv_nsec = 0; } else { /* - * Update the timer in case it has - * overrun already. If it has, - * we'll report it as having overrun - * and with the next reloaded timer - * already ticking, though we are - * swallowing that pending - * notification here to install the - * new setting. + * Update the timer in case it has overrun already. + * If it has, we'll report it as having overrun and + * with the next reloaded timer already ticking, + * though we are swallowing that pending + * notification here to install the new setting. */ - bump_cpu_timer(timer, val); - if (val < timer->it.cpu.expires) { - old_expires = timer->it.cpu.expires - val; + u64 exp = bump_cpu_timer(timer, val); + + if (val < exp) { + old_expires = exp - val; old->it_value = ns_to_timespec64(old_expires); } else { old->it_value.tv_nsec = 1; @@ -638,7 +634,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags, * For a timer with no notification action, we don't actually * arm the timer (we'll just fake it for timer_gettime). */ - timer->it.cpu.expires = new_expires; + cpu_timer_setexpires(ctmr, new_expires); if (new_expires != 0 && val < new_expires) { arm_timer(timer); } @@ -680,8 +676,9 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags, static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec64 *itp) { clockid_t clkid = CPUCLOCK_WHICH(timer->it_clock); - struct task_struct *p = timer->it.cpu.task; - u64 now; + struct cpu_timer *ctmr = &timer->it.cpu; + u64 now, expires = cpu_timer_getexpires(ctmr); + struct task_struct *p = ctmr->task; if (WARN_ON_ONCE(!p)) return; @@ -691,7 +688,7 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec64 *itp */ itp->it_interval = ktime_to_timespec64(timer->it_interval); - if (!timer->it.cpu.expires) + if (!expires) return; /* @@ -713,9 +710,9 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec64 *itp /* * The process has been reaped. * We can't even collect a sample any more. - * Call the timer disarmed, nothing else to do. + * Disarm the timer, nothing else to do. */ - timer->it.cpu.expires = 0; + cpu_timer_setexpires(ctmr, 0); return; } else { now = cpu_clock_sample_group(clkid, p, false); @@ -723,8 +720,8 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec64 *itp } } - if (now < timer->it.cpu.expires) { - itp->it_value = ns_to_timespec64(timer->it.cpu.expires - now); + if (now < expires) { + itp->it_value = ns_to_timespec64(expires - now); } else { /* * The timer should have expired already, but the firing @@ -735,37 +732,41 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec64 *itp } } -static unsigned long long -check_timers_list(struct list_head *timers, - struct list_head *firing, - unsigned long long curr) -{ - int maxfire = 20; - - while (!list_empty(timers)) { - struct cpu_timer_list *t; - - t = list_first_entry(timers, struct cpu_timer_list, entry); +#define MAX_COLLECTED 20 - if (!--maxfire || curr < t->expires) - return t->expires; - - t->firing = 1; - list_move_tail(&t->entry, firing); +static u64 collect_timerqueue(struct timerqueue_head *head, + struct list_head *firing, u64 now) +{ + struct timerqueue_node *next; + int i = 0; + + while ((next = timerqueue_getnext(head))) { + struct cpu_timer *ctmr; + u64 expires; + + ctmr = container_of(next, struct cpu_timer, node); + expires = cpu_timer_getexpires(ctmr); + /* Limit the number of timers to expire at once */ + if (++i == MAX_COLLECTED || now < expires) + return expires; + + ctmr->firing = 1; + cpu_timer_dequeue(ctmr); + list_add_tail(&ctmr->elist, firing); } return U64_MAX; } -static void collect_posix_cputimers(struct posix_cputimers *pct, - u64 *samples, struct list_head *firing) +static void collect_posix_cputimers(struct posix_cputimers *pct, u64 *samples, + struct list_head *firing) { struct posix_cputimer_base *base = pct->bases; int i; for (i = 0; i < CPUCLOCK_MAX; i++, base++) { - base->nextevt = check_timers_list(&base->cpu_timers, firing, - samples[i]); + base->nextevt = collect_timerqueue(&base->tqhead, firing, + samples[i]); } } @@ -948,7 +949,8 @@ static void check_process_timers(struct task_struct *tsk, static void posix_cpu_timer_rearm(struct k_itimer *timer) { clockid_t clkid = CPUCLOCK_WHICH(timer->it_clock); - struct task_struct *p = timer->it.cpu.task; + struct cpu_timer *ctmr = &timer->it.cpu; + struct task_struct *p = ctmr->task; struct sighand_struct *sighand; unsigned long flags; u64 now; @@ -980,7 +982,7 @@ static void posix_cpu_timer_rearm(struct k_itimer *timer) * The process has been reaped. * We can't even collect a sample any more. */ - timer->it.cpu.expires = 0; + cpu_timer_setexpires(ctmr, 0); return; } else if (unlikely(p->exit_state) && thread_group_empty(p)) { /* If the process is dying, no need to rearm */ @@ -1124,11 +1126,11 @@ void run_posix_cpu_timers(void) * each timer's lock before clearing its firing flag, so no * timer call will interfere. */ - list_for_each_entry_safe(timer, next, &firing, it.cpu.entry) { + list_for_each_entry_safe(timer, next, &firing, it.cpu.elist) { int cpu_firing; spin_lock(&timer->it_lock); - list_del_init(&timer->it.cpu.entry); + list_del_init(&timer->it.cpu.elist); cpu_firing = timer->it.cpu.firing; timer->it.cpu.firing = 0; /* @@ -1204,6 +1206,7 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags, timer.it_overrun = -1; error = posix_cpu_timer_create(&timer); timer.it_process = current; + if (!error) { static struct itimerspec64 zero_it; struct restart_block *restart; @@ -1219,7 +1222,7 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags, } while (!signal_pending(current)) { - if (timer.it.cpu.expires == 0) { + if (!cpu_timer_getexpires(&timer.it.cpu)) { /* * Our timer fired and was reset, below * deletion can not fail. @@ -1241,7 +1244,7 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags, /* * We were interrupted by a signal. */ - expires = timer.it.cpu.expires; + expires = cpu_timer_getexpires(&timer.it.cpu); error = posix_cpu_timer_set(&timer, 0, &zero_it, &it); if (!error) { /* -- cgit v1.2.3 From 101f85b56d03b36418bbf867f67d81710839b0ec Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Wed, 28 Aug 2019 16:58:15 +0800 Subject: genirq/affinity: Remove const qualifier from node_to_cpumask argument When CONFIG_CPUMASK_OFFSTACK isn't enabled, 'cpumask_var_t' is as 'typedef struct cpumask cpumask_var_t[1]', so the argument 'node_to_cpumask' alloc_nodes_vectors() can't be declared as 'const cpumask_var_t *' Fixes the following warning: kernel/irq/affinity.c: In function '__irq_build_affinity_masks': alloc_nodes_vectors(numvecs, node_to_cpumask, cpu_mask, ^ kernel/irq/affinity.c:128:13: note: expected 'const struct cpumask (*)[1]' but argument is of type 'struct cpumask (*)[1]' static void alloc_nodes_vectors(unsigned int numvecs, ^ Fixes: b1a5a73e64e9 ("genirq/affinity: Spread vectors on node according to nr_cpu ratio") Reported-by: kbuild test robot Signed-off-by: Ming Lei Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20190828085815.19931-1-ming.lei@redhat.com --- kernel/irq/affinity.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c index d905e844bf3a..4d89ad4fae3b 100644 --- a/kernel/irq/affinity.c +++ b/kernel/irq/affinity.c @@ -126,7 +126,7 @@ static int ncpus_cmp_func(const void *l, const void *r) * for each node. */ static void alloc_nodes_vectors(unsigned int numvecs, - const cpumask_var_t *node_to_cpumask, + cpumask_var_t *node_to_cpumask, const struct cpumask *cpu_mask, const nodemask_t nodemsk, struct cpumask *nmsk, -- cgit v1.2.3 From 71fed982d63cb2bb88db6f36059e3b14a7913846 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Fri, 23 Aug 2019 13:38:45 +0200 Subject: tick: Mark sched_timer to expire in hard interrupt context sched_timer must be initialized with the _HARD mode suffix to ensure expiry in hard interrupt context on RT. The previous conversion to HARD expiry mode missed on one instance in tick_nohz_switch_to_nohz(). Fix it up. Fixes: 902a9f9c50905 ("tick: Mark tick related hrtimers to expiry in hard interrupt context") Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20190823113845.12125-3-bigeasy@linutronix.de --- kernel/time/tick-sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 01ff32a02af2..955851748dc3 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -1233,7 +1233,7 @@ static void tick_nohz_switch_to_nohz(void) * Recycle the hrtimer in ts, so we can share the * hrtimer_forward with the highres code. */ - hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); + hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD); /* Get the next period */ next = tick_init_jiffy_update(); -- cgit v1.2.3 From a2ed4fd685cd23e98922f933d5dbccfbe82a4f08 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 29 Aug 2019 12:52:28 +0200 Subject: posix-cpu-timers: Make expiry_active check actually work correctly The state tracking changes broke the expiry active check by not writing to it and instead sitting timers_active, which is already set. That's not a big issue as the actual expiry is protected by sighand lock, so concurrent handling is not possible. That means that the second task which invokes that function executes the expiry code for nothing. Write to the proper flag. Also add a check whether the flag is set into check_process_timers(). That check had been missing in the code before the rework already. The check for another task handling the expiry of process wide timers was only done in the fastpath check. If the fastpath check returns true because a per task timer expired, then the checking of process wide timers was done in parallel which is as explained above just a waste of cycles. Fixes: 244d49e30653 ("posix-cpu-timers: Move state tracking to struct posix_cputimers") Signed-off-by: Thomas Gleixner Cc: Frederic Weisbecker --- kernel/time/posix-cpu-timers.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 73c492ce404b..c3a95b122209 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -884,16 +884,17 @@ static void check_process_timers(struct task_struct *tsk, /* * If there are no active process wide timers (POSIX 1.b, itimers, - * RLIMIT_CPU) nothing to check. + * RLIMIT_CPU) nothing to check. Also skip the process wide timer + * processing when there is already another task handling them. */ - if (!READ_ONCE(pct->timers_active)) + if (!READ_ONCE(pct->timers_active) || pct->expiry_active) return; - /* + /* * Signify that a thread is checking for process timers. * Write access to this field is protected by the sighand lock. */ - pct->timers_active = true; + pct->expiry_active = true; /* * Collect the current process totals. Group accounting is active -- cgit v1.2.3 From 8f35eaa5f2de020073a48ad51112237c5932cfcc Mon Sep 17 00:00:00 2001 From: Andrew Murray Date: Wed, 28 Aug 2019 18:50:05 +0100 Subject: jump_label: Don't warn on __exit jump entries On architectures that discard .exit.* sections at runtime, a warning is printed for each jump label that is used within an in-kernel __exit annotated function: can't patch jump_label at ehci_hcd_cleanup+0x8/0x3c WARNING: CPU: 0 PID: 1 at kernel/jump_label.c:410 __jump_label_update+0x12c/0x138 As these functions will never get executed (they are free'd along with the rest of initmem) - we do not need to patch them and should not display any warnings. The warning is displayed because the test required to satisfy jump_entry_is_init is based on init_section_contains (__init_begin to __init_end) whereas the test in __jump_label_update is based on init_kernel_text (_sinittext to _einittext) via kernel_text_address). Fixes: 19483677684b ("jump_label: Annotate entries that operate on __init code earlier") Signed-off-by: Andrew Murray Acked-by: Peter Zijlstra (Intel) Acked-by: Mark Rutland Signed-off-by: Will Deacon --- kernel/jump_label.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/jump_label.c b/kernel/jump_label.c index df3008419a1d..cdb3ffab128b 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -407,7 +407,9 @@ static bool jump_label_can_update(struct jump_entry *entry, bool init) return false; if (!kernel_text_address(jump_entry_code(entry))) { - WARN_ONCE(1, "can't patch jump_label at %pS", (void *)jump_entry_code(entry)); + WARN_ONCE(!jump_entry_is_init(entry), + "can't patch jump_label at %pS", + (void *)jump_entry_code(entry)); return false; } -- cgit v1.2.3 From 419e2f1838819e954071dfa1d1f820ab3386ada1 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 26 Aug 2019 09:03:44 +0200 Subject: dma-mapping: remove arch_dma_mmap_pgprot arch_dma_mmap_pgprot is used for two things: 1) to override the "normal" uncached page attributes for mapping memory coherent to devices that can't snoop the CPU caches 2) to provide the special DMA_ATTR_WRITE_COMBINE semantics on older arm systems and some mips platforms Replace one with the pgprot_dmacoherent macro that is already provided by arm and much simpler to use, and lift the DMA_ATTR_WRITE_COMBINE handling to common code with an explicit arch opt-in. Signed-off-by: Christoph Hellwig Acked-by: Geert Uytterhoeven # m68k Acked-by: Paul Burton # mips --- kernel/dma/Kconfig | 12 +++++++++--- kernel/dma/mapping.c | 8 +++++--- 2 files changed, 14 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig index 9decbba255fc..73c5c2b8e824 100644 --- a/kernel/dma/Kconfig +++ b/kernel/dma/Kconfig @@ -20,6 +20,15 @@ config ARCH_HAS_DMA_COHERENCE_H config ARCH_HAS_DMA_SET_MASK bool +# +# Select this option if the architecture needs special handling for +# DMA_ATTR_WRITE_COMBINE. Normally the "uncached" mapping should be what +# people thing of when saying write combine, so very few platforms should +# need to enable this. +# +config ARCH_HAS_DMA_WRITE_COMBINE + bool + config DMA_DECLARE_COHERENT bool @@ -45,9 +54,6 @@ config ARCH_HAS_DMA_PREP_COHERENT config ARCH_HAS_DMA_COHERENT_TO_PFN bool -config ARCH_HAS_DMA_MMAP_PGPROT - bool - config ARCH_HAS_FORCE_DMA_UNENCRYPTED bool diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c index b0038ca3aa92..1b96616c9f20 100644 --- a/kernel/dma/mapping.c +++ b/kernel/dma/mapping.c @@ -161,9 +161,11 @@ pgprot_t dma_pgprot(struct device *dev, pgprot_t prot, unsigned long attrs) (IS_ENABLED(CONFIG_DMA_NONCOHERENT_CACHE_SYNC) && (attrs & DMA_ATTR_NON_CONSISTENT))) return prot; - if (IS_ENABLED(CONFIG_ARCH_HAS_DMA_MMAP_PGPROT)) - return arch_dma_mmap_pgprot(dev, prot, attrs); - return pgprot_noncached(prot); +#ifdef CONFIG_ARCH_HAS_DMA_WRITE_COMBINE + if (attrs & DMA_ATTR_WRITE_COMBINE) + return pgprot_writecombine(prot); +#endif + return pgprot_dmacoherent(prot); } #endif /* CONFIG_MMU */ -- cgit v1.2.3 From 8e3a68fb55e00e0760bd8023883e064f1f93c62d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 3 Aug 2019 12:42:15 +0300 Subject: dma-mapping: make dma_atomic_pool_init self-contained The memory allocated for the atomic pool needs to have the same mapping attributes that we use for remapping, so use pgprot_dmacoherent instead of open coding it. Also deduct a suitable zone to allocate the memory from based on the presence of the DMA zones. Signed-off-by: Christoph Hellwig --- kernel/dma/remap.c | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/remap.c b/kernel/dma/remap.c index ffe78f0b2fe4..838123f79639 100644 --- a/kernel/dma/remap.c +++ b/kernel/dma/remap.c @@ -105,7 +105,16 @@ static int __init early_coherent_pool(char *p) } early_param("coherent_pool", early_coherent_pool); -int __init dma_atomic_pool_init(gfp_t gfp, pgprot_t prot) +static gfp_t dma_atomic_pool_gfp(void) +{ + if (IS_ENABLED(CONFIG_ZONE_DMA)) + return GFP_DMA; + if (IS_ENABLED(CONFIG_ZONE_DMA32)) + return GFP_DMA32; + return GFP_KERNEL; +} + +static int __init dma_atomic_pool_init(void) { unsigned int pool_size_order = get_order(atomic_pool_size); unsigned long nr_pages = atomic_pool_size >> PAGE_SHIFT; @@ -117,7 +126,7 @@ int __init dma_atomic_pool_init(gfp_t gfp, pgprot_t prot) page = dma_alloc_from_contiguous(NULL, nr_pages, pool_size_order, false); else - page = alloc_pages(gfp, pool_size_order); + page = alloc_pages(dma_atomic_pool_gfp(), pool_size_order); if (!page) goto out; @@ -128,7 +137,8 @@ int __init dma_atomic_pool_init(gfp_t gfp, pgprot_t prot) goto free_page; addr = dma_common_contiguous_remap(page, atomic_pool_size, VM_USERMAP, - prot, __builtin_return_address(0)); + pgprot_dmacoherent(PAGE_KERNEL), + __builtin_return_address(0)); if (!addr) goto destroy_genpool; @@ -155,6 +165,7 @@ out: atomic_pool_size / 1024); return -ENOMEM; } +postcore_initcall(dma_atomic_pool_init); bool dma_in_atomic_pool(void *start, size_t size) { -- cgit v1.2.3 From 7bd46644ea0f6021dc396a39a8bfd3a58f6f1f9f Mon Sep 17 00:00:00 2001 From: "Naveen N. Rao" Date: Thu, 4 Jul 2019 20:04:41 +0530 Subject: ftrace: Fix NULL pointer dereference in t_probe_next() LTP testsuite on powerpc results in the below crash: Unable to handle kernel paging request for data at address 0x00000000 Faulting instruction address: 0xc00000000029d800 Oops: Kernel access of bad area, sig: 11 [#1] LE SMP NR_CPUS=2048 NUMA PowerNV ... CPU: 68 PID: 96584 Comm: cat Kdump: loaded Tainted: G W NIP: c00000000029d800 LR: c00000000029dac4 CTR: c0000000001e6ad0 REGS: c0002017fae8ba10 TRAP: 0300 Tainted: G W MSR: 9000000000009033 CR: 28022422 XER: 20040000 CFAR: c00000000029d90c DAR: 0000000000000000 DSISR: 40000000 IRQMASK: 0 ... NIP [c00000000029d800] t_probe_next+0x60/0x180 LR [c00000000029dac4] t_mod_start+0x1a4/0x1f0 Call Trace: [c0002017fae8bc90] [c000000000cdbc40] _cond_resched+0x10/0xb0 (unreliable) [c0002017fae8bce0] [c0000000002a15b0] t_start+0xf0/0x1c0 [c0002017fae8bd30] [c0000000004ec2b4] seq_read+0x184/0x640 [c0002017fae8bdd0] [c0000000004a57bc] sys_read+0x10c/0x300 [c0002017fae8be30] [c00000000000b388] system_call+0x5c/0x70 The test (ftrace_set_ftrace_filter.sh) is part of ftrace stress tests and the crash happens when the test does 'cat $TRACING_PATH/set_ftrace_filter'. The address points to the second line below, in t_probe_next(), where filter_hash is dereferenced: hash = iter->probe->ops.func_hash->filter_hash; size = 1 << hash->size_bits; This happens due to a race with register_ftrace_function_probe(). A new ftrace_func_probe is created and added into the func_probes list in trace_array under ftrace_lock. However, before initializing the filter, we drop ftrace_lock, and re-acquire it after acquiring regex_lock. If another process is trying to read set_ftrace_filter, it will be able to acquire ftrace_lock during this window and it will end up seeing a NULL filter_hash. Fix this by just checking for a NULL filter_hash in t_probe_next(). If the filter_hash is NULL, then this probe is just being added and we can simply return from here. Link: http://lkml.kernel.org/r/05e021f757625cbbb006fad41380323dbe4e3b43.1562249521.git.naveen.n.rao@linux.vnet.ibm.com Cc: stable@vger.kernel.org Fixes: 7b60f3d876156 ("ftrace: Dynamically create the probe ftrace_ops for the trace_array") Signed-off-by: Naveen N. Rao Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index eca34503f178..80beed2cf0da 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -3095,6 +3095,10 @@ t_probe_next(struct seq_file *m, loff_t *pos) hnd = &iter->probe_entry->hlist; hash = iter->probe->ops.func_hash->filter_hash; + + if (!hash) + return NULL; + size = 1 << hash->size_bits; retry: -- cgit v1.2.3 From 372e0d01da71c84dcecf7028598a33813b0d5256 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Fri, 30 Aug 2019 16:30:01 -0400 Subject: ftrace: Check for empty hash and comment the race with registering probes The race between adding a function probe and reading the probes that exist is very subtle. It needs a comment. Also, the issue can also happen if the probe has has the EMPTY_HASH as its func_hash. Cc: stable@vger.kernel.org Fixes: 7b60f3d876156 ("ftrace: Dynamically create the probe ftrace_ops for the trace_array") Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 80beed2cf0da..6200a6fe10e3 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -3096,7 +3096,11 @@ t_probe_next(struct seq_file *m, loff_t *pos) hash = iter->probe->ops.func_hash->filter_hash; - if (!hash) + /* + * A probe being registered may temporarily have an empty hash + * and it's at the end of the func_probes list. + */ + if (!hash || hash == EMPTY_HASH) return NULL; size = 1 << hash->size_bits; @@ -4324,6 +4328,10 @@ register_ftrace_function_probe(char *glob, struct trace_array *tr, mutex_unlock(&ftrace_lock); + /* + * Note, there's a small window here that the func_hash->filter_hash + * may be NULL or empty. Need to be carefule when reading the loop. + */ mutex_lock(&probe->ops.func_hash->regex_lock); orig_hash = &probe->ops.func_hash->filter_hash; -- cgit v1.2.3 From 5b0022dd32b7c2e15edf1827ba80aa1407edf9ff Mon Sep 17 00:00:00 2001 From: "Naveen N. Rao" Date: Thu, 4 Jul 2019 20:04:42 +0530 Subject: ftrace: Check for successful allocation of hash In register_ftrace_function_probe(), we are not checking the return value of alloc_and_copy_ftrace_hash(). The subsequent call to ftrace_match_records() may end up dereferencing the same. Add a check to ensure this doesn't happen. Link: http://lkml.kernel.org/r/26e92574f25ad23e7cafa3cf5f7a819de1832cbe.1562249521.git.naveen.n.rao@linux.vnet.ibm.com Cc: stable@vger.kernel.org Fixes: 1ec3a81a0cf42 ("ftrace: Have each function probe use its own ftrace_ops") Signed-off-by: Naveen N. Rao Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 6200a6fe10e3..f9821a3374e9 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -4338,6 +4338,11 @@ register_ftrace_function_probe(char *glob, struct trace_array *tr, old_hash = *orig_hash; hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, old_hash); + if (!hash) { + ret = -ENOMEM; + goto out; + } + ret = ftrace_match_records(hash, glob, strlen(glob)); /* Nothing found? */ -- cgit v1.2.3 From 595a438c78dbdc43d6c9db4f437267f0bd1548bf Mon Sep 17 00:00:00 2001 From: Denis Efremov Date: Thu, 4 Jul 2019 20:21:10 +0300 Subject: tracing: Make exported ftrace_set_clr_event non-static The function ftrace_set_clr_event is declared static and marked EXPORT_SYMBOL_GPL(), which is at best an odd combination. Because the function was decided to be a part of API, this commit removes the static attribute and adds the declaration to the header. Link: http://lkml.kernel.org/r/20190704172110.27041-1-efremov@linux.com Fixes: f45d1225adb04 ("tracing: Kernel access to Ftrace instances") Reviewed-by: Joe Jin Signed-off-by: Denis Efremov Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index c7506bc81b75..648930823b57 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -787,7 +787,7 @@ static int __ftrace_set_clr_event(struct trace_array *tr, const char *match, return ret; } -static int ftrace_set_clr_event(struct trace_array *tr, char *buf, int set) +int ftrace_set_clr_event(struct trace_array *tr, char *buf, int set) { char *event = NULL, *sub = NULL, *match; int ret; -- cgit v1.2.3 From 19a58ce1dc72264b9d50ff6d86cc36b3c439fb64 Mon Sep 17 00:00:00 2001 From: Xinpeng Liu Date: Thu, 8 Aug 2019 07:29:23 +0800 Subject: tracing/probe: Fix null pointer dereference BUG: KASAN: null-ptr-deref in trace_probe_cleanup+0x8d/0xd0 Read of size 8 at addr 0000000000000000 by task syz-executor.0/9746 trace_probe_cleanup+0x8d/0xd0 free_trace_kprobe.part.14+0x15/0x50 alloc_trace_kprobe+0x23e/0x250 Link: http://lkml.kernel.org/r/1565220563-980-1-git-send-email-danielliu861@gmail.com Fixes: e3dc9f898ef9c ("tracing/probe: Add trace_event_call accesses APIs") Signed-off-by: Xinpeng Liu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_probe.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index dbef0d135075..fb6bfbc5bf86 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -895,7 +895,8 @@ void trace_probe_cleanup(struct trace_probe *tp) for (i = 0; i < tp->nr_args; i++) traceprobe_free_probe_arg(&tp->args[i]); - kfree(call->class->system); + if (call->class) + kfree(call->class->system); kfree(call->name); kfree(call->print_fmt); } -- cgit v1.2.3 From c68c9ec1c52e5bcd221eb09bc5344ad4f407b204 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 27 Aug 2019 22:25:47 -0700 Subject: tracing: Correct kdoc formats Fix the following kdoc warnings: kernel/trace/trace.c:1579: warning: Function parameter or member 'tr' not described in 'update_max_tr_single' kernel/trace/trace.c:1579: warning: Function parameter or member 'tsk' not described in 'update_max_tr_single' kernel/trace/trace.c:1579: warning: Function parameter or member 'cpu' not described in 'update_max_tr_single' kernel/trace/trace.c:1776: warning: Function parameter or member 'type' not described in 'register_tracer' kernel/trace/trace.c:2239: warning: Function parameter or member 'task' not described in 'tracing_record_taskinfo' kernel/trace/trace.c:2239: warning: Function parameter or member 'flags' not described in 'tracing_record_taskinfo' kernel/trace/trace.c:2269: warning: Function parameter or member 'prev' not described in 'tracing_record_taskinfo_sched_switch' kernel/trace/trace.c:2269: warning: Function parameter or member 'next' not described in 'tracing_record_taskinfo_sched_switch' kernel/trace/trace.c:2269: warning: Function parameter or member 'flags' not described in 'tracing_record_taskinfo_sched_switch' kernel/trace/trace.c:3078: warning: Function parameter or member 'ip' not described in 'trace_vbprintk' kernel/trace/trace.c:3078: warning: Function parameter or member 'fmt' not described in 'trace_vbprintk' kernel/trace/trace.c:3078: warning: Function parameter or member 'args' not described in 'trace_vbprintk' Link: http://lkml.kernel.org/r/20190828052549.2472-2-jakub.kicinski@netronome.com Signed-off-by: Jakub Kicinski Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 525a97fbbc60..563e80f9006a 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1567,9 +1567,9 @@ update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu, /** * update_max_tr_single - only copy one trace over, and reset the rest - * @tr - tracer - * @tsk - task with the latency - * @cpu - the cpu of the buffer to copy. + * @tr: tracer + * @tsk: task with the latency + * @cpu: the cpu of the buffer to copy. * * Flip the trace of a single CPU buffer between the @tr and the max_tr. */ @@ -1767,7 +1767,7 @@ static void __init apply_trace_boot_options(void); /** * register_tracer - register a tracer with the ftrace system. - * @type - the plugin for the tracer + * @type: the plugin for the tracer * * Register a new plugin tracer. */ @@ -2230,9 +2230,9 @@ static bool tracing_record_taskinfo_skip(int flags) /** * tracing_record_taskinfo - record the task info of a task * - * @task - task to record - * @flags - TRACE_RECORD_CMDLINE for recording comm - * - TRACE_RECORD_TGID for recording tgid + * @task: task to record + * @flags: TRACE_RECORD_CMDLINE for recording comm + * TRACE_RECORD_TGID for recording tgid */ void tracing_record_taskinfo(struct task_struct *task, int flags) { @@ -2258,10 +2258,10 @@ void tracing_record_taskinfo(struct task_struct *task, int flags) /** * tracing_record_taskinfo_sched_switch - record task info for sched_switch * - * @prev - previous task during sched_switch - * @next - next task during sched_switch - * @flags - TRACE_RECORD_CMDLINE for recording comm - * TRACE_RECORD_TGID for recording tgid + * @prev: previous task during sched_switch + * @next: next task during sched_switch + * @flags: TRACE_RECORD_CMDLINE for recording comm + * TRACE_RECORD_TGID for recording tgid */ void tracing_record_taskinfo_sched_switch(struct task_struct *prev, struct task_struct *next, int flags) @@ -3072,7 +3072,9 @@ static void trace_printk_start_stop_comm(int enabled) /** * trace_vbprintk - write binary msg to tracing buffer - * + * @ip: The address of the caller + * @fmt: The string format to write to the buffer + * @args: Arguments for @fmt */ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) { -- cgit v1.2.3 From 0bc11ed5ab60c135aa764a62c02cd5ea68289de4 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 25 Jul 2019 15:24:37 +0900 Subject: kprobes: Allow kprobes coexist with livepatch Allow kprobes which do not modify regs->ip, coexist with livepatch by dropping FTRACE_OPS_FL_IPMODIFY from ftrace_ops. User who wants to modify regs->ip (e.g. function fault injection) must set a dummy post_handler to its kprobes when registering. However, if such regs->ip modifying kprobes is set on a function, that function can not be livepatched. Link: http://lkml.kernel.org/r/156403587671.30117.5233558741694155985.stgit@devnote2 Acked-by: Joe Lawrence Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/kprobes.c | 56 ++++++++++++++++++++++++++++++++++++++++---------------- 1 file changed, 40 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index d9770a5393c8..f57deec96ba1 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -961,9 +961,16 @@ static struct kprobe *alloc_aggr_kprobe(struct kprobe *p) #ifdef CONFIG_KPROBES_ON_FTRACE static struct ftrace_ops kprobe_ftrace_ops __read_mostly = { + .func = kprobe_ftrace_handler, + .flags = FTRACE_OPS_FL_SAVE_REGS, +}; + +static struct ftrace_ops kprobe_ipmodify_ops __read_mostly = { .func = kprobe_ftrace_handler, .flags = FTRACE_OPS_FL_SAVE_REGS | FTRACE_OPS_FL_IPMODIFY, }; + +static int kprobe_ipmodify_enabled; static int kprobe_ftrace_enabled; /* Must ensure p->addr is really on ftrace */ @@ -976,58 +983,75 @@ static int prepare_kprobe(struct kprobe *p) } /* Caller must lock kprobe_mutex */ -static int arm_kprobe_ftrace(struct kprobe *p) +static int __arm_kprobe_ftrace(struct kprobe *p, struct ftrace_ops *ops, + int *cnt) { int ret = 0; - ret = ftrace_set_filter_ip(&kprobe_ftrace_ops, - (unsigned long)p->addr, 0, 0); + ret = ftrace_set_filter_ip(ops, (unsigned long)p->addr, 0, 0); if (ret) { pr_debug("Failed to arm kprobe-ftrace at %pS (%d)\n", p->addr, ret); return ret; } - if (kprobe_ftrace_enabled == 0) { - ret = register_ftrace_function(&kprobe_ftrace_ops); + if (*cnt == 0) { + ret = register_ftrace_function(ops); if (ret) { pr_debug("Failed to init kprobe-ftrace (%d)\n", ret); goto err_ftrace; } } - kprobe_ftrace_enabled++; + (*cnt)++; return ret; err_ftrace: /* - * Note: Since kprobe_ftrace_ops has IPMODIFY set, and ftrace requires a - * non-empty filter_hash for IPMODIFY ops, we're safe from an accidental - * empty filter_hash which would undesirably trace all functions. + * At this point, sinec ops is not registered, we should be sefe from + * registering empty filter. */ - ftrace_set_filter_ip(&kprobe_ftrace_ops, (unsigned long)p->addr, 1, 0); + ftrace_set_filter_ip(ops, (unsigned long)p->addr, 1, 0); return ret; } +static int arm_kprobe_ftrace(struct kprobe *p) +{ + bool ipmodify = (p->post_handler != NULL); + + return __arm_kprobe_ftrace(p, + ipmodify ? &kprobe_ipmodify_ops : &kprobe_ftrace_ops, + ipmodify ? &kprobe_ipmodify_enabled : &kprobe_ftrace_enabled); +} + /* Caller must lock kprobe_mutex */ -static int disarm_kprobe_ftrace(struct kprobe *p) +static int __disarm_kprobe_ftrace(struct kprobe *p, struct ftrace_ops *ops, + int *cnt) { int ret = 0; - if (kprobe_ftrace_enabled == 1) { - ret = unregister_ftrace_function(&kprobe_ftrace_ops); + if (*cnt == 1) { + ret = unregister_ftrace_function(ops); if (WARN(ret < 0, "Failed to unregister kprobe-ftrace (%d)\n", ret)) return ret; } - kprobe_ftrace_enabled--; + (*cnt)--; - ret = ftrace_set_filter_ip(&kprobe_ftrace_ops, - (unsigned long)p->addr, 1, 0); + ret = ftrace_set_filter_ip(ops, (unsigned long)p->addr, 1, 0); WARN_ONCE(ret < 0, "Failed to disarm kprobe-ftrace at %pS (%d)\n", p->addr, ret); return ret; } + +static int disarm_kprobe_ftrace(struct kprobe *p) +{ + bool ipmodify = (p->post_handler != NULL); + + return __disarm_kprobe_ftrace(p, + ipmodify ? &kprobe_ipmodify_ops : &kprobe_ftrace_ops, + ipmodify ? &kprobe_ipmodify_enabled : &kprobe_ftrace_enabled); +} #else /* !CONFIG_KPROBES_ON_FTRACE */ #define prepare_kprobe(p) arch_prepare_kprobe(p) #define arm_kprobe_ftrace(p) (-ENODEV) -- cgit v1.2.3 From 60d53e2c3b75e79c83970fe73db79123d9462c7c Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 20 Jun 2019 00:07:20 +0900 Subject: tracing/probe: Split trace_event related data from trace_probe Split the trace_event related data from trace_probe data structure and introduce trace_probe_event data structure for its folder. This trace_probe_event data structure can have multiple trace_probe. Link: http://lkml.kernel.org/r/156095683995.28024.7552150340561557873.stgit@devnote2 Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_kprobe.c | 157 ++++++++++++++++++++++++++++++----------- kernel/trace/trace_probe.c | 54 ++++++++++----- kernel/trace/trace_probe.h | 48 ++++++++++--- kernel/trace/trace_uprobe.c | 165 +++++++++++++++++++++++++++++++++----------- 4 files changed, 311 insertions(+), 113 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 9d483ad9bb6c..eac6344a2e7c 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -180,20 +180,33 @@ unsigned long trace_kprobe_address(struct trace_kprobe *tk) return addr; } +static nokprobe_inline struct trace_kprobe * +trace_kprobe_primary_from_call(struct trace_event_call *call) +{ + struct trace_probe *tp; + + tp = trace_probe_primary_from_call(call); + if (WARN_ON_ONCE(!tp)) + return NULL; + + return container_of(tp, struct trace_kprobe, tp); +} + bool trace_kprobe_on_func_entry(struct trace_event_call *call) { - struct trace_kprobe *tk = (struct trace_kprobe *)call->data; + struct trace_kprobe *tk = trace_kprobe_primary_from_call(call); - return kprobe_on_func_entry(tk->rp.kp.addr, + return tk ? kprobe_on_func_entry(tk->rp.kp.addr, tk->rp.kp.addr ? NULL : tk->rp.kp.symbol_name, - tk->rp.kp.addr ? 0 : tk->rp.kp.offset); + tk->rp.kp.addr ? 0 : tk->rp.kp.offset) : false; } bool trace_kprobe_error_injectable(struct trace_event_call *call) { - struct trace_kprobe *tk = (struct trace_kprobe *)call->data; + struct trace_kprobe *tk = trace_kprobe_primary_from_call(call); - return within_error_injection_list(trace_kprobe_address(tk)); + return tk ? within_error_injection_list(trace_kprobe_address(tk)) : + false; } static int register_kprobe_event(struct trace_kprobe *tk); @@ -291,32 +304,75 @@ static inline int __enable_trace_kprobe(struct trace_kprobe *tk) return ret; } +static void __disable_trace_kprobe(struct trace_probe *tp) +{ + struct trace_probe *pos; + struct trace_kprobe *tk; + + list_for_each_entry(pos, trace_probe_probe_list(tp), list) { + tk = container_of(pos, struct trace_kprobe, tp); + if (!trace_kprobe_is_registered(tk)) + continue; + if (trace_kprobe_is_return(tk)) + disable_kretprobe(&tk->rp); + else + disable_kprobe(&tk->rp.kp); + } +} + /* * Enable trace_probe * if the file is NULL, enable "perf" handler, or enable "trace" handler. */ -static int -enable_trace_kprobe(struct trace_kprobe *tk, struct trace_event_file *file) +static int enable_trace_kprobe(struct trace_event_call *call, + struct trace_event_file *file) { - bool enabled = trace_probe_is_enabled(&tk->tp); + struct trace_probe *pos, *tp; + struct trace_kprobe *tk; + bool enabled; int ret = 0; + tp = trace_probe_primary_from_call(call); + if (WARN_ON_ONCE(!tp)) + return -ENODEV; + enabled = trace_probe_is_enabled(tp); + + /* This also changes "enabled" state */ if (file) { - ret = trace_probe_add_file(&tk->tp, file); + ret = trace_probe_add_file(tp, file); if (ret) return ret; } else - trace_probe_set_flag(&tk->tp, TP_FLAG_PROFILE); + trace_probe_set_flag(tp, TP_FLAG_PROFILE); if (enabled) return 0; - ret = __enable_trace_kprobe(tk); - if (ret) { + enabled = false; + list_for_each_entry(pos, trace_probe_probe_list(tp), list) { + tk = container_of(pos, struct trace_kprobe, tp); + if (trace_kprobe_has_gone(tk)) + continue; + ret = __enable_trace_kprobe(tk); + if (ret) { + if (enabled) { + __disable_trace_kprobe(tp); + enabled = false; + } + break; + } + enabled = true; + } + + if (!enabled) { + /* No probe is enabled. Roll back */ if (file) - trace_probe_remove_file(&tk->tp, file); + trace_probe_remove_file(tp, file); else - trace_probe_clear_flag(&tk->tp, TP_FLAG_PROFILE); + trace_probe_clear_flag(tp, TP_FLAG_PROFILE); + if (!ret) + /* Since all probes are gone, this is not available */ + ret = -EADDRNOTAVAIL; } return ret; @@ -326,11 +382,14 @@ enable_trace_kprobe(struct trace_kprobe *tk, struct trace_event_file *file) * Disable trace_probe * if the file is NULL, disable "perf" handler, or disable "trace" handler. */ -static int -disable_trace_kprobe(struct trace_kprobe *tk, struct trace_event_file *file) +static int disable_trace_kprobe(struct trace_event_call *call, + struct trace_event_file *file) { - struct trace_probe *tp = &tk->tp; - int ret = 0; + struct trace_probe *tp; + + tp = trace_probe_primary_from_call(call); + if (WARN_ON_ONCE(!tp)) + return -ENODEV; if (file) { if (!trace_probe_get_file_link(tp, file)) @@ -341,12 +400,8 @@ disable_trace_kprobe(struct trace_kprobe *tk, struct trace_event_file *file) } else trace_probe_clear_flag(tp, TP_FLAG_PROFILE); - if (!trace_probe_is_enabled(tp) && trace_kprobe_is_registered(tk)) { - if (trace_kprobe_is_return(tk)) - disable_kretprobe(&tk->rp); - else - disable_kprobe(&tk->rp.kp); - } + if (!trace_probe_is_enabled(tp)) + __disable_trace_kprobe(tp); out: if (file) @@ -358,7 +413,7 @@ disable_trace_kprobe(struct trace_kprobe *tk, struct trace_event_file *file) */ trace_probe_remove_file(tp, file); - return ret; + return 0; } #if defined(CONFIG_KPROBES_ON_FTRACE) && \ @@ -1089,7 +1144,10 @@ print_kprobe_event(struct trace_iterator *iter, int flags, struct trace_probe *tp; field = (struct kprobe_trace_entry_head *)iter->ent; - tp = container_of(event, struct trace_probe, call.event); + tp = trace_probe_primary_from_call( + container_of(event, struct trace_event_call, event)); + if (WARN_ON_ONCE(!tp)) + goto out; trace_seq_printf(s, "%s: (", trace_probe_name(tp)); @@ -1116,7 +1174,10 @@ print_kretprobe_event(struct trace_iterator *iter, int flags, struct trace_probe *tp; field = (struct kretprobe_trace_entry_head *)iter->ent; - tp = container_of(event, struct trace_probe, call.event); + tp = trace_probe_primary_from_call( + container_of(event, struct trace_event_call, event)); + if (WARN_ON_ONCE(!tp)) + goto out; trace_seq_printf(s, "%s: (", trace_probe_name(tp)); @@ -1145,23 +1206,31 @@ static int kprobe_event_define_fields(struct trace_event_call *event_call) { int ret; struct kprobe_trace_entry_head field; - struct trace_kprobe *tk = (struct trace_kprobe *)event_call->data; + struct trace_probe *tp; + + tp = trace_probe_primary_from_call(event_call); + if (WARN_ON_ONCE(!tp)) + return -ENOENT; DEFINE_FIELD(unsigned long, ip, FIELD_STRING_IP, 0); - return traceprobe_define_arg_fields(event_call, sizeof(field), &tk->tp); + return traceprobe_define_arg_fields(event_call, sizeof(field), tp); } static int kretprobe_event_define_fields(struct trace_event_call *event_call) { int ret; struct kretprobe_trace_entry_head field; - struct trace_kprobe *tk = (struct trace_kprobe *)event_call->data; + struct trace_probe *tp; + + tp = trace_probe_primary_from_call(event_call); + if (WARN_ON_ONCE(!tp)) + return -ENOENT; DEFINE_FIELD(unsigned long, func, FIELD_STRING_FUNC, 0); DEFINE_FIELD(unsigned long, ret_ip, FIELD_STRING_RETIP, 0); - return traceprobe_define_arg_fields(event_call, sizeof(field), &tk->tp); + return traceprobe_define_arg_fields(event_call, sizeof(field), tp); } #ifdef CONFIG_PERF_EVENTS @@ -1289,20 +1358,19 @@ int bpf_get_kprobe_info(const struct perf_event *event, u32 *fd_type, static int kprobe_register(struct trace_event_call *event, enum trace_reg type, void *data) { - struct trace_kprobe *tk = (struct trace_kprobe *)event->data; struct trace_event_file *file = data; switch (type) { case TRACE_REG_REGISTER: - return enable_trace_kprobe(tk, file); + return enable_trace_kprobe(event, file); case TRACE_REG_UNREGISTER: - return disable_trace_kprobe(tk, file); + return disable_trace_kprobe(event, file); #ifdef CONFIG_PERF_EVENTS case TRACE_REG_PERF_REGISTER: - return enable_trace_kprobe(tk, NULL); + return enable_trace_kprobe(event, NULL); case TRACE_REG_PERF_UNREGISTER: - return disable_trace_kprobe(tk, NULL); + return disable_trace_kprobe(event, NULL); case TRACE_REG_PERF_OPEN: case TRACE_REG_PERF_CLOSE: case TRACE_REG_PERF_ADD: @@ -1369,7 +1437,6 @@ static inline void init_trace_event_call(struct trace_kprobe *tk) call->flags = TRACE_EVENT_FL_KPROBE; call->class->reg = kprobe_register; - call->data = tk; } static int register_kprobe_event(struct trace_kprobe *tk) @@ -1432,7 +1499,9 @@ void destroy_local_trace_kprobe(struct trace_event_call *event_call) { struct trace_kprobe *tk; - tk = container_of(event_call, struct trace_kprobe, tp.call); + tk = trace_kprobe_primary_from_call(event_call); + if (unlikely(!tk)) + return; if (trace_probe_is_enabled(&tk->tp)) { WARN_ON(1); @@ -1577,7 +1646,8 @@ static __init int kprobe_trace_self_tests_init(void) pr_warn("error on getting probe file.\n"); warn++; } else - enable_trace_kprobe(tk, file); + enable_trace_kprobe( + trace_probe_event_call(&tk->tp), file); } } @@ -1598,7 +1668,8 @@ static __init int kprobe_trace_self_tests_init(void) pr_warn("error on getting probe file.\n"); warn++; } else - enable_trace_kprobe(tk, file); + enable_trace_kprobe( + trace_probe_event_call(&tk->tp), file); } } @@ -1631,7 +1702,8 @@ static __init int kprobe_trace_self_tests_init(void) pr_warn("error on getting probe file.\n"); warn++; } else - disable_trace_kprobe(tk, file); + disable_trace_kprobe( + trace_probe_event_call(&tk->tp), file); } tk = find_trace_kprobe("testprobe2", KPROBE_EVENT_SYSTEM); @@ -1649,7 +1721,8 @@ static __init int kprobe_trace_self_tests_init(void) pr_warn("error on getting probe file.\n"); warn++; } else - disable_trace_kprobe(tk, file); + disable_trace_kprobe( + trace_probe_event_call(&tk->tp), file); } ret = trace_run_command("-:testprobe", create_or_delete_trace_kprobe); diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index fb6bfbc5bf86..28733bd6b607 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -889,41 +889,59 @@ int traceprobe_define_arg_fields(struct trace_event_call *event_call, void trace_probe_cleanup(struct trace_probe *tp) { - struct trace_event_call *call = trace_probe_event_call(tp); int i; for (i = 0; i < tp->nr_args; i++) traceprobe_free_probe_arg(&tp->args[i]); - if (call->class) - kfree(call->class->system); - kfree(call->name); - kfree(call->print_fmt); + if (tp->event) { + struct trace_event_call *call = trace_probe_event_call(tp); + + kfree(tp->event->class.system); + kfree(call->name); + kfree(call->print_fmt); + kfree(tp->event); + tp->event = NULL; + } } int trace_probe_init(struct trace_probe *tp, const char *event, const char *group) { - struct trace_event_call *call = trace_probe_event_call(tp); + struct trace_event_call *call; + int ret = 0; if (!event || !group) return -EINVAL; - call->class = &tp->class; - call->name = kstrdup(event, GFP_KERNEL); - if (!call->name) + tp->event = kzalloc(sizeof(struct trace_probe_event), GFP_KERNEL); + if (!tp->event) return -ENOMEM; - tp->class.system = kstrdup(group, GFP_KERNEL); - if (!tp->class.system) { - kfree(call->name); - call->name = NULL; - return -ENOMEM; + call = trace_probe_event_call(tp); + call->class = &tp->event->class; + call->name = kstrdup(event, GFP_KERNEL); + if (!call->name) { + ret = -ENOMEM; + goto error; + } + + tp->event->class.system = kstrdup(group, GFP_KERNEL); + if (!tp->event->class.system) { + ret = -ENOMEM; + goto error; } - INIT_LIST_HEAD(&tp->files); - INIT_LIST_HEAD(&tp->class.fields); + INIT_LIST_HEAD(&tp->event->files); + INIT_LIST_HEAD(&tp->event->class.fields); + INIT_LIST_HEAD(&tp->event->probes); + INIT_LIST_HEAD(&tp->list); + list_add(&tp->event->probes, &tp->list); return 0; + +error: + trace_probe_cleanup(tp); + return ret; } int trace_probe_register_event_call(struct trace_probe *tp) @@ -952,7 +970,7 @@ int trace_probe_add_file(struct trace_probe *tp, struct trace_event_file *file) link->file = file; INIT_LIST_HEAD(&link->list); - list_add_tail_rcu(&link->list, &tp->files); + list_add_tail_rcu(&link->list, &tp->event->files); trace_probe_set_flag(tp, TP_FLAG_TRACE); return 0; } @@ -983,7 +1001,7 @@ int trace_probe_remove_file(struct trace_probe *tp, synchronize_rcu(); kfree(link); - if (list_empty(&tp->files)) + if (list_empty(&tp->event->files)) trace_probe_clear_flag(tp, TP_FLAG_TRACE); return 0; diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index d1714820efe1..0b84abb884c2 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -222,11 +222,18 @@ struct probe_arg { const struct fetch_type *type; /* Type of this argument */ }; -struct trace_probe { +/* Event call and class holder */ +struct trace_probe_event { unsigned int flags; /* For TP_FLAG_* */ struct trace_event_class class; struct trace_event_call call; struct list_head files; + struct list_head probes; +}; + +struct trace_probe { + struct list_head list; + struct trace_probe_event *event; ssize_t size; /* trace entry size */ unsigned int nr_args; struct probe_arg args[]; @@ -240,19 +247,19 @@ struct event_file_link { static inline bool trace_probe_test_flag(struct trace_probe *tp, unsigned int flag) { - return !!(tp->flags & flag); + return !!(tp->event->flags & flag); } static inline void trace_probe_set_flag(struct trace_probe *tp, unsigned int flag) { - tp->flags |= flag; + tp->event->flags |= flag; } static inline void trace_probe_clear_flag(struct trace_probe *tp, unsigned int flag) { - tp->flags &= ~flag; + tp->event->flags &= ~flag; } static inline bool trace_probe_is_enabled(struct trace_probe *tp) @@ -262,29 +269,48 @@ static inline bool trace_probe_is_enabled(struct trace_probe *tp) static inline const char *trace_probe_name(struct trace_probe *tp) { - return trace_event_name(&tp->call); + return trace_event_name(&tp->event->call); } static inline const char *trace_probe_group_name(struct trace_probe *tp) { - return tp->call.class->system; + return tp->event->call.class->system; } static inline struct trace_event_call * trace_probe_event_call(struct trace_probe *tp) { - return &tp->call; + return &tp->event->call; +} + +static inline struct trace_probe_event * +trace_probe_event_from_call(struct trace_event_call *event_call) +{ + return container_of(event_call, struct trace_probe_event, call); +} + +static inline struct trace_probe * +trace_probe_primary_from_call(struct trace_event_call *call) +{ + struct trace_probe_event *tpe = trace_probe_event_from_call(call); + + return list_first_entry(&tpe->probes, struct trace_probe, list); +} + +static inline struct list_head *trace_probe_probe_list(struct trace_probe *tp) +{ + return &tp->event->probes; } static inline int trace_probe_unregister_event_call(struct trace_probe *tp) { /* tp->event is unregistered in trace_remove_event_call() */ - return trace_remove_event_call(&tp->call); + return trace_remove_event_call(&tp->event->call); } static inline bool trace_probe_has_single_file(struct trace_probe *tp) { - return !!list_is_singular(&tp->files); + return !!list_is_singular(&tp->event->files); } int trace_probe_init(struct trace_probe *tp, const char *event, @@ -298,9 +324,9 @@ struct event_file_link *trace_probe_get_file_link(struct trace_probe *tp, struct trace_event_file *file); #define trace_probe_for_each_link(pos, tp) \ - list_for_each_entry(pos, &(tp)->files, list) + list_for_each_entry(pos, &(tp)->event->files, list) #define trace_probe_for_each_link_rcu(pos, tp) \ - list_for_each_entry_rcu(pos, &(tp)->files, list) + list_for_each_entry_rcu(pos, &(tp)->event->files, list) /* Check the name is good for event/group/fields */ static inline bool is_good_name(const char *name) diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 1ceedb9146b1..ac799abb7da9 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -293,6 +293,18 @@ static bool trace_uprobe_match(const char *system, const char *event, (!system || strcmp(trace_probe_group_name(&tu->tp), system) == 0); } +static nokprobe_inline struct trace_uprobe * +trace_uprobe_primary_from_call(struct trace_event_call *call) +{ + struct trace_probe *tp; + + tp = trace_probe_primary_from_call(call); + if (WARN_ON_ONCE(!tp)) + return NULL; + + return container_of(tp, struct trace_uprobe, tp); +} + /* * Allocate new trace_uprobe and initialize it (including uprobes). */ @@ -897,7 +909,10 @@ print_uprobe_event(struct trace_iterator *iter, int flags, struct trace_event *e u8 *data; entry = (struct uprobe_trace_entry_head *)iter->ent; - tu = container_of(event, struct trace_uprobe, tp.call.event); + tu = trace_uprobe_primary_from_call( + container_of(event, struct trace_event_call, event)); + if (unlikely(!tu)) + goto out; if (is_ret_probe(tu)) { trace_seq_printf(s, "%s: (0x%lx <- 0x%lx)", @@ -924,27 +939,71 @@ typedef bool (*filter_func_t)(struct uprobe_consumer *self, enum uprobe_filter_ctx ctx, struct mm_struct *mm); -static int -probe_event_enable(struct trace_uprobe *tu, struct trace_event_file *file, - filter_func_t filter) +static int trace_uprobe_enable(struct trace_uprobe *tu, filter_func_t filter) { - bool enabled = trace_probe_is_enabled(&tu->tp); int ret; + tu->consumer.filter = filter; + tu->inode = d_real_inode(tu->path.dentry); + + if (tu->ref_ctr_offset) + ret = uprobe_register_refctr(tu->inode, tu->offset, + tu->ref_ctr_offset, &tu->consumer); + else + ret = uprobe_register(tu->inode, tu->offset, &tu->consumer); + + if (ret) + tu->inode = NULL; + + return ret; +} + +static void __probe_event_disable(struct trace_probe *tp) +{ + struct trace_probe *pos; + struct trace_uprobe *tu; + + list_for_each_entry(pos, trace_probe_probe_list(tp), list) { + tu = container_of(pos, struct trace_uprobe, tp); + if (!tu->inode) + continue; + + WARN_ON(!uprobe_filter_is_empty(&tu->filter)); + + uprobe_unregister(tu->inode, tu->offset, &tu->consumer); + tu->inode = NULL; + } +} + +static int probe_event_enable(struct trace_event_call *call, + struct trace_event_file *file, filter_func_t filter) +{ + struct trace_probe *pos, *tp; + struct trace_uprobe *tu; + bool enabled; + int ret; + + tp = trace_probe_primary_from_call(call); + if (WARN_ON_ONCE(!tp)) + return -ENODEV; + enabled = trace_probe_is_enabled(tp); + + /* This may also change "enabled" state */ if (file) { - if (trace_probe_test_flag(&tu->tp, TP_FLAG_PROFILE)) + if (trace_probe_test_flag(tp, TP_FLAG_PROFILE)) return -EINTR; - ret = trace_probe_add_file(&tu->tp, file); + ret = trace_probe_add_file(tp, file); if (ret < 0) return ret; } else { - if (trace_probe_test_flag(&tu->tp, TP_FLAG_TRACE)) + if (trace_probe_test_flag(tp, TP_FLAG_TRACE)) return -EINTR; - trace_probe_set_flag(&tu->tp, TP_FLAG_PROFILE); + trace_probe_set_flag(tp, TP_FLAG_PROFILE); } + tu = container_of(tp, struct trace_uprobe, tp); WARN_ON(!uprobe_filter_is_empty(&tu->filter)); if (enabled) @@ -954,18 +1013,15 @@ probe_event_enable(struct trace_uprobe *tu, struct trace_event_file *file, if (ret) goto err_flags; - tu->consumer.filter = filter; - tu->inode = d_real_inode(tu->path.dentry); - if (tu->ref_ctr_offset) { - ret = uprobe_register_refctr(tu->inode, tu->offset, - tu->ref_ctr_offset, &tu->consumer); - } else { - ret = uprobe_register(tu->inode, tu->offset, &tu->consumer); + list_for_each_entry(pos, trace_probe_probe_list(tp), list) { + tu = container_of(pos, struct trace_uprobe, tp); + ret = trace_uprobe_enable(tu, filter); + if (ret) { + __probe_event_disable(tp); + goto err_buffer; + } } - if (ret) - goto err_buffer; - return 0; err_buffer: @@ -973,33 +1029,35 @@ probe_event_enable(struct trace_uprobe *tu, struct trace_event_file *file, err_flags: if (file) - trace_probe_remove_file(&tu->tp, file); + trace_probe_remove_file(tp, file); else - trace_probe_clear_flag(&tu->tp, TP_FLAG_PROFILE); + trace_probe_clear_flag(tp, TP_FLAG_PROFILE); return ret; } -static void -probe_event_disable(struct trace_uprobe *tu, struct trace_event_file *file) +static void probe_event_disable(struct trace_event_call *call, + struct trace_event_file *file) { - if (!trace_probe_is_enabled(&tu->tp)) + struct trace_probe *tp; + + tp = trace_probe_primary_from_call(call); + if (WARN_ON_ONCE(!tp)) + return; + + if (!trace_probe_is_enabled(tp)) return; if (file) { - if (trace_probe_remove_file(&tu->tp, file) < 0) + if (trace_probe_remove_file(tp, file) < 0) return; - if (trace_probe_is_enabled(&tu->tp)) + if (trace_probe_is_enabled(tp)) return; } else - trace_probe_clear_flag(&tu->tp, TP_FLAG_PROFILE); - - WARN_ON(!uprobe_filter_is_empty(&tu->filter)); - - uprobe_unregister(tu->inode, tu->offset, &tu->consumer); - tu->inode = NULL; + trace_probe_clear_flag(tp, TP_FLAG_PROFILE); + __probe_event_disable(tp); uprobe_buffer_disable(); } @@ -1007,7 +1065,11 @@ static int uprobe_event_define_fields(struct trace_event_call *event_call) { int ret, size; struct uprobe_trace_entry_head field; - struct trace_uprobe *tu = event_call->data; + struct trace_uprobe *tu; + + tu = trace_uprobe_primary_from_call(event_call); + if (unlikely(!tu)) + return -ENODEV; if (is_ret_probe(tu)) { DEFINE_FIELD(unsigned long, vaddr[0], FIELD_STRING_FUNC, 0); @@ -1100,6 +1162,27 @@ static int uprobe_perf_open(struct trace_uprobe *tu, struct perf_event *event) return err; } +static int uprobe_perf_multi_call(struct trace_event_call *call, + struct perf_event *event, + int (*op)(struct trace_uprobe *tu, struct perf_event *event)) +{ + struct trace_probe *pos, *tp; + struct trace_uprobe *tu; + int ret = 0; + + tp = trace_probe_primary_from_call(call); + if (WARN_ON_ONCE(!tp)) + return -ENODEV; + + list_for_each_entry(pos, trace_probe_probe_list(tp), list) { + tu = container_of(pos, struct trace_uprobe, tp); + ret = op(tu, event); + if (ret) + break; + } + + return ret; +} static bool uprobe_perf_filter(struct uprobe_consumer *uc, enum uprobe_filter_ctx ctx, struct mm_struct *mm) { @@ -1213,30 +1296,29 @@ static int trace_uprobe_register(struct trace_event_call *event, enum trace_reg type, void *data) { - struct trace_uprobe *tu = event->data; struct trace_event_file *file = data; switch (type) { case TRACE_REG_REGISTER: - return probe_event_enable(tu, file, NULL); + return probe_event_enable(event, file, NULL); case TRACE_REG_UNREGISTER: - probe_event_disable(tu, file); + probe_event_disable(event, file); return 0; #ifdef CONFIG_PERF_EVENTS case TRACE_REG_PERF_REGISTER: - return probe_event_enable(tu, NULL, uprobe_perf_filter); + return probe_event_enable(event, NULL, uprobe_perf_filter); case TRACE_REG_PERF_UNREGISTER: - probe_event_disable(tu, NULL); + probe_event_disable(event, NULL); return 0; case TRACE_REG_PERF_OPEN: - return uprobe_perf_open(tu, data); + return uprobe_perf_multi_call(event, data, uprobe_perf_open); case TRACE_REG_PERF_CLOSE: - return uprobe_perf_close(tu, data); + return uprobe_perf_multi_call(event, data, uprobe_perf_close); #endif default: @@ -1330,7 +1412,6 @@ static inline void init_trace_event_call(struct trace_uprobe *tu) call->flags = TRACE_EVENT_FL_UPROBE | TRACE_EVENT_FL_CAP_ANY; call->class->reg = trace_uprobe_register; - call->data = tu; } static int register_uprobe_event(struct trace_uprobe *tu) @@ -1399,7 +1480,7 @@ void destroy_local_trace_uprobe(struct trace_event_call *event_call) { struct trace_uprobe *tu; - tu = container_of(event_call, struct trace_uprobe, tp.call); + tu = trace_uprobe_primary_from_call(event_call); free_trace_uprobe(tu); } -- cgit v1.2.3 From cb8e7a8d55e052fdcfd1a567305a9a180fb61c57 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 20 Jun 2019 00:07:29 +0900 Subject: tracing/dynevent: Delete all matched events When user gives an event name to delete, delete all matched events instead of the first one. This means if there are several events which have same name but different group (subsystem) name, those are removed if user passed only the event name, e.g. # cat kprobe_events p:group1/testevent _do_fork p:group2/testevent fork_idle # echo -:testevent >> kprobe_events # cat kprobe_events # Link: http://lkml.kernel.org/r/156095684958.28024.16597826267117453638.stgit@devnote2 Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_dynevent.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_dynevent.c b/kernel/trace/trace_dynevent.c index fa100ed3b4de..1cc55c50c491 100644 --- a/kernel/trace/trace_dynevent.c +++ b/kernel/trace/trace_dynevent.c @@ -61,10 +61,12 @@ int dyn_event_release(int argc, char **argv, struct dyn_event_operations *type) for_each_dyn_event_safe(pos, n) { if (type && type != pos->ops) continue; - if (pos->ops->match(system, event, pos)) { - ret = pos->ops->free(pos); + if (!pos->ops->match(system, event, pos)) + continue; + + ret = pos->ops->free(pos); + if (ret) break; - } } mutex_unlock(&event_mutex); -- cgit v1.2.3 From 30199137c899d7e416a2adc58bf09bec217ce9ca Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 20 Jun 2019 00:07:39 +0900 Subject: tracing/dynevent: Pass extra arguments to match operation Pass extra arguments to match operation for checking exact match. If the event doesn't support exact match, it will be ignored. Link: http://lkml.kernel.org/r/156095685930.28024.10405547027475590975.stgit@devnote2 Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_dynevent.c | 4 +++- kernel/trace/trace_dynevent.h | 7 ++++--- kernel/trace/trace_events_hist.c | 4 ++-- kernel/trace/trace_kprobe.c | 4 ++-- kernel/trace/trace_uprobe.c | 4 ++-- 5 files changed, 13 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_dynevent.c b/kernel/trace/trace_dynevent.c index 1cc55c50c491..a41fed46c285 100644 --- a/kernel/trace/trace_dynevent.c +++ b/kernel/trace/trace_dynevent.c @@ -47,6 +47,7 @@ int dyn_event_release(int argc, char **argv, struct dyn_event_operations *type) return -EINVAL; event++; } + argc--; argv++; p = strchr(event, '/'); if (p) { @@ -61,7 +62,8 @@ int dyn_event_release(int argc, char **argv, struct dyn_event_operations *type) for_each_dyn_event_safe(pos, n) { if (type && type != pos->ops) continue; - if (!pos->ops->match(system, event, pos)) + if (!pos->ops->match(system, event, + argc, (const char **)argv, pos)) continue; ret = pos->ops->free(pos); diff --git a/kernel/trace/trace_dynevent.h b/kernel/trace/trace_dynevent.h index 8c334064e4d6..46898138d2df 100644 --- a/kernel/trace/trace_dynevent.h +++ b/kernel/trace/trace_dynevent.h @@ -31,8 +31,9 @@ struct dyn_event; * @is_busy: Check whether given event is busy so that it can not be deleted. * Return true if it is busy, otherwides false. * @free: Delete the given event. Return 0 if success, otherwides error. - * @match: Check whether given event and system name match this event. - * Return true if it matches, otherwides false. + * @match: Check whether given event and system name match this event. The argc + * and argv is used for exact match. Return true if it matches, otherwides + * false. * * Except for @create, these methods are called under holding event_mutex. */ @@ -43,7 +44,7 @@ struct dyn_event_operations { bool (*is_busy)(struct dyn_event *ev); int (*free)(struct dyn_event *ev); bool (*match)(const char *system, const char *event, - struct dyn_event *ev); + int argc, const char **argv, struct dyn_event *ev); }; /* Register new dyn_event type -- must be called at first */ diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index ca6b0dff60c5..65e7d071ed28 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -374,7 +374,7 @@ static int synth_event_show(struct seq_file *m, struct dyn_event *ev); static int synth_event_release(struct dyn_event *ev); static bool synth_event_is_busy(struct dyn_event *ev); static bool synth_event_match(const char *system, const char *event, - struct dyn_event *ev); + int argc, const char **argv, struct dyn_event *ev); static struct dyn_event_operations synth_event_ops = { .create = synth_event_create, @@ -422,7 +422,7 @@ static bool synth_event_is_busy(struct dyn_event *ev) } static bool synth_event_match(const char *system, const char *event, - struct dyn_event *ev) + int argc, const char **argv, struct dyn_event *ev) { struct synth_event *sev = to_synth_event(ev); diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index eac6344a2e7c..e8f72431b866 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -39,7 +39,7 @@ static int trace_kprobe_show(struct seq_file *m, struct dyn_event *ev); static int trace_kprobe_release(struct dyn_event *ev); static bool trace_kprobe_is_busy(struct dyn_event *ev); static bool trace_kprobe_match(const char *system, const char *event, - struct dyn_event *ev); + int argc, const char **argv, struct dyn_event *ev); static struct dyn_event_operations trace_kprobe_ops = { .create = trace_kprobe_create, @@ -138,7 +138,7 @@ static bool trace_kprobe_is_busy(struct dyn_event *ev) } static bool trace_kprobe_match(const char *system, const char *event, - struct dyn_event *ev) + int argc, const char **argv, struct dyn_event *ev) { struct trace_kprobe *tk = to_trace_kprobe(ev); diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index ac799abb7da9..2862e6829e48 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -44,7 +44,7 @@ static int trace_uprobe_show(struct seq_file *m, struct dyn_event *ev); static int trace_uprobe_release(struct dyn_event *ev); static bool trace_uprobe_is_busy(struct dyn_event *ev); static bool trace_uprobe_match(const char *system, const char *event, - struct dyn_event *ev); + int argc, const char **argv, struct dyn_event *ev); static struct dyn_event_operations trace_uprobe_ops = { .create = trace_uprobe_create, @@ -285,7 +285,7 @@ static bool trace_uprobe_is_busy(struct dyn_event *ev) } static bool trace_uprobe_match(const char *system, const char *event, - struct dyn_event *ev) + int argc, const char **argv, struct dyn_event *ev) { struct trace_uprobe *tu = to_trace_uprobe(ev); -- cgit v1.2.3 From ca89bc071d5e4e981dcc52e0ca90f4500d332e42 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 20 Jun 2019 00:07:49 +0900 Subject: tracing/kprobe: Add multi-probe per event support Add multi-probe per one event support to kprobe events. User can define several different probes on one trace event if those events have same "event signature", e.g. # echo p:testevent _do_fork > kprobe_events # echo p:testevent fork_idle >> kprobe_events # kprobe_events p:kprobes/testevent _do_fork p:kprobes/testevent fork_idle The event signature is defined by kprobe type (retprobe or not), the number of args, argument names, and argument types. Note that this only support appending method. Delete event operation will delete all probes on the event. Link: http://lkml.kernel.org/r/156095686913.28024.9357292202316540742.stgit@devnote2 Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 4 +-- kernel/trace/trace_kprobe.c | 52 ++++++++++++++++++++++++++++++++++----- kernel/trace/trace_probe.c | 59 ++++++++++++++++++++++++++++++++++++++------- kernel/trace/trace_probe.h | 14 ++++++++++- 4 files changed, 111 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 563e80f9006a..a8505d84b76e 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4815,11 +4815,11 @@ static const char readme_msg[] = #endif #endif /* CONFIG_STACK_TRACER */ #ifdef CONFIG_DYNAMIC_EVENTS - " dynamic_events\t\t- Add/remove/show the generic dynamic events\n" + " dynamic_events\t\t- Create/append/remove/show the generic dynamic events\n" "\t\t\t Write into this file to define/undefine new trace events.\n" #endif #ifdef CONFIG_KPROBE_EVENTS - " kprobe_events\t\t- Add/remove/show the kernel dynamic events\n" + " kprobe_events\t\t- Create/append/remove/show the kernel dynamic events\n" "\t\t\t Write into this file to define/undefine new trace events.\n" #endif #ifdef CONFIG_UPROBE_EVENTS diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index e8f72431b866..f43098bf62dd 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -492,6 +492,10 @@ static void __unregister_trace_kprobe(struct trace_kprobe *tk) /* Unregister a trace_probe and probe_event */ static int unregister_trace_kprobe(struct trace_kprobe *tk) { + /* If other probes are on the event, just unregister kprobe */ + if (trace_probe_has_sibling(&tk->tp)) + goto unreg; + /* Enabled event can not be unregistered */ if (trace_probe_is_enabled(&tk->tp)) return -EBUSY; @@ -500,12 +504,38 @@ static int unregister_trace_kprobe(struct trace_kprobe *tk) if (unregister_kprobe_event(tk)) return -EBUSY; +unreg: __unregister_trace_kprobe(tk); dyn_event_remove(&tk->devent); + trace_probe_unlink(&tk->tp); return 0; } +static int append_trace_kprobe(struct trace_kprobe *tk, struct trace_kprobe *to) +{ + int ret; + + /* Append to existing event */ + ret = trace_probe_append(&tk->tp, &to->tp); + if (ret) + return ret; + + /* Register k*probe */ + ret = __register_trace_kprobe(tk); + if (ret == -ENOENT && !trace_kprobe_module_exist(tk)) { + pr_warn("This probe might be able to register after target module is loaded. Continue.\n"); + ret = 0; + } + + if (ret) + trace_probe_unlink(&tk->tp); + else + dyn_event_add(&tk->devent); + + return ret; +} + /* Register a trace_probe and probe_event */ static int register_trace_kprobe(struct trace_kprobe *tk) { @@ -514,14 +544,24 @@ static int register_trace_kprobe(struct trace_kprobe *tk) mutex_lock(&event_mutex); - /* Delete old (same name) event if exist */ old_tk = find_trace_kprobe(trace_probe_name(&tk->tp), trace_probe_group_name(&tk->tp)); if (old_tk) { - ret = unregister_trace_kprobe(old_tk); - if (ret < 0) - goto end; - free_trace_kprobe(old_tk); + if (trace_kprobe_is_return(tk) != trace_kprobe_is_return(old_tk)) { + trace_probe_log_set_index(0); + trace_probe_log_err(0, DIFF_PROBE_TYPE); + ret = -EEXIST; + } else { + ret = trace_probe_compare_arg_type(&tk->tp, &old_tk->tp); + if (ret) { + /* Note that argument starts index = 2 */ + trace_probe_log_set_index(ret + 1); + trace_probe_log_err(0, DIFF_ARG_TYPE); + ret = -EEXIST; + } else + ret = append_trace_kprobe(tk, old_tk); + } + goto end; } /* Register new event */ @@ -755,7 +795,7 @@ static int trace_kprobe_create(int argc, const char *argv[]) trace_probe_log_err(0, BAD_INSN_BNDRY); else if (ret == -ENOENT) trace_probe_log_err(0, BAD_PROBE_ADDR); - else if (ret != -ENOMEM) + else if (ret != -ENOMEM && ret != -EEXIST) trace_probe_log_err(0, FAIL_REG_PROBE); goto error; } diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index 28733bd6b607..651a1449acde 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -886,6 +886,35 @@ int traceprobe_define_arg_fields(struct trace_event_call *event_call, return 0; } +static void trace_probe_event_free(struct trace_probe_event *tpe) +{ + kfree(tpe->class.system); + kfree(tpe->call.name); + kfree(tpe->call.print_fmt); + kfree(tpe); +} + +int trace_probe_append(struct trace_probe *tp, struct trace_probe *to) +{ + if (trace_probe_has_sibling(tp)) + return -EBUSY; + + list_del_init(&tp->list); + trace_probe_event_free(tp->event); + + tp->event = to->event; + list_add_tail(&tp->list, trace_probe_probe_list(to)); + + return 0; +} + +void trace_probe_unlink(struct trace_probe *tp) +{ + list_del_init(&tp->list); + if (list_empty(trace_probe_probe_list(tp))) + trace_probe_event_free(tp->event); + tp->event = NULL; +} void trace_probe_cleanup(struct trace_probe *tp) { @@ -894,15 +923,8 @@ void trace_probe_cleanup(struct trace_probe *tp) for (i = 0; i < tp->nr_args; i++) traceprobe_free_probe_arg(&tp->args[i]); - if (tp->event) { - struct trace_event_call *call = trace_probe_event_call(tp); - - kfree(tp->event->class.system); - kfree(call->name); - kfree(call->print_fmt); - kfree(tp->event); - tp->event = NULL; - } + if (tp->event) + trace_probe_unlink(tp); } int trace_probe_init(struct trace_probe *tp, const char *event, @@ -1006,3 +1028,22 @@ int trace_probe_remove_file(struct trace_probe *tp, return 0; } + +/* + * Return the smallest index of different type argument (start from 1). + * If all argument types and name are same, return 0. + */ +int trace_probe_compare_arg_type(struct trace_probe *a, struct trace_probe *b) +{ + int i; + + for (i = 0; i < a->nr_args; i++) { + if ((b->nr_args <= i) || + ((a->args[i].type != b->args[i].type) || + (a->args[i].count != b->args[i].count) || + strcmp(a->args[i].name, b->args[i].name))) + return i + 1; + } + + return 0; +} diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index 0b84abb884c2..39926e8a344b 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -302,6 +302,13 @@ static inline struct list_head *trace_probe_probe_list(struct trace_probe *tp) return &tp->event->probes; } +static inline bool trace_probe_has_sibling(struct trace_probe *tp) +{ + struct list_head *list = trace_probe_probe_list(tp); + + return !list_empty(list) && !list_is_singular(list); +} + static inline int trace_probe_unregister_event_call(struct trace_probe *tp) { /* tp->event is unregistered in trace_remove_event_call() */ @@ -316,12 +323,15 @@ static inline bool trace_probe_has_single_file(struct trace_probe *tp) int trace_probe_init(struct trace_probe *tp, const char *event, const char *group); void trace_probe_cleanup(struct trace_probe *tp); +int trace_probe_append(struct trace_probe *tp, struct trace_probe *to); +void trace_probe_unlink(struct trace_probe *tp); int trace_probe_register_event_call(struct trace_probe *tp); int trace_probe_add_file(struct trace_probe *tp, struct trace_event_file *file); int trace_probe_remove_file(struct trace_probe *tp, struct trace_event_file *file); struct event_file_link *trace_probe_get_file_link(struct trace_probe *tp, struct trace_event_file *file); +int trace_probe_compare_arg_type(struct trace_probe *a, struct trace_probe *b); #define trace_probe_for_each_link(pos, tp) \ list_for_each_entry(pos, &(tp)->event->files, list) @@ -419,7 +429,9 @@ extern int traceprobe_define_arg_fields(struct trace_event_call *event_call, C(ARG_TOO_LONG, "Argument expression is too long"), \ C(NO_ARG_BODY, "No argument expression"), \ C(BAD_INSN_BNDRY, "Probe point is not an instruction boundary"),\ - C(FAIL_REG_PROBE, "Failed to register probe event"), + C(FAIL_REG_PROBE, "Failed to register probe event"),\ + C(DIFF_PROBE_TYPE, "Probe type is different from existing probe"),\ + C(DIFF_ARG_TYPE, "Argument type or name is different from existing probe"), #undef C #define C(a, b) TP_ERR_##a -- cgit v1.2.3 From 41af3cf587f476f9a879b08219324c8b456e6a4c Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 20 Jun 2019 00:07:58 +0900 Subject: tracing/uprobe: Add multi-probe per uprobe event support Allow user to define several probes on one uprobe event. Note that this only support appending method. So deleting event will delete all probes on the event. Link: http://lkml.kernel.org/r/156095687876.28024.13840331032234992863.stgit@devnote2 Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 2 +- kernel/trace/trace_uprobe.c | 60 +++++++++++++++++++++++++++++++-------------- 2 files changed, 43 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index a8505d84b76e..c7797a81a37e 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4823,7 +4823,7 @@ static const char readme_msg[] = "\t\t\t Write into this file to define/undefine new trace events.\n" #endif #ifdef CONFIG_UPROBE_EVENTS - " uprobe_events\t\t- Add/remove/show the userspace dynamic events\n" + " uprobe_events\t\t- Create/append/remove/show the userspace dynamic events\n" "\t\t\t Write into this file to define/undefine new trace events.\n" #endif #if defined(CONFIG_KPROBE_EVENTS) || defined(CONFIG_UPROBE_EVENTS) diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 2862e6829e48..d84e09abb8de 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -364,15 +364,32 @@ static int unregister_trace_uprobe(struct trace_uprobe *tu) { int ret; + if (trace_probe_has_sibling(&tu->tp)) + goto unreg; + ret = unregister_uprobe_event(tu); if (ret) return ret; +unreg: dyn_event_remove(&tu->devent); + trace_probe_unlink(&tu->tp); free_trace_uprobe(tu); return 0; } +static int append_trace_uprobe(struct trace_uprobe *tu, struct trace_uprobe *to) +{ + int ret; + + /* Append to existing event */ + ret = trace_probe_append(&tu->tp, &to->tp); + if (!ret) + dyn_event_add(&tu->devent); + + return ret; +} + /* * Uprobe with multiple reference counter is not allowed. i.e. * If inode and offset matches, reference counter offset *must* @@ -382,25 +399,21 @@ static int unregister_trace_uprobe(struct trace_uprobe *tu) * as the new one does not conflict with any other existing * ones. */ -static struct trace_uprobe *find_old_trace_uprobe(struct trace_uprobe *new) +static int validate_ref_ctr_offset(struct trace_uprobe *new) { struct dyn_event *pos; - struct trace_uprobe *tmp, *old = NULL; + struct trace_uprobe *tmp; struct inode *new_inode = d_real_inode(new->path.dentry); - old = find_probe_event(trace_probe_name(&new->tp), - trace_probe_group_name(&new->tp)); - for_each_trace_uprobe(tmp, pos) { - if ((old ? old != tmp : true) && - new_inode == d_real_inode(tmp->path.dentry) && + if (new_inode == d_real_inode(tmp->path.dentry) && new->offset == tmp->offset && new->ref_ctr_offset != tmp->ref_ctr_offset) { pr_warn("Reference counter offset mismatch."); - return ERR_PTR(-EINVAL); + return -EINVAL; } } - return old; + return 0; } /* Register a trace_uprobe and probe_event */ @@ -411,18 +424,29 @@ static int register_trace_uprobe(struct trace_uprobe *tu) mutex_lock(&event_mutex); - /* register as an event */ - old_tu = find_old_trace_uprobe(tu); - if (IS_ERR(old_tu)) { - ret = PTR_ERR(old_tu); + ret = validate_ref_ctr_offset(tu); + if (ret) goto end; - } + /* register as an event */ + old_tu = find_probe_event(trace_probe_name(&tu->tp), + trace_probe_group_name(&tu->tp)); if (old_tu) { - /* delete old event */ - ret = unregister_trace_uprobe(old_tu); - if (ret) - goto end; + if (is_ret_probe(tu) != is_ret_probe(old_tu)) { + trace_probe_log_set_index(0); + trace_probe_log_err(0, DIFF_PROBE_TYPE); + ret = -EEXIST; + } else { + ret = trace_probe_compare_arg_type(&tu->tp, &old_tu->tp); + if (ret) { + /* Note that argument starts index = 2 */ + trace_probe_log_set_index(ret + 1); + trace_probe_log_err(0, DIFF_ARG_TYPE); + ret = -EEXIST; + } else + ret = append_trace_uprobe(tu, old_tu); + } + goto end; } ret = register_uprobe_event(tu); -- cgit v1.2.3 From eb5bf81330a722d0079d28eed13d3a9355d938bf Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 20 Jun 2019 00:08:08 +0900 Subject: tracing/kprobe: Add per-probe delete from event Allow user to delete a probe from event. This is done by head match. For example, if we have 2 probes on an event $ cat kprobe_events p:kprobes/testprobe _do_fork r1=%ax r2=%dx p:kprobes/testprobe idle_fork r1=%ax r2=%cx Then you can remove one of them by passing the head of definition which identify the probe. $ echo "-:kprobes/testprobe idle_fork" >> kprobe_events Link: http://lkml.kernel.org/r/156095688848.28024.15798690082378432435.stgit@devnote2 Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_kprobe.c | 25 ++++++++++++++++++++++++- kernel/trace/trace_probe.c | 18 ++++++++++++++++++ kernel/trace/trace_probe.h | 2 ++ 3 files changed, 44 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index f43098bf62dd..18c4175b6585 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -137,13 +137,36 @@ static bool trace_kprobe_is_busy(struct dyn_event *ev) return trace_probe_is_enabled(&tk->tp); } +static bool trace_kprobe_match_command_head(struct trace_kprobe *tk, + int argc, const char **argv) +{ + char buf[MAX_ARGSTR_LEN + 1]; + + if (!argc) + return true; + + if (!tk->symbol) + snprintf(buf, sizeof(buf), "0x%p", tk->rp.kp.addr); + else if (tk->rp.kp.offset) + snprintf(buf, sizeof(buf), "%s+%u", + trace_kprobe_symbol(tk), tk->rp.kp.offset); + else + snprintf(buf, sizeof(buf), "%s", trace_kprobe_symbol(tk)); + if (strcmp(buf, argv[0])) + return false; + argc--; argv++; + + return trace_probe_match_command_args(&tk->tp, argc, argv); +} + static bool trace_kprobe_match(const char *system, const char *event, int argc, const char **argv, struct dyn_event *ev) { struct trace_kprobe *tk = to_trace_kprobe(ev); return strcmp(trace_probe_name(&tk->tp), event) == 0 && - (!system || strcmp(trace_probe_group_name(&tk->tp), system) == 0); + (!system || strcmp(trace_probe_group_name(&tk->tp), system) == 0) && + trace_kprobe_match_command_head(tk, argc, argv); } static nokprobe_inline unsigned long trace_kprobe_nhit(struct trace_kprobe *tk) diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index 651a1449acde..f8c3c65c035d 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -1047,3 +1047,21 @@ int trace_probe_compare_arg_type(struct trace_probe *a, struct trace_probe *b) return 0; } + +bool trace_probe_match_command_args(struct trace_probe *tp, + int argc, const char **argv) +{ + char buf[MAX_ARGSTR_LEN + 1]; + int i; + + if (tp->nr_args < argc) + return false; + + for (i = 0; i < argc; i++) { + snprintf(buf, sizeof(buf), "%s=%s", + tp->args[i].name, tp->args[i].comm); + if (strcmp(buf, argv[i])) + return false; + } + return true; +} diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index 39926e8a344b..2dcc4e317787 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -332,6 +332,8 @@ int trace_probe_remove_file(struct trace_probe *tp, struct event_file_link *trace_probe_get_file_link(struct trace_probe *tp, struct trace_event_file *file); int trace_probe_compare_arg_type(struct trace_probe *a, struct trace_probe *b); +bool trace_probe_match_command_args(struct trace_probe *tp, + int argc, const char **argv); #define trace_probe_for_each_link(pos, tp) \ list_for_each_entry(pos, &(tp)->event->files, list) -- cgit v1.2.3 From ab10d69eb714961d1eca4129e4f8cda5e0618f66 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 20 Jun 2019 00:08:18 +0900 Subject: tracing/uprobe: Add per-probe delete from event Add per-probe delete method from one event passing the head of definition. In other words, the events which match the head N parameters are deleted. Link: http://lkml.kernel.org/r/156095689811.28024.221706761151739433.stgit@devnote2 Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_uprobe.c | 31 ++++++++++++++++++++++++++++++- 1 file changed, 30 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index d84e09abb8de..84925b5b6db5 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -284,13 +284,42 @@ static bool trace_uprobe_is_busy(struct dyn_event *ev) return trace_probe_is_enabled(&tu->tp); } +static bool trace_uprobe_match_command_head(struct trace_uprobe *tu, + int argc, const char **argv) +{ + char buf[MAX_ARGSTR_LEN + 1]; + int len; + + if (!argc) + return true; + + len = strlen(tu->filename); + if (strncmp(tu->filename, argv[0], len) || argv[0][len] != ':') + return false; + + if (tu->ref_ctr_offset == 0) + snprintf(buf, sizeof(buf), "0x%0*lx", + (int)(sizeof(void *) * 2), tu->offset); + else + snprintf(buf, sizeof(buf), "0x%0*lx(0x%lx)", + (int)(sizeof(void *) * 2), tu->offset, + tu->ref_ctr_offset); + if (strcmp(buf, &argv[0][len + 1])) + return false; + + argc--; argv++; + + return trace_probe_match_command_args(&tu->tp, argc, argv); +} + static bool trace_uprobe_match(const char *system, const char *event, int argc, const char **argv, struct dyn_event *ev) { struct trace_uprobe *tu = to_trace_uprobe(ev); return strcmp(trace_probe_name(&tu->tp), event) == 0 && - (!system || strcmp(trace_probe_group_name(&tu->tp), system) == 0); + (!system || strcmp(trace_probe_group_name(&tu->tp), system) == 0) && + trace_uprobe_match_command_head(tu, argc, argv); } static nokprobe_inline struct trace_uprobe * -- cgit v1.2.3 From 6218bf9f4d2942e88d97b60abc8c2ca0532e41a8 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 20 Jun 2019 00:08:27 +0900 Subject: tracing/probe: Add immediate parameter support Add immediate value parameter (\1234) support to probe events. This allows you to specify an immediate (or dummy) parameter instead of fetching from memory or register. This feature looks odd, but imagine when you put a probe on a code to trace some data. If the code is compiled into 2 instructions and 1 instruction has a value but other has nothing since it is optimized out. In that case, you can not fold those into one event, even if ftrace supported multiple probes on one event. With this feature, you can set a dummy value like foo=\deadbeef instead of something like foo=%di. Link: http://lkml.kernel.org/r/156095690733.28024.13258186548822649469.stgit@devnote2 Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 2 +- kernel/trace/trace_probe.c | 18 ++++++++++++++++++ kernel/trace/trace_probe.h | 1 + 3 files changed, 20 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index c7797a81a37e..fb4003c10151 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4848,7 +4848,7 @@ static const char readme_msg[] = #else "\t $stack, $stack, $retval, $comm,\n" #endif - "\t +|-[u]()\n" + "\t +|-[u](), \\imm-value\n" "\t type: s8/16/32/64, u8/16/32/64, x8/16/32/64, string, symbol,\n" "\t b@/, ustring,\n" "\t \\[\\]\n" diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index f8c3c65c035d..fb90baec3cd8 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -316,6 +316,17 @@ inval_var: return -EINVAL; } +static int str_to_immediate(char *str, unsigned long *imm) +{ + if (isdigit(str[0])) + return kstrtoul(str, 0, imm); + else if (str[0] == '-') + return kstrtol(str, 0, (long *)imm); + else if (str[0] == '+') + return kstrtol(str + 1, 0, (long *)imm); + return -EINVAL; +} + /* Recursive argument parser */ static int parse_probe_arg(char *arg, const struct fetch_type *type, @@ -444,6 +455,13 @@ parse_probe_arg(char *arg, const struct fetch_type *type, code->offset = offset; } break; + case '\\': /* Immediate value */ + ret = str_to_immediate(arg + 1, &code->immediate); + if (ret) + trace_probe_log_err(offs + 1, BAD_IMM); + else + code->op = FETCH_OP_IMM; + break; } if (!ret && code->op == FETCH_OP_NOP) { /* Parsed, but do not find fetch method */ diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index 2dcc4e317787..cc113b82a4ce 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -408,6 +408,7 @@ extern int traceprobe_define_arg_fields(struct trace_event_call *event_call, C(BAD_VAR, "Invalid $-valiable specified"), \ C(BAD_REG_NAME, "Invalid register name"), \ C(BAD_MEM_ADDR, "Invalid memory address"), \ + C(BAD_IMM, "Invalid immediate value"), \ C(FILE_ON_KPROBE, "File offset is not available with kprobe"), \ C(BAD_FILE_OFFS, "Invalid file offset value"), \ C(SYM_ON_UPROBE, "Symbol is not available with uprobe"), \ -- cgit v1.2.3 From a42e3c4de9642d5de524a0a48a7ce96872662dca Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 20 Jun 2019 00:08:37 +0900 Subject: tracing/probe: Add immediate string parameter support Add immediate string parameter (\"string") support to probe events. This allows you to specify an immediate (or dummy) parameter instead of fetching a string from memory. This feature looks odd, but imagine that you put a probe on a code to trace some string data. If the code is compiled into 2 instructions and 1 instruction has a string on memory but other has no string since it is optimized out. In that case, you can not fold those into one event, even if ftrace supported multiple probes on one event. With this feature, you can set a dummy string like foo=\"(optimized)":string instead of something like foo=+0(+0(%bp)):string. Link: http://lkml.kernel.org/r/156095691687.28024.13372712423865047991.stgit@devnote2 Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 2 +- kernel/trace/trace_kprobe.c | 3 +++ kernel/trace/trace_probe.c | 56 +++++++++++++++++++++++++++++++++------------ kernel/trace/trace_probe.h | 2 ++ kernel/trace/trace_uprobe.c | 3 +++ 5 files changed, 51 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index fb4003c10151..3916b72de715 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4848,7 +4848,7 @@ static const char readme_msg[] = #else "\t $stack, $stack, $retval, $comm,\n" #endif - "\t +|-[u](), \\imm-value\n" + "\t +|-[u](), \\imm-value, \\\"imm-string\"\n" "\t type: s8/16/32/64, u8/16/32/64, x8/16/32/64, string, symbol,\n" "\t b@/, ustring,\n" "\t \\[\\]\n" diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 18c4175b6585..7579c53bb053 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1083,6 +1083,9 @@ retry: case FETCH_OP_COMM: val = (unsigned long)current->comm; break; + case FETCH_OP_DATA: + val = (unsigned long)code->data; + break; #ifdef CONFIG_HAVE_FUNCTION_ARG_ACCESS_API case FETCH_OP_ARG: val = regs_get_kernel_argument(regs, code->param); diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index fb90baec3cd8..1e67fef06e53 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -327,6 +327,18 @@ static int str_to_immediate(char *str, unsigned long *imm) return -EINVAL; } +static int __parse_imm_string(char *str, char **pbuf, int offs) +{ + size_t len = strlen(str); + + if (str[len - 1] != '"') { + trace_probe_log_err(offs + len, IMMSTR_NO_CLOSE); + return -EINVAL; + } + *pbuf = kstrndup(str, len - 1, GFP_KERNEL); + return 0; +} + /* Recursive argument parser */ static int parse_probe_arg(char *arg, const struct fetch_type *type, @@ -441,7 +453,8 @@ parse_probe_arg(char *arg, const struct fetch_type *type, ret = parse_probe_arg(arg, t2, &code, end, flags, offs); if (ret) break; - if (code->op == FETCH_OP_COMM) { + if (code->op == FETCH_OP_COMM || + code->op == FETCH_OP_DATA) { trace_probe_log_err(offs, COMM_CANT_DEREF); return -EINVAL; } @@ -456,11 +469,19 @@ parse_probe_arg(char *arg, const struct fetch_type *type, } break; case '\\': /* Immediate value */ - ret = str_to_immediate(arg + 1, &code->immediate); - if (ret) - trace_probe_log_err(offs + 1, BAD_IMM); - else - code->op = FETCH_OP_IMM; + if (arg[1] == '"') { /* Immediate string */ + ret = __parse_imm_string(arg + 2, &tmp, offs + 2); + if (ret) + break; + code->op = FETCH_OP_DATA; + code->data = tmp; + } else { + ret = str_to_immediate(arg + 1, &code->immediate); + if (ret) + trace_probe_log_err(offs + 1, BAD_IMM); + else + code->op = FETCH_OP_IMM; + } break; } if (!ret && code->op == FETCH_OP_NOP) { @@ -560,8 +581,11 @@ static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size, } } - /* Since $comm can not be dereferred, we can find $comm by strcmp */ - if (strcmp(arg, "$comm") == 0) { + /* + * Since $comm and immediate string can not be dereferred, + * we can find those by strcmp. + */ + if (strcmp(arg, "$comm") == 0 || strncmp(arg, "\\\"", 2) == 0) { /* The type of $comm must be "string", and not an array. */ if (parg->count || (t && strcmp(t, "string"))) return -EINVAL; @@ -598,7 +622,8 @@ static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size, if (!strcmp(parg->type->name, "string") || !strcmp(parg->type->name, "ustring")) { if (code->op != FETCH_OP_DEREF && code->op != FETCH_OP_UDEREF && - code->op != FETCH_OP_IMM && code->op != FETCH_OP_COMM) { + code->op != FETCH_OP_IMM && code->op != FETCH_OP_COMM && + code->op != FETCH_OP_DATA) { trace_probe_log_err(offset + (t ? (t - arg) : 0), BAD_STRING); ret = -EINVAL; @@ -607,9 +632,10 @@ static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size, if ((code->op == FETCH_OP_IMM || code->op == FETCH_OP_COMM) || parg->count) { /* - * IMM and COMM is pointing actual address, those must - * be kept, and if parg->count != 0, this is an array - * of string pointers instead of string address itself. + * IMM, DATA and COMM is pointing actual address, those + * must be kept, and if parg->count != 0, this is an + * array of string pointers instead of string address + * itself. */ code++; if (code->op != FETCH_OP_NOP) { @@ -683,7 +709,8 @@ static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size, fail: if (ret) { for (code = tmp; code < tmp + FETCH_INSN_MAX; code++) - if (code->op == FETCH_NOP_SYMBOL) + if (code->op == FETCH_NOP_SYMBOL || + code->op == FETCH_OP_DATA) kfree(code->data); } kfree(tmp); @@ -754,7 +781,8 @@ void traceprobe_free_probe_arg(struct probe_arg *arg) struct fetch_insn *code = arg->code; while (code && code->op != FETCH_OP_END) { - if (code->op == FETCH_NOP_SYMBOL) + if (code->op == FETCH_NOP_SYMBOL || + code->op == FETCH_OP_DATA) kfree(code->data); code++; } diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index cc113b82a4ce..f805cc4cbe7c 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -89,6 +89,7 @@ enum fetch_op { FETCH_OP_COMM, /* Current comm */ FETCH_OP_ARG, /* Function argument : .param */ FETCH_OP_FOFFS, /* File offset: .immediate */ + FETCH_OP_DATA, /* Allocated data: .data */ // Stage 2 (dereference) op FETCH_OP_DEREF, /* Dereference: .offset */ FETCH_OP_UDEREF, /* User-space Dereference: .offset */ @@ -409,6 +410,7 @@ extern int traceprobe_define_arg_fields(struct trace_event_call *event_call, C(BAD_REG_NAME, "Invalid register name"), \ C(BAD_MEM_ADDR, "Invalid memory address"), \ C(BAD_IMM, "Invalid immediate value"), \ + C(IMMSTR_NO_CLOSE, "String is not closed with '\"'"), \ C(FILE_ON_KPROBE, "File offset is not available with kprobe"), \ C(BAD_FILE_OFFS, "Invalid file offset value"), \ C(SYM_ON_UPROBE, "Symbol is not available with uprobe"), \ diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 84925b5b6db5..cbf4da4bf367 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -248,6 +248,9 @@ process_fetch_insn(struct fetch_insn *code, struct pt_regs *regs, void *dest, case FETCH_OP_COMM: val = FETCH_TOKEN_COMM; break; + case FETCH_OP_DATA: + val = (unsigned long)code->data; + break; case FETCH_OP_FOFFS: val = translate_user_vaddr(code->immediate); break; -- cgit v1.2.3 From f7edb451fa51e44e62177347ea7850aa0e901ea5 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Wed, 7 Aug 2019 11:28:59 -0400 Subject: tracing/arm64: Have max stack tracer handle the case of return address after data Most archs (well at least x86) store the function call return address on the stack before storing the local variables for the function. The max stack tracer depends on this in its algorithm to display the stack size of each function it finds in the back trace. Some archs (arm64), may store the return address (from its link register) just before calling a nested function. There's no reason to save the link register on leaf functions, as it wont be updated. This breaks the algorithm of the max stack tracer. Add a new define ARCH_FTRACE_SHIFT_STACK_TRACER that an architecture may set if it stores the return address (link register) after it stores the function's local variables, and have the stack trace shift the values of the mapped stack size to the appropriate functions. Link: 20190802094103.163576-1-jiping.ma2@windriver.com Reported-by: Jiping Ma Acked-by: Will Deacon Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_stack.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index 5d16f73898db..642a850af81a 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -158,6 +158,20 @@ static void check_stack(unsigned long ip, unsigned long *stack) i++; } +#ifdef ARCH_FTRACE_SHIFT_STACK_TRACER + /* + * Some archs will store the link register before calling + * nested functions. This means the saved return address + * comes after the local storage, and we need to shift + * for that. + */ + if (x > 1) { + memmove(&stack_trace_index[0], &stack_trace_index[1], + sizeof(stack_trace_index[0]) * (x - 1)); + x--; + } +#endif + stack_trace_nr_entries = x; if (task_stack_end_corrupted(current)) { -- cgit v1.2.3 From 58fe7a87db51ea00596187765dabfc2c4ea2b436 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Wed, 7 Aug 2019 12:27:30 -0400 Subject: tracing: Document the stack trace algorithm in the comments As the max stack tracer algorithm is not that easy to understand from the code, add comments that explain the algorithm and mentions how ARCH_FTRACE_SHIFT_STACK_TRACER affects it. Link: http://lkml.kernel.org/r/20190806123455.487ac02b@gandalf.local.home Suggested-by: Joel Fernandes Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_stack.c | 98 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 98 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index 642a850af81a..ec9a34a97129 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -53,6 +53,104 @@ static void print_max_stack(void) } } +/* + * The stack tracer looks for a maximum stack at each call from a function. It + * registers a callback from ftrace, and in that callback it examines the stack + * size. It determines the stack size from the variable passed in, which is the + * address of a local variable in the stack_trace_call() callback function. + * The stack size is calculated by the address of the local variable to the top + * of the current stack. If that size is smaller than the currently saved max + * stack size, nothing more is done. + * + * If the size of the stack is greater than the maximum recorded size, then the + * following algorithm takes place. + * + * For architectures (like x86) that store the function's return address before + * saving the function's local variables, the stack will look something like + * this: + * + * [ top of stack ] + * 0: sys call entry frame + * 10: return addr to entry code + * 11: start of sys_foo frame + * 20: return addr to sys_foo + * 21: start of kernel_func_bar frame + * 30: return addr to kernel_func_bar + * 31: [ do trace stack here ] + * + * The save_stack_trace() is called returning all the functions it finds in the + * current stack. Which would be (from the bottom of the stack to the top): + * + * return addr to kernel_func_bar + * return addr to sys_foo + * return addr to entry code + * + * Now to figure out how much each of these functions' local variable size is, + * a search of the stack is made to find these values. When a match is made, it + * is added to the stack_dump_trace[] array. The offset into the stack is saved + * in the stack_trace_index[] array. The above example would show: + * + * stack_dump_trace[] | stack_trace_index[] + * ------------------ + ------------------- + * return addr to kernel_func_bar | 30 + * return addr to sys_foo | 20 + * return addr to entry | 10 + * + * The print_max_stack() function above, uses these values to print the size of + * each function's portion of the stack. + * + * for (i = 0; i < nr_entries; i++) { + * size = i == nr_entries - 1 ? stack_trace_index[i] : + * stack_trace_index[i] - stack_trace_index[i+1] + * print "%d %d %d %s\n", i, stack_trace_index[i], size, stack_dump_trace[i]); + * } + * + * The above shows + * + * depth size location + * ----- ---- -------- + * 0 30 10 kernel_func_bar + * 1 20 10 sys_foo + * 2 10 10 entry code + * + * Now for architectures that might save the return address after the functions + * local variables (saving the link register before calling nested functions), + * this will cause the stack to look a little different: + * + * [ top of stack ] + * 0: sys call entry frame + * 10: start of sys_foo_frame + * 19: return addr to entry code << lr saved before calling kernel_func_bar + * 20: start of kernel_func_bar frame + * 29: return addr to sys_foo_frame << lr saved before calling next function + * 30: [ do trace stack here ] + * + * Although the functions returned by save_stack_trace() may be the same, the + * placement in the stack will be different. Using the same algorithm as above + * would yield: + * + * stack_dump_trace[] | stack_trace_index[] + * ------------------ + ------------------- + * return addr to kernel_func_bar | 30 + * return addr to sys_foo | 29 + * return addr to entry | 19 + * + * Where the mapping is off by one: + * + * kernel_func_bar stack frame size is 29 - 19 not 30 - 29! + * + * To fix this, if the architecture sets ARCH_RET_ADDR_AFTER_LOCAL_VARS the + * values in stack_trace_index[] are shifted by one to and the number of + * stack trace entries is decremented by one. + * + * stack_dump_trace[] | stack_trace_index[] + * ------------------ + ------------------- + * return addr to kernel_func_bar | 29 + * return addr to sys_foo | 19 + * + * Although the entry function is not displayed, the first function (sys_foo) + * will still include the stack size of it. + */ static void check_stack(unsigned long ip, unsigned long *stack) { unsigned long this_size, flags; unsigned long *p, *top, *start; -- cgit v1.2.3 From a47b53e95accfd2814efe39dfca06dbd45cd857a Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Tue, 13 Aug 2019 12:14:35 -0400 Subject: tracing: Rename tracing_reset() to tracing_reset_cpu() The name tracing_reset() was a misnomer, as it really only reset a single CPU buffer. Rename it to tracing_reset_cpu() and also make it static and remove the prototype from trace.h, as it is only used in a single function. Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 6 +++--- kernel/trace/trace.h | 1 - 2 files changed, 3 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 3916b72de715..e917aa783675 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1854,7 +1854,7 @@ int __init register_tracer(struct tracer *type) return ret; } -void tracing_reset(struct trace_buffer *buf, int cpu) +static void tracing_reset_cpu(struct trace_buffer *buf, int cpu) { struct ring_buffer *buffer = buf->buffer; @@ -4251,7 +4251,7 @@ static int tracing_open(struct inode *inode, struct file *file) if (cpu == RING_BUFFER_ALL_CPUS) tracing_reset_online_cpus(trace_buf); else - tracing_reset(trace_buf, cpu); + tracing_reset_cpu(trace_buf, cpu); } if (file->f_mode & FMODE_READ) { @@ -6742,7 +6742,7 @@ tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt, if (iter->cpu_file == RING_BUFFER_ALL_CPUS) tracing_reset_online_cpus(&tr->max_buffer); else - tracing_reset(&tr->max_buffer, iter->cpu_file); + tracing_reset_cpu(&tr->max_buffer, iter->cpu_file); } break; } diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 005f08629b8b..26b0a08f3c7d 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -677,7 +677,6 @@ trace_buffer_iter(struct trace_iterator *iter, int cpu) int tracer_init(struct tracer *t, struct trace_array *tr); int tracing_is_enabled(void); -void tracing_reset(struct trace_buffer *buf, int cpu); void tracing_reset_online_cpus(struct trace_buffer *buf); void tracing_reset_current(int cpu); void tracing_reset_all_online_cpus(void); -- cgit v1.2.3 From 6ba99411b858bd70bae966633561e698cd6de38c Mon Sep 17 00:00:00 2001 From: Yoshihiro Shimoda Date: Wed, 28 Aug 2019 21:35:40 +0900 Subject: dma-mapping: introduce dma_get_merge_boundary() This patch adds a new DMA API "dma_get_merge_boundary". This function returns the DMA merge boundary if the DMA layer can merge the segments. This patch also adds the implementation for a new dma_map_ops pointer. Signed-off-by: Yoshihiro Shimoda Reviewed-by: Simon Horman Signed-off-by: Christoph Hellwig --- kernel/dma/mapping.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'kernel') diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c index 1b96616c9f20..72c825c1788e 100644 --- a/kernel/dma/mapping.c +++ b/kernel/dma/mapping.c @@ -407,3 +407,14 @@ size_t dma_max_mapping_size(struct device *dev) return size; } EXPORT_SYMBOL_GPL(dma_max_mapping_size); + +unsigned long dma_get_merge_boundary(struct device *dev) +{ + const struct dma_map_ops *ops = get_dma_ops(dev); + + if (!ops || !ops->get_merge_boundary) + return 0; /* can't merge */ + + return ops->get_merge_boundary(dev); +} +EXPORT_SYMBOL_GPL(dma_get_merge_boundary); -- cgit v1.2.3 From 5e2d2cc2588bd3307ce3937acbc2ed03c830a861 Mon Sep 17 00:00:00 2001 From: Liangyan Date: Mon, 26 Aug 2019 20:16:33 +0800 Subject: sched/fair: Don't assign runtime for throttled cfs_rq do_sched_cfs_period_timer() will refill cfs_b runtime and call distribute_cfs_runtime to unthrottle cfs_rq, sometimes cfs_b->runtime will allocate all quota to one cfs_rq incorrectly, then other cfs_rqs attached to this cfs_b can't get runtime and will be throttled. We find that one throttled cfs_rq has non-negative cfs_rq->runtime_remaining and cause an unexpetced cast from s64 to u64 in snippet: distribute_cfs_runtime() { runtime = -cfs_rq->runtime_remaining + 1; } The runtime here will change to a large number and consume all cfs_b->runtime in this cfs_b period. According to Ben Segall, the throttled cfs_rq can have account_cfs_rq_runtime called on it because it is throttled before idle_balance, and the idle_balance calls update_rq_clock to add time that is accounted to the task. This commit prevents cfs_rq to be assgined new runtime if it has been throttled until that distribute_cfs_runtime is called. Signed-off-by: Liangyan Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Valentin Schneider Reviewed-by: Ben Segall Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: shanpeic@linux.alibaba.com Cc: stable@vger.kernel.org Cc: xlpang@linux.alibaba.com Fixes: d3d9dc330236 ("sched: Throttle entities exceeding their allowed bandwidth") Link: https://lkml.kernel.org/r/20190826121633.6538-1-liangyan.peng@linux.alibaba.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index bc9cfeaac8bd..500f5db0de0b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4470,6 +4470,8 @@ static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) if (likely(cfs_rq->runtime_remaining > 0)) return; + if (cfs_rq->throttled) + return; /* * if we're unable to extend our runtime we resched so that the active * hierarchy can be throttled @@ -4673,6 +4675,9 @@ static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b, if (!cfs_rq_throttled(cfs_rq)) goto next; + /* By the above check, this should never be true */ + SCHED_WARN_ON(cfs_rq->runtime_remaining > 0); + runtime = -cfs_rq->runtime_remaining + 1; if (runtime > remaining) runtime = remaining; -- cgit v1.2.3 From a55c7454a8c887b226a01d7eed088ccb5374d81e Mon Sep 17 00:00:00 2001 From: Matt Fleming Date: Thu, 8 Aug 2019 20:53:01 +0100 Subject: sched/topology: Improve load balancing on AMD EPYC systems SD_BALANCE_{FORK,EXEC} and SD_WAKE_AFFINE are stripped in sd_init() for any sched domains with a NUMA distance greater than 2 hops (RECLAIM_DISTANCE). The idea being that it's expensive to balance across domains that far apart. However, as is rather unfortunately explained in: commit 32e45ff43eaf ("mm: increase RECLAIM_DISTANCE to 30") the value for RECLAIM_DISTANCE is based on node distance tables from 2011-era hardware. Current AMD EPYC machines have the following NUMA node distances: node distances: node 0 1 2 3 4 5 6 7 0: 10 16 16 16 32 32 32 32 1: 16 10 16 16 32 32 32 32 2: 16 16 10 16 32 32 32 32 3: 16 16 16 10 32 32 32 32 4: 32 32 32 32 10 16 16 16 5: 32 32 32 32 16 10 16 16 6: 32 32 32 32 16 16 10 16 7: 32 32 32 32 16 16 16 10 where 2 hops is 32. The result is that the scheduler fails to load balance properly across NUMA nodes on different sockets -- 2 hops apart. For example, pinning 16 busy threads to NUMA nodes 0 (CPUs 0-7) and 4 (CPUs 32-39) like so, $ numactl -C 0-7,32-39 ./spinner 16 causes all threads to fork and remain on node 0 until the active balancer kicks in after a few seconds and forcibly moves some threads to node 4. Override node_reclaim_distance for AMD Zen. Signed-off-by: Matt Fleming Signed-off-by: Peter Zijlstra (Intel) Acked-by: Mel Gorman Cc: Borislav Petkov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rik van Riel Cc: Suravee.Suthikulpanit@amd.com Cc: Thomas Gleixner Cc: Thomas.Lendacky@amd.com Cc: Tony Luck Link: https://lkml.kernel.org/r/20190808195301.13222-3-matt@codeblueprint.co.uk Signed-off-by: Ingo Molnar --- kernel/sched/topology.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 8f83e8e3ea9a..b5667a273bf6 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -1284,6 +1284,7 @@ static int sched_domains_curr_level; int sched_max_numa_distance; static int *sched_domains_numa_distance; static struct cpumask ***sched_domains_numa_masks; +int __read_mostly node_reclaim_distance = RECLAIM_DISTANCE; #endif /* @@ -1402,7 +1403,7 @@ sd_init(struct sched_domain_topology_level *tl, sd->flags &= ~SD_PREFER_SIBLING; sd->flags |= SD_SERIALIZE; - if (sched_domains_numa_distance[tl->numa_level] > RECLAIM_DISTANCE) { + if (sched_domains_numa_distance[tl->numa_level] > node_reclaim_distance) { sd->flags &= ~(SD_BALANCE_EXEC | SD_BALANCE_FORK | SD_WAKE_AFFINE); -- cgit v1.2.3 From 2480c093130f64ac3a410504fa8b3db1fc4b87ce Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Thu, 22 Aug 2019 14:28:06 +0100 Subject: sched/uclamp: Extend CPU's cgroup controller The cgroup CPU bandwidth controller allows to assign a specified (maximum) bandwidth to the tasks of a group. However this bandwidth is defined and enforced only on a temporal base, without considering the actual frequency a CPU is running on. Thus, the amount of computation completed by a task within an allocated bandwidth can be very different depending on the actual frequency the CPU is running that task. The amount of computation can be affected also by the specific CPU a task is running on, especially when running on asymmetric capacity systems like Arm's big.LITTLE. With the availability of schedutil, the scheduler is now able to drive frequency selections based on actual task utilization. Moreover, the utilization clamping support provides a mechanism to bias the frequency selection operated by schedutil depending on constraints assigned to the tasks currently RUNNABLE on a CPU. Giving the mechanisms described above, it is now possible to extend the cpu controller to specify the minimum (or maximum) utilization which should be considered for tasks RUNNABLE on a cpu. This makes it possible to better defined the actual computational power assigned to task groups, thus improving the cgroup CPU bandwidth controller which is currently based just on time constraints. Extend the CPU controller with a couple of new attributes uclamp.{min,max} which allow to enforce utilization boosting and capping for all the tasks in a group. Specifically: - uclamp.min: defines the minimum utilization which should be considered i.e. the RUNNABLE tasks of this group will run at least at a minimum frequency which corresponds to the uclamp.min utilization - uclamp.max: defines the maximum utilization which should be considered i.e. the RUNNABLE tasks of this group will run up to a maximum frequency which corresponds to the uclamp.max utilization These attributes: a) are available only for non-root nodes, both on default and legacy hierarchies, while system wide clamps are defined by a generic interface which does not depends on cgroups. This system wide interface enforces constraints on tasks in the root node. b) enforce effective constraints at each level of the hierarchy which are a restriction of the group requests considering its parent's effective constraints. Root group effective constraints are defined by the system wide interface. This mechanism allows each (non-root) level of the hierarchy to: - request whatever clamp values it would like to get - effectively get only up to the maximum amount allowed by its parent c) have higher priority than task-specific clamps, defined via sched_setattr(), thus allowing to control and restrict task requests. Add two new attributes to the cpu controller to collect "requested" clamp values. Allow that at each non-root level of the hierarchy. Keep it simple by not caring now about "effective" values computation and propagation along the hierarchy. Update sysctl_sched_uclamp_handler() to use the newly introduced uclamp_mutex so that we serialize system default updates with cgroup relate updates. Signed-off-by: Patrick Bellasi Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Michal Koutny Acked-by: Tejun Heo Cc: Alessio Balsini Cc: Dietmar Eggemann Cc: Joel Fernandes Cc: Juri Lelli Cc: Linus Torvalds Cc: Morten Rasmussen Cc: Paul Turner Cc: Peter Zijlstra Cc: Quentin Perret Cc: Rafael J . Wysocki Cc: Steve Muckle Cc: Suren Baghdasaryan Cc: Thomas Gleixner Cc: Todd Kjos Cc: Vincent Guittot Cc: Viresh Kumar Link: https://lkml.kernel.org/r/20190822132811.31294-2-patrick.bellasi@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 193 +++++++++++++++++++++++++++++++++++++++++++++++++-- kernel/sched/sched.h | 8 +++ 2 files changed, 197 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index a6661852907b..c186abed5c6d 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -773,6 +773,18 @@ static void set_load_weight(struct task_struct *p, bool update_load) } #ifdef CONFIG_UCLAMP_TASK +/* + * Serializes updates of utilization clamp values + * + * The (slow-path) user-space triggers utilization clamp value updates which + * can require updates on (fast-path) scheduler's data structures used to + * support enqueue/dequeue operations. + * While the per-CPU rq lock protects fast-path update operations, user-space + * requests are serialized using a mutex to reduce the risk of conflicting + * updates or API abuses. + */ +static DEFINE_MUTEX(uclamp_mutex); + /* Max allowed minimum utilization */ unsigned int sysctl_sched_uclamp_util_min = SCHED_CAPACITY_SCALE; @@ -1010,10 +1022,9 @@ int sysctl_sched_uclamp_handler(struct ctl_table *table, int write, loff_t *ppos) { int old_min, old_max; - static DEFINE_MUTEX(mutex); int result; - mutex_lock(&mutex); + mutex_lock(&uclamp_mutex); old_min = sysctl_sched_uclamp_util_min; old_max = sysctl_sched_uclamp_util_max; @@ -1048,7 +1059,7 @@ undo: sysctl_sched_uclamp_util_min = old_min; sysctl_sched_uclamp_util_max = old_max; done: - mutex_unlock(&mutex); + mutex_unlock(&uclamp_mutex); return result; } @@ -1137,6 +1148,8 @@ static void __init init_uclamp(void) unsigned int clamp_id; int cpu; + mutex_init(&uclamp_mutex); + for_each_possible_cpu(cpu) { memset(&cpu_rq(cpu)->uclamp, 0, sizeof(struct uclamp_rq)); cpu_rq(cpu)->uclamp_flags = 0; @@ -1149,8 +1162,12 @@ static void __init init_uclamp(void) /* System defaults allow max clamp values for both indexes */ uclamp_se_set(&uc_max, uclamp_none(UCLAMP_MAX), false); - for_each_clamp_id(clamp_id) + for_each_clamp_id(clamp_id) { uclamp_default[clamp_id] = uc_max; +#ifdef CONFIG_UCLAMP_TASK_GROUP + root_task_group.uclamp_req[clamp_id] = uc_max; +#endif + } } #else /* CONFIG_UCLAMP_TASK */ @@ -6798,6 +6815,19 @@ void ia64_set_curr_task(int cpu, struct task_struct *p) /* task_group_lock serializes the addition/removal of task groups */ static DEFINE_SPINLOCK(task_group_lock); +static inline void alloc_uclamp_sched_group(struct task_group *tg, + struct task_group *parent) +{ +#ifdef CONFIG_UCLAMP_TASK_GROUP + int clamp_id; + + for_each_clamp_id(clamp_id) { + uclamp_se_set(&tg->uclamp_req[clamp_id], + uclamp_none(clamp_id), false); + } +#endif +} + static void sched_free_group(struct task_group *tg) { free_fair_sched_group(tg); @@ -6821,6 +6851,8 @@ struct task_group *sched_create_group(struct task_group *parent) if (!alloc_rt_sched_group(tg, parent)) goto err; + alloc_uclamp_sched_group(tg, parent); + return tg; err: @@ -7037,6 +7069,131 @@ static void cpu_cgroup_attach(struct cgroup_taskset *tset) sched_move_task(task); } +#ifdef CONFIG_UCLAMP_TASK_GROUP + +/* + * Integer 10^N with a given N exponent by casting to integer the literal "1eN" + * C expression. Since there is no way to convert a macro argument (N) into a + * character constant, use two levels of macros. + */ +#define _POW10(exp) ((unsigned int)1e##exp) +#define POW10(exp) _POW10(exp) + +struct uclamp_request { +#define UCLAMP_PERCENT_SHIFT 2 +#define UCLAMP_PERCENT_SCALE (100 * POW10(UCLAMP_PERCENT_SHIFT)) + s64 percent; + u64 util; + int ret; +}; + +static inline struct uclamp_request +capacity_from_percent(char *buf) +{ + struct uclamp_request req = { + .percent = UCLAMP_PERCENT_SCALE, + .util = SCHED_CAPACITY_SCALE, + .ret = 0, + }; + + buf = strim(buf); + if (strcmp(buf, "max")) { + req.ret = cgroup_parse_float(buf, UCLAMP_PERCENT_SHIFT, + &req.percent); + if (req.ret) + return req; + if (req.percent > UCLAMP_PERCENT_SCALE) { + req.ret = -ERANGE; + return req; + } + + req.util = req.percent << SCHED_CAPACITY_SHIFT; + req.util = DIV_ROUND_CLOSEST_ULL(req.util, UCLAMP_PERCENT_SCALE); + } + + return req; +} + +static ssize_t cpu_uclamp_write(struct kernfs_open_file *of, char *buf, + size_t nbytes, loff_t off, + enum uclamp_id clamp_id) +{ + struct uclamp_request req; + struct task_group *tg; + + req = capacity_from_percent(buf); + if (req.ret) + return req.ret; + + mutex_lock(&uclamp_mutex); + rcu_read_lock(); + + tg = css_tg(of_css(of)); + if (tg->uclamp_req[clamp_id].value != req.util) + uclamp_se_set(&tg->uclamp_req[clamp_id], req.util, false); + + /* + * Because of not recoverable conversion rounding we keep track of the + * exact requested value + */ + tg->uclamp_pct[clamp_id] = req.percent; + + rcu_read_unlock(); + mutex_unlock(&uclamp_mutex); + + return nbytes; +} + +static ssize_t cpu_uclamp_min_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, + loff_t off) +{ + return cpu_uclamp_write(of, buf, nbytes, off, UCLAMP_MIN); +} + +static ssize_t cpu_uclamp_max_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, + loff_t off) +{ + return cpu_uclamp_write(of, buf, nbytes, off, UCLAMP_MAX); +} + +static inline void cpu_uclamp_print(struct seq_file *sf, + enum uclamp_id clamp_id) +{ + struct task_group *tg; + u64 util_clamp; + u64 percent; + u32 rem; + + rcu_read_lock(); + tg = css_tg(seq_css(sf)); + util_clamp = tg->uclamp_req[clamp_id].value; + rcu_read_unlock(); + + if (util_clamp == SCHED_CAPACITY_SCALE) { + seq_puts(sf, "max\n"); + return; + } + + percent = tg->uclamp_pct[clamp_id]; + percent = div_u64_rem(percent, POW10(UCLAMP_PERCENT_SHIFT), &rem); + seq_printf(sf, "%llu.%0*u\n", percent, UCLAMP_PERCENT_SHIFT, rem); +} + +static int cpu_uclamp_min_show(struct seq_file *sf, void *v) +{ + cpu_uclamp_print(sf, UCLAMP_MIN); + return 0; +} + +static int cpu_uclamp_max_show(struct seq_file *sf, void *v) +{ + cpu_uclamp_print(sf, UCLAMP_MAX); + return 0; +} +#endif /* CONFIG_UCLAMP_TASK_GROUP */ + #ifdef CONFIG_FAIR_GROUP_SCHED static int cpu_shares_write_u64(struct cgroup_subsys_state *css, struct cftype *cftype, u64 shareval) @@ -7381,6 +7538,20 @@ static struct cftype cpu_legacy_files[] = { .read_u64 = cpu_rt_period_read_uint, .write_u64 = cpu_rt_period_write_uint, }, +#endif +#ifdef CONFIG_UCLAMP_TASK_GROUP + { + .name = "uclamp.min", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = cpu_uclamp_min_show, + .write = cpu_uclamp_min_write, + }, + { + .name = "uclamp.max", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = cpu_uclamp_max_show, + .write = cpu_uclamp_max_write, + }, #endif { } /* Terminate */ }; @@ -7548,6 +7719,20 @@ static struct cftype cpu_files[] = { .seq_show = cpu_max_show, .write = cpu_max_write, }, +#endif +#ifdef CONFIG_UCLAMP_TASK_GROUP + { + .name = "uclamp.min", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = cpu_uclamp_min_show, + .write = cpu_uclamp_min_write, + }, + { + .name = "uclamp.max", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = cpu_uclamp_max_show, + .write = cpu_uclamp_max_write, + }, #endif { } /* terminate */ }; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 7111e3a1eeb4..ae1be61fb279 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -391,6 +391,14 @@ struct task_group { #endif struct cfs_bandwidth cfs_bandwidth; + +#ifdef CONFIG_UCLAMP_TASK_GROUP + /* The two decimal precision [%] value requested from user-space */ + unsigned int uclamp_pct[UCLAMP_CNT]; + /* Clamp values requested for a task group */ + struct uclamp_se uclamp_req[UCLAMP_CNT]; +#endif + }; #ifdef CONFIG_FAIR_GROUP_SCHED -- cgit v1.2.3 From 0b60ba2dd342016e4e717dbaa4ca9af3a43f4434 Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Thu, 22 Aug 2019 14:28:07 +0100 Subject: sched/uclamp: Propagate parent clamps In order to properly support hierarchical resources control, the cgroup delegation model requires that attribute writes from a child group never fail but still are locally consistent and constrained based on parent's assigned resources. This requires to properly propagate and aggregate parent attributes down to its descendants. Implement this mechanism by adding a new "effective" clamp value for each task group. The effective clamp value is defined as the smaller value between the clamp value of a group and the effective clamp value of its parent. This is the actual clamp value enforced on tasks in a task group. Since it's possible for a cpu.uclamp.min value to be bigger than the cpu.uclamp.max value, ensure local consistency by restricting each "protection" (i.e. min utilization) with the corresponding "limit" (i.e. max utilization). Do that at effective clamps propagation to ensure all user-space write never fails while still always tracking the most restrictive values. Signed-off-by: Patrick Bellasi Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Michal Koutny Acked-by: Tejun Heo Cc: Alessio Balsini Cc: Dietmar Eggemann Cc: Joel Fernandes Cc: Juri Lelli Cc: Linus Torvalds Cc: Morten Rasmussen Cc: Paul Turner Cc: Peter Zijlstra Cc: Quentin Perret Cc: Rafael J . Wysocki Cc: Steve Muckle Cc: Suren Baghdasaryan Cc: Thomas Gleixner Cc: Todd Kjos Cc: Vincent Guittot Cc: Viresh Kumar Link: https://lkml.kernel.org/r/20190822132811.31294-3-patrick.bellasi@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 44 ++++++++++++++++++++++++++++++++++++++++++++ kernel/sched/sched.h | 2 ++ 2 files changed, 46 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index c186abed5c6d..8855481857b5 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1166,6 +1166,7 @@ static void __init init_uclamp(void) uclamp_default[clamp_id] = uc_max; #ifdef CONFIG_UCLAMP_TASK_GROUP root_task_group.uclamp_req[clamp_id] = uc_max; + root_task_group.uclamp[clamp_id] = uc_max; #endif } } @@ -6824,6 +6825,7 @@ static inline void alloc_uclamp_sched_group(struct task_group *tg, for_each_clamp_id(clamp_id) { uclamp_se_set(&tg->uclamp_req[clamp_id], uclamp_none(clamp_id), false); + tg->uclamp[clamp_id] = parent->uclamp[clamp_id]; } #endif } @@ -7070,6 +7072,45 @@ static void cpu_cgroup_attach(struct cgroup_taskset *tset) } #ifdef CONFIG_UCLAMP_TASK_GROUP +static void cpu_util_update_eff(struct cgroup_subsys_state *css) +{ + struct cgroup_subsys_state *top_css = css; + struct uclamp_se *uc_parent = NULL; + struct uclamp_se *uc_se = NULL; + unsigned int eff[UCLAMP_CNT]; + unsigned int clamp_id; + unsigned int clamps; + + css_for_each_descendant_pre(css, top_css) { + uc_parent = css_tg(css)->parent + ? css_tg(css)->parent->uclamp : NULL; + + for_each_clamp_id(clamp_id) { + /* Assume effective clamps matches requested clamps */ + eff[clamp_id] = css_tg(css)->uclamp_req[clamp_id].value; + /* Cap effective clamps with parent's effective clamps */ + if (uc_parent && + eff[clamp_id] > uc_parent[clamp_id].value) { + eff[clamp_id] = uc_parent[clamp_id].value; + } + } + /* Ensure protection is always capped by limit */ + eff[UCLAMP_MIN] = min(eff[UCLAMP_MIN], eff[UCLAMP_MAX]); + + /* Propagate most restrictive effective clamps */ + clamps = 0x0; + uc_se = css_tg(css)->uclamp; + for_each_clamp_id(clamp_id) { + if (eff[clamp_id] == uc_se[clamp_id].value) + continue; + uc_se[clamp_id].value = eff[clamp_id]; + uc_se[clamp_id].bucket_id = uclamp_bucket_id(eff[clamp_id]); + clamps |= (0x1 << clamp_id); + } + if (!clamps) + css = css_rightmost_descendant(css); + } +} /* * Integer 10^N with a given N exponent by casting to integer the literal "1eN" @@ -7138,6 +7179,9 @@ static ssize_t cpu_uclamp_write(struct kernfs_open_file *of, char *buf, */ tg->uclamp_pct[clamp_id] = req.percent; + /* Update effective clamps to track the most restrictive value */ + cpu_util_update_eff(of_css(of)); + rcu_read_unlock(); mutex_unlock(&uclamp_mutex); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index ae1be61fb279..5b343112a47b 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -397,6 +397,8 @@ struct task_group { unsigned int uclamp_pct[UCLAMP_CNT]; /* Clamp values requested for a task group */ struct uclamp_se uclamp_req[UCLAMP_CNT]; + /* Effective clamp values used for a task group */ + struct uclamp_se uclamp[UCLAMP_CNT]; #endif }; -- cgit v1.2.3 From 7274a5c1bbec45f06f1fff4b8c8b5855b6cc189d Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Thu, 22 Aug 2019 14:28:08 +0100 Subject: sched/uclamp: Propagate system defaults to the root group The clamp values are not tunable at the level of the root task group. That's for two main reasons: - the root group represents "system resources" which are always entirely available from the cgroup standpoint. - when tuning/restricting "system resources" makes sense, tuning must be done using a system wide API which should also be available when control groups are not. When a system wide restriction is available, cgroups should be aware of its value in order to know exactly how much "system resources" are available for the subgroups. Utilization clamping supports already the concepts of: - system defaults: which define the maximum possible clamp values usable by tasks. - effective clamps: which allows a parent cgroup to constraint (maybe temporarily) its descendants without losing the information related to the values "requested" from them. Exploit these two concepts and bind them together in such a way that, whenever system default are tuned, the new values are propagated to (possibly) restrict or relax the "effective" value of nested cgroups. When cgroups are in use, force an update of all the RUNNABLE tasks. Otherwise, keep things simple and do just a lazy update next time each task will be enqueued. Do that since we assume a more strict resource control is required when cgroups are in use. This allows also to keep "effective" clamp values updated in case we need to expose them to user-space. Signed-off-by: Patrick Bellasi Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Michal Koutny Acked-by: Tejun Heo Cc: Alessio Balsini Cc: Dietmar Eggemann Cc: Joel Fernandes Cc: Juri Lelli Cc: Linus Torvalds Cc: Morten Rasmussen Cc: Paul Turner Cc: Peter Zijlstra Cc: Quentin Perret Cc: Rafael J . Wysocki Cc: Steve Muckle Cc: Suren Baghdasaryan Cc: Thomas Gleixner Cc: Todd Kjos Cc: Vincent Guittot Cc: Viresh Kumar Link: https://lkml.kernel.org/r/20190822132811.31294-4-patrick.bellasi@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 31 +++++++++++++++++++++++++++++-- 1 file changed, 29 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 8855481857b5..e6800fe040ea 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1017,10 +1017,30 @@ static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p) uclamp_rq_dec_id(rq, p, clamp_id); } +#ifdef CONFIG_UCLAMP_TASK_GROUP +static void cpu_util_update_eff(struct cgroup_subsys_state *css); +static void uclamp_update_root_tg(void) +{ + struct task_group *tg = &root_task_group; + + uclamp_se_set(&tg->uclamp_req[UCLAMP_MIN], + sysctl_sched_uclamp_util_min, false); + uclamp_se_set(&tg->uclamp_req[UCLAMP_MAX], + sysctl_sched_uclamp_util_max, false); + + rcu_read_lock(); + cpu_util_update_eff(&root_task_group.css); + rcu_read_unlock(); +} +#else +static void uclamp_update_root_tg(void) { } +#endif + int sysctl_sched_uclamp_handler(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { + bool update_root_tg = false; int old_min, old_max; int result; @@ -1043,16 +1063,23 @@ int sysctl_sched_uclamp_handler(struct ctl_table *table, int write, if (old_min != sysctl_sched_uclamp_util_min) { uclamp_se_set(&uclamp_default[UCLAMP_MIN], sysctl_sched_uclamp_util_min, false); + update_root_tg = true; } if (old_max != sysctl_sched_uclamp_util_max) { uclamp_se_set(&uclamp_default[UCLAMP_MAX], sysctl_sched_uclamp_util_max, false); + update_root_tg = true; } + if (update_root_tg) + uclamp_update_root_tg(); + /* - * Updating all the RUNNABLE task is expensive, keep it simple and do - * just a lazy update at each next enqueue time. + * We update all RUNNABLE tasks only when task groups are in use. + * Otherwise, keep it simple and do just a lazy update at each next + * task enqueue time. */ + goto done; undo: -- cgit v1.2.3 From 3eac870a324728e5d17118888840dad70bcd37f3 Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Thu, 22 Aug 2019 14:28:09 +0100 Subject: sched/uclamp: Use TG's clamps to restrict TASK's clamps When a task specific clamp value is configured via sched_setattr(2), this value is accounted in the corresponding clamp bucket every time the task is {en,de}qeued. However, when cgroups are also in use, the task specific clamp values could be restricted by the task_group (TG) clamp values. Update uclamp_cpu_inc() to aggregate task and TG clamp values. Every time a task is enqueued, it's accounted in the clamp bucket tracking the smaller clamp between the task specific value and its TG effective value. This allows to: 1. ensure cgroup clamps are always used to restrict task specific requests, i.e. boosted not more than its TG effective protection and capped at least as its TG effective limit. 2. implement a "nice-like" policy, where tasks are still allowed to request less than what enforced by their TG effective limits and protections Do this by exploiting the concept of "effective" clamp, which is already used by a TG to track parent enforced restrictions. Apply task group clamp restrictions only to tasks belonging to a child group. While, for tasks in the root group or in an autogroup, system defaults are still enforced. Signed-off-by: Patrick Bellasi Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Michal Koutny Acked-by: Tejun Heo Cc: Alessio Balsini Cc: Dietmar Eggemann Cc: Joel Fernandes Cc: Juri Lelli Cc: Linus Torvalds Cc: Morten Rasmussen Cc: Paul Turner Cc: Peter Zijlstra Cc: Quentin Perret Cc: Rafael J . Wysocki Cc: Steve Muckle Cc: Suren Baghdasaryan Cc: Thomas Gleixner Cc: Todd Kjos Cc: Vincent Guittot Cc: Viresh Kumar Link: https://lkml.kernel.org/r/20190822132811.31294-5-patrick.bellasi@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 28 +++++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index e6800fe040ea..c32ac071c203 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -873,16 +873,42 @@ unsigned int uclamp_rq_max_value(struct rq *rq, unsigned int clamp_id, return uclamp_idle_value(rq, clamp_id, clamp_value); } +static inline struct uclamp_se +uclamp_tg_restrict(struct task_struct *p, unsigned int clamp_id) +{ + struct uclamp_se uc_req = p->uclamp_req[clamp_id]; +#ifdef CONFIG_UCLAMP_TASK_GROUP + struct uclamp_se uc_max; + + /* + * Tasks in autogroups or root task group will be + * restricted by system defaults. + */ + if (task_group_is_autogroup(task_group(p))) + return uc_req; + if (task_group(p) == &root_task_group) + return uc_req; + + uc_max = task_group(p)->uclamp[clamp_id]; + if (uc_req.value > uc_max.value || !uc_req.user_defined) + return uc_max; +#endif + + return uc_req; +} + /* * The effective clamp bucket index of a task depends on, by increasing * priority: * - the task specific clamp value, when explicitly requested from userspace + * - the task group effective clamp value, for tasks not either in the root + * group or in an autogroup * - the system default clamp value, defined by the sysadmin */ static inline struct uclamp_se uclamp_eff_get(struct task_struct *p, unsigned int clamp_id) { - struct uclamp_se uc_req = p->uclamp_req[clamp_id]; + struct uclamp_se uc_req = uclamp_tg_restrict(p, clamp_id); struct uclamp_se uc_max = uclamp_default[clamp_id]; /* System default restrictions always apply */ -- cgit v1.2.3 From babbe170e053c6ec2343751749995b7b9fd5fd2c Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Thu, 22 Aug 2019 14:28:10 +0100 Subject: sched/uclamp: Update CPU's refcount on TG's clamp changes On updates of task group (TG) clamp values, ensure that these new values are enforced on all RUNNABLE tasks of the task group, i.e. all RUNNABLE tasks are immediately boosted and/or capped as requested. Do that each time we update effective clamps from cpu_util_update_eff(). Use the *cgroup_subsys_state (css) to walk the list of tasks in each affected TG and update their RUNNABLE tasks. Update each task by using the same mechanism used for cpu affinity masks updates, i.e. by taking the rq lock. Signed-off-by: Patrick Bellasi Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Michal Koutny Acked-by: Tejun Heo Cc: Alessio Balsini Cc: Dietmar Eggemann Cc: Joel Fernandes Cc: Juri Lelli Cc: Linus Torvalds Cc: Morten Rasmussen Cc: Paul Turner Cc: Peter Zijlstra Cc: Quentin Perret Cc: Rafael J . Wysocki Cc: Steve Muckle Cc: Suren Baghdasaryan Cc: Thomas Gleixner Cc: Todd Kjos Cc: Vincent Guittot Cc: Viresh Kumar Link: https://lkml.kernel.org/r/20190822132811.31294-6-patrick.bellasi@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 55 ++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 54 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index c32ac071c203..55a1c07045ff 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1043,6 +1043,54 @@ static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p) uclamp_rq_dec_id(rq, p, clamp_id); } +static inline void +uclamp_update_active(struct task_struct *p, unsigned int clamp_id) +{ + struct rq_flags rf; + struct rq *rq; + + /* + * Lock the task and the rq where the task is (or was) queued. + * + * We might lock the (previous) rq of a !RUNNABLE task, but that's the + * price to pay to safely serialize util_{min,max} updates with + * enqueues, dequeues and migration operations. + * This is the same locking schema used by __set_cpus_allowed_ptr(). + */ + rq = task_rq_lock(p, &rf); + + /* + * Setting the clamp bucket is serialized by task_rq_lock(). + * If the task is not yet RUNNABLE and its task_struct is not + * affecting a valid clamp bucket, the next time it's enqueued, + * it will already see the updated clamp bucket value. + */ + if (!p->uclamp[clamp_id].active) { + uclamp_rq_dec_id(rq, p, clamp_id); + uclamp_rq_inc_id(rq, p, clamp_id); + } + + task_rq_unlock(rq, p, &rf); +} + +static inline void +uclamp_update_active_tasks(struct cgroup_subsys_state *css, + unsigned int clamps) +{ + struct css_task_iter it; + struct task_struct *p; + unsigned int clamp_id; + + css_task_iter_start(css, 0, &it); + while ((p = css_task_iter_next(&it))) { + for_each_clamp_id(clamp_id) { + if ((0x1 << clamp_id) & clamps) + uclamp_update_active(p, clamp_id); + } + } + css_task_iter_end(&it); +} + #ifdef CONFIG_UCLAMP_TASK_GROUP static void cpu_util_update_eff(struct cgroup_subsys_state *css); static void uclamp_update_root_tg(void) @@ -7160,8 +7208,13 @@ static void cpu_util_update_eff(struct cgroup_subsys_state *css) uc_se[clamp_id].bucket_id = uclamp_bucket_id(eff[clamp_id]); clamps |= (0x1 << clamp_id); } - if (!clamps) + if (!clamps) { css = css_rightmost_descendant(css); + continue; + } + + /* Immediately update descendants RUNNABLE tasks */ + uclamp_update_active_tasks(css, clamps); } } -- cgit v1.2.3 From 0413d7f33e60751570fd6c179546bde2f7d82dcb Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Thu, 22 Aug 2019 14:28:11 +0100 Subject: sched/uclamp: Always use 'enum uclamp_id' for clamp_id values The supported clamp indexes are defined in 'enum clamp_id', however, because of the code logic in some of the first utilization clamping series version, sometimes we needed to use 'unsigned int' to represent indices. This is not more required since the final version of the uclamp_* APIs can always use the proper enum uclamp_id type. Fix it with a bulk rename now that we have all the bits merged. Signed-off-by: Patrick Bellasi Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Michal Koutny Acked-by: Tejun Heo Cc: Alessio Balsini Cc: Dietmar Eggemann Cc: Joel Fernandes Cc: Juri Lelli Cc: Linus Torvalds Cc: Morten Rasmussen Cc: Paul Turner Cc: Peter Zijlstra Cc: Quentin Perret Cc: Rafael J . Wysocki Cc: Steve Muckle Cc: Suren Baghdasaryan Cc: Thomas Gleixner Cc: Todd Kjos Cc: Vincent Guittot Cc: Viresh Kumar Link: https://lkml.kernel.org/r/20190822132811.31294-7-patrick.bellasi@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 38 +++++++++++++++++++------------------- kernel/sched/sched.h | 2 +- 2 files changed, 20 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 55a1c07045ff..3c7b90bcbe4e 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -810,7 +810,7 @@ static inline unsigned int uclamp_bucket_base_value(unsigned int clamp_value) return UCLAMP_BUCKET_DELTA * uclamp_bucket_id(clamp_value); } -static inline unsigned int uclamp_none(int clamp_id) +static inline enum uclamp_id uclamp_none(enum uclamp_id clamp_id) { if (clamp_id == UCLAMP_MIN) return 0; @@ -826,7 +826,7 @@ static inline void uclamp_se_set(struct uclamp_se *uc_se, } static inline unsigned int -uclamp_idle_value(struct rq *rq, unsigned int clamp_id, +uclamp_idle_value(struct rq *rq, enum uclamp_id clamp_id, unsigned int clamp_value) { /* @@ -842,7 +842,7 @@ uclamp_idle_value(struct rq *rq, unsigned int clamp_id, return uclamp_none(UCLAMP_MIN); } -static inline void uclamp_idle_reset(struct rq *rq, unsigned int clamp_id, +static inline void uclamp_idle_reset(struct rq *rq, enum uclamp_id clamp_id, unsigned int clamp_value) { /* Reset max-clamp retention only on idle exit */ @@ -853,8 +853,8 @@ static inline void uclamp_idle_reset(struct rq *rq, unsigned int clamp_id, } static inline -unsigned int uclamp_rq_max_value(struct rq *rq, unsigned int clamp_id, - unsigned int clamp_value) +enum uclamp_id uclamp_rq_max_value(struct rq *rq, enum uclamp_id clamp_id, + unsigned int clamp_value) { struct uclamp_bucket *bucket = rq->uclamp[clamp_id].bucket; int bucket_id = UCLAMP_BUCKETS - 1; @@ -874,7 +874,7 @@ unsigned int uclamp_rq_max_value(struct rq *rq, unsigned int clamp_id, } static inline struct uclamp_se -uclamp_tg_restrict(struct task_struct *p, unsigned int clamp_id) +uclamp_tg_restrict(struct task_struct *p, enum uclamp_id clamp_id) { struct uclamp_se uc_req = p->uclamp_req[clamp_id]; #ifdef CONFIG_UCLAMP_TASK_GROUP @@ -906,7 +906,7 @@ uclamp_tg_restrict(struct task_struct *p, unsigned int clamp_id) * - the system default clamp value, defined by the sysadmin */ static inline struct uclamp_se -uclamp_eff_get(struct task_struct *p, unsigned int clamp_id) +uclamp_eff_get(struct task_struct *p, enum uclamp_id clamp_id) { struct uclamp_se uc_req = uclamp_tg_restrict(p, clamp_id); struct uclamp_se uc_max = uclamp_default[clamp_id]; @@ -918,7 +918,7 @@ uclamp_eff_get(struct task_struct *p, unsigned int clamp_id) return uc_req; } -unsigned int uclamp_eff_value(struct task_struct *p, unsigned int clamp_id) +enum uclamp_id uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id) { struct uclamp_se uc_eff; @@ -942,7 +942,7 @@ unsigned int uclamp_eff_value(struct task_struct *p, unsigned int clamp_id) * for each bucket when all its RUNNABLE tasks require the same clamp. */ static inline void uclamp_rq_inc_id(struct rq *rq, struct task_struct *p, - unsigned int clamp_id) + enum uclamp_id clamp_id) { struct uclamp_rq *uc_rq = &rq->uclamp[clamp_id]; struct uclamp_se *uc_se = &p->uclamp[clamp_id]; @@ -980,7 +980,7 @@ static inline void uclamp_rq_inc_id(struct rq *rq, struct task_struct *p, * enforce the expected state and warn. */ static inline void uclamp_rq_dec_id(struct rq *rq, struct task_struct *p, - unsigned int clamp_id) + enum uclamp_id clamp_id) { struct uclamp_rq *uc_rq = &rq->uclamp[clamp_id]; struct uclamp_se *uc_se = &p->uclamp[clamp_id]; @@ -1019,7 +1019,7 @@ static inline void uclamp_rq_dec_id(struct rq *rq, struct task_struct *p, static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p) { - unsigned int clamp_id; + enum uclamp_id clamp_id; if (unlikely(!p->sched_class->uclamp_enabled)) return; @@ -1034,7 +1034,7 @@ static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p) static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p) { - unsigned int clamp_id; + enum uclamp_id clamp_id; if (unlikely(!p->sched_class->uclamp_enabled)) return; @@ -1044,7 +1044,7 @@ static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p) } static inline void -uclamp_update_active(struct task_struct *p, unsigned int clamp_id) +uclamp_update_active(struct task_struct *p, enum uclamp_id clamp_id) { struct rq_flags rf; struct rq *rq; @@ -1077,9 +1077,9 @@ static inline void uclamp_update_active_tasks(struct cgroup_subsys_state *css, unsigned int clamps) { + enum uclamp_id clamp_id; struct css_task_iter it; struct task_struct *p; - unsigned int clamp_id; css_task_iter_start(css, 0, &it); while ((p = css_task_iter_next(&it))) { @@ -1187,7 +1187,7 @@ static int uclamp_validate(struct task_struct *p, static void __setscheduler_uclamp(struct task_struct *p, const struct sched_attr *attr) { - unsigned int clamp_id; + enum uclamp_id clamp_id; /* * On scheduling class change, reset to default clamps for tasks @@ -1224,7 +1224,7 @@ static void __setscheduler_uclamp(struct task_struct *p, static void uclamp_fork(struct task_struct *p) { - unsigned int clamp_id; + enum uclamp_id clamp_id; for_each_clamp_id(clamp_id) p->uclamp[clamp_id].active = false; @@ -1246,7 +1246,7 @@ static void uclamp_fork(struct task_struct *p) static void __init init_uclamp(void) { struct uclamp_se uc_max = {}; - unsigned int clamp_id; + enum uclamp_id clamp_id; int cpu; mutex_init(&uclamp_mutex); @@ -6921,7 +6921,7 @@ static inline void alloc_uclamp_sched_group(struct task_group *tg, struct task_group *parent) { #ifdef CONFIG_UCLAMP_TASK_GROUP - int clamp_id; + enum uclamp_id clamp_id; for_each_clamp_id(clamp_id) { uclamp_se_set(&tg->uclamp_req[clamp_id], @@ -7179,7 +7179,7 @@ static void cpu_util_update_eff(struct cgroup_subsys_state *css) struct uclamp_se *uc_parent = NULL; struct uclamp_se *uc_se = NULL; unsigned int eff[UCLAMP_CNT]; - unsigned int clamp_id; + enum uclamp_id clamp_id; unsigned int clamps; css_for_each_descendant_pre(css, top_css) { diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 5b343112a47b..00ff5b57e9cd 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2281,7 +2281,7 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {} #endif /* CONFIG_CPU_FREQ */ #ifdef CONFIG_UCLAMP_TASK -unsigned int uclamp_eff_value(struct task_struct *p, unsigned int clamp_id); +enum uclamp_id uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id); static __always_inline unsigned int uclamp_util_with(struct rq *rq, unsigned int util, -- cgit v1.2.3 From 711419e504ebd68c8f03656616829c8ad7829389 Mon Sep 17 00:00:00 2001 From: Dexuan Cui Date: Mon, 2 Sep 2019 23:14:56 +0000 Subject: irqdomain: Add the missing assignment of domain->fwnode for named fwnode Recently device pass-through stops working for Linux VM running on Hyper-V. git-bisect shows the regression is caused by the recent commit 467a3bb97432 ("PCI: hv: Allocate a named fwnode ..."), but the root cause is that the commit d59f6617eef0 forgets to set the domain->fwnode for IRQCHIP_FWNODE_NAMED*, and as a result: 1. The domain->fwnode remains to be NULL. 2. irq_find_matching_fwspec() returns NULL since "h->fwnode == fwnode" is false, and pci_set_bus_msi_domain() sets the Hyper-V PCI root bus's msi_domain to NULL. 3. When the device is added onto the root bus, the device's dev->msi_domain is set to NULL in pci_set_msi_domain(). 4. When a device driver tries to enable MSI-X, pci_msi_setup_msi_irqs() calls arch_setup_msi_irqs(), which uses the native MSI chip (i.e. arch/x86/kernel/apic/msi.c: pci_msi_controller) to set up the irqs, but actually pci_msi_setup_msi_irqs() is supposed to call msi_domain_alloc_irqs() with the hbus->irq_domain, which is created in hv_pcie_init_irq_domain() and is associated with the Hyper-V chip hv_msi_irq_chip. Consequently, the irq line is not properly set up, and the device driver can not receive any interrupt. Fixes: d59f6617eef0 ("genirq: Allow fwnode to carry name information only") Fixes: 467a3bb97432 ("PCI: hv: Allocate a named fwnode instead of an address-based one") Reported-by: Lili Deng Signed-off-by: Dexuan Cui Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/PU1P153MB01694D9AF625AC335C600C5FBFBE0@PU1P153MB0169.APCP153.PROD.OUTLOOK.COM --- kernel/irq/irqdomain.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index e7bbab149750..132672b74e4b 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -149,6 +149,7 @@ struct irq_domain *__irq_domain_add(struct fwnode_handle *fwnode, int size, switch (fwid->type) { case IRQCHIP_FWNODE_NAMED: case IRQCHIP_FWNODE_NAMED_ID: + domain->fwnode = fwnode; domain->name = kstrdup(fwid->name, GFP_KERNEL); if (!domain->name) { kfree(domain); -- cgit v1.2.3 From 82e430a6df7f0b5972c7fe717faffea823c6b84a Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Fri, 2 Aug 2019 19:34:23 +0200 Subject: cpuidle: play_idle: Increase the resolution to usec The play_idle resolution is 1ms. The intel_powerclamp bases the idle duration on jiffies. The idle injection API is also using msec based duration but has no user yet. Unfortunately, msec based time does not fit well when we want to inject idle cycle precisely with shallow idle state. In order to set the scene for the incoming idle injection user, move the precision up to usec when calling play_idle. Signed-off-by: Daniel Lezcano Signed-off-by: Rafael J. Wysocki --- kernel/sched/idle.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 80940939b733..b98283fc6914 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -311,7 +311,7 @@ static enum hrtimer_restart idle_inject_timer_fn(struct hrtimer *timer) return HRTIMER_NORESTART; } -void play_idle(unsigned long duration_ms) +void play_idle(unsigned long duration_us) { struct idle_timer it; @@ -323,7 +323,7 @@ void play_idle(unsigned long duration_ms) WARN_ON_ONCE(current->nr_cpus_allowed != 1); WARN_ON_ONCE(!(current->flags & PF_KTHREAD)); WARN_ON_ONCE(!(current->flags & PF_NO_SETAFFINITY)); - WARN_ON_ONCE(!duration_ms); + WARN_ON_ONCE(!duration_us); rcu_sleep_check(); preempt_disable(); @@ -333,7 +333,8 @@ void play_idle(unsigned long duration_ms) it.done = 0; hrtimer_init_on_stack(&it.timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); it.timer.function = idle_inject_timer_fn; - hrtimer_start(&it.timer, ms_to_ktime(duration_ms), HRTIMER_MODE_REL_PINNED); + hrtimer_start(&it.timer, ns_to_ktime(duration_us * NSEC_PER_USEC), + HRTIMER_MODE_REL_PINNED); while (!READ_ONCE(it.done)) do_idle(); -- cgit v1.2.3 From 635714312e6a515de2e78e74bf8d5b10593a4dbf Mon Sep 17 00:00:00 2001 From: Chuhong Yuan Date: Mon, 29 Jul 2019 23:13:59 +0800 Subject: kdb: Replace strncmp with str_has_prefix strncmp(str, const, len) is error-prone. We had better use newly introduced str_has_prefix() instead of it. Signed-off-by: Chuhong Yuan Signed-off-by: Daniel Thompson --- kernel/debug/kdb/kdb_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index 9ecfa37c7fbf..4567fe998c30 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -830,7 +830,7 @@ static void parse_grep(const char *str) cp++; while (isspace(*cp)) cp++; - if (strncmp(cp, "grep ", 5)) { + if (!str_has_prefix(cp, "grep ")) { kdb_printf("invalid 'pipe', see grephelp\n"); return; } -- cgit v1.2.3 From d8a050f5a3e8242242df6430d5980c142350e461 Mon Sep 17 00:00:00 2001 From: Nadav Amit Date: Sun, 21 Oct 2018 21:45:48 -0700 Subject: kgdb: fix comment regarding static function The comment that says that module_event() is not static is clearly wrong. Signed-off-by: Nadav Amit Signed-off-by: Daniel Thompson --- kernel/debug/debug_core.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c index 5cc608de6883..10f1187b3907 100644 --- a/kernel/debug/debug_core.c +++ b/kernel/debug/debug_core.c @@ -787,11 +787,8 @@ out: } /* - * GDB places a breakpoint at this function to know dynamically - * loaded objects. It's not defined static so that only one instance with this - * name exists in the kernel. + * GDB places a breakpoint at this function to know dynamically loaded objects. */ - static int module_event(struct notifier_block *self, unsigned long val, void *data) { -- cgit v1.2.3 From 14451467014b4c8aff6570b44b6d0ee68cd49bc0 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 15 Mar 2019 17:56:43 +0100 Subject: dma-mapping: move the dma_get_sgtable API comments from arm to common code The comments are spot on and should be near the central API, not just near a single implementation. Signed-off-by: Christoph Hellwig --- kernel/dma/mapping.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'kernel') diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c index 72c825c1788e..a136932b8e6d 100644 --- a/kernel/dma/mapping.c +++ b/kernel/dma/mapping.c @@ -136,6 +136,17 @@ int dma_common_get_sgtable(struct device *dev, struct sg_table *sgt, return ret; } +/* + * The whole dma_get_sgtable() idea is fundamentally unsafe - it seems + * that the intention is to allow exporting memory allocated via the + * coherent DMA APIs through the dma_buf API, which only accepts a + * scattertable. This presents a couple of problems: + * 1. Not all memory allocated via the coherent DMA APIs is backed by + * a struct page + * 2. Passing coherent DMA memory into the streaming APIs is not allowed + * as we will try to flush the memory through a different alias to that + * actually being used (and the flushes are redundant.) + */ int dma_get_sgtable_attrs(struct device *dev, struct sg_table *sgt, void *cpu_addr, dma_addr_t dma_addr, size_t size, unsigned long attrs) -- cgit v1.2.3 From f9f3232a7d0ab73a33d11f4056c5823010f03d55 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 6 Aug 2019 15:01:50 +0300 Subject: dma-mapping: explicitly wire up ->mmap and ->get_sgtable While the default ->mmap and ->get_sgtable implementations work for the majority of our dma_map_ops impementations they are inherently safe for others that don't use the page allocator or CMA and/or use their own way of remapping not covered by the common code. So remove the defaults if these methods are not wired up, but instead wire up the default implementations for all safe instances. Fixes: e1c7e324539a ("dma-mapping: always provide the dma_map_ops based implementation") Signed-off-by: Christoph Hellwig --- kernel/dma/mapping.c | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c index a136932b8e6d..c8b4e46407ba 100644 --- a/kernel/dma/mapping.c +++ b/kernel/dma/mapping.c @@ -153,11 +153,12 @@ int dma_get_sgtable_attrs(struct device *dev, struct sg_table *sgt, { const struct dma_map_ops *ops = get_dma_ops(dev); - if (!dma_is_direct(ops) && ops->get_sgtable) - return ops->get_sgtable(dev, sgt, cpu_addr, dma_addr, size, - attrs); - return dma_common_get_sgtable(dev, sgt, cpu_addr, dma_addr, size, - attrs); + if (dma_is_direct(ops)) + return dma_common_get_sgtable(dev, sgt, cpu_addr, dma_addr, + size, attrs); + if (!ops->get_sgtable) + return -ENXIO; + return ops->get_sgtable(dev, sgt, cpu_addr, dma_addr, size, attrs); } EXPORT_SYMBOL(dma_get_sgtable_attrs); @@ -240,9 +241,12 @@ int dma_mmap_attrs(struct device *dev, struct vm_area_struct *vma, { const struct dma_map_ops *ops = get_dma_ops(dev); - if (!dma_is_direct(ops) && ops->mmap) - return ops->mmap(dev, vma, cpu_addr, dma_addr, size, attrs); - return dma_common_mmap(dev, vma, cpu_addr, dma_addr, size, attrs); + if (dma_is_direct(ops)) + return dma_common_mmap(dev, vma, cpu_addr, dma_addr, size, + attrs); + if (!ops->mmap) + return -ENXIO; + return ops->mmap(dev, vma, cpu_addr, dma_addr, size, attrs); } EXPORT_SYMBOL(dma_mmap_attrs); -- cgit v1.2.3 From e29ccc188f3dae1cb66f59e10e01e0f150642a54 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 3 Aug 2019 13:31:25 +0300 Subject: dma-mapping: add a dma_can_mmap helper Add a helper to check if DMA allocations for a specific device can be mapped to userspace using dma_mmap_*. Signed-off-by: Christoph Hellwig --- kernel/dma/mapping.c | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) (limited to 'kernel') diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c index c8b4e46407ba..18ba1ac93fc1 100644 --- a/kernel/dma/mapping.c +++ b/kernel/dma/mapping.c @@ -222,6 +222,29 @@ int dma_common_mmap(struct device *dev, struct vm_area_struct *vma, #endif /* !CONFIG_ARCH_NO_COHERENT_DMA_MMAP */ } +/** + * dma_can_mmap - check if a given device supports dma_mmap_* + * @dev: device to check + * + * Returns %true if @dev supports dma_mmap_coherent() and dma_mmap_attrs() to + * map DMA allocations to userspace. + */ +bool dma_can_mmap(struct device *dev) +{ + const struct dma_map_ops *ops = get_dma_ops(dev); + + if (IS_ENABLED(CONFIG_ARCH_NO_COHERENT_DMA_MMAP)) + return false; + + if (dma_is_direct(ops)) { + return dev_is_dma_coherent(dev) || + IS_ENABLED(CONFIG_ARCH_HAS_DMA_COHERENT_TO_PFN); + } + + return ops->mmap != NULL; +} +EXPORT_SYMBOL_GPL(dma_can_mmap); + /** * dma_mmap_attrs - map a coherent DMA allocation into user space * @dev: valid struct device pointer, or NULL for ISA and EISA-like devices -- cgit v1.2.3 From 62fcee9a3bd73e279d3052245a652a918d0c51da Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 6 Aug 2019 15:06:40 +0300 Subject: dma-mapping: remove CONFIG_ARCH_NO_COHERENT_DMA_MMAP CONFIG_ARCH_NO_COHERENT_DMA_MMAP is now functionally identical to !CONFIG_MMU, so remove the separate symbol. The only difference is that arm did not set it for !CONFIG_MMU, but arm uses a separate dma mapping implementation including its own mmap method, which is handled by moving the CONFIG_MMU check in dma_can_mmap so that is only applies to the dma-direct case, just as the other ifdefs for it. Signed-off-by: Christoph Hellwig Acked-by: Geert Uytterhoeven # m68k --- kernel/dma/mapping.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c index 18ba1ac93fc1..285de5fbc8e9 100644 --- a/kernel/dma/mapping.c +++ b/kernel/dma/mapping.c @@ -188,7 +188,7 @@ int dma_common_mmap(struct device *dev, struct vm_area_struct *vma, void *cpu_addr, dma_addr_t dma_addr, size_t size, unsigned long attrs) { -#ifndef CONFIG_ARCH_NO_COHERENT_DMA_MMAP +#ifdef CONFIG_MMU unsigned long user_count = vma_pages(vma); unsigned long count = PAGE_ALIGN(size) >> PAGE_SHIFT; unsigned long off = vma->vm_pgoff; @@ -219,7 +219,7 @@ int dma_common_mmap(struct device *dev, struct vm_area_struct *vma, user_count << PAGE_SHIFT, vma->vm_page_prot); #else return -ENXIO; -#endif /* !CONFIG_ARCH_NO_COHERENT_DMA_MMAP */ +#endif /* CONFIG_MMU */ } /** @@ -233,12 +233,10 @@ bool dma_can_mmap(struct device *dev) { const struct dma_map_ops *ops = get_dma_ops(dev); - if (IS_ENABLED(CONFIG_ARCH_NO_COHERENT_DMA_MMAP)) - return false; - if (dma_is_direct(ops)) { - return dev_is_dma_coherent(dev) || - IS_ENABLED(CONFIG_ARCH_HAS_DMA_COHERENT_TO_PFN); + return IS_ENABLED(CONFIG_MMU) && + (dev_is_dma_coherent(dev) || + IS_ENABLED(CONFIG_ARCH_HAS_DMA_COHERENT_TO_PFN)); } return ops->mmap != NULL; -- cgit v1.2.3 From 1fa0682448acd5198f79c1d28ee1292a27ae406d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 30 Aug 2019 08:46:57 +0200 Subject: dma-mapping: remove dma_release_declared_memory This function is entirely unused given that declared memory is generally provided by platform setup code. Signed-off-by: Christoph Hellwig --- kernel/dma/coherent.c | 11 ----------- 1 file changed, 11 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/coherent.c b/kernel/dma/coherent.c index 29fd6590dc1e..7271cda86a37 100644 --- a/kernel/dma/coherent.c +++ b/kernel/dma/coherent.c @@ -124,17 +124,6 @@ int dma_declare_coherent_memory(struct device *dev, phys_addr_t phys_addr, } EXPORT_SYMBOL(dma_declare_coherent_memory); -void dma_release_declared_memory(struct device *dev) -{ - struct dma_coherent_mem *mem = dev->dma_mem; - - if (!mem) - return; - dma_release_coherent_memory(mem); - dev->dma_mem = NULL; -} -EXPORT_SYMBOL(dma_release_declared_memory); - static void *__dma_alloc_from_coherent(struct dma_coherent_mem *mem, ssize_t size, dma_addr_t *dma_handle) { -- cgit v1.2.3 From 7a01ee42209c00e551c7cfb581be7a207a376f00 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 30 Aug 2019 08:40:33 +0200 Subject: dma-mapping: remove the dma_mmap_from_dev_coherent export dma_mmap_from_dev_coherent is only used by dma_map_ops instances, none of which is modular. Signed-off-by: Christoph Hellwig --- kernel/dma/coherent.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/dma/coherent.c b/kernel/dma/coherent.c index 7271cda86a37..7cafe1affdc9 100644 --- a/kernel/dma/coherent.c +++ b/kernel/dma/coherent.c @@ -277,7 +277,6 @@ int dma_mmap_from_dev_coherent(struct device *dev, struct vm_area_struct *vma, return __dma_mmap_from_coherent(mem, vma, vaddr, size, ret); } -EXPORT_SYMBOL(dma_mmap_from_dev_coherent); int dma_mmap_from_global_coherent(struct vm_area_struct *vma, void *vaddr, size_t size, int *ret) -- cgit v1.2.3 From d9295532d5725e5926d76470acdfd239c8d2aad3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 30 Aug 2019 08:48:27 +0200 Subject: dma-mapping: remove the dma_declare_coherent_memory export dma_declare_coherent_memory is something that the platform setup code (which pretty much means the device tree these days) need to do so that drivers can use the memory as declared by the platform. Drivers themselves have no business calling this function. Signed-off-by: Christoph Hellwig --- kernel/dma/coherent.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/dma/coherent.c b/kernel/dma/coherent.c index 7cafe1affdc9..545e3869b0e3 100644 --- a/kernel/dma/coherent.c +++ b/kernel/dma/coherent.c @@ -122,7 +122,6 @@ int dma_declare_coherent_memory(struct device *dev, phys_addr_t phys_addr, dma_release_coherent_memory(mem); return ret; } -EXPORT_SYMBOL(dma_declare_coherent_memory); static void *__dma_alloc_from_coherent(struct dma_coherent_mem *mem, ssize_t size, dma_addr_t *dma_handle) -- cgit v1.2.3 From 249baa54790171438524ba97e8e0485dd6aa2762 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 6 Aug 2019 15:01:38 +0300 Subject: dma-mapping: provide a better default ->get_required_mask Most dma_map_ops instances are IOMMUs that work perfectly fine in 32-bits of IOVA space, and the generic direct mapping code already provides its own routines that is intelligent based on the amount of memory actually present. Wire up the dma-direct routine for the ARM direct mapping code as well, and otherwise default to the constant 32-bit mask. This way we only need to override it for the occasional odd IOMMU that requires 64-bit IOVA support, or IOMMU drivers that are more efficient if they can fall back to the direct mapping. Signed-off-by: Christoph Hellwig --- kernel/dma/mapping.c | 30 ++++++++++-------------------- 1 file changed, 10 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c index 285de5fbc8e9..64a3d294f4b4 100644 --- a/kernel/dma/mapping.c +++ b/kernel/dma/mapping.c @@ -271,25 +271,6 @@ int dma_mmap_attrs(struct device *dev, struct vm_area_struct *vma, } EXPORT_SYMBOL(dma_mmap_attrs); -static u64 dma_default_get_required_mask(struct device *dev) -{ - u32 low_totalram = ((max_pfn - 1) << PAGE_SHIFT); - u32 high_totalram = ((max_pfn - 1) >> (32 - PAGE_SHIFT)); - u64 mask; - - if (!high_totalram) { - /* convert to mask just covering totalram */ - low_totalram = (1 << (fls(low_totalram) - 1)); - low_totalram += low_totalram - 1; - mask = low_totalram; - } else { - high_totalram = (1 << (fls(high_totalram) - 1)); - high_totalram += high_totalram - 1; - mask = (((u64)high_totalram) << 32) + 0xffffffff; - } - return mask; -} - u64 dma_get_required_mask(struct device *dev) { const struct dma_map_ops *ops = get_dma_ops(dev); @@ -298,7 +279,16 @@ u64 dma_get_required_mask(struct device *dev) return dma_direct_get_required_mask(dev); if (ops->get_required_mask) return ops->get_required_mask(dev); - return dma_default_get_required_mask(dev); + + /* + * We require every DMA ops implementation to at least support a 32-bit + * DMA mask (and use bounce buffering if that isn't supported in + * hardware). As the direct mapping code has its own routine to + * actually report an optimal mask we default to 32-bit here as that + * is the right thing for most IOMMUs, and at least not actively + * harmful in general. + */ + return DMA_BIT_MASK(32); } EXPORT_SYMBOL_GPL(dma_get_required_mask); -- cgit v1.2.3 From 512317401f6a337e617ec284d20dec5fa3a951ec Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 30 Aug 2019 08:51:01 +0200 Subject: dma-mapping: always use VM_DMA_COHERENT for generic DMA remap Currently the generic dma remap allocator gets a vm_flags passed by the caller that is a little confusing. We just introduced a generic vmalloc-level flag to identify the dma coherent allocations, so use that everywhere and remove the now pointless argument. Signed-off-by: Christoph Hellwig --- kernel/dma/remap.c | 23 ++++++++++------------- 1 file changed, 10 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/remap.c b/kernel/dma/remap.c index 838123f79639..f6b90521a7e4 100644 --- a/kernel/dma/remap.c +++ b/kernel/dma/remap.c @@ -12,12 +12,11 @@ #include static struct vm_struct *__dma_common_pages_remap(struct page **pages, - size_t size, unsigned long vm_flags, pgprot_t prot, - const void *caller) + size_t size, pgprot_t prot, const void *caller) { struct vm_struct *area; - area = get_vm_area_caller(size, vm_flags, caller); + area = get_vm_area_caller(size, VM_DMA_COHERENT, caller); if (!area) return NULL; @@ -34,12 +33,11 @@ static struct vm_struct *__dma_common_pages_remap(struct page **pages, * Cannot be used in non-sleeping contexts */ void *dma_common_pages_remap(struct page **pages, size_t size, - unsigned long vm_flags, pgprot_t prot, - const void *caller) + pgprot_t prot, const void *caller) { struct vm_struct *area; - area = __dma_common_pages_remap(pages, size, vm_flags, prot, caller); + area = __dma_common_pages_remap(pages, size, prot, caller); if (!area) return NULL; @@ -53,7 +51,6 @@ void *dma_common_pages_remap(struct page **pages, size_t size, * Cannot be used in non-sleeping contexts */ void *dma_common_contiguous_remap(struct page *page, size_t size, - unsigned long vm_flags, pgprot_t prot, const void *caller) { int i; @@ -67,7 +64,7 @@ void *dma_common_contiguous_remap(struct page *page, size_t size, for (i = 0; i < (size >> PAGE_SHIFT); i++) pages[i] = nth_page(page, i); - area = __dma_common_pages_remap(pages, size, vm_flags, prot, caller); + area = __dma_common_pages_remap(pages, size, prot, caller); kfree(pages); @@ -79,11 +76,11 @@ void *dma_common_contiguous_remap(struct page *page, size_t size, /* * Unmaps a range previously mapped by dma_common_*_remap */ -void dma_common_free_remap(void *cpu_addr, size_t size, unsigned long vm_flags) +void dma_common_free_remap(void *cpu_addr, size_t size) { struct vm_struct *area = find_vm_area(cpu_addr); - if (!area || (area->flags & vm_flags) != vm_flags) { + if (!area || area->flags != VM_DMA_COHERENT) { WARN(1, "trying to free invalid coherent area: %p\n", cpu_addr); return; } @@ -136,7 +133,7 @@ static int __init dma_atomic_pool_init(void) if (!atomic_pool) goto free_page; - addr = dma_common_contiguous_remap(page, atomic_pool_size, VM_USERMAP, + addr = dma_common_contiguous_remap(page, atomic_pool_size, pgprot_dmacoherent(PAGE_KERNEL), __builtin_return_address(0)); if (!addr) @@ -153,7 +150,7 @@ static int __init dma_atomic_pool_init(void) return 0; remove_mapping: - dma_common_free_remap(addr, atomic_pool_size, VM_USERMAP); + dma_common_free_remap(addr, atomic_pool_size); destroy_genpool: gen_pool_destroy(atomic_pool); atomic_pool = NULL; @@ -228,7 +225,7 @@ void *arch_dma_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle, arch_dma_prep_coherent(page, size); /* create a coherent mapping */ - ret = dma_common_contiguous_remap(page, size, VM_USERMAP, + ret = dma_common_contiguous_remap(page, size, dma_pgprot(dev, PAGE_KERNEL, attrs), __builtin_return_address(0)); if (!ret) { -- cgit v1.2.3 From 5cf4537975bbd5691b9ddd015d540bb92f61e322 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 3 Jun 2019 09:14:31 +0200 Subject: dma-mapping: introduce a dma_common_find_pages helper A helper to find the backing page array based on a virtual address. This also ensures we do the same vm_flags check everywhere instead of slightly different or missing ones in a few places. Signed-off-by: Christoph Hellwig --- kernel/dma/remap.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/remap.c b/kernel/dma/remap.c index f6b90521a7e4..ca4e5d44b571 100644 --- a/kernel/dma/remap.c +++ b/kernel/dma/remap.c @@ -11,6 +11,15 @@ #include #include +struct page **dma_common_find_pages(void *cpu_addr) +{ + struct vm_struct *area = find_vm_area(cpu_addr); + + if (!area || area->flags != VM_DMA_COHERENT) + return NULL; + return area->pages; +} + static struct vm_struct *__dma_common_pages_remap(struct page **pages, size_t size, pgprot_t prot, const void *caller) { @@ -78,9 +87,9 @@ void *dma_common_contiguous_remap(struct page *page, size_t size, */ void dma_common_free_remap(void *cpu_addr, size_t size) { - struct vm_struct *area = find_vm_area(cpu_addr); + struct page **pages = dma_common_find_pages(cpu_addr); - if (!area || area->flags != VM_DMA_COHERENT) { + if (!pages) { WARN(1, "trying to free invalid coherent area: %p\n", cpu_addr); return; } -- cgit v1.2.3 From 858805b336be1cabb3d9033adaa3676574d12e37 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Sun, 25 Aug 2019 22:28:37 +0900 Subject: kbuild: add $(BASH) to run scripts with bash-extension CONFIG_SHELL falls back to sh when bash is not installed on the system, but nobody is testing such a case since bash is usually installed. So, shell scripts invoked by CONFIG_SHELL are only tested with bash. It makes it difficult to test whether the hashbang #!/bin/sh is real. For example, #!/bin/sh in arch/powerpc/kernel/prom_init_check.sh is false. (I fixed it up) Besides, some shell scripts invoked by CONFIG_SHELL use bash-extension and #!/bin/bash is specified as the hashbang, while CONFIG_SHELL may not always be set to bash. Probably, the right thing to do is to introduce BASH, which is bash by default, and always set CONFIG_SHELL to sh. Replace $(CONFIG_SHELL) with $(BASH) for bash scripts. If somebody tries to add bash-extension to a #!/bin/sh script, it will be caught in testing because /bin/sh is a symlink to dash on some major distributions. Signed-off-by: Masahiro Yamada --- kernel/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/Makefile b/kernel/Makefile index ef0d95a190b4..6027677f89e8 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -126,7 +126,7 @@ $(obj)/config_data.gz: $(KCONFIG_CONFIG) FORCE $(obj)/kheaders.o: $(obj)/kheaders_data.tar.xz quiet_cmd_genikh = CHK $(obj)/kheaders_data.tar.xz -cmd_genikh = $(CONFIG_SHELL) $(srctree)/kernel/gen_kheaders.sh $@ + cmd_genikh = $(BASH) $(srctree)/kernel/gen_kheaders.sh $@ $(obj)/kheaders_data.tar.xz: FORCE $(call cmd,genikh) -- cgit v1.2.3 From 1251201c0d34fadf69d56efa675c2b7dd0a90eca Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 4 Sep 2019 09:55:32 +0200 Subject: sched/core: Fix uclamp ABI bug, clean up and robustify sched_read_attr() ABI logic and code Thadeu Lima de Souza Cascardo reported that 'chrt' broke on recent kernels: $ chrt -p $$ chrt: failed to get pid 26306's policy: Argument list too long and he has root-caused the bug to the following commit increasing sched_attr size and breaking sched_read_attr() into returning -EFBIG: a509a7cd7974 ("sched/uclamp: Extend sched_setattr() to support utilization clamping") The other, bigger bug is that the whole sched_getattr() and sched_read_attr() logic of checking non-zero bits in new ABI components is arguably broken, and pretty much any extension of the ABI will spuriously break the ABI. That's way too fragile. Instead implement the perf syscall's extensible ABI instead, which we already implement on the sched_setattr() side: - if user-attributes have the same size as kernel attributes then the logic is unchanged. - if user-attributes are larger than the kernel knows about then simply skip the extra bits, but set attr->size to the (smaller) kernel size so that tooling can (in principle) handle older kernel as well. - if user-attributes are smaller than the kernel knows about then just copy whatever user-space can accept. Also clean up the whole logic: - Simplify the code flow - there's no need for 'ret' for example. - Standardize on 'kattr/uattr' and 'ksize/usize' naming to make sure we always know which side we are dealing with. - Why is it called 'read' when what it does is to copy to user? This code is so far away from VFS read() semantics that the naming is actively confusing. Name it sched_attr_copy_to_user() instead, which mirrors other copy_to_user() functionality. - Move the attr->size assignment from the head of sched_getattr() to the sched_attr_copy_to_user() function. Nothing else within the kernel should care about the size of the structure. With these fixes the sched_getattr() syscall now nicely supports an extensible ABI in both a forward and backward compatible fashion, and will also fix the chrt bug. As an added bonus the bogus -EFBIG return is removed as well, which as Thadeu noted should have been -E2BIG to begin with. Reported-by: Thadeu Lima de Souza Cascardo Tested-by: Dietmar Eggemann Tested-by: Thadeu Lima de Souza Cascardo Acked-by: Thadeu Lima de Souza Cascardo Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Patrick Bellasi Cc: Peter Zijlstra Cc: Thomas Gleixner Fixes: a509a7cd7974 ("sched/uclamp: Extend sched_setattr() to support utilization clamping") Link: https://lkml.kernel.org/r/20190904075532.GA26751@gmail.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 78 ++++++++++++++++++++++++++--------------------------- 1 file changed, 39 insertions(+), 39 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 010d578118d6..df9f1fe5689b 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5105,37 +5105,40 @@ out_unlock: return retval; } -static int sched_read_attr(struct sched_attr __user *uattr, - struct sched_attr *attr, - unsigned int usize) +/* + * Copy the kernel size attribute structure (which might be larger + * than what user-space knows about) to user-space. + * + * Note that all cases are valid: user-space buffer can be larger or + * smaller than the kernel-space buffer. The usual case is that both + * have the same size. + */ +static int +sched_attr_copy_to_user(struct sched_attr __user *uattr, + struct sched_attr *kattr, + unsigned int usize) { - int ret; + unsigned int ksize = sizeof(*kattr); if (!access_ok(uattr, usize)) return -EFAULT; /* - * If we're handed a smaller struct than we know of, - * ensure all the unknown bits are 0 - i.e. old - * user-space does not get uncomplete information. + * sched_getattr() ABI forwards and backwards compatibility: + * + * If usize == ksize then we just copy everything to user-space and all is good. + * + * If usize < ksize then we only copy as much as user-space has space for, + * this keeps ABI compatibility as well. We skip the rest. + * + * If usize > ksize then user-space is using a newer version of the ABI, + * which part the kernel doesn't know about. Just ignore it - tooling can + * detect the kernel's knowledge of attributes from the attr->size value + * which is set to ksize in this case. */ - if (usize < sizeof(*attr)) { - unsigned char *addr; - unsigned char *end; + kattr->size = min(usize, ksize); - addr = (void *)attr + usize; - end = (void *)attr + sizeof(*attr); - - for (; addr < end; addr++) { - if (*addr) - return -EFBIG; - } - - attr->size = usize; - } - - ret = copy_to_user(uattr, attr, attr->size); - if (ret) + if (copy_to_user(uattr, kattr, kattr->size)) return -EFAULT; return 0; @@ -5145,20 +5148,18 @@ static int sched_read_attr(struct sched_attr __user *uattr, * sys_sched_getattr - similar to sched_getparam, but with sched_attr * @pid: the pid in question. * @uattr: structure containing the extended parameters. - * @size: sizeof(attr) for fwd/bwd comp. + * @usize: sizeof(attr) that user-space knows about, for forwards and backwards compatibility. * @flags: for future extension. */ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, - unsigned int, size, unsigned int, flags) + unsigned int, usize, unsigned int, flags) { - struct sched_attr attr = { - .size = sizeof(struct sched_attr), - }; + struct sched_attr kattr = { }; struct task_struct *p; int retval; - if (!uattr || pid < 0 || size > PAGE_SIZE || - size < SCHED_ATTR_SIZE_VER0 || flags) + if (!uattr || pid < 0 || usize > PAGE_SIZE || + usize < SCHED_ATTR_SIZE_VER0 || flags) return -EINVAL; rcu_read_lock(); @@ -5171,25 +5172,24 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, if (retval) goto out_unlock; - attr.sched_policy = p->policy; + kattr.sched_policy = p->policy; if (p->sched_reset_on_fork) - attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK; + kattr.sched_flags |= SCHED_FLAG_RESET_ON_FORK; if (task_has_dl_policy(p)) - __getparam_dl(p, &attr); + __getparam_dl(p, &kattr); else if (task_has_rt_policy(p)) - attr.sched_priority = p->rt_priority; + kattr.sched_priority = p->rt_priority; else - attr.sched_nice = task_nice(p); + kattr.sched_nice = task_nice(p); #ifdef CONFIG_UCLAMP_TASK - attr.sched_util_min = p->uclamp_req[UCLAMP_MIN].value; - attr.sched_util_max = p->uclamp_req[UCLAMP_MAX].value; + kattr.sched_util_min = p->uclamp_req[UCLAMP_MIN].value; + kattr.sched_util_max = p->uclamp_req[UCLAMP_MAX].value; #endif rcu_read_unlock(); - retval = sched_read_attr(uattr, &attr, size); - return retval; + return sched_attr_copy_to_user(uattr, &kattr, usize); out_unlock: rcu_read_unlock(); -- cgit v1.2.3 From e336b4027775cb458dc713745e526fa1a1996b2a Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 3 Sep 2019 20:08:21 +0900 Subject: kprobes: Prohibit probing on BUG() and WARN() address Since BUG() and WARN() may use a trap (e.g. UD2 on x86) to get the address where the BUG() has occurred, kprobes can not do single-step out-of-line that instruction. So prohibit probing on such address. Without this fix, if someone put a kprobe on WARN(), the kernel will crash with invalid opcode error instead of outputing warning message, because kernel can not find correct bug address. Signed-off-by: Masami Hiramatsu Acked-by: Steven Rostedt (VMware) Acked-by: Naveen N. Rao Cc: Anil S Keshavamurthy Cc: David S . Miller Cc: Linus Torvalds Cc: Naveen N . Rao Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Link: https://lkml.kernel.org/r/156750890133.19112.3393666300746167111.stgit@devnote2 Signed-off-by: Ingo Molnar --- kernel/kprobes.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index d9770a5393c8..ebe8315a756a 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -1514,7 +1514,8 @@ static int check_kprobe_address_safe(struct kprobe *p, /* Ensure it is not in reserved area nor out of text */ if (!kernel_text_address((unsigned long) p->addr) || within_kprobe_blacklist((unsigned long) p->addr) || - jump_label_text_reserved(p->addr, p->addr)) { + jump_label_text_reserved(p->addr, p->addr) || + find_bug((unsigned long)p->addr)) { ret = -EINVAL; goto out; } -- cgit v1.2.3 From 5d2295f3a93b04986d069ebeaf5b07725f9096c1 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Wed, 4 Sep 2019 16:55:27 +0200 Subject: hrtimer: Add a missing bracket and hide `migration_base' on !SMP The recent change to avoid taking the expiry lock when a timer is currently migrated missed to add a bracket at the end of the if statement leading to compile errors. Since that commit the variable `migration_base' is always used but it is only available on SMP configuration thus leading to another compile error. The changelog says "The timer base and base->cpu_base cannot be NULL in the code path", so it is safe to limit this check to SMP configurations only. Add the missing bracket to the if statement and hide `migration_base' behind CONFIG_SMP bars. [ tglx: Mark the functions inline ... ] Fixes: 68b2c8c1e4210 ("hrtimer: Don't take expiry_lock when timer is currently migrated") Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20190904145527.eah7z56ntwobqm6j@linutronix.de --- kernel/time/hrtimer.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index ebbd0fb3512b..0d4dc241c0fb 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -140,6 +140,11 @@ static struct hrtimer_cpu_base migration_cpu_base = { #define migration_base migration_cpu_base.clock_base[0] +static inline bool is_migration_base(struct hrtimer_clock_base *base) +{ + return base == &migration_base; +} + /* * We are using hashed locking: holding per_cpu(hrtimer_bases)[n].lock * means that all timers which are tied to this base via timer->base are @@ -264,6 +269,11 @@ again: #else /* CONFIG_SMP */ +static inline bool is_migration_base(struct hrtimer_clock_base *base) +{ + return false; +} + static inline struct hrtimer_clock_base * lock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags) { @@ -1221,7 +1231,7 @@ void hrtimer_cancel_wait_running(const struct hrtimer *timer) * Just relax if the timer expires in hard interrupt context or if * it is currently on the migration base. */ - if (!timer->is_soft || base == &migration_base) + if (!timer->is_soft || is_migration_base(base)) { cpu_relax(); return; } -- cgit v1.2.3 From 2339cd6cd0b5401fa3fe886bf1c0cb8822041957 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 3 Sep 2019 15:16:17 -0700 Subject: bpf: fix precision tracking of stack slots The problem can be seen in the following two tests: 0: (bf) r3 = r10 1: (55) if r3 != 0x7b goto pc+0 2: (7a) *(u64 *)(r3 -8) = 0 3: (79) r4 = *(u64 *)(r10 -8) .. 0: (85) call bpf_get_prandom_u32#7 1: (bf) r3 = r10 2: (55) if r3 != 0x7b goto pc+0 3: (7b) *(u64 *)(r3 -8) = r0 4: (79) r4 = *(u64 *)(r10 -8) When backtracking need to mark R4 it will mark slot fp-8. But ST or STX into fp-8 could belong to the same block of instructions. When backtracing is done the parent state may have fp-8 slot as "unallocated stack". Which will cause verifier to warn and incorrectly reject such programs. Writes into stack via non-R10 register are rare. llvm always generates canonical stack spill/fill. For such pathological case fall back to conservative precision tracking instead of rejecting. Reported-by: syzbot+c8d66267fd2b5955287e@syzkaller.appspotmail.com Fixes: b5dc0163d8fd ("bpf: precise scalar_value tracking") Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- kernel/bpf/verifier.c | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index b5c14c9d7b98..c36a719fee6d 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1772,16 +1772,21 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno, bitmap_from_u64(mask, stack_mask); for_each_set_bit(i, mask, 64) { if (i >= func->allocated_stack / BPF_REG_SIZE) { - /* This can happen if backtracking - * is propagating stack precision where - * caller has larger stack frame - * than callee, but backtrack_insn() should - * have returned -ENOTSUPP. + /* the sequence of instructions: + * 2: (bf) r3 = r10 + * 3: (7b) *(u64 *)(r3 -8) = r0 + * 4: (79) r4 = *(u64 *)(r10 -8) + * doesn't contain jmps. It's backtracked + * as a single block. + * During backtracking insn 3 is not recognized as + * stack access, so at the end of backtracking + * stack slot fp-8 is still marked in stack_mask. + * However the parent state may not have accessed + * fp-8 and it's "unallocated" stack space. + * In such case fallback to conservative. */ - verbose(env, "BUG spi %d stack_size %d\n", - i, func->allocated_stack); - WARN_ONCE(1, "verifier backtracking bug"); - return -EFAULT; + mark_all_scalars_precise(env, st); + return 0; } if (func->stack[i].slot_type[0] != STACK_SPILL) { -- cgit v1.2.3 From ac68154626ab7fe4ce5f424937c34f42a3e20c5b Mon Sep 17 00:00:00 2001 From: Zhengjun Xing Date: Fri, 12 Jul 2019 09:53:08 +0800 Subject: tracing: Add "gfp_t" support in synthetic_events Add "gfp_t" support in synthetic_events, then the "gfp_t" type parameter in some functions can be traced. Prints the gfp flags as hex in addition to the human-readable flag string. Example output: whoopsie-630 [000] ...1 78.969452: testevent: bar=b20 (GFP_ATOMIC|__GFP_ZERO) rcuc/0-11 [000] ...1 81.097555: testevent: bar=a20 (GFP_ATOMIC) rcuc/0-11 [000] ...1 81.583123: testevent: bar=a20 (GFP_ATOMIC) Link: http://lkml.kernel.org/r/20190712015308.9908-1-zhengjun.xing@linux.intel.com Signed-off-by: Zhengjun Xing [ Added printing of flag names ] Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 65e7d071ed28..3a6e42aa08e6 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -13,6 +13,10 @@ #include #include +/* for gfp flag names */ +#include +#include + #include "tracing_map.h" #include "trace.h" #include "trace_dynevent.h" @@ -752,6 +756,8 @@ static int synth_field_size(char *type) size = sizeof(unsigned long); else if (strcmp(type, "pid_t") == 0) size = sizeof(pid_t); + else if (strcmp(type, "gfp_t") == 0) + size = sizeof(gfp_t); else if (synth_field_is_string(type)) size = synth_field_string_size(type); @@ -792,6 +798,8 @@ static const char *synth_field_fmt(char *type) fmt = "%lu"; else if (strcmp(type, "pid_t") == 0) fmt = "%d"; + else if (strcmp(type, "gfp_t") == 0) + fmt = "%x"; else if (synth_field_is_string(type)) fmt = "%s"; @@ -834,9 +842,20 @@ static enum print_line_t print_synth_event(struct trace_iterator *iter, i == se->n_fields - 1 ? "" : " "); n_u64 += STR_VAR_LEN_MAX / sizeof(u64); } else { + struct trace_print_flags __flags[] = { + __def_gfpflag_names, {-1, NULL} }; + trace_seq_printf(s, print_fmt, se->fields[i]->name, entry->fields[n_u64], i == se->n_fields - 1 ? "" : " "); + + if (strcmp(se->fields[i]->type, "gfp_t") == 0) { + trace_seq_puts(s, " ("); + trace_print_flags_seq(s, "|", + entry->fields[n_u64], + __flags); + trace_seq_putc(s, ')'); + } n_u64++; } } -- cgit v1.2.3 From f18ddc13af981ce3c7b7f26925f099e7c6929aba Mon Sep 17 00:00:00 2001 From: Thadeu Lima de Souza Cascardo Date: Tue, 3 Sep 2019 14:18:02 -0300 Subject: alarmtimer: Use EOPNOTSUPP instead of ENOTSUPP ENOTSUPP is not supposed to be returned to userspace. This was found on an OpenPower machine, where the RTC does not support set_alarm. On that system, a clock_nanosleep(CLOCK_REALTIME_ALARM, ...) results in "524 Unknown error 524" Replace it with EOPNOTSUPP which results in the expected "95 Operation not supported" error. Fixes: 1c6b39ad3f01 (alarmtimers: Return -ENOTSUPP if no RTC device is present) Signed-off-by: Thadeu Lima de Souza Cascardo Signed-off-by: Thomas Gleixner Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20190903171802.28314-1-cascardo@canonical.com --- kernel/time/alarmtimer.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 57518efc3810..b7d75a9e8ccf 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -672,7 +672,7 @@ static int alarm_timer_create(struct k_itimer *new_timer) enum alarmtimer_type type; if (!alarmtimer_get_rtcdev()) - return -ENOTSUPP; + return -EOPNOTSUPP; if (!capable(CAP_WAKE_ALARM)) return -EPERM; @@ -790,7 +790,7 @@ static int alarm_timer_nsleep(const clockid_t which_clock, int flags, int ret = 0; if (!alarmtimer_get_rtcdev()) - return -ENOTSUPP; + return -EOPNOTSUPP; if (flags & ~TIMER_ABSTIME) return -EINVAL; -- cgit v1.2.3 From eddf3e9c7c7e4d0707c68d1bb22cc6ec8aef7d4a Mon Sep 17 00:00:00 2001 From: Yunfeng Ye Date: Wed, 4 Sep 2019 20:46:25 +0800 Subject: genirq: Prevent NULL pointer dereference in resend_irqs() The following crash was observed: Unable to handle kernel NULL pointer dereference at 0000000000000158 Internal error: Oops: 96000004 [#1] SMP pc : resend_irqs+0x68/0xb0 lr : resend_irqs+0x64/0xb0 ... Call trace: resend_irqs+0x68/0xb0 tasklet_action_common.isra.6+0x84/0x138 tasklet_action+0x2c/0x38 __do_softirq+0x120/0x324 run_ksoftirqd+0x44/0x60 smpboot_thread_fn+0x1ac/0x1e8 kthread+0x134/0x138 ret_from_fork+0x10/0x18 The reason for this is that the interrupt resend mechanism happens in soft interrupt context, which is a asynchronous mechanism versus other operations on interrupts. free_irq() does not take resend handling into account. Thus, the irq descriptor might be already freed before the resend tasklet is executed. resend_irqs() does not check the return value of the interrupt descriptor lookup and derefences the return value unconditionally. 1): __setup_irq irq_startup check_irq_resend // activate softirq to handle resend irq 2): irq_domain_free_irqs irq_free_descs free_desc call_rcu(&desc->rcu, delayed_free_desc) 3): __do_softirq tasklet_action resend_irqs desc = irq_to_desc(irq) desc->handle_irq(desc) // desc is NULL --> Ooops Fix this by adding a NULL pointer check in resend_irqs() before derefencing the irq descriptor. Fixes: a4633adcdbc1 ("[PATCH] genirq: add genirq sw IRQ-retrigger") Signed-off-by: Yunfeng Ye Signed-off-by: Thomas Gleixner Reviewed-by: Zhiqiang Liu Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/1630ae13-5c8e-901e-de09-e740b6a426a7@huawei.com --- kernel/irq/resend.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c index 95414ad3506a..98c04ca5fa43 100644 --- a/kernel/irq/resend.c +++ b/kernel/irq/resend.c @@ -36,6 +36,8 @@ static void resend_irqs(unsigned long arg) irq = find_first_bit(irqs_resend, nr_irqs); clear_bit(irq, irqs_resend); desc = irq_to_desc(irq); + if (!desc) + continue; local_irq_disable(); desc->handle_irq(desc); local_irq_enable(); -- cgit v1.2.3 From 310aa0a25b338b3100c94880c9a69bec8ce8c3ae Mon Sep 17 00:00:00 2001 From: Mark-PK Tsai Date: Fri, 6 Sep 2019 14:01:16 +0800 Subject: perf/hw_breakpoint: Fix arch_hw_breakpoint use-before-initialization If we disable the compiler's auto-initialization feature, if -fplugin-arg-structleak_plugin-byref or -ftrivial-auto-var-init=pattern are disabled, arch_hw_breakpoint may be used before initialization after: 9a4903dde2c86 ("perf/hw_breakpoint: Split attribute parse and commit") On our ARM platform, the struct step_ctrl in arch_hw_breakpoint, which used to be zero-initialized by kzalloc(), may be used in arch_install_hw_breakpoint() without initialization. Signed-off-by: Mark-PK Tsai Cc: Alexander Shishkin Cc: Alix Wu Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: YJ Chiang Link: https://lkml.kernel.org/r/20190906060115.9460-1-mark-pk.tsai@mediatek.com [ Minor edits. ] Signed-off-by: Ingo Molnar --- kernel/events/hw_breakpoint.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c index c5cd852fe86b..3cc8416ec844 100644 --- a/kernel/events/hw_breakpoint.c +++ b/kernel/events/hw_breakpoint.c @@ -413,7 +413,7 @@ static int hw_breakpoint_parse(struct perf_event *bp, int register_perf_hw_breakpoint(struct perf_event *bp) { - struct arch_hw_breakpoint hw; + struct arch_hw_breakpoint hw = { }; int err; err = reserve_bp_slot(bp); @@ -461,7 +461,7 @@ int modify_user_hw_breakpoint_check(struct perf_event *bp, struct perf_event_attr *attr, bool check) { - struct arch_hw_breakpoint hw; + struct arch_hw_breakpoint hw = { }; int err; err = hw_breakpoint_parse(bp, attr, &hw); -- cgit v1.2.3 From 175fca3bf91a1111b7e46f6655666640556b9059 Mon Sep 17 00:00:00 2001 From: Sven Schnelle Date: Fri, 23 Aug 2019 21:49:13 +0200 Subject: kexec: add KEXEC_ELF Right now powerpc provides an implementation to read elf files with the kexec_file_load() syscall. Make that available as a public kexec interface so it can be re-used on other architectures. Signed-off-by: Sven Schnelle Reviewed-by: Thiago Jung Bauermann Signed-off-by: Helge Deller --- kernel/Makefile | 1 + kernel/kexec_elf.c | 549 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 550 insertions(+) create mode 100644 kernel/kexec_elf.c (limited to 'kernel') diff --git a/kernel/Makefile b/kernel/Makefile index a8d923b5481b..002220b7ca7c 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -64,6 +64,7 @@ obj-$(CONFIG_CRASH_CORE) += crash_core.o obj-$(CONFIG_KEXEC_CORE) += kexec_core.o obj-$(CONFIG_KEXEC) += kexec.o obj-$(CONFIG_KEXEC_FILE) += kexec_file.o +obj-$(CONFIG_KEXEC_ELF) += kexec_elf.o obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o obj-$(CONFIG_COMPAT) += compat.o obj-$(CONFIG_CGROUPS) += cgroup/ diff --git a/kernel/kexec_elf.c b/kernel/kexec_elf.c new file mode 100644 index 000000000000..26c6310167a0 --- /dev/null +++ b/kernel/kexec_elf.c @@ -0,0 +1,549 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Load ELF vmlinux file for the kexec_file_load syscall. + * + * Copyright (C) 2004 Adam Litke (agl@us.ibm.com) + * Copyright (C) 2004 IBM Corp. + * Copyright (C) 2005 R Sharada (sharada@in.ibm.com) + * Copyright (C) 2006 Mohan Kumar M (mohan@in.ibm.com) + * Copyright (C) 2016 IBM Corporation + * + * Based on kexec-tools' kexec-elf-exec.c and kexec-elf-ppc64.c. + * Heavily modified for the kernel by + * Thiago Jung Bauermann . + */ + +#define pr_fmt(fmt) "kexec_elf: " fmt + +#include +#include +#include +#include +#include + +#define PURGATORY_STACK_SIZE (16 * 1024) + +#define elf_addr_to_cpu elf64_to_cpu + +#ifndef Elf_Rel +#define Elf_Rel Elf64_Rel +#endif /* Elf_Rel */ + +static inline bool elf_is_elf_file(const struct elfhdr *ehdr) +{ + return memcmp(ehdr->e_ident, ELFMAG, SELFMAG) == 0; +} + +static uint64_t elf64_to_cpu(const struct elfhdr *ehdr, uint64_t value) +{ + if (ehdr->e_ident[EI_DATA] == ELFDATA2LSB) + value = le64_to_cpu(value); + else if (ehdr->e_ident[EI_DATA] == ELFDATA2MSB) + value = be64_to_cpu(value); + + return value; +} + +static uint16_t elf16_to_cpu(const struct elfhdr *ehdr, uint16_t value) +{ + if (ehdr->e_ident[EI_DATA] == ELFDATA2LSB) + value = le16_to_cpu(value); + else if (ehdr->e_ident[EI_DATA] == ELFDATA2MSB) + value = be16_to_cpu(value); + + return value; +} + +static uint32_t elf32_to_cpu(const struct elfhdr *ehdr, uint32_t value) +{ + if (ehdr->e_ident[EI_DATA] == ELFDATA2LSB) + value = le32_to_cpu(value); + else if (ehdr->e_ident[EI_DATA] == ELFDATA2MSB) + value = be32_to_cpu(value); + + return value; +} + +/** + * elf_is_ehdr_sane - check that it is safe to use the ELF header + * @buf_len: size of the buffer in which the ELF file is loaded. + */ +static bool elf_is_ehdr_sane(const struct elfhdr *ehdr, size_t buf_len) +{ + if (ehdr->e_phnum > 0 && ehdr->e_phentsize != sizeof(struct elf_phdr)) { + pr_debug("Bad program header size.\n"); + return false; + } else if (ehdr->e_shnum > 0 && + ehdr->e_shentsize != sizeof(struct elf_shdr)) { + pr_debug("Bad section header size.\n"); + return false; + } else if (ehdr->e_ident[EI_VERSION] != EV_CURRENT || + ehdr->e_version != EV_CURRENT) { + pr_debug("Unknown ELF version.\n"); + return false; + } + + if (ehdr->e_phoff > 0 && ehdr->e_phnum > 0) { + size_t phdr_size; + + /* + * e_phnum is at most 65535 so calculating the size of the + * program header cannot overflow. + */ + phdr_size = sizeof(struct elf_phdr) * ehdr->e_phnum; + + /* Sanity check the program header table location. */ + if (ehdr->e_phoff + phdr_size < ehdr->e_phoff) { + pr_debug("Program headers at invalid location.\n"); + return false; + } else if (ehdr->e_phoff + phdr_size > buf_len) { + pr_debug("Program headers truncated.\n"); + return false; + } + } + + if (ehdr->e_shoff > 0 && ehdr->e_shnum > 0) { + size_t shdr_size; + + /* + * e_shnum is at most 65536 so calculating + * the size of the section header cannot overflow. + */ + shdr_size = sizeof(struct elf_shdr) * ehdr->e_shnum; + + /* Sanity check the section header table location. */ + if (ehdr->e_shoff + shdr_size < ehdr->e_shoff) { + pr_debug("Section headers at invalid location.\n"); + return false; + } else if (ehdr->e_shoff + shdr_size > buf_len) { + pr_debug("Section headers truncated.\n"); + return false; + } + } + + return true; +} + +static int elf_read_ehdr(const char *buf, size_t len, struct elfhdr *ehdr) +{ + struct elfhdr *buf_ehdr; + + if (len < sizeof(*buf_ehdr)) { + pr_debug("Buffer is too small to hold ELF header.\n"); + return -ENOEXEC; + } + + memset(ehdr, 0, sizeof(*ehdr)); + memcpy(ehdr->e_ident, buf, sizeof(ehdr->e_ident)); + if (!elf_is_elf_file(ehdr)) { + pr_debug("No ELF header magic.\n"); + return -ENOEXEC; + } + + if (ehdr->e_ident[EI_CLASS] != ELF_CLASS) { + pr_debug("Not a supported ELF class.\n"); + return -ENOEXEC; + } else if (ehdr->e_ident[EI_DATA] != ELFDATA2LSB && + ehdr->e_ident[EI_DATA] != ELFDATA2MSB) { + pr_debug("Not a supported ELF data format.\n"); + return -ENOEXEC; + } + + buf_ehdr = (struct elfhdr *) buf; + if (elf16_to_cpu(ehdr, buf_ehdr->e_ehsize) != sizeof(*buf_ehdr)) { + pr_debug("Bad ELF header size.\n"); + return -ENOEXEC; + } + + ehdr->e_type = elf16_to_cpu(ehdr, buf_ehdr->e_type); + ehdr->e_machine = elf16_to_cpu(ehdr, buf_ehdr->e_machine); + ehdr->e_version = elf32_to_cpu(ehdr, buf_ehdr->e_version); + ehdr->e_entry = elf_addr_to_cpu(ehdr, buf_ehdr->e_entry); + ehdr->e_phoff = elf_addr_to_cpu(ehdr, buf_ehdr->e_phoff); + ehdr->e_shoff = elf_addr_to_cpu(ehdr, buf_ehdr->e_shoff); + ehdr->e_flags = elf32_to_cpu(ehdr, buf_ehdr->e_flags); + ehdr->e_phentsize = elf16_to_cpu(ehdr, buf_ehdr->e_phentsize); + ehdr->e_phnum = elf16_to_cpu(ehdr, buf_ehdr->e_phnum); + ehdr->e_shentsize = elf16_to_cpu(ehdr, buf_ehdr->e_shentsize); + ehdr->e_shnum = elf16_to_cpu(ehdr, buf_ehdr->e_shnum); + ehdr->e_shstrndx = elf16_to_cpu(ehdr, buf_ehdr->e_shstrndx); + + return elf_is_ehdr_sane(ehdr, len) ? 0 : -ENOEXEC; +} + +/** + * elf_is_phdr_sane - check that it is safe to use the program header + * @buf_len: size of the buffer in which the ELF file is loaded. + */ +static bool elf_is_phdr_sane(const struct elf_phdr *phdr, size_t buf_len) +{ + + if (phdr->p_offset + phdr->p_filesz < phdr->p_offset) { + pr_debug("ELF segment location wraps around.\n"); + return false; + } else if (phdr->p_offset + phdr->p_filesz > buf_len) { + pr_debug("ELF segment not in file.\n"); + return false; + } else if (phdr->p_paddr + phdr->p_memsz < phdr->p_paddr) { + pr_debug("ELF segment address wraps around.\n"); + return false; + } + + return true; +} + +static int elf_read_phdr(const char *buf, size_t len, + struct kexec_elf_info *elf_info, + int idx) +{ + /* Override the const in proghdrs, we are the ones doing the loading. */ + struct elf_phdr *phdr = (struct elf_phdr *) &elf_info->proghdrs[idx]; + const char *pbuf; + struct elf_phdr *buf_phdr; + + pbuf = buf + elf_info->ehdr->e_phoff + (idx * sizeof(*buf_phdr)); + buf_phdr = (struct elf_phdr *) pbuf; + + phdr->p_type = elf32_to_cpu(elf_info->ehdr, buf_phdr->p_type); + phdr->p_offset = elf_addr_to_cpu(elf_info->ehdr, buf_phdr->p_offset); + phdr->p_paddr = elf_addr_to_cpu(elf_info->ehdr, buf_phdr->p_paddr); + phdr->p_vaddr = elf_addr_to_cpu(elf_info->ehdr, buf_phdr->p_vaddr); + phdr->p_flags = elf32_to_cpu(elf_info->ehdr, buf_phdr->p_flags); + + /* + * The following fields have a type equivalent to Elf_Addr + * both in 32 bit and 64 bit ELF. + */ + phdr->p_filesz = elf_addr_to_cpu(elf_info->ehdr, buf_phdr->p_filesz); + phdr->p_memsz = elf_addr_to_cpu(elf_info->ehdr, buf_phdr->p_memsz); + phdr->p_align = elf_addr_to_cpu(elf_info->ehdr, buf_phdr->p_align); + + return elf_is_phdr_sane(phdr, len) ? 0 : -ENOEXEC; +} + +/** + * elf_read_phdrs - read the program headers from the buffer + * + * This function assumes that the program header table was checked for sanity. + * Use elf_is_ehdr_sane() if it wasn't. + */ +static int elf_read_phdrs(const char *buf, size_t len, + struct kexec_elf_info *elf_info) +{ + size_t phdr_size, i; + const struct elfhdr *ehdr = elf_info->ehdr; + + /* + * e_phnum is at most 65535 so calculating the size of the + * program header cannot overflow. + */ + phdr_size = sizeof(struct elf_phdr) * ehdr->e_phnum; + + elf_info->proghdrs = kzalloc(phdr_size, GFP_KERNEL); + if (!elf_info->proghdrs) + return -ENOMEM; + + for (i = 0; i < ehdr->e_phnum; i++) { + int ret; + + ret = elf_read_phdr(buf, len, elf_info, i); + if (ret) { + kfree(elf_info->proghdrs); + elf_info->proghdrs = NULL; + return ret; + } + } + + return 0; +} + +/** + * elf_is_shdr_sane - check that it is safe to use the section header + * @buf_len: size of the buffer in which the ELF file is loaded. + */ +static bool elf_is_shdr_sane(const struct elf_shdr *shdr, size_t buf_len) +{ + bool size_ok; + + /* SHT_NULL headers have undefined values, so we can't check them. */ + if (shdr->sh_type == SHT_NULL) + return true; + + /* Now verify sh_entsize */ + switch (shdr->sh_type) { + case SHT_SYMTAB: + size_ok = shdr->sh_entsize == sizeof(Elf_Sym); + break; + case SHT_RELA: + size_ok = shdr->sh_entsize == sizeof(Elf_Rela); + break; + case SHT_DYNAMIC: + size_ok = shdr->sh_entsize == sizeof(Elf_Dyn); + break; + case SHT_REL: + size_ok = shdr->sh_entsize == sizeof(Elf_Rel); + break; + case SHT_NOTE: + case SHT_PROGBITS: + case SHT_HASH: + case SHT_NOBITS: + default: + /* + * This is a section whose entsize requirements + * I don't care about. If I don't know about + * the section I can't care about it's entsize + * requirements. + */ + size_ok = true; + break; + } + + if (!size_ok) { + pr_debug("ELF section with wrong entry size.\n"); + return false; + } else if (shdr->sh_addr + shdr->sh_size < shdr->sh_addr) { + pr_debug("ELF section address wraps around.\n"); + return false; + } + + if (shdr->sh_type != SHT_NOBITS) { + if (shdr->sh_offset + shdr->sh_size < shdr->sh_offset) { + pr_debug("ELF section location wraps around.\n"); + return false; + } else if (shdr->sh_offset + shdr->sh_size > buf_len) { + pr_debug("ELF section not in file.\n"); + return false; + } + } + + return true; +} + +static int elf_read_shdr(const char *buf, size_t len, + struct kexec_elf_info *elf_info, + int idx) +{ + struct elf_shdr *shdr = &elf_info->sechdrs[idx]; + const struct elfhdr *ehdr = elf_info->ehdr; + const char *sbuf; + struct elf_shdr *buf_shdr; + + sbuf = buf + ehdr->e_shoff + idx * sizeof(*buf_shdr); + buf_shdr = (struct elf_shdr *) sbuf; + + shdr->sh_name = elf32_to_cpu(ehdr, buf_shdr->sh_name); + shdr->sh_type = elf32_to_cpu(ehdr, buf_shdr->sh_type); + shdr->sh_addr = elf_addr_to_cpu(ehdr, buf_shdr->sh_addr); + shdr->sh_offset = elf_addr_to_cpu(ehdr, buf_shdr->sh_offset); + shdr->sh_link = elf32_to_cpu(ehdr, buf_shdr->sh_link); + shdr->sh_info = elf32_to_cpu(ehdr, buf_shdr->sh_info); + + /* + * The following fields have a type equivalent to Elf_Addr + * both in 32 bit and 64 bit ELF. + */ + shdr->sh_flags = elf_addr_to_cpu(ehdr, buf_shdr->sh_flags); + shdr->sh_size = elf_addr_to_cpu(ehdr, buf_shdr->sh_size); + shdr->sh_addralign = elf_addr_to_cpu(ehdr, buf_shdr->sh_addralign); + shdr->sh_entsize = elf_addr_to_cpu(ehdr, buf_shdr->sh_entsize); + + return elf_is_shdr_sane(shdr, len) ? 0 : -ENOEXEC; +} + +/** + * elf_read_shdrs - read the section headers from the buffer + * + * This function assumes that the section header table was checked for sanity. + * Use elf_is_ehdr_sane() if it wasn't. + */ +static int elf_read_shdrs(const char *buf, size_t len, + struct kexec_elf_info *elf_info) +{ + size_t shdr_size, i; + + /* + * e_shnum is at most 65536 so calculating + * the size of the section header cannot overflow. + */ + shdr_size = sizeof(struct elf_shdr) * elf_info->ehdr->e_shnum; + + elf_info->sechdrs = kzalloc(shdr_size, GFP_KERNEL); + if (!elf_info->sechdrs) + return -ENOMEM; + + for (i = 0; i < elf_info->ehdr->e_shnum; i++) { + int ret; + + ret = elf_read_shdr(buf, len, elf_info, i); + if (ret) { + kfree(elf_info->sechdrs); + elf_info->sechdrs = NULL; + return ret; + } + } + + return 0; +} + +/** + * elf_read_from_buffer - read ELF file and sets up ELF header and ELF info + * @buf: Buffer to read ELF file from. + * @len: Size of @buf. + * @ehdr: Pointer to existing struct which will be populated. + * @elf_info: Pointer to existing struct which will be populated. + * + * This function allows reading ELF files with different byte order than + * the kernel, byte-swapping the fields as needed. + * + * Return: + * On success returns 0, and the caller should call + * kexec_free_elf_info(elf_info) to free the memory allocated for the section + * and program headers. + */ +static int elf_read_from_buffer(const char *buf, size_t len, + struct elfhdr *ehdr, + struct kexec_elf_info *elf_info) +{ + int ret; + + ret = elf_read_ehdr(buf, len, ehdr); + if (ret) + return ret; + + elf_info->buffer = buf; + elf_info->ehdr = ehdr; + if (ehdr->e_phoff > 0 && ehdr->e_phnum > 0) { + ret = elf_read_phdrs(buf, len, elf_info); + if (ret) + return ret; + } + if (ehdr->e_shoff > 0 && ehdr->e_shnum > 0) { + ret = elf_read_shdrs(buf, len, elf_info); + if (ret) { + kfree(elf_info->proghdrs); + return ret; + } + } + + return 0; +} + +/** + * kexec_free_elf_info - free memory allocated by elf_read_from_buffer + */ +void kexec_free_elf_info(struct kexec_elf_info *elf_info) +{ + kfree(elf_info->proghdrs); + kfree(elf_info->sechdrs); + memset(elf_info, 0, sizeof(*elf_info)); +} +/** + * kexec_build_elf_info - read ELF executable and check that we can use it + */ +int kexec_build_elf_info(const char *buf, size_t len, struct elfhdr *ehdr, + struct kexec_elf_info *elf_info) +{ + int i; + int ret; + + ret = elf_read_from_buffer(buf, len, ehdr, elf_info); + if (ret) + return ret; + + /* Big endian vmlinux has type ET_DYN. */ + if (ehdr->e_type != ET_EXEC && ehdr->e_type != ET_DYN) { + pr_err("Not an ELF executable.\n"); + goto error; + } else if (!elf_info->proghdrs) { + pr_err("No ELF program header.\n"); + goto error; + } + + for (i = 0; i < ehdr->e_phnum; i++) { + /* + * Kexec does not support loading interpreters. + * In addition this check keeps us from attempting + * to kexec ordinay executables. + */ + if (elf_info->proghdrs[i].p_type == PT_INTERP) { + pr_err("Requires an ELF interpreter.\n"); + goto error; + } + } + + return 0; +error: + kexec_free_elf_info(elf_info); + return -ENOEXEC; +} + + +int kexec_elf_probe(const char *buf, unsigned long len) +{ + struct elfhdr ehdr; + struct kexec_elf_info elf_info; + int ret; + + ret = kexec_build_elf_info(buf, len, &ehdr, &elf_info); + if (ret) + return ret; + + kexec_free_elf_info(&elf_info); + + return elf_check_arch(&ehdr) ? 0 : -ENOEXEC; +} + +/** + * kexec_elf_load - load ELF executable image + * @lowest_load_addr: On return, will be the address where the first PT_LOAD + * section will be loaded in memory. + * + * Return: + * 0 on success, negative value on failure. + */ +int kexec_elf_load(struct kimage *image, struct elfhdr *ehdr, + struct kexec_elf_info *elf_info, + struct kexec_buf *kbuf, + unsigned long *lowest_load_addr) +{ + unsigned long base = 0, lowest_addr = UINT_MAX; + int ret; + size_t i; + + /* Read in the PT_LOAD segments. */ + for (i = 0; i < ehdr->e_phnum; i++) { + unsigned long load_addr; + size_t size; + const struct elf_phdr *phdr; + + phdr = &elf_info->proghdrs[i]; + if (phdr->p_type != PT_LOAD) + continue; + + size = phdr->p_filesz; + if (size > phdr->p_memsz) + size = phdr->p_memsz; + + kbuf->buffer = (void *) elf_info->buffer + phdr->p_offset; + kbuf->bufsz = size; + kbuf->memsz = phdr->p_memsz; + kbuf->buf_align = phdr->p_align; + kbuf->buf_min = phdr->p_paddr + base; + kbuf->mem = KEXEC_BUF_MEM_UNKNOWN; + ret = kexec_add_buffer(kbuf); + if (ret) + goto out; + load_addr = kbuf->mem; + + if (load_addr < lowest_addr) + lowest_addr = load_addr; + } + + /* Update entry point to reflect new load address. */ + ehdr->e_entry += base; + + *lowest_load_addr = lowest_addr; + ret = 0; + out: + return ret; +} -- cgit v1.2.3 From d34e0ad3eaf45c612cc7b5baa0f6b2f2a6d75612 Mon Sep 17 00:00:00 2001 From: Sven Schnelle Date: Fri, 23 Aug 2019 21:49:14 +0200 Subject: kexec_elf: change order of elf_*_to_cpu() functions Change the order to have a 64/32/16 order, no functional change. Signed-off-by: Sven Schnelle Reviewed-by: Thiago Jung Bauermann Signed-off-by: Helge Deller --- kernel/kexec_elf.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/kexec_elf.c b/kernel/kexec_elf.c index 26c6310167a0..34376fbc55be 100644 --- a/kernel/kexec_elf.c +++ b/kernel/kexec_elf.c @@ -44,22 +44,22 @@ static uint64_t elf64_to_cpu(const struct elfhdr *ehdr, uint64_t value) return value; } -static uint16_t elf16_to_cpu(const struct elfhdr *ehdr, uint16_t value) +static uint32_t elf32_to_cpu(const struct elfhdr *ehdr, uint32_t value) { if (ehdr->e_ident[EI_DATA] == ELFDATA2LSB) - value = le16_to_cpu(value); + value = le32_to_cpu(value); else if (ehdr->e_ident[EI_DATA] == ELFDATA2MSB) - value = be16_to_cpu(value); + value = be32_to_cpu(value); return value; } -static uint32_t elf32_to_cpu(const struct elfhdr *ehdr, uint32_t value) +static uint16_t elf16_to_cpu(const struct elfhdr *ehdr, uint16_t value) { if (ehdr->e_ident[EI_DATA] == ELFDATA2LSB) - value = le32_to_cpu(value); + value = le16_to_cpu(value); else if (ehdr->e_ident[EI_DATA] == ELFDATA2MSB) - value = be32_to_cpu(value); + value = be16_to_cpu(value); return value; } -- cgit v1.2.3 From 5f71d977206f3b2990ba304766ddaa9e81dbe700 Mon Sep 17 00:00:00 2001 From: Sven Schnelle Date: Fri, 23 Aug 2019 21:49:15 +0200 Subject: kexec_elf: remove parsing of section headers We're not using them, so we can drop the parsing. Signed-off-by: Sven Schnelle Reviewed-by: Thiago Jung Bauermann Signed-off-by: Helge Deller --- kernel/kexec_elf.c | 137 ----------------------------------------------------- 1 file changed, 137 deletions(-) (limited to 'kernel') diff --git a/kernel/kexec_elf.c b/kernel/kexec_elf.c index 34376fbc55be..137037603117 100644 --- a/kernel/kexec_elf.c +++ b/kernel/kexec_elf.c @@ -257,134 +257,6 @@ static int elf_read_phdrs(const char *buf, size_t len, return 0; } -/** - * elf_is_shdr_sane - check that it is safe to use the section header - * @buf_len: size of the buffer in which the ELF file is loaded. - */ -static bool elf_is_shdr_sane(const struct elf_shdr *shdr, size_t buf_len) -{ - bool size_ok; - - /* SHT_NULL headers have undefined values, so we can't check them. */ - if (shdr->sh_type == SHT_NULL) - return true; - - /* Now verify sh_entsize */ - switch (shdr->sh_type) { - case SHT_SYMTAB: - size_ok = shdr->sh_entsize == sizeof(Elf_Sym); - break; - case SHT_RELA: - size_ok = shdr->sh_entsize == sizeof(Elf_Rela); - break; - case SHT_DYNAMIC: - size_ok = shdr->sh_entsize == sizeof(Elf_Dyn); - break; - case SHT_REL: - size_ok = shdr->sh_entsize == sizeof(Elf_Rel); - break; - case SHT_NOTE: - case SHT_PROGBITS: - case SHT_HASH: - case SHT_NOBITS: - default: - /* - * This is a section whose entsize requirements - * I don't care about. If I don't know about - * the section I can't care about it's entsize - * requirements. - */ - size_ok = true; - break; - } - - if (!size_ok) { - pr_debug("ELF section with wrong entry size.\n"); - return false; - } else if (shdr->sh_addr + shdr->sh_size < shdr->sh_addr) { - pr_debug("ELF section address wraps around.\n"); - return false; - } - - if (shdr->sh_type != SHT_NOBITS) { - if (shdr->sh_offset + shdr->sh_size < shdr->sh_offset) { - pr_debug("ELF section location wraps around.\n"); - return false; - } else if (shdr->sh_offset + shdr->sh_size > buf_len) { - pr_debug("ELF section not in file.\n"); - return false; - } - } - - return true; -} - -static int elf_read_shdr(const char *buf, size_t len, - struct kexec_elf_info *elf_info, - int idx) -{ - struct elf_shdr *shdr = &elf_info->sechdrs[idx]; - const struct elfhdr *ehdr = elf_info->ehdr; - const char *sbuf; - struct elf_shdr *buf_shdr; - - sbuf = buf + ehdr->e_shoff + idx * sizeof(*buf_shdr); - buf_shdr = (struct elf_shdr *) sbuf; - - shdr->sh_name = elf32_to_cpu(ehdr, buf_shdr->sh_name); - shdr->sh_type = elf32_to_cpu(ehdr, buf_shdr->sh_type); - shdr->sh_addr = elf_addr_to_cpu(ehdr, buf_shdr->sh_addr); - shdr->sh_offset = elf_addr_to_cpu(ehdr, buf_shdr->sh_offset); - shdr->sh_link = elf32_to_cpu(ehdr, buf_shdr->sh_link); - shdr->sh_info = elf32_to_cpu(ehdr, buf_shdr->sh_info); - - /* - * The following fields have a type equivalent to Elf_Addr - * both in 32 bit and 64 bit ELF. - */ - shdr->sh_flags = elf_addr_to_cpu(ehdr, buf_shdr->sh_flags); - shdr->sh_size = elf_addr_to_cpu(ehdr, buf_shdr->sh_size); - shdr->sh_addralign = elf_addr_to_cpu(ehdr, buf_shdr->sh_addralign); - shdr->sh_entsize = elf_addr_to_cpu(ehdr, buf_shdr->sh_entsize); - - return elf_is_shdr_sane(shdr, len) ? 0 : -ENOEXEC; -} - -/** - * elf_read_shdrs - read the section headers from the buffer - * - * This function assumes that the section header table was checked for sanity. - * Use elf_is_ehdr_sane() if it wasn't. - */ -static int elf_read_shdrs(const char *buf, size_t len, - struct kexec_elf_info *elf_info) -{ - size_t shdr_size, i; - - /* - * e_shnum is at most 65536 so calculating - * the size of the section header cannot overflow. - */ - shdr_size = sizeof(struct elf_shdr) * elf_info->ehdr->e_shnum; - - elf_info->sechdrs = kzalloc(shdr_size, GFP_KERNEL); - if (!elf_info->sechdrs) - return -ENOMEM; - - for (i = 0; i < elf_info->ehdr->e_shnum; i++) { - int ret; - - ret = elf_read_shdr(buf, len, elf_info, i); - if (ret) { - kfree(elf_info->sechdrs); - elf_info->sechdrs = NULL; - return ret; - } - } - - return 0; -} - /** * elf_read_from_buffer - read ELF file and sets up ELF header and ELF info * @buf: Buffer to read ELF file from. @@ -417,14 +289,6 @@ static int elf_read_from_buffer(const char *buf, size_t len, if (ret) return ret; } - if (ehdr->e_shoff > 0 && ehdr->e_shnum > 0) { - ret = elf_read_shdrs(buf, len, elf_info); - if (ret) { - kfree(elf_info->proghdrs); - return ret; - } - } - return 0; } @@ -434,7 +298,6 @@ static int elf_read_from_buffer(const char *buf, size_t len, void kexec_free_elf_info(struct kexec_elf_info *elf_info) { kfree(elf_info->proghdrs); - kfree(elf_info->sechdrs); memset(elf_info, 0, sizeof(*elf_info)); } /** -- cgit v1.2.3 From 10ba459f870c7ca469e8190a704b9bbc1d513978 Mon Sep 17 00:00:00 2001 From: Sven Schnelle Date: Fri, 23 Aug 2019 21:49:16 +0200 Subject: kexec_elf: remove PURGATORY_STACK_SIZE It's not used anywhere so just drop it. Signed-off-by: Sven Schnelle Reviewed-by: Thiago Jung Bauermann Signed-off-by: Helge Deller --- kernel/kexec_elf.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/kexec_elf.c b/kernel/kexec_elf.c index 137037603117..87935bd5e2ba 100644 --- a/kernel/kexec_elf.c +++ b/kernel/kexec_elf.c @@ -21,8 +21,6 @@ #include #include -#define PURGATORY_STACK_SIZE (16 * 1024) - #define elf_addr_to_cpu elf64_to_cpu #ifndef Elf_Rel -- cgit v1.2.3 From 3bd9c3366ee5c4e1b6ee8c8d9006ee559ec1d4ba Mon Sep 17 00:00:00 2001 From: Sven Schnelle Date: Fri, 23 Aug 2019 21:49:17 +0200 Subject: kexec_elf: remove Elf_Rel macro It wasn't used anywhere, so lets drop it. Reviewed-by: Christophe Leroy Reviewed-by: Thiago Jung Bauermann Signed-off-by: Sven Schnelle Signed-off-by: Helge Deller --- kernel/kexec_elf.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'kernel') diff --git a/kernel/kexec_elf.c b/kernel/kexec_elf.c index 87935bd5e2ba..6c806ce96ac1 100644 --- a/kernel/kexec_elf.c +++ b/kernel/kexec_elf.c @@ -23,10 +23,6 @@ #define elf_addr_to_cpu elf64_to_cpu -#ifndef Elf_Rel -#define Elf_Rel Elf64_Rel -#endif /* Elf_Rel */ - static inline bool elf_is_elf_file(const struct elfhdr *ehdr) { return memcmp(ehdr->e_ident, ELFMAG, SELFMAG) == 0; -- cgit v1.2.3 From 571ceb7d96952e8281bf780714773b850c403c3e Mon Sep 17 00:00:00 2001 From: Sven Schnelle Date: Fri, 23 Aug 2019 21:49:18 +0200 Subject: kexec_elf: remove unused variable in kexec_elf_load() base was never assigned, so we can remove it. Reviewed-by: Christophe Leroy Reviewed-by: Thiago Jung Bauermann Signed-off-by: Sven Schnelle Signed-off-by: Helge Deller --- kernel/kexec_elf.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/kexec_elf.c b/kernel/kexec_elf.c index 6c806ce96ac1..85f2bd177d6e 100644 --- a/kernel/kexec_elf.c +++ b/kernel/kexec_elf.c @@ -363,7 +363,7 @@ int kexec_elf_load(struct kimage *image, struct elfhdr *ehdr, struct kexec_buf *kbuf, unsigned long *lowest_load_addr) { - unsigned long base = 0, lowest_addr = UINT_MAX; + unsigned long lowest_addr = UINT_MAX; int ret; size_t i; @@ -385,7 +385,7 @@ int kexec_elf_load(struct kimage *image, struct elfhdr *ehdr, kbuf->bufsz = size; kbuf->memsz = phdr->p_memsz; kbuf->buf_align = phdr->p_align; - kbuf->buf_min = phdr->p_paddr + base; + kbuf->buf_min = phdr->p_paddr; kbuf->mem = KEXEC_BUF_MEM_UNKNOWN; ret = kexec_add_buffer(kbuf); if (ret) @@ -396,9 +396,6 @@ int kexec_elf_load(struct kimage *image, struct elfhdr *ehdr, lowest_addr = load_addr; } - /* Update entry point to reflect new load address. */ - ehdr->e_entry += base; - *lowest_load_addr = lowest_addr; ret = 0; out: -- cgit v1.2.3 From ea46a13ebf536698a6f456e03b0f33bffbc5b4c0 Mon Sep 17 00:00:00 2001 From: Sven Schnelle Date: Fri, 23 Aug 2019 21:49:19 +0200 Subject: kexec_elf: support 32 bit ELF files The powerpc version only supported 64 bit. Add some code to switch decoding of fields during runtime so we can kexec a 32 bit kernel from a 64 bit kernel and vice versa. Signed-off-by: Sven Schnelle Reviewed-by: Thiago Jung Bauermann Signed-off-by: Helge Deller --- kernel/kexec_elf.c | 57 ++++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 42 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/kexec_elf.c b/kernel/kexec_elf.c index 85f2bd177d6e..d3689632e8b9 100644 --- a/kernel/kexec_elf.c +++ b/kernel/kexec_elf.c @@ -21,8 +21,6 @@ #include #include -#define elf_addr_to_cpu elf64_to_cpu - static inline bool elf_is_elf_file(const struct elfhdr *ehdr) { return memcmp(ehdr->e_ident, ELFMAG, SELFMAG) == 0; @@ -152,9 +150,6 @@ static int elf_read_ehdr(const char *buf, size_t len, struct elfhdr *ehdr) ehdr->e_type = elf16_to_cpu(ehdr, buf_ehdr->e_type); ehdr->e_machine = elf16_to_cpu(ehdr, buf_ehdr->e_machine); ehdr->e_version = elf32_to_cpu(ehdr, buf_ehdr->e_version); - ehdr->e_entry = elf_addr_to_cpu(ehdr, buf_ehdr->e_entry); - ehdr->e_phoff = elf_addr_to_cpu(ehdr, buf_ehdr->e_phoff); - ehdr->e_shoff = elf_addr_to_cpu(ehdr, buf_ehdr->e_shoff); ehdr->e_flags = elf32_to_cpu(ehdr, buf_ehdr->e_flags); ehdr->e_phentsize = elf16_to_cpu(ehdr, buf_ehdr->e_phentsize); ehdr->e_phnum = elf16_to_cpu(ehdr, buf_ehdr->e_phnum); @@ -162,6 +157,24 @@ static int elf_read_ehdr(const char *buf, size_t len, struct elfhdr *ehdr) ehdr->e_shnum = elf16_to_cpu(ehdr, buf_ehdr->e_shnum); ehdr->e_shstrndx = elf16_to_cpu(ehdr, buf_ehdr->e_shstrndx); + switch (ehdr->e_ident[EI_CLASS]) { + case ELFCLASS64: + ehdr->e_entry = elf64_to_cpu(ehdr, buf_ehdr->e_entry); + ehdr->e_phoff = elf64_to_cpu(ehdr, buf_ehdr->e_phoff); + ehdr->e_shoff = elf64_to_cpu(ehdr, buf_ehdr->e_shoff); + break; + + case ELFCLASS32: + ehdr->e_entry = elf32_to_cpu(ehdr, buf_ehdr->e_entry); + ehdr->e_phoff = elf32_to_cpu(ehdr, buf_ehdr->e_phoff); + ehdr->e_shoff = elf32_to_cpu(ehdr, buf_ehdr->e_shoff); + break; + + default: + pr_debug("Unknown ELF class.\n"); + return -EINVAL; + } + return elf_is_ehdr_sane(ehdr, len) ? 0 : -ENOEXEC; } @@ -192,6 +205,7 @@ static int elf_read_phdr(const char *buf, size_t len, { /* Override the const in proghdrs, we are the ones doing the loading. */ struct elf_phdr *phdr = (struct elf_phdr *) &elf_info->proghdrs[idx]; + const struct elfhdr *ehdr = elf_info->ehdr; const char *pbuf; struct elf_phdr *buf_phdr; @@ -199,18 +213,31 @@ static int elf_read_phdr(const char *buf, size_t len, buf_phdr = (struct elf_phdr *) pbuf; phdr->p_type = elf32_to_cpu(elf_info->ehdr, buf_phdr->p_type); - phdr->p_offset = elf_addr_to_cpu(elf_info->ehdr, buf_phdr->p_offset); - phdr->p_paddr = elf_addr_to_cpu(elf_info->ehdr, buf_phdr->p_paddr); - phdr->p_vaddr = elf_addr_to_cpu(elf_info->ehdr, buf_phdr->p_vaddr); phdr->p_flags = elf32_to_cpu(elf_info->ehdr, buf_phdr->p_flags); - /* - * The following fields have a type equivalent to Elf_Addr - * both in 32 bit and 64 bit ELF. - */ - phdr->p_filesz = elf_addr_to_cpu(elf_info->ehdr, buf_phdr->p_filesz); - phdr->p_memsz = elf_addr_to_cpu(elf_info->ehdr, buf_phdr->p_memsz); - phdr->p_align = elf_addr_to_cpu(elf_info->ehdr, buf_phdr->p_align); + switch (ehdr->e_ident[EI_CLASS]) { + case ELFCLASS64: + phdr->p_offset = elf64_to_cpu(ehdr, buf_phdr->p_offset); + phdr->p_paddr = elf64_to_cpu(ehdr, buf_phdr->p_paddr); + phdr->p_vaddr = elf64_to_cpu(ehdr, buf_phdr->p_vaddr); + phdr->p_filesz = elf64_to_cpu(ehdr, buf_phdr->p_filesz); + phdr->p_memsz = elf64_to_cpu(ehdr, buf_phdr->p_memsz); + phdr->p_align = elf64_to_cpu(ehdr, buf_phdr->p_align); + break; + + case ELFCLASS32: + phdr->p_offset = elf32_to_cpu(ehdr, buf_phdr->p_offset); + phdr->p_paddr = elf32_to_cpu(ehdr, buf_phdr->p_paddr); + phdr->p_vaddr = elf32_to_cpu(ehdr, buf_phdr->p_vaddr); + phdr->p_filesz = elf32_to_cpu(ehdr, buf_phdr->p_filesz); + phdr->p_memsz = elf32_to_cpu(ehdr, buf_phdr->p_memsz); + phdr->p_align = elf32_to_cpu(ehdr, buf_phdr->p_align); + break; + + default: + pr_debug("Unknown ELF class.\n"); + return -EINVAL; + } return elf_is_phdr_sane(phdr, len) ? 0 : -ENOEXEC; } -- cgit v1.2.3 From 312364f3534cc974b79a96d062bde2386315201f Mon Sep 17 00:00:00 2001 From: Daniel Vetter Date: Mon, 26 Aug 2019 22:14:23 +0200 Subject: kernel.h: Add non_block_start/end() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In some special cases we must not block, but there's not a spinlock, preempt-off, irqs-off or similar critical section already that arms the might_sleep() debug checks. Add a non_block_start/end() pair to annotate these. This will be used in the oom paths of mmu-notifiers, where blocking is not allowed to make sure there's forward progress. Quoting Michal: "The notifier is called from quite a restricted context - oom_reaper - which shouldn't depend on any locks or sleepable conditionals. The code should be swift as well but we mostly do care about it to make a forward progress. Checking for sleepable context is the best thing we could come up with that would describe these demands at least partially." Peter also asked whether we want to catch spinlocks on top, but Michal said those are less of a problem because spinlocks can't have an indirect dependency upon the page allocator and hence close the loop with the oom reaper. Suggested by Michal Hocko. Link: https://lore.kernel.org/r/20190826201425.17547-4-daniel.vetter@ffwll.ch Acked-by: Christian König (v1) Acked-by: Peter Zijlstra (Intel) Signed-off-by: Daniel Vetter Acked-by: Michal Hocko Signed-off-by: Jason Gunthorpe --- kernel/sched/core.c | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 2b037f195473..57245770d6cc 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3700,13 +3700,22 @@ static noinline void __schedule_bug(struct task_struct *prev) /* * Various schedule()-time debugging checks and statistics: */ -static inline void schedule_debug(struct task_struct *prev) +static inline void schedule_debug(struct task_struct *prev, bool preempt) { #ifdef CONFIG_SCHED_STACK_END_CHECK if (task_stack_end_corrupted(prev)) panic("corrupted stack end detected inside scheduler\n"); #endif +#ifdef CONFIG_DEBUG_ATOMIC_SLEEP + if (!preempt && prev->state && prev->non_block_count) { + printk(KERN_ERR "BUG: scheduling in a non-blocking section: %s/%d/%i\n", + prev->comm, prev->pid, prev->non_block_count); + dump_stack(); + add_taint(TAINT_WARN, LOCKDEP_STILL_OK); + } +#endif + if (unlikely(in_atomic_preempt_off())) { __schedule_bug(prev); preempt_count_set(PREEMPT_DISABLED); @@ -3813,7 +3822,7 @@ static void __sched notrace __schedule(bool preempt) rq = cpu_rq(cpu); prev = rq->curr; - schedule_debug(prev); + schedule_debug(prev, preempt); if (sched_feat(HRTICK)) hrtick_clear(rq); @@ -6570,7 +6579,7 @@ void ___might_sleep(const char *file, int line, int preempt_offset) rcu_sleep_check(); if ((preempt_count_equals(preempt_offset) && !irqs_disabled() && - !is_idle_task(current)) || + !is_idle_task(current) && !current->non_block_count) || system_state == SYSTEM_BOOTING || system_state > SYSTEM_RUNNING || oops_in_progress) return; @@ -6586,8 +6595,8 @@ void ___might_sleep(const char *file, int line, int preempt_offset) "BUG: sleeping function called from invalid context at %s:%d\n", file, line); printk(KERN_ERR - "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n", - in_atomic(), irqs_disabled(), + "in_atomic(): %d, irqs_disabled(): %d, non_block: %d, pid: %d, name: %s\n", + in_atomic(), irqs_disabled(), current->non_block_count, current->pid, current->comm); if (task_stack_end_corrupted(current)) -- cgit v1.2.3 From c5e4a062fe661806ab291a7576dc4a41613adb86 Mon Sep 17 00:00:00 2001 From: Matthias Maennich Date: Fri, 6 Sep 2019 11:32:25 +0100 Subject: module: support reading multiple values per modinfo tag Similar to modpost's get_next_modinfo(), introduce get_next_modinfo() in kernel/module.c to acquire any further values associated with the same modinfo tag name. That is useful for any tags that have multiple occurrences (such as 'alias'), but is in particular introduced here as part of the symbol namespaces patch series to read the (potentially) multiple namespaces a module is importing. Reviewed-by: Joel Fernandes (Google) Reviewed-by: Martijn Coenen Reviewed-by: Greg Kroah-Hartman Signed-off-by: Matthias Maennich Signed-off-by: Jessica Yu --- kernel/module.c | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 9ee93421269c..3ee507c0a92f 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2481,7 +2481,8 @@ static char *next_string(char *string, unsigned long *secsize) return string; } -static char *get_modinfo(struct load_info *info, const char *tag) +static char *get_next_modinfo(const struct load_info *info, const char *tag, + char *prev) { char *p; unsigned int taglen = strlen(tag); @@ -2492,13 +2493,25 @@ static char *get_modinfo(struct load_info *info, const char *tag) * get_modinfo() calls made before rewrite_section_headers() * must use sh_offset, as sh_addr isn't set! */ - for (p = (char *)info->hdr + infosec->sh_offset; p; p = next_string(p, &size)) { + char *modinfo = (char *)info->hdr + infosec->sh_offset; + + if (prev) { + size -= prev - modinfo; + modinfo = next_string(prev, &size); + } + + for (p = modinfo; p; p = next_string(p, &size)) { if (strncmp(p, tag, taglen) == 0 && p[taglen] == '=') return p + taglen + 1; } return NULL; } +static char *get_modinfo(const struct load_info *info, const char *tag) +{ + return get_next_modinfo(info, tag, NULL); +} + static void setup_modinfo(struct module *mod, struct load_info *info) { struct module_attribute *attr; -- cgit v1.2.3 From 8651ec01daedad26290f76beeb4736f9d2da4b87 Mon Sep 17 00:00:00 2001 From: Matthias Maennich Date: Fri, 6 Sep 2019 11:32:27 +0100 Subject: module: add support for symbol namespaces. The EXPORT_SYMBOL_NS() and EXPORT_SYMBOL_NS_GPL() macros can be used to export a symbol to a specific namespace. There are no _GPL_FUTURE and _UNUSED variants because these are currently unused, and I'm not sure they are necessary. I didn't add EXPORT_SYMBOL_NS() for ASM exports; this patch sets the namespace of ASM exports to NULL by default. In case of relative references, it will be relocatable to NULL. If there's a need, this should be pretty easy to add. A module that wants to use a symbol exported to a namespace must add a MODULE_IMPORT_NS() statement to their module code; otherwise, modpost will complain when building the module, and the kernel module loader will emit an error and fail when loading the module. MODULE_IMPORT_NS() adds a modinfo tag 'import_ns' to the module. That tag can be observed by the modinfo command, modpost and kernel/module.c at the time of loading the module. The ELF symbols are renamed to include the namespace with an asm label; for example, symbol 'usb_stor_suspend' in namespace USB_STORAGE becomes 'usb_stor_suspend.USB_STORAGE'. This allows modpost to do namespace checking, without having to go through all the effort of parsing ELF and relocation records just to get to the struct kernel_symbols. On x86_64 I saw no difference in binary size (compression), but at runtime this will require a word of memory per export to hold the namespace. An alternative could be to store namespaced symbols in their own section and use a separate 'struct namespaced_kernel_symbol' for that section, at the cost of making the module loader more complex. Co-developed-by: Martijn Coenen Signed-off-by: Martijn Coenen Reviewed-by: Greg Kroah-Hartman Signed-off-by: Matthias Maennich Signed-off-by: Jessica Yu --- kernel/module.c | 43 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 3ee507c0a92f..6bb9b938f9c7 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -544,6 +544,15 @@ static const char *kernel_symbol_name(const struct kernel_symbol *sym) #endif } +static const char *kernel_symbol_namespace(const struct kernel_symbol *sym) +{ +#ifdef CONFIG_HAVE_ARCH_PREL32_RELOCATIONS + return offset_to_ptr(&sym->namespace_offset); +#else + return sym->namespace; +#endif +} + static int cmp_name(const void *va, const void *vb) { const char *a; @@ -1379,6 +1388,34 @@ static inline int same_magic(const char *amagic, const char *bmagic, } #endif /* CONFIG_MODVERSIONS */ +static char *get_modinfo(const struct load_info *info, const char *tag); +static char *get_next_modinfo(const struct load_info *info, const char *tag, + char *prev); + +static int verify_namespace_is_imported(const struct load_info *info, + const struct kernel_symbol *sym, + struct module *mod) +{ + const char *namespace; + char *imported_namespace; + + namespace = kernel_symbol_namespace(sym); + if (namespace) { + imported_namespace = get_modinfo(info, "import_ns"); + while (imported_namespace) { + if (strcmp(namespace, imported_namespace) == 0) + return 0; + imported_namespace = get_next_modinfo( + info, "import_ns", imported_namespace); + } + pr_err("%s: module uses symbol (%s) from namespace %s, but does not import it.\n", + mod->name, kernel_symbol_name(sym), namespace); + return -EINVAL; + } + return 0; +} + + /* Resolve a symbol for this module. I.e. if we find one, record usage. */ static const struct kernel_symbol *resolve_symbol(struct module *mod, const struct load_info *info, @@ -1407,6 +1444,12 @@ static const struct kernel_symbol *resolve_symbol(struct module *mod, goto getname; } + err = verify_namespace_is_imported(info, sym, mod); + if (err) { + sym = ERR_PTR(err); + goto getname; + } + err = ref_module(mod, owner); if (err) { sym = ERR_PTR(err); -- cgit v1.2.3 From 3d52ec5e5d0dd7f8ca96a68c6756bd96e58b716b Mon Sep 17 00:00:00 2001 From: Matthias Maennich Date: Fri, 6 Sep 2019 11:32:29 +0100 Subject: module: add config option MODULE_ALLOW_MISSING_NAMESPACE_IMPORTS If MODULE_ALLOW_MISSING_NAMESPACE_IMPORTS is enabled (default=n), the requirement for modules to import all namespaces that are used by the module is relaxed. Enabling this option effectively allows (invalid) modules to be loaded while only a warning is emitted. Disabling this option keeps the enforcement at module loading time and loading is denied if the module's imports are not satisfactory. Reviewed-by: Martijn Coenen Reviewed-by: Greg Kroah-Hartman Signed-off-by: Matthias Maennich Signed-off-by: Jessica Yu --- kernel/module.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 6bb9b938f9c7..f76efcf2043e 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -1408,9 +1408,16 @@ static int verify_namespace_is_imported(const struct load_info *info, imported_namespace = get_next_modinfo( info, "import_ns", imported_namespace); } - pr_err("%s: module uses symbol (%s) from namespace %s, but does not import it.\n", - mod->name, kernel_symbol_name(sym), namespace); +#ifdef CONFIG_MODULE_ALLOW_MISSING_NAMESPACE_IMPORTS + pr_warn( +#else + pr_err( +#endif + "%s: module uses symbol (%s) from namespace %s, but does not import it.\n", + mod->name, kernel_symbol_name(sym), namespace); +#ifndef CONFIG_MODULE_ALLOW_MISSING_NAMESPACE_IMPORTS return -EINVAL; +#endif } return 0; } -- cgit v1.2.3 From 77b4b5420422fc037d00b8f3f0e89b2262e4ae29 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 5 Sep 2019 23:15:08 +0200 Subject: posix-cpu-timers: Fix permission check regression The recent consolidation of the three permission checks introduced a subtle regression. For timer_create() with a process wide timer it returns the current task if the lookup through the PID which is encoded into the clockid results in returning current. That's broken because it does not validate whether the current task is the group leader. That was caused by the two different variants of permission checks: - posix_cpu_timer_get() allowed access to the process wide clock when the looked up task is current. That's not an issue because the process wide clock is in the shared sighand. - posix_cpu_timer_create() made sure that the looked up task is the group leader. Restore the previous state. Note, that these permission checks are more than questionable, but that's subject to follow up changes. Fixes: 6ae40e3fdcd3 ("posix-cpu-timers: Provide task validation functions") Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Link: https://lkml.kernel.org/r/alpine.DEB.2.21.1909052314110.1902@nanos.tec.linutronix.de --- kernel/time/posix-cpu-timers.c | 44 +++++++++++++++++++++++++++++++++--------- 1 file changed, 35 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index c3a95b122209..92a431981b1c 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -47,25 +47,46 @@ void update_rlimit_cpu(struct task_struct *task, unsigned long rlim_new) /* * Functions for validating access to tasks. */ -static struct task_struct *lookup_task(const pid_t pid, bool thread) +static struct task_struct *lookup_task(const pid_t pid, bool thread, + bool gettime) { struct task_struct *p; + /* + * If the encoded PID is 0, then the timer is targeted at current + * or the process to which current belongs. + */ if (!pid) return thread ? current : current->group_leader; p = find_task_by_vpid(pid); - if (!p || p == current) + if (!p) return p; + if (thread) return same_thread_group(p, current) ? p : NULL; - if (p == current) - return p; + + if (gettime) { + /* + * For clock_gettime(PROCESS) the task does not need to be + * the actual group leader. tsk->sighand gives + * access to the group's clock. + * + * Timers need the group leader because they take a + * reference on it and store the task pointer until the + * timer is destroyed. + */ + return (p == current || thread_group_leader(p)) ? p : NULL; + } + + /* + * For processes require that p is group leader. + */ return has_group_leader_pid(p) ? p : NULL; } static struct task_struct *__get_task_for_clock(const clockid_t clock, - bool getref) + bool getref, bool gettime) { const bool thread = !!CPUCLOCK_PERTHREAD(clock); const pid_t pid = CPUCLOCK_PID(clock); @@ -75,7 +96,7 @@ static struct task_struct *__get_task_for_clock(const clockid_t clock, return NULL; rcu_read_lock(); - p = lookup_task(pid, thread); + p = lookup_task(pid, thread, gettime); if (p && getref) get_task_struct(p); rcu_read_unlock(); @@ -84,12 +105,17 @@ static struct task_struct *__get_task_for_clock(const clockid_t clock, static inline struct task_struct *get_task_for_clock(const clockid_t clock) { - return __get_task_for_clock(clock, true); + return __get_task_for_clock(clock, true, false); +} + +static inline struct task_struct *get_task_for_clock_get(const clockid_t clock) +{ + return __get_task_for_clock(clock, true, true); } static inline int validate_clock_permissions(const clockid_t clock) { - return __get_task_for_clock(clock, false) ? 0 : -EINVAL; + return __get_task_for_clock(clock, false, false) ? 0 : -EINVAL; } /* @@ -339,7 +365,7 @@ static int posix_cpu_clock_get(const clockid_t clock, struct timespec64 *tp) struct task_struct *tsk; u64 t; - tsk = get_task_for_clock(clock); + tsk = get_task_for_clock_get(clock); if (!tsk) return -EINVAL; -- cgit v1.2.3 From 821cc7b0b205c0df64cce59aacc330af251fa8f7 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Tue, 23 Jul 2019 07:44:46 -0500 Subject: waitid: Add support for waiting for the current process group It was recently discovered that the linux version of waitid is not a superset of the other wait functions because it does not include support for waiting for the current process group. This has two downsides: 1. An extra system call is needed to get the current process group. 2. After the current process group is received and before it is passed to waitid a signal could arrive causing the current process group to change. Inherent race-conditions as these make it impossible for userspace to emulate this functionaly and thus violate async-signal safety requirements for waitpid. Arguments can be made for using a different choice of idtype and id for this case but the BSDs already use this P_PGID and 0 to indicate waiting for the current process's process group. So be nice to user space programmers and don't introduce an unnecessary incompatibility. Some people have noted that the posix description is that waitpid will wait for the current process group, and that in the presence of pthreads that process group can change. To get clarity on this issue I looked at XNU, FreeBSD, and Luminos. All of those flavors of unix waited for the current process group at the time of call and as written could not adapt to the process group changing after the call. At one point Linux did adapt to the current process group changing but that stopped in 161550d74c07 ("pid: sys_wait... fixes"). It has been over 11 years since Linux has that behavior, no programs that fail with the change in behavior have been reported, and I could not find any other unix that does this. So I think it is safe to clarify the definition of current process group, to current process group at the time of the wait function. Signed-off-by: "Eric W. Biederman" Signed-off-by: Christian Brauner Reviewed-by: Oleg Nesterov Cc: "H. Peter Anvin" Cc: Arnd Bergmann Cc: Palmer Dabbelt Cc: Rich Felker Cc: Alistair Francis Cc: Zong Li Cc: Andrew Morton Cc: Oleg Nesterov Cc: Linus Torvalds Cc: Al Viro Cc: Florian Weimer Cc: Adhemerval Zanella Cc: GNU C Library Link: https://lore.kernel.org/r/20190814154400.6371-2-christian.brauner@ubuntu.com --- kernel/exit.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 64bb6893a37d..d2d74a7b81d1 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1596,10 +1596,13 @@ static long kernel_waitid(int which, pid_t upid, struct waitid_info *infop, break; case P_PGID: type = PIDTYPE_PGID; - if (upid <= 0) + if (upid < 0) return -EINVAL; - pid = find_get_pid(upid); + if (upid) + pid = find_get_pid(upid); + else + pid = get_task_pid(current, PIDTYPE_PGID); break; case P_PIDFD: type = PIDTYPE_PID; -- cgit v1.2.3 From 3fc1ca00653db6371585e3c21c4b873b2f20e60a Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Fri, 6 Sep 2019 14:14:48 +0800 Subject: swiotlb: Split size parameter to map/unmap APIs This splits the size parameter to swiotlb_tbl_map_single() and swiotlb_tbl_unmap_single() into an alloc_size and a mapping_size parameter, where the latter one is rounded up to the iommu page size. Suggested-by: Christoph Hellwig Signed-off-by: Lu Baolu Reviewed-by: Christoph Hellwig Signed-off-by: Joerg Roedel --- kernel/dma/direct.c | 2 +- kernel/dma/swiotlb.c | 34 +++++++++++++++++++++------------- 2 files changed, 22 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index 795c9b095d75..a7f2a0163426 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -297,7 +297,7 @@ void dma_direct_unmap_page(struct device *dev, dma_addr_t addr, dma_direct_sync_single_for_cpu(dev, addr, size, dir); if (unlikely(is_swiotlb_buffer(phys))) - swiotlb_tbl_unmap_single(dev, phys, size, dir, attrs); + swiotlb_tbl_unmap_single(dev, phys, size, size, dir, attrs); } EXPORT_SYMBOL(dma_direct_unmap_page); diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index 9de232229063..796a44f8ef5a 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -444,7 +444,9 @@ static void swiotlb_bounce(phys_addr_t orig_addr, phys_addr_t tlb_addr, phys_addr_t swiotlb_tbl_map_single(struct device *hwdev, dma_addr_t tbl_dma_addr, - phys_addr_t orig_addr, size_t size, + phys_addr_t orig_addr, + size_t mapping_size, + size_t alloc_size, enum dma_data_direction dir, unsigned long attrs) { @@ -464,6 +466,12 @@ phys_addr_t swiotlb_tbl_map_single(struct device *hwdev, pr_warn_once("%s is active and system is using DMA bounce buffers\n", sme_active() ? "SME" : "SEV"); + if (mapping_size > alloc_size) { + dev_warn_once(hwdev, "Invalid sizes (mapping: %zd bytes, alloc: %zd bytes)", + mapping_size, alloc_size); + return (phys_addr_t)DMA_MAPPING_ERROR; + } + mask = dma_get_seg_boundary(hwdev); tbl_dma_addr &= mask; @@ -471,8 +479,8 @@ phys_addr_t swiotlb_tbl_map_single(struct device *hwdev, offset_slots = ALIGN(tbl_dma_addr, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT; /* - * Carefully handle integer overflow which can occur when mask == ~0UL. - */ + * Carefully handle integer overflow which can occur when mask == ~0UL. + */ max_slots = mask + 1 ? ALIGN(mask + 1, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT : 1UL << (BITS_PER_LONG - IO_TLB_SHIFT); @@ -481,8 +489,8 @@ phys_addr_t swiotlb_tbl_map_single(struct device *hwdev, * For mappings greater than or equal to a page, we limit the stride * (and hence alignment) to a page size. */ - nslots = ALIGN(size, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT; - if (size >= PAGE_SIZE) + nslots = ALIGN(alloc_size, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT; + if (alloc_size >= PAGE_SIZE) stride = (1 << (PAGE_SHIFT - IO_TLB_SHIFT)); else stride = 1; @@ -547,7 +555,7 @@ not_found: spin_unlock_irqrestore(&io_tlb_lock, flags); if (!(attrs & DMA_ATTR_NO_WARN) && printk_ratelimit()) dev_warn(hwdev, "swiotlb buffer is full (sz: %zd bytes), total %lu (slots), used %lu (slots)\n", - size, io_tlb_nslabs, tmp_io_tlb_used); + alloc_size, io_tlb_nslabs, tmp_io_tlb_used); return (phys_addr_t)DMA_MAPPING_ERROR; found: io_tlb_used += nslots; @@ -562,7 +570,7 @@ found: io_tlb_orig_addr[index+i] = orig_addr + (i << IO_TLB_SHIFT); if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) && (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL)) - swiotlb_bounce(orig_addr, tlb_addr, size, DMA_TO_DEVICE); + swiotlb_bounce(orig_addr, tlb_addr, mapping_size, DMA_TO_DEVICE); return tlb_addr; } @@ -571,11 +579,11 @@ found: * tlb_addr is the physical address of the bounce buffer to unmap. */ void swiotlb_tbl_unmap_single(struct device *hwdev, phys_addr_t tlb_addr, - size_t size, enum dma_data_direction dir, - unsigned long attrs) + size_t mapping_size, size_t alloc_size, + enum dma_data_direction dir, unsigned long attrs) { unsigned long flags; - int i, count, nslots = ALIGN(size, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT; + int i, count, nslots = ALIGN(alloc_size, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT; int index = (tlb_addr - io_tlb_start) >> IO_TLB_SHIFT; phys_addr_t orig_addr = io_tlb_orig_addr[index]; @@ -585,7 +593,7 @@ void swiotlb_tbl_unmap_single(struct device *hwdev, phys_addr_t tlb_addr, if (orig_addr != INVALID_PHYS_ADDR && !(attrs & DMA_ATTR_SKIP_CPU_SYNC) && ((dir == DMA_FROM_DEVICE) || (dir == DMA_BIDIRECTIONAL))) - swiotlb_bounce(orig_addr, tlb_addr, size, DMA_FROM_DEVICE); + swiotlb_bounce(orig_addr, tlb_addr, mapping_size, DMA_FROM_DEVICE); /* * Return the buffer to the free list by setting the corresponding @@ -665,14 +673,14 @@ bool swiotlb_map(struct device *dev, phys_addr_t *phys, dma_addr_t *dma_addr, /* Oh well, have to allocate and map a bounce buffer. */ *phys = swiotlb_tbl_map_single(dev, __phys_to_dma(dev, io_tlb_start), - *phys, size, dir, attrs); + *phys, size, size, dir, attrs); if (*phys == (phys_addr_t)DMA_MAPPING_ERROR) return false; /* Ensure that the address returned is DMA'ble */ *dma_addr = __phys_to_dma(dev, *phys); if (unlikely(!dma_capable(dev, *dma_addr, size))) { - swiotlb_tbl_unmap_single(dev, *phys, size, dir, + swiotlb_tbl_unmap_single(dev, *phys, size, size, dir, attrs | DMA_ATTR_SKIP_CPU_SYNC); return false; } -- cgit v1.2.3 From 069e1c07c18ac2ccecdfb5ac287a37d6fb2d7a00 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Wed, 11 Sep 2019 13:26:46 +0100 Subject: module: Fix link failure due to invalid relocation on namespace offset Commit 8651ec01daed ("module: add support for symbol namespaces.") broke linking for arm64 defconfig: | lib/crypto/arc4.o: In function `__ksymtab_arc4_setkey': | arc4.c:(___ksymtab+arc4_setkey+0x8): undefined reference to `no symbol' | lib/crypto/arc4.o: In function `__ksymtab_arc4_crypt': | arc4.c:(___ksymtab+arc4_crypt+0x8): undefined reference to `no symbol' This is because the dummy initialisation of the 'namespace_offset' field in 'struct kernel_symbol' when using EXPORT_SYMBOL on architectures with support for PREL32 locations uses an offset from an absolute address (0) in an effort to trick 'offset_to_pointer' into behaving as a NOP, allowing non-namespaced symbols to be treated in the same way as those belonging to a namespace. Unfortunately, place-relative relocations require a symbol reference rather than an absolute value and, although x86 appears to get away with this due to placing the kernel text at the top of the address space, it almost certainly results in a runtime failure if the kernel is relocated dynamically as a result of KASLR. Rework 'namespace_offset' so that a value of 0, which cannot occur for a valid namespaced symbol, indicates that the corresponding symbol does not belong to a namespace. Cc: Matthias Maennich Cc: Jessica Yu Cc: Ard Biesheuvel Cc: Catalin Marinas Fixes: 8651ec01daed ("module: add support for symbol namespaces.") Reported-by: kbuild test robot Tested-by: Matthias Maennich Tested-by: Ard Biesheuvel Reviewed-by: Matthias Maennich Acked-by: Ard Biesheuvel Signed-off-by: Will Deacon Signed-off-by: Jessica Yu --- kernel/module.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index f76efcf2043e..7ab244c4e1ba 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -547,6 +547,8 @@ static const char *kernel_symbol_name(const struct kernel_symbol *sym) static const char *kernel_symbol_namespace(const struct kernel_symbol *sym) { #ifdef CONFIG_HAVE_ARCH_PREL32_RELOCATIONS + if (!sym->namespace_offset) + return NULL; return offset_to_ptr(&sym->namespace_offset); #else return sym->namespace; -- cgit v1.2.3 From b605be658188005efd2d447e0989fd1f4aab8862 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Mon, 9 Sep 2019 20:39:02 +0900 Subject: module: remove unneeded casts in cmp_name() You can pass opaque pointers directly. I also renamed 'va' and 'vb' into more meaningful arguments. Signed-off-by: Masahiro Yamada Signed-off-by: Jessica Yu --- kernel/module.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 7ab244c4e1ba..32873bcce738 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -555,12 +555,9 @@ static const char *kernel_symbol_namespace(const struct kernel_symbol *sym) #endif } -static int cmp_name(const void *va, const void *vb) +static int cmp_name(const void *name, const void *sym) { - const char *a; - const struct kernel_symbol *b; - a = va; b = vb; - return strcmp(a, kernel_symbol_name(b)); + return strcmp(name, kernel_symbol_name(sym)); } static bool find_exported_symbol_in_section(const struct symsearch *syms, -- cgit v1.2.3 From a0eb9abd8af92d1aa34bc1e24dfbd1ba0bd6a56c Mon Sep 17 00:00:00 2001 From: Eugene Syromiatnikov Date: Wed, 11 Sep 2019 18:45:40 +0100 Subject: fork: block invalid exit signals with clone3() Previously, higher 32 bits of exit_signal fields were lost when copied to the kernel args structure (that uses int as a type for the respective field). Moreover, as Oleg has noted, exit_signal is used unchecked, so it has to be checked for sanity before use; for the legacy syscalls, applying CSIGNAL mask guarantees that it is at least non-negative; however, there's no such thing is done in clone3() code path, and that can break at least thread_group_leader. This commit adds a check to copy_clone_args_from_user() to verify that the exit signal is limited by CSIGNAL as with legacy clone() and that the signal is valid. With this we don't get the legacy clone behavior were an invalid signal could be handed down and would only be detected and ignored in do_notify_parent(). Users of clone3() will now get a proper error when they pass an invalid exit signal. Note, that this is not user-visible behavior since no kernel with clone3() has been released yet. The following program will cause a splat on a non-fixed clone3() version and will fail correctly on a fixed version: #define _GNU_SOURCE #include #include #include #include #include #include #include #include int main(int argc, char *argv[]) { pid_t pid = -1; struct clone_args args = {0}; args.exit_signal = -1; pid = syscall(__NR_clone3, &args, sizeof(struct clone_args)); if (pid < 0) exit(EXIT_FAILURE); if (pid == 0) exit(EXIT_SUCCESS); wait(NULL); exit(EXIT_SUCCESS); } Fixes: 7f192e3cd316 ("fork: add clone3") Reported-by: Oleg Nesterov Suggested-by: Oleg Nesterov Suggested-by: Dmitry V. Levin Signed-off-by: Eugene Syromiatnikov Link: https://lore.kernel.org/r/4b38fa4ce420b119a4c6345f42fe3cec2de9b0b5.1568223594.git.esyr@redhat.com [christian.brauner@ubuntu.com: simplify check and rework commit message] Signed-off-by: Christian Brauner --- kernel/fork.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 2852d0e76ea3..541fd805fb88 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2338,6 +2338,8 @@ struct mm_struct *copy_init_mm(void) * * It copies the process, and if successful kick-starts * it and waits for it to finish using the VM if required. + * + * args->exit_signal is expected to be checked for sanity by the caller. */ long _do_fork(struct kernel_clone_args *args) { @@ -2562,6 +2564,14 @@ noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs, if (copy_from_user(&args, uargs, size)) return -EFAULT; + /* + * Verify that higher 32bits of exit_signal are unset and that + * it is a valid signal + */ + if (unlikely((args.exit_signal & ~((u64)CSIGNAL)) || + !valid_signal(args.exit_signal))) + return -EINVAL; + *kargs = (struct kernel_clone_args){ .flags = args.flags, .pidfd = u64_to_user_ptr(args.pidfd), -- cgit v1.2.3 From 97a61369830ab085df5aed0ff9256f35b07d425a Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Thu, 12 Sep 2019 10:56:45 -0700 Subject: cgroup: freezer: fix frozen state inheritance If a new child cgroup is created in the frozen cgroup hierarchy (one or more of ancestor cgroups is frozen), the CGRP_FREEZE cgroup flag should be set. Otherwise if a process will be attached to the child cgroup, it won't become frozen. The problem can be reproduced with the test_cgfreezer_mkdir test. This is the output before this patch: ~/test_freezer ok 1 test_cgfreezer_simple ok 2 test_cgfreezer_tree ok 3 test_cgfreezer_forkbomb Cgroup /sys/fs/cgroup/cg_test_mkdir_A/cg_test_mkdir_B isn't frozen not ok 4 test_cgfreezer_mkdir ok 5 test_cgfreezer_rmdir ok 6 test_cgfreezer_migrate ok 7 test_cgfreezer_ptrace ok 8 test_cgfreezer_stopped ok 9 test_cgfreezer_ptraced ok 10 test_cgfreezer_vfork And with this patch: ~/test_freezer ok 1 test_cgfreezer_simple ok 2 test_cgfreezer_tree ok 3 test_cgfreezer_forkbomb ok 4 test_cgfreezer_mkdir ok 5 test_cgfreezer_rmdir ok 6 test_cgfreezer_migrate ok 7 test_cgfreezer_ptrace ok 8 test_cgfreezer_stopped ok 9 test_cgfreezer_ptraced ok 10 test_cgfreezer_vfork Reported-by: Mark Crossen Signed-off-by: Roman Gushchin Fixes: 76f969e8948d ("cgroup: cgroup v2 freezer") Cc: Tejun Heo Cc: stable@vger.kernel.org # v5.2+ Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 753afbca549f..8be1da1ebd9a 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -5255,8 +5255,16 @@ static struct cgroup *cgroup_create(struct cgroup *parent) * if the parent has to be frozen, the child has too. */ cgrp->freezer.e_freeze = parent->freezer.e_freeze; - if (cgrp->freezer.e_freeze) + if (cgrp->freezer.e_freeze) { + /* + * Set the CGRP_FREEZE flag, so when a process will be + * attached to the child cgroup, it will become frozen. + * At this point the new cgroup is unpopulated, so we can + * consider it frozen immediately. + */ + set_bit(CGRP_FREEZE, &cgrp->flags); set_bit(CGRP_FROZEN, &cgrp->flags); + } spin_lock_irq(&css_set_lock); for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp)) { -- cgit v1.2.3 From eb92692b2544d3f415887dbbc98499843dfe568b Mon Sep 17 00:00:00 2001 From: Quentin Perret Date: Thu, 12 Sep 2019 11:44:04 +0200 Subject: sched/fair: Speed-up energy-aware wake-ups EAS computes the energy impact of migrating a waking task when deciding on which CPU it should run. However, the current approach is known to have a high algorithmic complexity, which can result in prohibitively high wake-up latencies on systems with complex energy models, such as systems with per-CPU DVFS. On such systems, the algorithm complexity is in O(n^2) (ignoring the cost of searching for performance states in the EM) with 'n' the number of CPUs. To address this, re-factor the EAS wake-up path to compute the energy 'delta' (with and without the task) on a per-performance domain basis, rather than system-wide, which brings the complexity down to O(n). No functional changes intended. Test results ~~~~~~~~~~~~ * Setup: Tested on a Google Pixel 3, with a Snapdragon 845 (4+4 CPUs, A55/A75). Base kernel is 5.3-rc5 + Pixel3 specific patches. Android userspace, no graphics. * Test case: Run a periodic rt-app task, with 16ms period, ramping down from 70% to 10%, in 5% steps of 500 ms each (json avail. at [1]). Frequencies of all CPUs are pinned to max (using scaling_min_freq CPUFreq sysfs entries) to reduce variability. The time to run select_task_rq_fair() is measured using the function profiler (/sys/kernel/debug/tracing/trace_stat/function*). See the test script for more details [2]. Test 1: I hacked the DT to 'fake' per-CPU DVFS. That is, we end up with one CPUFreq policy per CPU (8 policies in total). Since all frequencies are pinned to max for the test, this should have no impact on the actual frequency selection, but it does in the EAS calculation. +---------------------------+----------------------------------+ | Without patch | With patch | +-----+-----+----------+----------+-----+-----------------+----------+ | CPU | Hit | Avg (us) | s^2 (us) | Hit | Avg (us) | s^2 (us) | |-----+-----+----------+----------+-----+-----------------+----------+ | 0 | 274 | 38.303 | 1750.239 | 401 | 14.126 (-63.1%) | 146.625 | | 1 | 197 | 49.529 | 1695.852 | 314 | 16.135 (-67.4%) | 167.525 | | 2 | 142 | 34.296 | 1758.665 | 302 | 14.133 (-58.8%) | 130.071 | | 3 | 172 | 31.734 | 1490.975 | 641 | 14.637 (-53.9%) | 139.189 | | 4 | 316 | 7.834 | 178.217 | 425 | 5.413 (-30.9%) | 20.803 | | 5 | 447 | 8.424 | 144.638 | 556 | 5.929 (-29.6%) | 27.301 | | 6 | 581 | 14.886 | 346.793 | 456 | 5.711 (-61.6%) | 23.124 | | 7 | 456 | 10.005 | 211.187 | 997 | 4.708 (-52.9%) | 21.144 | +-----+-----+----------+----------+-----+-----------------+----------+ * Hit, Avg and s^2 are as reported by the function profiler Test 2: I also ran the same test with a normal DT, with 2 CPUFreq policies, to see if this causes regressions in the most common case. +---------------------------+----------------------------------+ | Without patch | With patch | +-----+-----+----------+----------+-----+-----------------+----------+ | CPU | Hit | Avg (us) | s^2 (us) | Hit | Avg (us) | s^2 (us) | |-----+-----+----------+----------+-----+-----------------+----------+ | 0 | 345 | 22.184 | 215.321 | 580 | 18.635 (-16.0%) | 146.892 | | 1 | 358 | 18.597 | 200.596 | 438 | 12.934 (-30.5%) | 104.604 | | 2 | 359 | 25.566 | 200.217 | 397 | 10.826 (-57.7%) | 74.021 | | 3 | 362 | 16.881 | 200.291 | 718 | 11.455 (-32.1%) | 102.280 | | 4 | 457 | 3.822 | 9.895 | 757 | 4.616 (+20.8%) | 13.369 | | 5 | 344 | 4.301 | 7.121 | 594 | 5.320 (+23.7%) | 18.798 | | 6 | 472 | 4.326 | 7.849 | 464 | 5.648 (+30.6%) | 22.022 | | 7 | 331 | 4.630 | 13.937 | 408 | 5.299 (+14.4%) | 18.273 | +-----+-----+----------+----------+-----+-----------------+----------+ * Hit, Avg and s^2 are as reported by the function profiler In addition to these two tests, I also ran 50 iterations of the Lisa EAS functional test suite [3] with this patch applied on Arm Juno r0, Arm Juno r2, Arm TC2 and Hikey960, and could not see any regressions (all EAS functional tests are passing). [1] https://paste.debian.net/1100055/ [2] https://paste.debian.net/1100057/ [3] https://github.com/ARM-software/lisa/blob/master/lisa/tests/scheduler/eas_behaviour.py Signed-off-by: Quentin Perret Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dietmar.eggemann@arm.com Cc: juri.lelli@redhat.com Cc: morten.rasmussen@arm.com Cc: qais.yousef@arm.com Cc: qperret@qperret.net Cc: rjw@rjwysocki.net Cc: tkjos@google.com Cc: valentin.schneider@arm.com Cc: vincent.guittot@linaro.org Link: https://lkml.kernel.org/r/20190912094404.13802-1-qperret@qperret.net Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 110 ++++++++++++++++++++++++---------------------------- 1 file changed, 50 insertions(+), 60 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 1054d2cf6aaa..8b665110a44a 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6251,69 +6251,55 @@ static unsigned long cpu_util_next(int cpu, struct task_struct *p, int dst_cpu) } /* - * compute_energy(): Estimates the energy that would be consumed if @p was + * compute_energy(): Estimates the energy that @pd would consume if @p was * migrated to @dst_cpu. compute_energy() predicts what will be the utilization - * landscape of the * CPUs after the task migration, and uses the Energy Model + * landscape of @pd's CPUs after the task migration, and uses the Energy Model * to compute what would be the energy if we decided to actually migrate that * task. */ static long compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd) { - unsigned int max_util, util_cfs, cpu_util, cpu_cap; - unsigned long sum_util, energy = 0; - struct task_struct *tsk; + struct cpumask *pd_mask = perf_domain_span(pd); + unsigned long cpu_cap = arch_scale_cpu_capacity(cpumask_first(pd_mask)); + unsigned long max_util = 0, sum_util = 0; int cpu; - for (; pd; pd = pd->next) { - struct cpumask *pd_mask = perf_domain_span(pd); + /* + * The capacity state of CPUs of the current rd can be driven by CPUs + * of another rd if they belong to the same pd. So, account for the + * utilization of these CPUs too by masking pd with cpu_online_mask + * instead of the rd span. + * + * If an entire pd is outside of the current rd, it will not appear in + * its pd list and will not be accounted by compute_energy(). + */ + for_each_cpu_and(cpu, pd_mask, cpu_online_mask) { + unsigned long cpu_util, util_cfs = cpu_util_next(cpu, p, dst_cpu); + struct task_struct *tsk = cpu == dst_cpu ? p : NULL; /* - * The energy model mandates all the CPUs of a performance - * domain have the same capacity. + * Busy time computation: utilization clamping is not + * required since the ratio (sum_util / cpu_capacity) + * is already enough to scale the EM reported power + * consumption at the (eventually clamped) cpu_capacity. */ - cpu_cap = arch_scale_cpu_capacity(cpumask_first(pd_mask)); - max_util = sum_util = 0; + sum_util += schedutil_cpu_util(cpu, util_cfs, cpu_cap, + ENERGY_UTIL, NULL); /* - * The capacity state of CPUs of the current rd can be driven by - * CPUs of another rd if they belong to the same performance - * domain. So, account for the utilization of these CPUs too - * by masking pd with cpu_online_mask instead of the rd span. - * - * If an entire performance domain is outside of the current rd, - * it will not appear in its pd list and will not be accounted - * by compute_energy(). + * Performance domain frequency: utilization clamping + * must be considered since it affects the selection + * of the performance domain frequency. + * NOTE: in case RT tasks are running, by default the + * FREQUENCY_UTIL's utilization can be max OPP. */ - for_each_cpu_and(cpu, pd_mask, cpu_online_mask) { - util_cfs = cpu_util_next(cpu, p, dst_cpu); - - /* - * Busy time computation: utilization clamping is not - * required since the ratio (sum_util / cpu_capacity) - * is already enough to scale the EM reported power - * consumption at the (eventually clamped) cpu_capacity. - */ - sum_util += schedutil_cpu_util(cpu, util_cfs, cpu_cap, - ENERGY_UTIL, NULL); - - /* - * Performance domain frequency: utilization clamping - * must be considered since it affects the selection - * of the performance domain frequency. - * NOTE: in case RT tasks are running, by default the - * FREQUENCY_UTIL's utilization can be max OPP. - */ - tsk = cpu == dst_cpu ? p : NULL; - cpu_util = schedutil_cpu_util(cpu, util_cfs, cpu_cap, - FREQUENCY_UTIL, tsk); - max_util = max(max_util, cpu_util); - } - - energy += em_pd_energy(pd->em_pd, max_util, sum_util); + cpu_util = schedutil_cpu_util(cpu, util_cfs, cpu_cap, + FREQUENCY_UTIL, tsk); + max_util = max(max_util, cpu_util); } - return energy; + return em_pd_energy(pd->em_pd, max_util, sum_util); } /* @@ -6355,21 +6341,19 @@ compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd) * other use-cases too. So, until someone finds a better way to solve this, * let's keep things simple by re-using the existing slow path. */ - static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) { - unsigned long prev_energy = ULONG_MAX, best_energy = ULONG_MAX; + unsigned long prev_delta = ULONG_MAX, best_delta = ULONG_MAX; struct root_domain *rd = cpu_rq(smp_processor_id())->rd; + unsigned long cpu_cap, util, base_energy = 0; int cpu, best_energy_cpu = prev_cpu; - struct perf_domain *head, *pd; - unsigned long cpu_cap, util; struct sched_domain *sd; + struct perf_domain *pd; rcu_read_lock(); pd = rcu_dereference(rd->pd); if (!pd || READ_ONCE(rd->overutilized)) goto fail; - head = pd; /* * Energy-aware wake-up happens on the lowest sched_domain starting @@ -6386,9 +6370,14 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) goto unlock; for (; pd; pd = pd->next) { - unsigned long cur_energy, spare_cap, max_spare_cap = 0; + unsigned long cur_delta, spare_cap, max_spare_cap = 0; + unsigned long base_energy_pd; int max_spare_cap_cpu = -1; + /* Compute the 'base' energy of the pd, without @p */ + base_energy_pd = compute_energy(p, -1, pd); + base_energy += base_energy_pd; + for_each_cpu_and(cpu, perf_domain_span(pd), sched_domain_span(sd)) { if (!cpumask_test_cpu(cpu, p->cpus_ptr)) continue; @@ -6401,9 +6390,9 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) /* Always use prev_cpu as a candidate. */ if (cpu == prev_cpu) { - prev_energy = compute_energy(p, prev_cpu, head); - best_energy = min(best_energy, prev_energy); - continue; + prev_delta = compute_energy(p, prev_cpu, pd); + prev_delta -= base_energy_pd; + best_delta = min(best_delta, prev_delta); } /* @@ -6419,9 +6408,10 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) /* Evaluate the energy impact of using this CPU. */ if (max_spare_cap_cpu >= 0) { - cur_energy = compute_energy(p, max_spare_cap_cpu, head); - if (cur_energy < best_energy) { - best_energy = cur_energy; + cur_delta = compute_energy(p, max_spare_cap_cpu, pd); + cur_delta -= base_energy_pd; + if (cur_delta < best_delta) { + best_delta = cur_delta; best_energy_cpu = max_spare_cap_cpu; } } @@ -6433,10 +6423,10 @@ unlock: * Pick the best CPU if prev_cpu cannot be used, or if it saves at * least 6% of the energy used by prev_cpu. */ - if (prev_energy == ULONG_MAX) + if (prev_delta == ULONG_MAX) return best_energy_cpu; - if ((prev_energy - best_energy) > (prev_energy >> 4)) + if ((prev_delta - best_delta) > ((prev_delta + base_energy) >> 4)) return best_energy_cpu; return prev_cpu; -- cgit v1.2.3 From 4adcdcea717cb2d8436bef00dd689aa5bc76f11b Mon Sep 17 00:00:00 2001 From: Miles Chen Date: Thu, 12 Sep 2019 18:34:52 +0800 Subject: sched/psi: Correct overly pessimistic size calculation When passing a equal or more then 32 bytes long string to psi_write(), psi_write() copies 31 bytes to its buf and overwrites buf[30] with '\0'. Which makes the input string 1 byte shorter than it should be. Fix it by copying sizeof(buf) bytes when nbytes >= sizeof(buf). This does not cause problems in normal use case like: "some 500000 10000000" or "full 500000 10000000" because they are less than 32 bytes in length. /* assuming nbytes == 35 */ char buf[32]; buf_size = min(nbytes, (sizeof(buf) - 1)); /* buf_size = 31 */ if (copy_from_user(buf, user_buf, buf_size)) return -EFAULT; buf[buf_size - 1] = '\0'; /* buf[30] = '\0' */ Before: %cd /proc/pressure/ %echo "123456789|123456789|123456789|1234" > memory [ 22.473497] nbytes=35,buf_size=31 [ 22.473775] 123456789|123456789|123456789| (print 30 chars) %sh: write error: Invalid argument %echo "123456789|123456789|123456789|1" > memory [ 64.916162] nbytes=32,buf_size=31 [ 64.916331] 123456789|123456789|123456789| (print 30 chars) %sh: write error: Invalid argument After: %cd /proc/pressure/ %echo "123456789|123456789|123456789|1234" > memory [ 254.837863] nbytes=35,buf_size=32 [ 254.838541] 123456789|123456789|123456789|1 (print 31 chars) %sh: write error: Invalid argument %echo "123456789|123456789|123456789|1" > memory [ 9965.714935] nbytes=32,buf_size=32 [ 9965.715096] 123456789|123456789|123456789|1 (print 31 chars) %sh: write error: Invalid argument Also remove the superfluous parentheses. Signed-off-by: Miles Chen Cc: Cc: Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: https://lkml.kernel.org/r/20190912103452.13281-1-miles.chen@mediatek.com Signed-off-by: Ingo Molnar --- kernel/sched/psi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 7acc632c3b82..4b14a3208fbe 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -1190,7 +1190,7 @@ static ssize_t psi_write(struct file *file, const char __user *user_buf, if (static_branch_likely(&psi_disabled)) return -EOPNOTSUPP; - buf_size = min(nbytes, (sizeof(buf) - 1)); + buf_size = min(nbytes, sizeof(buf)); if (copy_from_user(buf, user_buf, buf_size)) return -EFAULT; -- cgit v1.2.3 From b128a30409356df65f1a51cff3eb986cac8cfedc Mon Sep 17 00:00:00 2001 From: Daniel Jordan Date: Thu, 5 Sep 2019 21:40:21 -0400 Subject: padata: allocate workqueue internally Move workqueue allocation inside of padata to prepare for further changes to how padata uses workqueues. Guarantees the workqueue is created with max_active=1, which padata relies on to work correctly. No functional change. Signed-off-by: Daniel Jordan Acked-by: Steffen Klassert Cc: Herbert Xu Cc: Jonathan Corbet Cc: Lai Jiangshan Cc: Peter Zijlstra Cc: Tejun Heo Cc: linux-crypto@vger.kernel.org Cc: linux-doc@vger.kernel.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Herbert Xu --- kernel/padata.c | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/padata.c b/kernel/padata.c index b60cc3dcee58..58728cd7f40c 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -805,6 +805,7 @@ static void __padata_free(struct padata_instance *pinst) padata_free_pd(pinst->pd); free_cpumask_var(pinst->cpumask.pcpu); free_cpumask_var(pinst->cpumask.cbcpu); + destroy_workqueue(pinst->wq); kfree(pinst); } @@ -938,13 +939,13 @@ static struct kobj_type padata_attr_type = { * padata_alloc - allocate and initialize a padata instance and specify * cpumasks for serial and parallel workers. * - * @wq: workqueue to use for the allocated padata instance + * @name: used to identify the instance * @pcpumask: cpumask that will be used for padata parallelization * @cbcpumask: cpumask that will be used for padata serialization * * Must be called from a cpus_read_lock() protected region */ -static struct padata_instance *padata_alloc(struct workqueue_struct *wq, +static struct padata_instance *padata_alloc(const char *name, const struct cpumask *pcpumask, const struct cpumask *cbcpumask) { @@ -955,11 +956,16 @@ static struct padata_instance *padata_alloc(struct workqueue_struct *wq, if (!pinst) goto err; - if (!alloc_cpumask_var(&pinst->cpumask.pcpu, GFP_KERNEL)) + pinst->wq = alloc_workqueue("%s", WQ_MEM_RECLAIM | WQ_CPU_INTENSIVE, + 1, name); + if (!pinst->wq) goto err_free_inst; + + if (!alloc_cpumask_var(&pinst->cpumask.pcpu, GFP_KERNEL)) + goto err_free_wq; if (!alloc_cpumask_var(&pinst->cpumask.cbcpu, GFP_KERNEL)) { free_cpumask_var(pinst->cpumask.pcpu); - goto err_free_inst; + goto err_free_wq; } if (!padata_validate_cpumask(pinst, pcpumask) || !padata_validate_cpumask(pinst, cbcpumask)) @@ -971,8 +977,6 @@ static struct padata_instance *padata_alloc(struct workqueue_struct *wq, rcu_assign_pointer(pinst->pd, pd); - pinst->wq = wq; - cpumask_copy(pinst->cpumask.pcpu, pcpumask); cpumask_copy(pinst->cpumask.cbcpu, cbcpumask); @@ -990,6 +994,8 @@ static struct padata_instance *padata_alloc(struct workqueue_struct *wq, err_free_masks: free_cpumask_var(pinst->cpumask.pcpu); free_cpumask_var(pinst->cpumask.cbcpu); +err_free_wq: + destroy_workqueue(pinst->wq); err_free_inst: kfree(pinst); err: @@ -1001,14 +1007,14 @@ err: * Use the cpu_possible_mask for serial and * parallel workers. * - * @wq: workqueue to use for the allocated padata instance + * @name: used to identify the instance * * Must be called from a cpus_read_lock() protected region */ -struct padata_instance *padata_alloc_possible(struct workqueue_struct *wq) +struct padata_instance *padata_alloc_possible(const char *name) { lockdep_assert_cpus_held(); - return padata_alloc(wq, cpu_possible_mask, cpu_possible_mask); + return padata_alloc(name, cpu_possible_mask, cpu_possible_mask); } EXPORT_SYMBOL(padata_alloc_possible); -- cgit v1.2.3 From 513c98d08682957cc9eba20e7e4bb349970711f3 Mon Sep 17 00:00:00 2001 From: Daniel Jordan Date: Thu, 5 Sep 2019 21:40:22 -0400 Subject: workqueue: unconfine alloc/apply/free_workqueue_attrs() padata will use these these interfaces in a later patch, so unconfine them. Signed-off-by: Daniel Jordan Acked-by: Tejun Heo Acked-by: Steffen Klassert Cc: Herbert Xu Cc: Lai Jiangshan Cc: Peter Zijlstra Cc: linux-crypto@vger.kernel.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Herbert Xu --- kernel/workqueue.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 601d61150b65..f53705ff3ff1 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -3329,7 +3329,7 @@ EXPORT_SYMBOL_GPL(execute_in_process_context); * * Undo alloc_workqueue_attrs(). */ -static void free_workqueue_attrs(struct workqueue_attrs *attrs) +void free_workqueue_attrs(struct workqueue_attrs *attrs) { if (attrs) { free_cpumask_var(attrs->cpumask); @@ -3345,7 +3345,7 @@ static void free_workqueue_attrs(struct workqueue_attrs *attrs) * * Return: The allocated new workqueue_attr on success. %NULL on failure. */ -static struct workqueue_attrs *alloc_workqueue_attrs(void) +struct workqueue_attrs *alloc_workqueue_attrs(void) { struct workqueue_attrs *attrs; @@ -4032,7 +4032,7 @@ static int apply_workqueue_attrs_locked(struct workqueue_struct *wq, * * Return: 0 on success and -errno on failure. */ -static int apply_workqueue_attrs(struct workqueue_struct *wq, +int apply_workqueue_attrs(struct workqueue_struct *wq, const struct workqueue_attrs *attrs) { int ret; -- cgit v1.2.3 From 509b3204890ab31c3e652c26424a0706bb809933 Mon Sep 17 00:00:00 2001 From: Daniel Jordan Date: Thu, 5 Sep 2019 21:40:23 -0400 Subject: workqueue: require CPU hotplug read exclusion for apply_workqueue_attrs Change the calling convention for apply_workqueue_attrs to require CPU hotplug read exclusion. Avoids lockdep complaints about nested calls to get_online_cpus in a future patch where padata calls apply_workqueue_attrs when changing other CPU-hotplug-sensitive data structures with the CPU read lock already held. Signed-off-by: Daniel Jordan Acked-by: Tejun Heo Acked-by: Steffen Klassert Cc: Herbert Xu Cc: Lai Jiangshan Cc: Peter Zijlstra Cc: linux-crypto@vger.kernel.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Herbert Xu --- kernel/workqueue.c | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index f53705ff3ff1..bc2e09a8ea61 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -4030,6 +4030,8 @@ static int apply_workqueue_attrs_locked(struct workqueue_struct *wq, * * Performs GFP_KERNEL allocations. * + * Assumes caller has CPU hotplug read exclusion, i.e. get_online_cpus(). + * * Return: 0 on success and -errno on failure. */ int apply_workqueue_attrs(struct workqueue_struct *wq, @@ -4037,9 +4039,11 @@ int apply_workqueue_attrs(struct workqueue_struct *wq, { int ret; - apply_wqattrs_lock(); + lockdep_assert_cpus_held(); + + mutex_lock(&wq_pool_mutex); ret = apply_workqueue_attrs_locked(wq, attrs); - apply_wqattrs_unlock(); + mutex_unlock(&wq_pool_mutex); return ret; } @@ -4152,16 +4156,21 @@ static int alloc_and_link_pwqs(struct workqueue_struct *wq) mutex_unlock(&wq->mutex); } return 0; - } else if (wq->flags & __WQ_ORDERED) { + } + + get_online_cpus(); + if (wq->flags & __WQ_ORDERED) { ret = apply_workqueue_attrs(wq, ordered_wq_attrs[highpri]); /* there should only be single pwq for ordering guarantee */ WARN(!ret && (wq->pwqs.next != &wq->dfl_pwq->pwqs_node || wq->pwqs.prev != &wq->dfl_pwq->pwqs_node), "ordering guarantee broken for workqueue %s\n", wq->name); - return ret; } else { - return apply_workqueue_attrs(wq, unbound_std_wq_attrs[highpri]); + ret = apply_workqueue_attrs(wq, unbound_std_wq_attrs[highpri]); } + put_online_cpus(); + + return ret; } static int wq_clamp_max_active(int max_active, unsigned int flags, -- cgit v1.2.3 From e6ce0e0807e90d38a2cefa524ac253d7a85c3f2f Mon Sep 17 00:00:00 2001 From: Daniel Jordan Date: Thu, 5 Sep 2019 21:40:24 -0400 Subject: padata: make padata_do_parallel find alternate callback CPU padata_do_parallel currently returns -EINVAL if the callback CPU isn't in the callback cpumask. pcrypt tries to prevent this situation by keeping its own callback cpumask in sync with padata's and checks that the callback CPU it passes to padata is valid. Make padata handle this instead. padata_do_parallel now takes a pointer to the callback CPU and updates it for the caller if an alternate CPU is used. Overall behavior in terms of which callback CPUs are chosen stays the same. Prepares for removal of the padata cpumask notifier in pcrypt, which will fix a lockdep complaint about nested acquisition of the CPU hotplug lock later in the series. Signed-off-by: Daniel Jordan Acked-by: Steffen Klassert Cc: Herbert Xu Cc: Lai Jiangshan Cc: Peter Zijlstra Cc: Tejun Heo Cc: linux-crypto@vger.kernel.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Herbert Xu --- kernel/padata.c | 27 ++++++++++++++++++++------- 1 file changed, 20 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/padata.c b/kernel/padata.c index 58728cd7f40c..9a17922ec436 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -94,17 +94,19 @@ static void padata_parallel_worker(struct work_struct *parallel_work) * * @pinst: padata instance * @padata: object to be parallelized - * @cb_cpu: cpu the serialization callback function will run on, - * must be in the serial cpumask of padata(i.e. cpumask.cbcpu). + * @cb_cpu: pointer to the CPU that the serialization callback function should + * run on. If it's not in the serial cpumask of @pinst + * (i.e. cpumask.cbcpu), this function selects a fallback CPU and if + * none found, returns -EINVAL. * * The parallelization callback function will run with BHs off. * Note: Every object which is parallelized by padata_do_parallel * must be seen by padata_do_serial. */ int padata_do_parallel(struct padata_instance *pinst, - struct padata_priv *padata, int cb_cpu) + struct padata_priv *padata, int *cb_cpu) { - int target_cpu, err; + int i, cpu, cpu_index, target_cpu, err; struct padata_parallel_queue *queue; struct parallel_data *pd; @@ -116,8 +118,19 @@ int padata_do_parallel(struct padata_instance *pinst, if (!(pinst->flags & PADATA_INIT) || pinst->flags & PADATA_INVALID) goto out; - if (!cpumask_test_cpu(cb_cpu, pd->cpumask.cbcpu)) - goto out; + if (!cpumask_test_cpu(*cb_cpu, pd->cpumask.cbcpu)) { + if (!cpumask_weight(pd->cpumask.cbcpu)) + goto out; + + /* Select an alternate fallback CPU and notify the caller. */ + cpu_index = *cb_cpu % cpumask_weight(pd->cpumask.cbcpu); + + cpu = cpumask_first(pd->cpumask.cbcpu); + for (i = 0; i < cpu_index; i++) + cpu = cpumask_next(cpu, pd->cpumask.cbcpu); + + *cb_cpu = cpu; + } err = -EBUSY; if ((pinst->flags & PADATA_RESET)) @@ -129,7 +142,7 @@ int padata_do_parallel(struct padata_instance *pinst, err = 0; atomic_inc(&pd->refcnt); padata->pd = pd; - padata->cb_cpu = cb_cpu; + padata->cb_cpu = *cb_cpu; target_cpu = padata_cpu_hash(pd); padata->cpu = target_cpu; -- cgit v1.2.3 From cc491d8e6486c56e07e60d9992cd56f63dc9fd6c Mon Sep 17 00:00:00 2001 From: Daniel Jordan Date: Thu, 5 Sep 2019 21:40:26 -0400 Subject: padata, pcrypt: take CPU hotplug lock internally in padata_alloc_possible With pcrypt's cpumask no longer used, take the CPU hotplug lock inside padata_alloc_possible. Useful later in the series for avoiding nested acquisition of the CPU hotplug lock in padata when padata_alloc_possible is allocating an unbound workqueue. Without this patch, this nested acquisition would happen later in the series: pcrypt_init_padata get_online_cpus alloc_padata_possible alloc_padata alloc_workqueue(WQ_UNBOUND) // later in the series alloc_and_link_pwqs apply_wqattrs_lock get_online_cpus // recursive rwsem acquisition Signed-off-by: Daniel Jordan Acked-by: Steffen Klassert Cc: Herbert Xu Cc: Lai Jiangshan Cc: Peter Zijlstra Cc: Tejun Heo Cc: linux-crypto@vger.kernel.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Herbert Xu --- kernel/padata.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/padata.c b/kernel/padata.c index 9a17922ec436..8a362923c488 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -955,8 +955,6 @@ static struct kobj_type padata_attr_type = { * @name: used to identify the instance * @pcpumask: cpumask that will be used for padata parallelization * @cbcpumask: cpumask that will be used for padata serialization - * - * Must be called from a cpus_read_lock() protected region */ static struct padata_instance *padata_alloc(const char *name, const struct cpumask *pcpumask, @@ -974,11 +972,13 @@ static struct padata_instance *padata_alloc(const char *name, if (!pinst->wq) goto err_free_inst; + get_online_cpus(); + if (!alloc_cpumask_var(&pinst->cpumask.pcpu, GFP_KERNEL)) - goto err_free_wq; + goto err_put_cpus; if (!alloc_cpumask_var(&pinst->cpumask.cbcpu, GFP_KERNEL)) { free_cpumask_var(pinst->cpumask.pcpu); - goto err_free_wq; + goto err_put_cpus; } if (!padata_validate_cpumask(pinst, pcpumask) || !padata_validate_cpumask(pinst, cbcpumask)) @@ -1002,12 +1002,16 @@ static struct padata_instance *padata_alloc(const char *name, #ifdef CONFIG_HOTPLUG_CPU cpuhp_state_add_instance_nocalls_cpuslocked(hp_online, &pinst->node); #endif + + put_online_cpus(); + return pinst; err_free_masks: free_cpumask_var(pinst->cpumask.pcpu); free_cpumask_var(pinst->cpumask.cbcpu); -err_free_wq: +err_put_cpus: + put_online_cpus(); destroy_workqueue(pinst->wq); err_free_inst: kfree(pinst); @@ -1021,12 +1025,9 @@ err: * parallel workers. * * @name: used to identify the instance - * - * Must be called from a cpus_read_lock() protected region */ struct padata_instance *padata_alloc_possible(const char *name) { - lockdep_assert_cpus_held(); return padata_alloc(name, cpu_possible_mask, cpu_possible_mask); } EXPORT_SYMBOL(padata_alloc_possible); -- cgit v1.2.3 From 45d153c08bc73c8ced640dc20d8f2b749a6cb0d0 Mon Sep 17 00:00:00 2001 From: Daniel Jordan Date: Thu, 5 Sep 2019 21:40:27 -0400 Subject: padata: use separate workqueues for parallel and serial work padata currently uses one per-CPU workqueue per instance for all work. Prepare for running parallel jobs on an unbound workqueue by introducing dedicated workqueues for parallel and serial work. Signed-off-by: Daniel Jordan Acked-by: Steffen Klassert Cc: Herbert Xu Cc: Lai Jiangshan Cc: Peter Zijlstra Cc: Tejun Heo Cc: linux-crypto@vger.kernel.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Herbert Xu --- kernel/padata.c | 28 ++++++++++++++++++---------- 1 file changed, 18 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/padata.c b/kernel/padata.c index 8a362923c488..669f5d53d357 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -152,7 +152,7 @@ int padata_do_parallel(struct padata_instance *pinst, list_add_tail(&padata->list, &queue->parallel.list); spin_unlock(&queue->parallel.lock); - queue_work_on(target_cpu, pinst->wq, &queue->work); + queue_work_on(target_cpu, pinst->parallel_wq, &queue->work); out: rcu_read_unlock_bh(); @@ -261,7 +261,7 @@ static void padata_reorder(struct parallel_data *pd) list_add_tail(&padata->list, &squeue->serial.list); spin_unlock(&squeue->serial.lock); - queue_work_on(cb_cpu, pinst->wq, &squeue->work); + queue_work_on(cb_cpu, pinst->serial_wq, &squeue->work); } spin_unlock_bh(&pd->lock); @@ -278,7 +278,7 @@ static void padata_reorder(struct parallel_data *pd) next_queue = per_cpu_ptr(pd->pqueue, pd->cpu); if (!list_empty(&next_queue->reorder.list)) - queue_work(pinst->wq, &pd->reorder_work); + queue_work(pinst->serial_wq, &pd->reorder_work); } static void invoke_padata_reorder(struct work_struct *work) @@ -818,7 +818,8 @@ static void __padata_free(struct padata_instance *pinst) padata_free_pd(pinst->pd); free_cpumask_var(pinst->cpumask.pcpu); free_cpumask_var(pinst->cpumask.cbcpu); - destroy_workqueue(pinst->wq); + destroy_workqueue(pinst->serial_wq); + destroy_workqueue(pinst->parallel_wq); kfree(pinst); } @@ -967,18 +968,23 @@ static struct padata_instance *padata_alloc(const char *name, if (!pinst) goto err; - pinst->wq = alloc_workqueue("%s", WQ_MEM_RECLAIM | WQ_CPU_INTENSIVE, - 1, name); - if (!pinst->wq) + pinst->parallel_wq = alloc_workqueue("%s_parallel", WQ_MEM_RECLAIM | + WQ_CPU_INTENSIVE, 1, name); + if (!pinst->parallel_wq) goto err_free_inst; get_online_cpus(); - if (!alloc_cpumask_var(&pinst->cpumask.pcpu, GFP_KERNEL)) + pinst->serial_wq = alloc_workqueue("%s_serial", WQ_MEM_RECLAIM | + WQ_CPU_INTENSIVE, 1, name); + if (!pinst->serial_wq) goto err_put_cpus; + + if (!alloc_cpumask_var(&pinst->cpumask.pcpu, GFP_KERNEL)) + goto err_free_serial_wq; if (!alloc_cpumask_var(&pinst->cpumask.cbcpu, GFP_KERNEL)) { free_cpumask_var(pinst->cpumask.pcpu); - goto err_put_cpus; + goto err_free_serial_wq; } if (!padata_validate_cpumask(pinst, pcpumask) || !padata_validate_cpumask(pinst, cbcpumask)) @@ -1010,9 +1016,11 @@ static struct padata_instance *padata_alloc(const char *name, err_free_masks: free_cpumask_var(pinst->cpumask.pcpu); free_cpumask_var(pinst->cpumask.cbcpu); +err_free_serial_wq: + destroy_workqueue(pinst->serial_wq); err_put_cpus: put_online_cpus(); - destroy_workqueue(pinst->wq); + destroy_workqueue(pinst->parallel_wq); err_free_inst: kfree(pinst); err: -- cgit v1.2.3 From bfde23ce200e6d33291d29b9b8b60cc2f30f0805 Mon Sep 17 00:00:00 2001 From: Daniel Jordan Date: Thu, 5 Sep 2019 21:40:28 -0400 Subject: padata: unbind parallel jobs from specific CPUs Padata binds the parallel part of a job to a single CPU and round-robins over all CPUs in the system for each successive job. Though the serial parts rely on per-CPU queues for correct ordering, they're not necessary for parallel work, and it improves performance to run the job locally on NUMA machines and let the scheduler pick the CPU within a node on a busy system. So, make the parallel workqueue unbound. Update the parallel workqueue's cpumask when the instance's parallel cpumask changes. Now that parallel jobs no longer run on max_active=1 workqueues, two or more parallel works that hash to the same CPU may run simultaneously, finish out of order, and so be serialized out of order. Prevent this by keeping the works sorted on the reorder list by sequence number and checking that in the reordering logic. padata_get_next becomes padata_find_next so it can be reused for the end of padata_reorder, where it's used to avoid uselessly queueing work when the next job by sequence number isn't finished yet but a later job that hashed to the same CPU has. The ENODATA case in padata_find_next no longer makes sense because parallel jobs aren't bound to specific CPUs. The EINPROGRESS case takes care of the scenario where a parallel job is potentially running on the same CPU as padata_find_next, and with only one error code left, just use NULL instead. Signed-off-by: Daniel Jordan Cc: Herbert Xu Cc: Lai Jiangshan Cc: Peter Zijlstra Cc: Steffen Klassert Cc: Tejun Heo Cc: linux-crypto@vger.kernel.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Herbert Xu --- kernel/padata.c | 118 +++++++++++++++++++++++++++++++------------------------- 1 file changed, 65 insertions(+), 53 deletions(-) (limited to 'kernel') diff --git a/kernel/padata.c b/kernel/padata.c index 669f5d53d357..832224dcf2e1 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -46,18 +46,13 @@ static int padata_index_to_cpu(struct parallel_data *pd, int cpu_index) return target_cpu; } -static int padata_cpu_hash(struct parallel_data *pd) +static int padata_cpu_hash(struct parallel_data *pd, unsigned int seq_nr) { - unsigned int seq_nr; - int cpu_index; - /* * Hash the sequence numbers to the cpus by taking * seq_nr mod. number of cpus in use. */ - - seq_nr = atomic_inc_return(&pd->seq_nr); - cpu_index = seq_nr % cpumask_weight(pd->cpumask.pcpu); + int cpu_index = seq_nr % cpumask_weight(pd->cpumask.pcpu); return padata_index_to_cpu(pd, cpu_index); } @@ -144,7 +139,8 @@ int padata_do_parallel(struct padata_instance *pinst, padata->pd = pd; padata->cb_cpu = *cb_cpu; - target_cpu = padata_cpu_hash(pd); + padata->seq_nr = atomic_inc_return(&pd->seq_nr); + target_cpu = padata_cpu_hash(pd, padata->seq_nr); padata->cpu = target_cpu; queue = per_cpu_ptr(pd->pqueue, target_cpu); @@ -152,7 +148,7 @@ int padata_do_parallel(struct padata_instance *pinst, list_add_tail(&padata->list, &queue->parallel.list); spin_unlock(&queue->parallel.lock); - queue_work_on(target_cpu, pinst->parallel_wq, &queue->work); + queue_work(pinst->parallel_wq, &queue->work); out: rcu_read_unlock_bh(); @@ -162,21 +158,19 @@ out: EXPORT_SYMBOL(padata_do_parallel); /* - * padata_get_next - Get the next object that needs serialization. + * padata_find_next - Find the next object that needs serialization. * * Return values are: * * A pointer to the control struct of the next object that needs * serialization, if present in one of the percpu reorder queues. * - * -EINPROGRESS, if the next object that needs serialization will + * NULL, if the next object that needs serialization will * be parallel processed by another cpu and is not yet present in * the cpu's reorder queue. - * - * -ENODATA, if this cpu has to do the parallel processing for - * the next object. */ -static struct padata_priv *padata_get_next(struct parallel_data *pd) +static struct padata_priv *padata_find_next(struct parallel_data *pd, + bool remove_object) { struct padata_parallel_queue *next_queue; struct padata_priv *padata; @@ -187,28 +181,30 @@ static struct padata_priv *padata_get_next(struct parallel_data *pd) reorder = &next_queue->reorder; spin_lock(&reorder->lock); - if (!list_empty(&reorder->list)) { - padata = list_entry(reorder->list.next, - struct padata_priv, list); - - list_del_init(&padata->list); - atomic_dec(&pd->reorder_objects); + if (list_empty(&reorder->list)) { + spin_unlock(&reorder->lock); + return NULL; + } - pd->cpu = cpumask_next_wrap(cpu, pd->cpumask.pcpu, -1, - false); + padata = list_entry(reorder->list.next, struct padata_priv, list); + /* + * Checks the rare case where two or more parallel jobs have hashed to + * the same CPU and one of the later ones finishes first. + */ + if (padata->seq_nr != pd->processed) { spin_unlock(&reorder->lock); - goto out; + return NULL; } - spin_unlock(&reorder->lock); - if (__this_cpu_read(pd->pqueue->cpu_index) == next_queue->cpu_index) { - padata = ERR_PTR(-ENODATA); - goto out; + if (remove_object) { + list_del_init(&padata->list); + atomic_dec(&pd->reorder_objects); + ++pd->processed; + pd->cpu = cpumask_next_wrap(cpu, pd->cpumask.pcpu, -1, false); } - padata = ERR_PTR(-EINPROGRESS); -out: + spin_unlock(&reorder->lock); return padata; } @@ -234,26 +230,16 @@ static void padata_reorder(struct parallel_data *pd) return; while (1) { - padata = padata_get_next(pd); + padata = padata_find_next(pd, true); /* * If the next object that needs serialization is parallel * processed by another cpu and is still on it's way to the * cpu's reorder queue, nothing to do for now. */ - if (PTR_ERR(padata) == -EINPROGRESS) + if (!padata) break; - /* - * This cpu has to do the parallel processing of the next - * object. It's waiting in the cpu's parallelization queue, - * so exit immediately. - */ - if (PTR_ERR(padata) == -ENODATA) { - spin_unlock_bh(&pd->lock); - return; - } - cb_cpu = padata->cb_cpu; squeue = per_cpu_ptr(pd->squeue, cb_cpu); @@ -277,7 +263,8 @@ static void padata_reorder(struct parallel_data *pd) smp_mb(); next_queue = per_cpu_ptr(pd->pqueue, pd->cpu); - if (!list_empty(&next_queue->reorder.list)) + if (!list_empty(&next_queue->reorder.list) && + padata_find_next(pd, false)) queue_work(pinst->serial_wq, &pd->reorder_work); } @@ -332,9 +319,14 @@ void padata_do_serial(struct padata_priv *padata) struct parallel_data *pd = padata->pd; struct padata_parallel_queue *pqueue = per_cpu_ptr(pd->pqueue, padata->cpu); + struct padata_priv *cur; spin_lock(&pqueue->reorder.lock); - list_add_tail(&padata->list, &pqueue->reorder.list); + /* Sort in ascending order of sequence number. */ + list_for_each_entry_reverse(cur, &pqueue->reorder.list, list) + if (cur->seq_nr < padata->seq_nr) + break; + list_add(&padata->list, &cur->list); atomic_inc(&pd->reorder_objects); spin_unlock(&pqueue->reorder.lock); @@ -353,17 +345,36 @@ static int padata_setup_cpumasks(struct parallel_data *pd, const struct cpumask *pcpumask, const struct cpumask *cbcpumask) { - if (!alloc_cpumask_var(&pd->cpumask.pcpu, GFP_KERNEL)) - return -ENOMEM; + struct workqueue_attrs *attrs; + int err = -ENOMEM; + if (!alloc_cpumask_var(&pd->cpumask.pcpu, GFP_KERNEL)) + goto out; cpumask_and(pd->cpumask.pcpu, pcpumask, cpu_online_mask); - if (!alloc_cpumask_var(&pd->cpumask.cbcpu, GFP_KERNEL)) { - free_cpumask_var(pd->cpumask.pcpu); - return -ENOMEM; - } + if (!alloc_cpumask_var(&pd->cpumask.cbcpu, GFP_KERNEL)) + goto free_pcpu_mask; cpumask_and(pd->cpumask.cbcpu, cbcpumask, cpu_online_mask); + + attrs = alloc_workqueue_attrs(); + if (!attrs) + goto free_cbcpu_mask; + + /* Restrict parallel_wq workers to pd->cpumask.pcpu. */ + cpumask_copy(attrs->cpumask, pd->cpumask.pcpu); + err = apply_workqueue_attrs(pd->pinst->parallel_wq, attrs); + free_workqueue_attrs(attrs); + if (err < 0) + goto free_cbcpu_mask; + return 0; + +free_cbcpu_mask: + free_cpumask_var(pd->cpumask.cbcpu); +free_pcpu_mask: + free_cpumask_var(pd->cpumask.pcpu); +out: + return err; } static void __padata_list_init(struct padata_list *pd_list) @@ -429,6 +440,8 @@ static struct parallel_data *padata_alloc_pd(struct padata_instance *pinst, pd->squeue = alloc_percpu(struct padata_serial_queue); if (!pd->squeue) goto err_free_pqueue; + + pd->pinst = pinst; if (padata_setup_cpumasks(pd, pcpumask, cbcpumask) < 0) goto err_free_squeue; @@ -437,7 +450,6 @@ static struct parallel_data *padata_alloc_pd(struct padata_instance *pinst, atomic_set(&pd->seq_nr, -1); atomic_set(&pd->reorder_objects, 0); atomic_set(&pd->refcnt, 0); - pd->pinst = pinst; spin_lock_init(&pd->lock); pd->cpu = cpumask_first(pd->cpumask.pcpu); INIT_WORK(&pd->reorder_work, invoke_padata_reorder); @@ -968,8 +980,8 @@ static struct padata_instance *padata_alloc(const char *name, if (!pinst) goto err; - pinst->parallel_wq = alloc_workqueue("%s_parallel", WQ_MEM_RECLAIM | - WQ_CPU_INTENSIVE, 1, name); + pinst->parallel_wq = alloc_workqueue("%s_parallel", WQ_UNBOUND, 0, + name); if (!pinst->parallel_wq) goto err_free_inst; -- cgit v1.2.3 From c51636a3065491af521187724d14a822548bcfd7 Mon Sep 17 00:00:00 2001 From: Daniel Jordan Date: Thu, 5 Sep 2019 21:40:29 -0400 Subject: padata: remove cpu_index from the parallel_queue With the removal of the ENODATA case from padata_get_next, the cpu_index field is no longer useful, so it can go away. Signed-off-by: Daniel Jordan Acked-by: Steffen Klassert Cc: Herbert Xu Cc: Lai Jiangshan Cc: Peter Zijlstra Cc: Tejun Heo Cc: linux-crypto@vger.kernel.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Herbert Xu --- kernel/padata.c | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/padata.c b/kernel/padata.c index 832224dcf2e1..c3fec1413295 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -400,21 +400,12 @@ static void padata_init_squeues(struct parallel_data *pd) /* Initialize all percpu queues used by parallel workers */ static void padata_init_pqueues(struct parallel_data *pd) { - int cpu_index, cpu; + int cpu; struct padata_parallel_queue *pqueue; - cpu_index = 0; - for_each_possible_cpu(cpu) { + for_each_cpu(cpu, pd->cpumask.pcpu) { pqueue = per_cpu_ptr(pd->pqueue, cpu); - if (!cpumask_test_cpu(cpu, pd->cpumask.pcpu)) { - pqueue->cpu_index = -1; - continue; - } - - pqueue->cpu_index = cpu_index; - cpu_index++; - __padata_list_init(&pqueue->reorder); __padata_list_init(&pqueue->parallel); INIT_WORK(&pqueue->work, padata_parallel_worker); -- cgit v1.2.3 From 786b2384bf1c1b53dc23dc493aaaae29ef01e6ce Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 23 Aug 2019 15:36:50 +0200 Subject: um: Enable CONFIG_CONSTRUCTORS We do need to call the constructors for *modules*, and at least for KASAN in the future, we must call even the kernel constructors only later when the kernel has been initialized. Instead of relying on libc to call them, emit an empty section for libc and let the kernel's CONSTRUCTORS code do the rest of the job. Tested that it indeed doesn't work in modules, and does work after the fixes in both, with a few functions with __attribute__((constructor)) in both dynamic and static builds. Signed-off-by: Johannes Berg Signed-off-by: Richard Weinberger --- kernel/gcov/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig index 3941a9c48f83..060e8e726755 100644 --- a/kernel/gcov/Kconfig +++ b/kernel/gcov/Kconfig @@ -4,7 +4,7 @@ menu "GCOV-based kernel profiling" config GCOV_KERNEL bool "Enable gcov-based kernel profiling" depends on DEBUG_FS - select CONSTRUCTORS if !UML + select CONSTRUCTORS default n ---help--- This option enables gcov-based code profiling (e.g. for code coverage -- cgit v1.2.3 From af58e7ee6a8d83726ad8a2696e98d86400a7639c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= Date: Sun, 8 Sep 2019 09:20:16 +0100 Subject: xdp: Fix race in dev_map_hash_update_elem() when replacing element MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit syzbot found a crash in dev_map_hash_update_elem(), when replacing an element with a new one. Jesper correctly identified the cause of the crash as a race condition between the initial lookup in the map (which is done before taking the lock), and the removal of the old element. Rather than just add a second lookup into the hashmap after taking the lock, fix this by reworking the function logic to take the lock before the initial lookup. Fixes: 6f9d451ab1a3 ("xdp: Add devmap_hash map type for looking up devices by hashed index") Reported-and-tested-by: syzbot+4e7a85b1432052e8d6f8@syzkaller.appspotmail.com Signed-off-by: Toke Høiland-Jørgensen Acked-by: Jesper Dangaard Brouer Signed-off-by: Daniel Borkmann --- kernel/bpf/devmap.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 9af048a932b5..d27f3b60ff6d 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -650,19 +650,22 @@ static int __dev_map_hash_update_elem(struct net *net, struct bpf_map *map, u32 ifindex = *(u32 *)value; u32 idx = *(u32 *)key; unsigned long flags; + int err = -EEXIST; if (unlikely(map_flags > BPF_EXIST || !ifindex)) return -EINVAL; + spin_lock_irqsave(&dtab->index_lock, flags); + old_dev = __dev_map_hash_lookup_elem(map, idx); if (old_dev && (map_flags & BPF_NOEXIST)) - return -EEXIST; + goto out_err; dev = __dev_map_alloc_node(net, dtab, ifindex, idx); - if (IS_ERR(dev)) - return PTR_ERR(dev); - - spin_lock_irqsave(&dtab->index_lock, flags); + if (IS_ERR(dev)) { + err = PTR_ERR(dev); + goto out_err; + } if (old_dev) { hlist_del_rcu(&old_dev->index_hlist); @@ -683,6 +686,10 @@ static int __dev_map_hash_update_elem(struct net *net, struct bpf_map *map, call_rcu(&old_dev->rcu, __dev_map_entry_free); return 0; + +out_err: + spin_unlock_irqrestore(&dtab->index_lock, flags); + return err; } static int dev_map_hash_update_elem(struct bpf_map *map, void *key, void *value, -- cgit v1.2.3 From d895a0f16fadb26d22ab531c49768f7642ae5c3e Mon Sep 17 00:00:00 2001 From: Ilya Leoshkevich Date: Fri, 16 Aug 2019 12:53:00 +0200 Subject: bpf: fix accessing bpf_sysctl.file_pos on s390 "ctx:file_pos sysctl:read write ok" fails on s390 with "Read value != nux". This is because verifier rewrites a complete 32-bit bpf_sysctl.file_pos update to a partial update of the first 32 bits of 64-bit *bpf_sysctl_kern.ppos, which is not correct on big-endian systems. Fix by using an offset on big-endian systems. Ditto for bpf_sysctl.file_pos reads. Currently the test does not detect a problem there, since it expects to see 0, which it gets with high probability in error cases, so change it to seek to offset 3 and expect 3 in bpf_sysctl.file_pos. Fixes: e1550bfe0de4 ("bpf: Add file_pos field to bpf_sysctl ctx") Signed-off-by: Ilya Leoshkevich Acked-by: Yonghong Song Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20190816105300.49035-1-iii@linux.ibm.com/ --- kernel/bpf/cgroup.c | 10 ++++++++-- kernel/bpf/verifier.c | 4 ++-- 2 files changed, 10 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 6a6a154cfa7b..ddd8addcdb5c 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -1334,6 +1334,7 @@ static u32 sysctl_convert_ctx_access(enum bpf_access_type type, struct bpf_prog *prog, u32 *target_size) { struct bpf_insn *insn = insn_buf; + u32 read_size; switch (si->off) { case offsetof(struct bpf_sysctl, write): @@ -1365,7 +1366,9 @@ static u32 sysctl_convert_ctx_access(enum bpf_access_type type, treg, si->dst_reg, offsetof(struct bpf_sysctl_kern, ppos)); *insn++ = BPF_STX_MEM( - BPF_SIZEOF(u32), treg, si->src_reg, 0); + BPF_SIZEOF(u32), treg, si->src_reg, + bpf_ctx_narrow_access_offset( + 0, sizeof(u32), sizeof(loff_t))); *insn++ = BPF_LDX_MEM( BPF_DW, treg, si->dst_reg, offsetof(struct bpf_sysctl_kern, tmp_reg)); @@ -1374,8 +1377,11 @@ static u32 sysctl_convert_ctx_access(enum bpf_access_type type, BPF_FIELD_SIZEOF(struct bpf_sysctl_kern, ppos), si->dst_reg, si->src_reg, offsetof(struct bpf_sysctl_kern, ppos)); + read_size = bpf_size_to_bytes(BPF_SIZE(si->code)); *insn++ = BPF_LDX_MEM( - BPF_SIZE(si->code), si->dst_reg, si->dst_reg, 0); + BPF_SIZE(si->code), si->dst_reg, si->dst_reg, + bpf_ctx_narrow_access_offset( + 0, read_size, sizeof(loff_t))); } *target_size = sizeof(u32); break; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 3fb50757e812..92a4332b041d 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -8619,8 +8619,8 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) } if (is_narrower_load && size < target_size) { - u8 shift = bpf_ctx_narrow_load_shift(off, size, - size_default); + u8 shift = bpf_ctx_narrow_access_offset( + off, size, size_default) * 8; if (ctx_field_size <= 4) { if (shift) insn_buf[cnt++] = BPF_ALU32_IMM(BPF_RSH, -- cgit v1.2.3 From dac9f027b1096c5f03ca583e787aac0f852e8f78 Mon Sep 17 00:00:00 2001 From: Qian Cai Date: Mon, 16 Sep 2019 17:19:35 -0400 Subject: sched/fair: Remove unused cfs_rq_clock_task() function cfs_rq_clock_task() was first introduced and used in: f1b17280efbd ("sched: Maintain runnable averages across throttled periods") Over time its use has been graduately removed by the following commits: d31b1a66cbe0 ("sched/fair: Factorize PELT update") 23127296889f ("sched/fair: Update scale invariance of PELT") Today, there is no single user left, so it can be safely removed. Found via the -Wunused-function build warning. Signed-off-by: Qian Cai Cc: Ben Segall Cc: Dietmar Eggemann Cc: Juri Lelli Cc: Linus Torvalds Cc: Mel Gorman Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Vincent Guittot Link: https://lkml.kernel.org/r/1568668775-2127-1-git-send-email-cai@lca.pw [ Rewrote the changelog. ] Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 16 ---------------- 1 file changed, 16 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index d4bbf68c3161..3101c662426d 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -749,7 +749,6 @@ void init_entity_runnable_average(struct sched_entity *se) /* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */ } -static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq); static void attach_entity_cfs_rq(struct sched_entity *se); /* @@ -4376,15 +4375,6 @@ static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg) return &tg->cfs_bandwidth; } -/* rq->task_clock normalized against any time this cfs_rq has spent throttled */ -static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq) -{ - if (unlikely(cfs_rq->throttle_count)) - return cfs_rq->throttled_clock_task - cfs_rq->throttled_clock_task_time; - - return rq_clock_task(rq_of(cfs_rq)) - cfs_rq->throttled_clock_task_time; -} - /* returns 0 on failure to allocate runtime */ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq) { @@ -4476,7 +4466,6 @@ static int tg_unthrottle_up(struct task_group *tg, void *data) cfs_rq->throttle_count--; if (!cfs_rq->throttle_count) { - /* adjust cfs_rq_clock_task() */ cfs_rq->throttled_clock_task_time += rq_clock_task(rq) - cfs_rq->throttled_clock_task; @@ -5080,11 +5069,6 @@ static inline bool cfs_bandwidth_used(void) return false; } -static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq) -{ - return rq_clock_task(rq_of(cfs_rq)); -} - static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {} static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; } static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {} -- cgit v1.2.3 From 08468754c16e731d31538a8b1b0b433be2410a89 Mon Sep 17 00:00:00 2001 From: Changbin Du Date: Tue, 10 Sep 2019 22:33:36 +0800 Subject: ftrace: Simplify ftrace hash lookup code in clear_func_from_hash() Function ftrace_lookup_ip() will check empty hash table. So we don't need extra check outside. Link: http://lkml.kernel.org/r/20190910143336.13472-1-changbin.du@gmail.com Signed-off-by: Changbin Du Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index f9821a3374e9..c4cc048eb594 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -6036,11 +6036,7 @@ clear_func_from_hash(struct ftrace_init_func *func, struct ftrace_hash *hash) { struct ftrace_func_entry *entry; - if (ftrace_hash_empty(hash)) - return; - - entry = __ftrace_lookup_ip(hash, func->ip); - + entry = ftrace_lookup_ip(hash, func->ip); /* * Do not allow this rec to match again. * Yeah, it may waste some memory, but will be removed -- cgit v1.2.3 From 119cdbdb95a66203c0bca09474427c297186f7a3 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Tue, 6 Aug 2019 18:15:43 +0300 Subject: tracing: Be more clever when dumping hex in __print_hex() Hex dump as many as 16 bytes at once in trace_print_hex_seq() instead of byte-by-byte approach. Link: http://lkml.kernel.org/r/20190806151543.86061-1-andriy.shevchenko@linux.intel.com Signed-off-by: Andy Shevchenko Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_output.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index cab4a5398f1d..d54ce252b05a 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -219,10 +219,10 @@ trace_print_hex_seq(struct trace_seq *p, const unsigned char *buf, int buf_len, { int i; const char *ret = trace_seq_buffer_ptr(p); + const char *fmt = concatenate ? "%*phN" : "%*ph"; - for (i = 0; i < buf_len; i++) - trace_seq_printf(p, "%s%2.2x", concatenate || i == 0 ? "" : " ", - buf[i]); + for (i = 0; i < buf_len; i += 16) + trace_seq_printf(p, fmt, min(buf_len - i, 16), &buf[i]); trace_seq_putc(p, 0); return ret; -- cgit v1.2.3 From 17f8607a1658a8e70415eef67909f990d13017b5 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Sun, 1 Sep 2019 17:02:01 -0500 Subject: tracing: Make sure variable reference alias has correct var_ref_idx Original changelog from Steve Rostedt (except last sentence which explains the problem, and the Fixes: tag): I performed a three way histogram with the following commands: echo 'irq_lat u64 lat pid_t pid' > synthetic_events echo 'wake_lat u64 lat u64 irqlat pid_t pid' >> synthetic_events echo 'hist:keys=common_pid:irqts=common_timestamp.usecs if function == 0xffffffff81200580' > events/timer/hrtimer_start/trigger echo 'hist:keys=common_pid:lat=common_timestamp.usecs-$irqts:onmatch(timer.hrtimer_start).irq_lat($lat,pid) if common_flags & 1' > events/sched/sched_waking/trigger echo 'hist:keys=pid:wakets=common_timestamp.usecs,irqlat=lat' > events/synthetic/irq_lat/trigger echo 'hist:keys=next_pid:lat=common_timestamp.usecs-$wakets,irqlat=$irqlat:onmatch(synthetic.irq_lat).wake_lat($lat,$irqlat,next_pid)' > events/sched/sched_switch/trigger echo 1 > events/synthetic/wake_lat/enable Basically I wanted to see: hrtimer_start (calling function tick_sched_timer) Note: # grep tick_sched_timer /proc/kallsyms ffffffff81200580 t tick_sched_timer And save the time of that, and then record sched_waking if it is called in interrupt context and with the same pid as the hrtimer_start, it will record the latency between that and the waking event. I then look at when the task that is woken is scheduled in, and record the latency between the wakeup and the task running. At the end, the wake_lat synthetic event will show the wakeup to scheduled latency, as well as the irq latency in from hritmer_start to the wakeup. The problem is that I found this: -0 [007] d... 190.485261: wake_lat: lat=27 irqlat=190485230 pid=698 -0 [005] d... 190.485283: wake_lat: lat=40 irqlat=190485239 pid=10 -0 [002] d... 190.488327: wake_lat: lat=56 irqlat=190488266 pid=335 -0 [005] d... 190.489330: wake_lat: lat=64 irqlat=190489262 pid=10 -0 [003] d... 190.490312: wake_lat: lat=43 irqlat=190490265 pid=77 -0 [005] d... 190.493322: wake_lat: lat=54 irqlat=190493262 pid=10 -0 [005] d... 190.497305: wake_lat: lat=35 irqlat=190497267 pid=10 -0 [005] d... 190.501319: wake_lat: lat=50 irqlat=190501264 pid=10 The irqlat seemed quite large! Investigating this further, if I had enabled the irq_lat synthetic event, I noticed this: -0 [002] d.s. 249.429308: irq_lat: lat=164968 pid=335 -0 [002] d... 249.429369: wake_lat: lat=55 irqlat=249429308 pid=335 Notice that the timestamp of the irq_lat "249.429308" is awfully similar to the reported irqlat variable. In fact, all instances were like this. It appeared that: irqlat=$irqlat Wasn't assigning the old $irqlat to the new irqlat variable, but instead was assigning the $irqts to it. The issue is that assigning the old $irqlat to the new irqlat variable creates a variable reference alias, but the alias creation code forgets to make sure the alias uses the same var_ref_idx to access the reference. Link: http://lkml.kernel.org/r/1567375321.5282.12.camel@kernel.org Cc: Linux Trace Devel Cc: linux-rt-users Cc: stable@vger.kernel.org Fixes: 7e8b88a30b085 ("tracing: Add hist trigger support for variable reference aliases") Reported-by: Steven Rostedt (VMware) Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 3a6e42aa08e6..9468bd8d44a2 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -2804,6 +2804,8 @@ static struct hist_field *create_alias(struct hist_trigger_data *hist_data, return NULL; } + alias->var_ref_idx = var_ref->var_ref_idx; + return alias; } -- cgit v1.2.3 From d59fae6fea39efe65bb3d3310aaa2a54b5f55c0d Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 17 Sep 2019 14:11:37 +0900 Subject: tracing/kprobe: Fix NULL pointer access in trace_porbe_unlink() Fix NULL pointer access in trace_probe_unlink() by initializing trace_probe.list correctly in trace_probe_init(). In the error case of trace_probe_init(), it can call trace_probe_unlink() before initializing trace_probe.list member. This causes NULL pointer dereference at list_del_init() in trace_probe_unlink(). Syzbot reported : kasan: CONFIG_KASAN_INLINE enabled kasan: GPF could be caused by NULL-ptr deref or user memory access general protection fault: 0000 [#1] PREEMPT SMP KASAN CPU: 1 PID: 8633 Comm: syz-executor797 Not tainted 5.3.0-rc8-next-20190915 #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 RIP: 0010:__list_del_entry_valid+0x85/0xf5 lib/list_debug.c:51 Code: 0f 84 e1 00 00 00 48 b8 22 01 00 00 00 00 ad de 49 39 c4 0f 84 e2 00 00 00 48 b8 00 00 00 00 00 fc ff df 4c 89 e2 48 c1 ea 03 <80> 3c 02 00 75 53 49 8b 14 24 4c 39 f2 0f 85 99 00 00 00 49 8d 7d RSP: 0018:ffff888090a7f9d8 EFLAGS: 00010246 RAX: dffffc0000000000 RBX: ffff88809b6f90c0 RCX: ffffffff817c0ca9 RDX: 0000000000000000 RSI: ffffffff817c0a73 RDI: ffff88809b6f90c8 RBP: ffff888090a7f9f0 R08: ffff88809a04e600 R09: ffffed1015d26aed R10: ffffed1015d26aec R11: ffff8880ae935763 R12: 0000000000000000 R13: 0000000000000000 R14: ffff88809b6f90c0 R15: ffff88809b6f90d0 FS: 0000555556f99880(0000) GS:ffff8880ae900000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00000000006cc090 CR3: 00000000962b2000 CR4: 00000000001406e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: __list_del_entry include/linux/list.h:131 [inline] list_del_init include/linux/list.h:190 [inline] trace_probe_unlink+0x1f/0x200 kernel/trace/trace_probe.c:959 trace_probe_cleanup+0xd3/0x110 kernel/trace/trace_probe.c:973 trace_probe_init+0x3f2/0x510 kernel/trace/trace_probe.c:1011 alloc_trace_uprobe+0x5e/0x250 kernel/trace/trace_uprobe.c:353 create_local_trace_uprobe+0x109/0x4a0 kernel/trace/trace_uprobe.c:1508 perf_uprobe_init+0x131/0x210 kernel/trace/trace_event_perf.c:314 perf_uprobe_event_init+0x106/0x1a0 kernel/events/core.c:8898 perf_try_init_event+0x135/0x590 kernel/events/core.c:10184 perf_init_event kernel/events/core.c:10228 [inline] perf_event_alloc.part.0+0x1b89/0x33d0 kernel/events/core.c:10505 perf_event_alloc kernel/events/core.c:10887 [inline] __do_sys_perf_event_open+0xa2d/0x2d00 kernel/events/core.c:10989 __se_sys_perf_event_open kernel/events/core.c:10871 [inline] __x64_sys_perf_event_open+0xbe/0x150 kernel/events/core.c:10871 do_syscall_64+0xfa/0x760 arch/x86/entry/common.c:290 entry_SYSCALL_64_after_hwframe+0x49/0xbe Link: http://lkml.kernel.org/r/156869709721.22406.5153754822203046939.stgit@devnote2 Reported-by: syzbot+2f807f4d3a2a4e87f18f@syzkaller.appspotmail.com Fixes: ca89bc071d5e ("tracing/kprobe: Add multi-probe per event support") Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_probe.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index 1e67fef06e53..baf58a3612c0 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -986,6 +986,12 @@ int trace_probe_init(struct trace_probe *tp, const char *event, if (!tp->event) return -ENOMEM; + INIT_LIST_HEAD(&tp->event->files); + INIT_LIST_HEAD(&tp->event->class.fields); + INIT_LIST_HEAD(&tp->event->probes); + INIT_LIST_HEAD(&tp->list); + list_add(&tp->event->probes, &tp->list); + call = trace_probe_event_call(tp); call->class = &tp->event->class; call->name = kstrdup(event, GFP_KERNEL); @@ -999,11 +1005,6 @@ int trace_probe_init(struct trace_probe *tp, const char *event, ret = -ENOMEM; goto error; } - INIT_LIST_HEAD(&tp->event->files); - INIT_LIST_HEAD(&tp->event->class.fields); - INIT_LIST_HEAD(&tp->event->probes); - INIT_LIST_HEAD(&tp->list); - list_add(&tp->event->probes, &tp->list); return 0; -- cgit v1.2.3 From a3db31ff6ce31f5a544a66b61613a098029031cc Mon Sep 17 00:00:00 2001 From: "Naveen N. Rao" Date: Thu, 5 Sep 2019 23:50:28 +0530 Subject: ftrace: Look up the address of return_to_handler() using helpers This ensures that we use the right address on architectures that use function descriptors. Signed-off-by: Naveen N. Rao Acked-by: Steven Rostedt (VMware) Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/8f6f14d192a994008ac370ce14036bbe67224c7d.1567707399.git.naveen.n.rao@linux.vnet.ibm.com --- kernel/trace/fgraph.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c index 8dfd5021b933..7950a0356042 100644 --- a/kernel/trace/fgraph.c +++ b/kernel/trace/fgraph.c @@ -276,7 +276,7 @@ unsigned long ftrace_graph_ret_addr(struct task_struct *task, int *idx, int index = task->curr_ret_stack; int i; - if (ret != (unsigned long)return_to_handler) + if (ret != (unsigned long)dereference_kernel_function_descriptor(return_to_handler)) return ret; if (index < 0) @@ -294,7 +294,7 @@ unsigned long ftrace_graph_ret_addr(struct task_struct *task, int *idx, { int task_idx; - if (ret != (unsigned long)return_to_handler) + if (ret != (unsigned long)dereference_kernel_function_descriptor(return_to_handler)) return ret; task_idx = task->curr_ret_stack; -- cgit v1.2.3 From d2935de7e4fd3f873976f1872b505584b727574c Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 22 Mar 2019 14:58:36 +0000 Subject: vfs: Convert bpf to use the new mount API Convert the bpf filesystem to the new internal mount API as the old one will be obsoleted and removed. This allows greater flexibility in communication of mount parameters between userspace, the VFS and the filesystem. See Documentation/filesystems/mount_api.txt for more information. Signed-off-by: David Howells cc: Alexei Starovoitov cc: Daniel Borkmann cc: Martin KaFai Lau cc: Song Liu cc: Yonghong Song cc: netdev@vger.kernel.org cc: bpf@vger.kernel.org Signed-off-by: Al Viro --- kernel/bpf/inode.c | 92 ++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 58 insertions(+), 34 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index cc0d0cf114e3..a70f7209cda3 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -14,8 +14,9 @@ #include #include #include +#include +#include #include -#include #include #include #include @@ -583,58 +584,52 @@ static const struct super_operations bpf_super_ops = { enum { OPT_MODE, - OPT_ERR, }; -static const match_table_t bpf_mount_tokens = { - { OPT_MODE, "mode=%o" }, - { OPT_ERR, NULL }, +static const struct fs_parameter_spec bpf_param_specs[] = { + fsparam_u32oct ("mode", OPT_MODE), + {} +}; + +static const struct fs_parameter_description bpf_fs_parameters = { + .name = "bpf", + .specs = bpf_param_specs, }; struct bpf_mount_opts { umode_t mode; }; -static int bpf_parse_options(char *data, struct bpf_mount_opts *opts) +static int bpf_parse_param(struct fs_context *fc, struct fs_parameter *param) { - substring_t args[MAX_OPT_ARGS]; - int option, token; - char *ptr; + struct bpf_mount_opts *opts = fc->fs_private; + struct fs_parse_result result; + int opt; - opts->mode = S_IRWXUGO; - - while ((ptr = strsep(&data, ",")) != NULL) { - if (!*ptr) - continue; - - token = match_token(ptr, bpf_mount_tokens, args); - switch (token) { - case OPT_MODE: - if (match_octal(&args[0], &option)) - return -EINVAL; - opts->mode = option & S_IALLUGO; - break; + opt = fs_parse(fc, &bpf_fs_parameters, param, &result); + if (opt < 0) /* We might like to report bad mount options here, but * traditionally we've ignored all mount options, so we'd * better continue to ignore non-existing options for bpf. */ - } + return opt == -ENOPARAM ? 0 : opt; + + switch (opt) { + case OPT_MODE: + opts->mode = result.uint_32 & S_IALLUGO; + break; } return 0; } -static int bpf_fill_super(struct super_block *sb, void *data, int silent) +static int bpf_fill_super(struct super_block *sb, struct fs_context *fc) { static const struct tree_descr bpf_rfiles[] = { { "" } }; - struct bpf_mount_opts opts; + struct bpf_mount_opts *opts = fc->fs_private; struct inode *inode; int ret; - ret = bpf_parse_options(data, &opts); - if (ret) - return ret; - ret = simple_fill_super(sb, BPF_FS_MAGIC, bpf_rfiles); if (ret) return ret; @@ -644,21 +639,50 @@ static int bpf_fill_super(struct super_block *sb, void *data, int silent) inode = sb->s_root->d_inode; inode->i_op = &bpf_dir_iops; inode->i_mode &= ~S_IALLUGO; - inode->i_mode |= S_ISVTX | opts.mode; + inode->i_mode |= S_ISVTX | opts->mode; return 0; } -static struct dentry *bpf_mount(struct file_system_type *type, int flags, - const char *dev_name, void *data) +static int bpf_get_tree(struct fs_context *fc) +{ + return get_tree_nodev(fc, bpf_fill_super); +} + +static void bpf_free_fc(struct fs_context *fc) { - return mount_nodev(type, flags, data, bpf_fill_super); + kfree(fc->fs_private); +} + +static const struct fs_context_operations bpf_context_ops = { + .free = bpf_free_fc, + .parse_param = bpf_parse_param, + .get_tree = bpf_get_tree, +}; + +/* + * Set up the filesystem mount context. + */ +static int bpf_init_fs_context(struct fs_context *fc) +{ + struct bpf_mount_opts *opts; + + opts = kzalloc(sizeof(struct bpf_mount_opts), GFP_KERNEL); + if (!opts) + return -ENOMEM; + + opts->mode = S_IRWXUGO; + + fc->fs_private = opts; + fc->ops = &bpf_context_ops; + return 0; } static struct file_system_type bpf_fs_type = { .owner = THIS_MODULE, .name = "bpf", - .mount = bpf_mount, + .init_fs_context = bpf_init_fs_context, + .parameters = &bpf_fs_parameters, .kill_sb = kill_litter_super, }; -- cgit v1.2.3 From 9eea984979513d6ee137e545e26c5877d46039dd Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 17 Sep 2019 10:45:37 -0700 Subject: bpf: fix BTF verification of enums vmlinux BTF has enums that are 8 byte and 1 byte in size. 2 byte enum is a valid construct as well. Fix BTF enum verification to accept those sizes. Fixes: 69b693f0aefa ("bpf: btf: Introduce BPF Type Format (BTF)") Signed-off-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann --- kernel/bpf/btf.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index adb3adcebe3c..722d38e543e9 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -2377,9 +2377,8 @@ static s32 btf_enum_check_meta(struct btf_verifier_env *env, return -EINVAL; } - if (t->size != sizeof(int)) { - btf_verifier_log_type(env, t, "Expected size:%zu", - sizeof(int)); + if (t->size > 8 || !is_power_of_2(t->size)) { + btf_verifier_log_type(env, t, "Unexpected size"); return -EINVAL; } -- cgit v1.2.3 From 44d00dc7ceab1732ebd5f5aae601b24dacdf10c5 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Wed, 18 Sep 2019 17:55:37 +0900 Subject: tracing/probe: Fix to allow user to enable events on unloaded modules Fix to allow user to enable probe events on unloaded modules. This operations was allowed before commit 60d53e2c3b75 ("tracing/probe: Split trace_event related data from trace_probe"), because if users need to probe module init functions, they have to enable those probe events before loading module. Link: http://lkml.kernel.org/r/156879693733.31056.9331322616994665167.stgit@devnote2 Cc: stable@vger.kernel.org Fixes: 60d53e2c3b75 ("tracing/probe: Split trace_event related data from trace_probe") Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_kprobe.c | 17 +++++------------ 1 file changed, 5 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 7579c53bb053..0ba3239c0270 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -371,31 +371,24 @@ static int enable_trace_kprobe(struct trace_event_call *call, if (enabled) return 0; - enabled = false; list_for_each_entry(pos, trace_probe_probe_list(tp), list) { tk = container_of(pos, struct trace_kprobe, tp); if (trace_kprobe_has_gone(tk)) continue; ret = __enable_trace_kprobe(tk); - if (ret) { - if (enabled) { - __disable_trace_kprobe(tp); - enabled = false; - } + if (ret) break; - } enabled = true; } - if (!enabled) { - /* No probe is enabled. Roll back */ + if (ret) { + /* Failed to enable one of them. Roll back all */ + if (enabled) + __disable_trace_kprobe(tp); if (file) trace_probe_remove_file(tp, file); else trace_probe_clear_flag(tp, TP_FLAG_PROFILE); - if (!ret) - /* Since all probes are gone, this is not available */ - ret = -EADDRNOTAVAIL; } return ret; -- cgit v1.2.3 From fe60b0ce8e7335269722ec080173a9411a9d58a5 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Wed, 18 Sep 2019 17:55:46 +0900 Subject: tracing/probe: Reject exactly same probe event Reject exactly same probe events as existing probes. Multiprobe allows user to define multiple probes on same event. If user appends a probe which exactly same definition (same probe address and same arguments) on existing event, the event will record same probe information twice. That can be confusing users, so reject it. Link: http://lkml.kernel.org/r/156879694602.31056.5533024778165036763.stgit@devnote2 Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_kprobe.c | 52 ++++++++++++++++++++++++++++++++++++++------- kernel/trace/trace_probe.h | 3 ++- kernel/trace/trace_uprobe.c | 52 ++++++++++++++++++++++++++++++++++++++------- 3 files changed, 90 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 0ba3239c0270..a6697e28ddda 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -528,10 +528,53 @@ unreg: return 0; } +static bool trace_kprobe_has_same_kprobe(struct trace_kprobe *orig, + struct trace_kprobe *comp) +{ + struct trace_probe_event *tpe = orig->tp.event; + struct trace_probe *pos; + int i; + + list_for_each_entry(pos, &tpe->probes, list) { + orig = container_of(pos, struct trace_kprobe, tp); + if (strcmp(trace_kprobe_symbol(orig), + trace_kprobe_symbol(comp)) || + trace_kprobe_offset(orig) != trace_kprobe_offset(comp)) + continue; + + /* + * trace_probe_compare_arg_type() ensured that nr_args and + * each argument name and type are same. Let's compare comm. + */ + for (i = 0; i < orig->tp.nr_args; i++) { + if (strcmp(orig->tp.args[i].comm, + comp->tp.args[i].comm)) + continue; + } + + return true; + } + + return false; +} + static int append_trace_kprobe(struct trace_kprobe *tk, struct trace_kprobe *to) { int ret; + ret = trace_probe_compare_arg_type(&tk->tp, &to->tp); + if (ret) { + /* Note that argument starts index = 2 */ + trace_probe_log_set_index(ret + 1); + trace_probe_log_err(0, DIFF_ARG_TYPE); + return -EEXIST; + } + if (trace_kprobe_has_same_kprobe(to, tk)) { + trace_probe_log_set_index(0); + trace_probe_log_err(0, SAME_PROBE); + return -EEXIST; + } + /* Append to existing event */ ret = trace_probe_append(&tk->tp, &to->tp); if (ret) @@ -568,14 +611,7 @@ static int register_trace_kprobe(struct trace_kprobe *tk) trace_probe_log_err(0, DIFF_PROBE_TYPE); ret = -EEXIST; } else { - ret = trace_probe_compare_arg_type(&tk->tp, &old_tk->tp); - if (ret) { - /* Note that argument starts index = 2 */ - trace_probe_log_set_index(ret + 1); - trace_probe_log_err(0, DIFF_ARG_TYPE); - ret = -EEXIST; - } else - ret = append_trace_kprobe(tk, old_tk); + ret = append_trace_kprobe(tk, old_tk); } goto end; } diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index f805cc4cbe7c..4ee703728aec 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -436,7 +436,8 @@ extern int traceprobe_define_arg_fields(struct trace_event_call *event_call, C(BAD_INSN_BNDRY, "Probe point is not an instruction boundary"),\ C(FAIL_REG_PROBE, "Failed to register probe event"),\ C(DIFF_PROBE_TYPE, "Probe type is different from existing probe"),\ - C(DIFF_ARG_TYPE, "Argument type or name is different from existing probe"), + C(DIFF_ARG_TYPE, "Argument type or name is different from existing probe"),\ + C(SAME_PROBE, "There is already the exact same probe event"), #undef C #define C(a, b) TP_ERR_##a diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index cbf4da4bf367..34dd6d0016a3 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -410,10 +410,53 @@ unreg: return 0; } +static bool trace_uprobe_has_same_uprobe(struct trace_uprobe *orig, + struct trace_uprobe *comp) +{ + struct trace_probe_event *tpe = orig->tp.event; + struct trace_probe *pos; + struct inode *comp_inode = d_real_inode(comp->path.dentry); + int i; + + list_for_each_entry(pos, &tpe->probes, list) { + orig = container_of(pos, struct trace_uprobe, tp); + if (comp_inode != d_real_inode(orig->path.dentry) || + comp->offset != orig->offset) + continue; + + /* + * trace_probe_compare_arg_type() ensured that nr_args and + * each argument name and type are same. Let's compare comm. + */ + for (i = 0; i < orig->tp.nr_args; i++) { + if (strcmp(orig->tp.args[i].comm, + comp->tp.args[i].comm)) + continue; + } + + return true; + } + + return false; +} + static int append_trace_uprobe(struct trace_uprobe *tu, struct trace_uprobe *to) { int ret; + ret = trace_probe_compare_arg_type(&tu->tp, &to->tp); + if (ret) { + /* Note that argument starts index = 2 */ + trace_probe_log_set_index(ret + 1); + trace_probe_log_err(0, DIFF_ARG_TYPE); + return -EEXIST; + } + if (trace_uprobe_has_same_uprobe(to, tu)) { + trace_probe_log_set_index(0); + trace_probe_log_err(0, SAME_PROBE); + return -EEXIST; + } + /* Append to existing event */ ret = trace_probe_append(&tu->tp, &to->tp); if (!ret) @@ -469,14 +512,7 @@ static int register_trace_uprobe(struct trace_uprobe *tu) trace_probe_log_err(0, DIFF_PROBE_TYPE); ret = -EEXIST; } else { - ret = trace_probe_compare_arg_type(&tu->tp, &old_tu->tp); - if (ret) { - /* Note that argument starts index = 2 */ - trace_probe_log_set_index(ret + 1); - trace_probe_log_err(0, DIFF_ARG_TYPE); - ret = -EEXIST; - } else - ret = append_trace_uprobe(tu, old_tu); + ret = append_trace_uprobe(tu, old_tu); } goto end; } -- cgit v1.2.3 From e430d802d6a3aaf61bd3ed03d9404888a29b9bf9 Mon Sep 17 00:00:00 2001 From: Li RongQing Date: Thu, 19 Sep 2019 20:04:47 +0800 Subject: timer: Read jiffies once when forwarding base clk The timer delayed for more than 3 seconds warning was triggered during testing. Workqueue: events_unbound sched_tick_remote RIP: 0010:sched_tick_remote+0xee/0x100 ... Call Trace: process_one_work+0x18c/0x3a0 worker_thread+0x30/0x380 kthread+0x113/0x130 ret_from_fork+0x22/0x40 The reason is that the code in collect_expired_timers() uses jiffies unprotected: if (next_event > jiffies) base->clk = jiffies; As the compiler is allowed to reload the value base->clk can advance between the check and the store and in the worst case advance farther than next event. That causes the timer expiry to be delayed until the wheel pointer wraps around. Convert the code to use READ_ONCE() Fixes: 236968383cf5 ("timers: Optimize collect_expired_timers() for NOHZ") Signed-off-by: Li RongQing Signed-off-by: Liang ZhiCheng Signed-off-by: Thomas Gleixner Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/1568894687-14499-1-git-send-email-lirongqing@baidu.com --- kernel/time/timer.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 0e315a2e77ae..4820823515e9 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -1678,24 +1678,26 @@ void timer_clear_idle(void) static int collect_expired_timers(struct timer_base *base, struct hlist_head *heads) { + unsigned long now = READ_ONCE(jiffies); + /* * NOHZ optimization. After a long idle sleep we need to forward the * base to current jiffies. Avoid a loop by searching the bitfield for * the next expiring timer. */ - if ((long)(jiffies - base->clk) > 2) { + if ((long)(now - base->clk) > 2) { unsigned long next = __next_timer_interrupt(base); /* * If the next timer is ahead of time forward to current * jiffies, otherwise forward to the next expiry time: */ - if (time_after(next, jiffies)) { + if (time_after(next, now)) { /* * The call site will increment base->clk and then * terminate the expiry loop immediately. */ - base->clk = jiffies; + base->clk = now; return 0; } base->clk = next; -- cgit v1.2.3 From 9f014e3a66bc936412b6614304a4e6c70c70230e Mon Sep 17 00:00:00 2001 From: Roy Ben Shlomo Date: Fri, 20 Sep 2019 20:12:53 +0300 Subject: perf/core: Fix several typos in comments Fix typos in a few functions' documentation comments. Signed-off-by: Roy Ben Shlomo Cc: Alexander Shishkin Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: royb@sentinelone.com Link: http://lore.kernel.org/lkml/20190920171254.31373-1-royb@sentinelone.com Signed-off-by: Arnaldo Carvalho de Melo --- kernel/events/core.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 4f08b17d6426..275eae05af20 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -2239,7 +2239,7 @@ static void __perf_event_disable(struct perf_event *event, * * If event->ctx is a cloned context, callers must make sure that * every task struct that event->ctx->task could possibly point to - * remains valid. This condition is satisifed when called through + * remains valid. This condition is satisfied when called through * perf_event_for_each_child or perf_event_for_each because they * hold the top-level event's child_mutex, so any descendant that * goes to exit will block in perf_event_exit_event(). @@ -6054,7 +6054,7 @@ static void perf_sample_regs_intr(struct perf_regs *regs_intr, * Get remaining task size from user stack pointer. * * It'd be better to take stack vma map and limit this more - * precisly, but there's no way to get it safely under interrupt, + * precisely, but there's no way to get it safely under interrupt, * so using TASK_SIZE as limit. */ static u64 perf_ustack_task_size(struct pt_regs *regs) @@ -6616,7 +6616,7 @@ void perf_prepare_sample(struct perf_event_header *header, if (sample_type & PERF_SAMPLE_STACK_USER) { /* - * Either we need PERF_SAMPLE_STACK_USER bit to be allways + * Either we need PERF_SAMPLE_STACK_USER bit to be always * processed as the last one or have additional check added * in case new sample type is added, because we could eat * up the rest of the sample size. -- cgit v1.2.3 From e1572f1d08be57a5412a464cff0712a23cd0b73e Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Mon, 16 Sep 2019 18:22:56 +0200 Subject: cpu/SMT: create and export cpu_smt_possible() KVM needs to know if SMT is theoretically possible, this means it is supported and not forcefully disabled ('nosmt=force'). Create and export cpu_smt_possible() answering this question. Signed-off-by: Vitaly Kuznetsov Signed-off-by: Paolo Bonzini --- kernel/cpu.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index e1967e9eddc2..fc28e17940e0 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -392,8 +392,7 @@ enum cpuhp_smt_control cpu_smt_control __read_mostly = CPU_SMT_ENABLED; void __init cpu_smt_disable(bool force) { - if (cpu_smt_control == CPU_SMT_FORCE_DISABLED || - cpu_smt_control == CPU_SMT_NOT_SUPPORTED) + if (!cpu_smt_possible()) return; if (force) { @@ -438,6 +437,14 @@ static inline bool cpu_smt_allowed(unsigned int cpu) */ return !cpumask_test_cpu(cpu, &cpus_booted_once_mask); } + +/* Returns true if SMT is not supported of forcefully (irreversibly) disabled */ +bool cpu_smt_possible(void) +{ + return cpu_smt_control != CPU_SMT_FORCE_DISABLED && + cpu_smt_control != CPU_SMT_NOT_SUPPORTED; +} +EXPORT_SYMBOL_GPL(cpu_smt_possible); #else static inline bool cpu_smt_allowed(unsigned int cpu) { return true; } #endif -- cgit v1.2.3 From 13224794cb0832caa403ad583d8605202cabc6bc Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Mon, 23 Sep 2019 15:35:19 -0700 Subject: mm: remove quicklist page table caches Patch series "mm: remove quicklist page table caches". A while ago Nicholas proposed to remove quicklist page table caches [1]. I've rebased his patch on the curren upstream and switched ia64 and sh to use generic versions of PTE allocation. [1] https://lore.kernel.org/linux-mm/20190711030339.20892-1-npiggin@gmail.com This patch (of 3): Remove page table allocator "quicklists". These have been around for a long time, but have not got much traction in the last decade and are only used on ia64 and sh architectures. The numbers in the initial commit look interesting but probably don't apply anymore. If anybody wants to resurrect this it's in the git history, but it's unhelpful to have this code and divergent allocator behaviour for minor archs. Also it might be better to instead make more general improvements to page allocator if this is still so slow. Link: http://lkml.kernel.org/r/1565250728-21721-2-git-send-email-rppt@linux.ibm.com Signed-off-by: Nicholas Piggin Signed-off-by: Mike Rapoport Cc: Tony Luck Cc: Yoshinori Sato Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sched/idle.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index c892c6280c9f..8dad5aa600ea 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -238,7 +238,6 @@ static void do_idle(void) tick_nohz_idle_enter(); while (!need_resched()) { - check_pgt_cache(); rmb(); local_irq_disable(); -- cgit v1.2.3 From 00ff9a91bdb74933648a5b346d9f0edb99bd76d3 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 23 Sep 2019 15:35:55 -0700 Subject: mm/memory_hotplug.c: use PFN_UP / PFN_DOWN in walk_system_ram_range() Patch series "mm/memory_hotplug: online_pages() cleanups", v2. Some cleanups (+ one fix for a special case) in the context of online_pages(). This patch (of 5): This makes it clearer that we will never call func() with duplicate PFNs in case we have multiple sub-page memory resources. All unaligned parts of PFNs are completely discarded. Link: http://lkml.kernel.org/r/20190814154109.3448-2-david@redhat.com Signed-off-by: David Hildenbrand Acked-by: Michal Hocko Reviewed-by: Wei Yang Cc: Dan Williams Cc: Borislav Petkov Cc: Bjorn Helgaas Cc: Ingo Molnar Cc: Dave Hansen Cc: Nadav Amit Cc: Oscar Salvador Cc: Arun KS Cc: Pavel Tatashin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/resource.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/resource.c b/kernel/resource.c index 74877e9d90ca..76036a41143b 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -487,8 +487,8 @@ int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages, while (start < end && !find_next_iomem_res(start, end, flags, IORES_DESC_NONE, false, &res)) { - pfn = (res.start + PAGE_SIZE - 1) >> PAGE_SHIFT; - end_pfn = (res.end + 1) >> PAGE_SHIFT; + pfn = PFN_UP(res.start); + end_pfn = PFN_DOWN(res.end + 1); if (end_pfn > pfn) ret = (*func)(pfn, end_pfn - pfn, arg); if (ret) -- cgit v1.2.3 From fb4fb04ff4dd377b3132e9b31259263ec37b830a Mon Sep 17 00:00:00 2001 From: Song Liu Date: Mon, 23 Sep 2019 15:38:22 -0700 Subject: uprobe: use original page when all uprobes are removed Currently, uprobe swaps the target page with a anonymous page in both install_breakpoint() and remove_breakpoint(). When all uprobes on a page are removed, the given mm is still using an anonymous page (not the original page). This patch allows uprobe to use original page when possible (all uprobes on the page are already removed, and the original page is in page cache and uptodate). As suggested by Oleg, we unmap the old_page and let the original page fault in. Link: http://lkml.kernel.org/r/20190815164525.1848545-3-songliubraving@fb.com Signed-off-by: Song Liu Suggested-by: Oleg Nesterov Reviewed-by: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/events/uprobes.c | 66 ++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 51 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 84fa00497c49..648f47553bff 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -143,10 +143,12 @@ static loff_t vaddr_to_offset(struct vm_area_struct *vma, unsigned long vaddr) * * @vma: vma that holds the pte pointing to page * @addr: address the old @page is mapped at - * @page: the cowed page we are replacing by kpage - * @kpage: the modified page we replace page by + * @old_page: the page we are replacing by new_page + * @new_page: the modified page we replace page by * - * Returns 0 on success, -EFAULT on failure. + * If @new_page is NULL, only unmap @old_page. + * + * Returns 0 on success, negative error code otherwise. */ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, struct page *old_page, struct page *new_page) @@ -166,10 +168,12 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, VM_BUG_ON_PAGE(PageTransHuge(old_page), old_page); - err = mem_cgroup_try_charge(new_page, vma->vm_mm, GFP_KERNEL, &memcg, - false); - if (err) - return err; + if (new_page) { + err = mem_cgroup_try_charge(new_page, vma->vm_mm, GFP_KERNEL, + &memcg, false); + if (err) + return err; + } /* For try_to_free_swap() and munlock_vma_page() below */ lock_page(old_page); @@ -177,15 +181,20 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, mmu_notifier_invalidate_range_start(&range); err = -EAGAIN; if (!page_vma_mapped_walk(&pvmw)) { - mem_cgroup_cancel_charge(new_page, memcg, false); + if (new_page) + mem_cgroup_cancel_charge(new_page, memcg, false); goto unlock; } VM_BUG_ON_PAGE(addr != pvmw.address, old_page); - get_page(new_page); - page_add_new_anon_rmap(new_page, vma, addr, false); - mem_cgroup_commit_charge(new_page, memcg, false, false); - lru_cache_add_active_or_unevictable(new_page, vma); + if (new_page) { + get_page(new_page); + page_add_new_anon_rmap(new_page, vma, addr, false); + mem_cgroup_commit_charge(new_page, memcg, false, false); + lru_cache_add_active_or_unevictable(new_page, vma); + } else + /* no new page, just dec_mm_counter for old_page */ + dec_mm_counter(mm, MM_ANONPAGES); if (!PageAnon(old_page)) { dec_mm_counter(mm, mm_counter_file(old_page)); @@ -194,8 +203,9 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, flush_cache_page(vma, addr, pte_pfn(*pvmw.pte)); ptep_clear_flush_notify(vma, addr, pvmw.pte); - set_pte_at_notify(mm, addr, pvmw.pte, - mk_pte(new_page, vma->vm_page_prot)); + if (new_page) + set_pte_at_notify(mm, addr, pvmw.pte, + mk_pte(new_page, vma->vm_page_prot)); page_remove_rmap(old_page, false); if (!page_mapped(old_page)) @@ -488,6 +498,10 @@ retry: ref_ctr_updated = 1; } + ret = 0; + if (!is_register && !PageAnon(old_page)) + goto put_old; + ret = anon_vma_prepare(vma); if (ret) goto put_old; @@ -501,8 +515,30 @@ retry: copy_highpage(new_page, old_page); copy_to_page(new_page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE); + if (!is_register) { + struct page *orig_page; + pgoff_t index; + + VM_BUG_ON_PAGE(!PageAnon(old_page), old_page); + + index = vaddr_to_offset(vma, vaddr & PAGE_MASK) >> PAGE_SHIFT; + orig_page = find_get_page(vma->vm_file->f_inode->i_mapping, + index); + + if (orig_page) { + if (PageUptodate(orig_page) && + pages_identical(new_page, orig_page)) { + /* let go new_page */ + put_page(new_page); + new_page = NULL; + } + put_page(orig_page); + } + } + ret = __replace_page(vma, vaddr, old_page, new_page); - put_page(new_page); + if (new_page) + put_page(new_page); put_old: put_page(old_page); -- cgit v1.2.3 From 5a52c9df62b422087d0c88fc79641028e4472a3b Mon Sep 17 00:00:00 2001 From: Song Liu Date: Mon, 23 Sep 2019 15:38:27 -0700 Subject: uprobe: use FOLL_SPLIT_PMD instead of FOLL_SPLIT Use the newly added FOLL_SPLIT_PMD in uprobe. This preserves the huge page when the uprobe is enabled. When the uprobe is disabled, newer instances of the same application could still benefit from huge page. For the next step, we will enable khugepaged to regroup the pmd, so that existing instances of the application could also benefit from huge page after the uprobe is disabled. Link: http://lkml.kernel.org/r/20190815164525.1848545-5-songliubraving@fb.com Signed-off-by: Song Liu Acked-by: Kirill A. Shutemov Reviewed-by: Srikar Dronamraju Reviewed-by: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/events/uprobes.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 648f47553bff..27b596f14463 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -155,7 +155,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, { struct mm_struct *mm = vma->vm_mm; struct page_vma_mapped_walk pvmw = { - .page = old_page, + .page = compound_head(old_page), .vma = vma, .address = addr, }; @@ -166,8 +166,6 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, addr, addr + PAGE_SIZE); - VM_BUG_ON_PAGE(PageTransHuge(old_page), old_page); - if (new_page) { err = mem_cgroup_try_charge(new_page, vma->vm_mm, GFP_KERNEL, &memcg, false); @@ -481,7 +479,7 @@ int uprobe_write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm, retry: /* Read the page with vaddr into memory */ ret = get_user_pages_remote(NULL, mm, vaddr, 1, - FOLL_FORCE | FOLL_SPLIT, &old_page, &vma, NULL); + FOLL_FORCE | FOLL_SPLIT_PMD, &old_page, &vma, NULL); if (ret <= 0) return ret; -- cgit v1.2.3 From f385cb85a42fc4ba92464c2bfd2e2049d65353d3 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Mon, 23 Sep 2019 15:38:33 -0700 Subject: uprobe: collapse THP pmd after removing all uprobes After all uprobes are removed from the huge page (with PTE pgtable), it is possible to collapse the pmd and benefit from THP again. This patch does the collapse by calling collapse_pte_mapped_thp(). Link: http://lkml.kernel.org/r/20190815164525.1848545-7-songliubraving@fb.com Signed-off-by: Song Liu Acked-by: Kirill A. Shutemov Reported-by: kbuild test robot Reviewed-by: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/events/uprobes.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 27b596f14463..94d38a39d72e 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -26,6 +26,7 @@ #include #include #include +#include #include @@ -472,6 +473,7 @@ int uprobe_write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm, struct page *old_page, *new_page; struct vm_area_struct *vma; int ret, is_register, ref_ctr_updated = 0; + bool orig_page_huge = false; is_register = is_swbp_insn(&opcode); uprobe = container_of(auprobe, struct uprobe, arch); @@ -529,6 +531,9 @@ retry: /* let go new_page */ put_page(new_page); new_page = NULL; + + if (PageCompound(orig_page)) + orig_page_huge = true; } put_page(orig_page); } @@ -547,6 +552,10 @@ put_old: if (ret && is_register && ref_ctr_updated) update_ref_ctr(uprobe, mm, -1); + /* try collapse pmd for compound page */ + if (!ret && orig_page_huge) + collapse_pte_mapped_thp(mm, vaddr); + return ret; } -- cgit v1.2.3 From 67f3977f805b34cf0e41090679800d2091d41d49 Mon Sep 17 00:00:00 2001 From: Alexandre Ghiti Date: Mon, 23 Sep 2019 15:38:47 -0700 Subject: arm64, mm: move generic mmap layout functions to mm arm64 handles top-down mmap layout in a way that can be easily reused by other architectures, so make it available in mm. It then introduces a new config ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT that can be set by other architectures to benefit from those functions. Note that this new config depends on MMU being enabled, if selected without MMU support, a warning will be thrown. Link: http://lkml.kernel.org/r/20190730055113.23635-5-alex@ghiti.fr Signed-off-by: Alexandre Ghiti Suggested-by: Christoph Hellwig Acked-by: Catalin Marinas Acked-by: Kees Cook Reviewed-by: Christoph Hellwig Reviewed-by: Luis Chamberlain Cc: Albert Ou Cc: Alexander Viro Cc: James Hogan Cc: Palmer Dabbelt Cc: Paul Burton Cc: Ralf Baechle Cc: Russell King Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 078950d9605b..00fcea236eba 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -264,7 +264,8 @@ extern struct ctl_table epoll_table[]; extern struct ctl_table firmware_config_table[]; #endif -#ifdef HAVE_ARCH_PICK_MMAP_LAYOUT +#if defined(HAVE_ARCH_PICK_MMAP_LAYOUT) || \ + defined(CONFIG_ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT) int sysctl_legacy_va_layout; #endif @@ -1573,7 +1574,8 @@ static struct ctl_table vm_table[] = { .proc_handler = proc_dointvec, .extra1 = SYSCTL_ZERO, }, -#ifdef HAVE_ARCH_PICK_MMAP_LAYOUT +#if defined(HAVE_ARCH_PICK_MMAP_LAYOUT) || \ + defined(CONFIG_ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT) { .procname = "legacy_va_layout", .data = &sysctl_legacy_va_layout, -- cgit v1.2.3 From 89340d0935c9296c7b8222b6eab30e67cb57ab82 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Mon, 9 Sep 2019 09:40:28 +0800 Subject: Revert "locking/pvqspinlock: Don't wait if vCPU is preempted" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch reverts commit 75437bb304b20 (locking/pvqspinlock: Don't wait if vCPU is preempted). A large performance regression was caused by this commit. on over-subscription scenarios. The test was run on a Xeon Skylake box, 2 sockets, 40 cores, 80 threads, with three VMs of 80 vCPUs each. The score of ebizzy -M is reduced from 13000-14000 records/s to 1700-1800 records/s: Host Guest score vanilla w/o kvm optimizations upstream 1700-1800 records/s vanilla w/o kvm optimizations revert 13000-14000 records/s vanilla w/ kvm optimizations upstream 4500-5000 records/s vanilla w/ kvm optimizations revert 14000-15500 records/s Exit from aggressive wait-early mechanism can result in premature yield and extra scheduling latency. Actually, only 6% of wait_early events are caused by vcpu_is_preempted() being true. However, when one vCPU voluntarily releases its vCPU, all the subsequently waiters in the queue will do the same and the cascading effect leads to bad performance. kvm optimizations: [1] commit d73eb57b80b (KVM: Boost vCPUs that are delivering interrupts) [2] commit 266e85a5ec9 (KVM: X86: Boost queue head vCPU to mitigate lock waiter preemption) Tested-by: loobinliu@tencent.com Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Waiman Long Cc: Paolo Bonzini Cc: Radim Krčmář Cc: loobinliu@tencent.com Cc: stable@vger.kernel.org Fixes: 75437bb304b20 (locking/pvqspinlock: Don't wait if vCPU is preempted) Signed-off-by: Wanpeng Li Signed-off-by: Paolo Bonzini --- kernel/locking/qspinlock_paravirt.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h index 89bab079e7a4..e84d21aa0722 100644 --- a/kernel/locking/qspinlock_paravirt.h +++ b/kernel/locking/qspinlock_paravirt.h @@ -269,7 +269,7 @@ pv_wait_early(struct pv_node *prev, int loop) if ((loop & PV_PREV_CHECK_MASK) != 0) return false; - return READ_ONCE(prev->state) != vcpu_running || vcpu_is_preempted(prev->cpu); + return READ_ONCE(prev->state) != vcpu_running; } /* -- cgit v1.2.3 From f8d7ab2bded897607bff6324d5c6ea6b4aecca0c Mon Sep 17 00:00:00 2001 From: Srikar Dronamraju Date: Tue, 24 Sep 2019 17:19:06 +0530 Subject: tracing/probe: Fix same probe event argument matching Commit fe60b0ce8e73 ("tracing/probe: Reject exactly same probe event") tries to reject a event which matches an already existing probe. However it currently continues to match arguments and rejects adding a probe even when the arguments don't match. Fix this by only rejecting a probe if and only if all the arguments match. Link: http://lkml.kernel.org/r/20190924114906.14038-1-srikar@linux.vnet.ibm.com Fixes: fe60b0ce8e73 ("tracing/probe: Reject exactly same probe event") Acked-by: Masami Hiramatsu Signed-off-by: Srikar Dronamraju Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_kprobe.c | 5 +++-- kernel/trace/trace_uprobe.c | 5 +++-- 2 files changed, 6 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index a6697e28ddda..402dc3ce88d3 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -549,10 +549,11 @@ static bool trace_kprobe_has_same_kprobe(struct trace_kprobe *orig, for (i = 0; i < orig->tp.nr_args; i++) { if (strcmp(orig->tp.args[i].comm, comp->tp.args[i].comm)) - continue; + break; } - return true; + if (i == orig->tp.nr_args) + return true; } return false; diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 34dd6d0016a3..dd884341f5c5 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -431,10 +431,11 @@ static bool trace_uprobe_has_same_uprobe(struct trace_uprobe *orig, for (i = 0; i < orig->tp.nr_args; i++) { if (strcmp(orig->tp.args[i].comm, comp->tp.args[i].comm)) - continue; + break; } - return true; + if (i == orig->tp.nr_args) + return true; } return false; -- cgit v1.2.3 From 3fbd7ee285b2bbc6eebd15a3c8786d9776a402a8 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sat, 14 Sep 2019 07:33:34 -0500 Subject: tasks: Add a count of task RCU users Add a count of the number of RCU users (currently 1) of the task struct so that we can later add the scheduler case and get rid of the very subtle task_rcu_dereference(), and just use rcu_dereference(). As suggested by Oleg have the count overlap rcu_head so that no additional space in task_struct is required. Inspired-by: Linus Torvalds Inspired-by: Oleg Nesterov Signed-off-by: Eric W. Biederman Signed-off-by: Peter Zijlstra (Intel) Cc: Chris Metcalf Cc: Christoph Lameter Cc: Davidlohr Bueso Cc: Kirill Tkhai Cc: Linus Torvalds Cc: Mike Galbraith Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Russell King - ARM Linux admin Cc: Thomas Gleixner Link: https://lkml.kernel.org/r/87woebdplt.fsf_-_@x220.int.ebiederm.org Signed-off-by: Ingo Molnar --- kernel/exit.c | 7 ++++++- kernel/fork.c | 7 +++---- 2 files changed, 9 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 22ab6a4bdc51..3bcaec2ea3ba 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -182,6 +182,11 @@ static void delayed_put_task_struct(struct rcu_head *rhp) put_task_struct(tsk); } +void put_task_struct_rcu_user(struct task_struct *task) +{ + if (refcount_dec_and_test(&task->rcu_users)) + call_rcu(&task->rcu, delayed_put_task_struct); +} void release_task(struct task_struct *p) { @@ -222,7 +227,7 @@ repeat: write_unlock_irq(&tasklist_lock); release_thread(p); - call_rcu(&p->rcu, delayed_put_task_struct); + put_task_struct_rcu_user(p); p = leader; if (unlikely(zap_leader)) diff --git a/kernel/fork.c b/kernel/fork.c index 1d1cd06edbc1..7eefe338d7a2 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -902,10 +902,9 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) if (orig->cpus_ptr == &orig->cpus_mask) tsk->cpus_ptr = &tsk->cpus_mask; - /* - * One for us, one for whoever does the "release_task()" (usually - * parent) - */ + /* One for the user space visible state that goes away when reaped. */ + refcount_set(&tsk->rcu_users, 1); + /* One for the rcu users, and one for the scheduler */ refcount_set(&tsk->usage, 2); #ifdef CONFIG_BLK_DEV_IO_TRACE tsk->btrace_seq = 0; -- cgit v1.2.3 From 0ff7b2cfbae36ebcd216c6a5ad7f8534eebeaee2 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sat, 14 Sep 2019 07:33:58 -0500 Subject: tasks, sched/core: Ensure tasks are available for a grace period after leaving the runqueue In the ordinary case today the RCU grace period for a task_struct is triggered when another process wait's for it's zombine and causes the kernel to call release_task(). As the waiting task has to receive a signal and then act upon it before this happens, typically this will occur after the original task as been removed from the runqueue. Unfortunaty in some cases such as self reaping tasks it can be shown that release_task() will be called starting the grace period for task_struct long before the task leaves the runqueue. Therefore use put_task_struct_rcu_user() in finish_task_switch() to guarantee that the there is a RCU lifetime after the task leaves the runqueue. Besides the change in the start of the RCU grace period for the task_struct this change may cause perf_event_delayed_put and trace_sched_process_free. The function perf_event_delayed_put boils down to just a WARN_ON for cases that I assume never show happen. So I don't see any problem with delaying it. The function trace_sched_process_free is a trace point and thus visible to user space. Occassionally userspace has the strangest dependencies so this has a miniscule chance of causing a regression. This change only changes the timing of when the tracepoint is called. The change in timing arguably gives userspace a more accurate picture of what is going on. So I don't expect there to be a regression. In the case where a task self reaps we are pretty much guaranteed that the RCU grace period is delayed. So we should get quite a bit of coverage in of this worst case for the change in a normal threaded workload. So I expect any issues to turn up quickly or not at all. I have lightly tested this change and everything appears to work fine. Inspired-by: Linus Torvalds Inspired-by: Oleg Nesterov Signed-off-by: Eric W. Biederman Signed-off-by: Peter Zijlstra (Intel) Cc: Chris Metcalf Cc: Christoph Lameter Cc: Davidlohr Bueso Cc: Kirill Tkhai Cc: Linus Torvalds Cc: Mike Galbraith Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Russell King - ARM Linux admin Cc: Thomas Gleixner Link: https://lkml.kernel.org/r/87r24jdpl5.fsf_-_@x220.int.ebiederm.org Signed-off-by: Ingo Molnar --- kernel/fork.c | 11 +++++++---- kernel/sched/core.c | 2 +- 2 files changed, 8 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 7eefe338d7a2..d6e552552e52 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -902,10 +902,13 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) if (orig->cpus_ptr == &orig->cpus_mask) tsk->cpus_ptr = &tsk->cpus_mask; - /* One for the user space visible state that goes away when reaped. */ - refcount_set(&tsk->rcu_users, 1); - /* One for the rcu users, and one for the scheduler */ - refcount_set(&tsk->usage, 2); + /* + * One for the user space visible state that goes away when reaped. + * One for the scheduler. + */ + refcount_set(&tsk->rcu_users, 2); + /* One for the rcu users */ + refcount_set(&tsk->usage, 1); #ifdef CONFIG_BLK_DEV_IO_TRACE tsk->btrace_seq = 0; #endif diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 06961b997ed6..5e5fefbd4cc7 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3254,7 +3254,7 @@ static struct rq *finish_task_switch(struct task_struct *prev) /* Task is done with its stack. */ put_task_stack(prev); - put_task_struct(prev); + put_task_struct_rcu_user(prev); } tick_nohz_task_switch(); -- cgit v1.2.3 From 154abafc68bfb7c2ef2ad5308a3b2de8968c3f61 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sat, 14 Sep 2019 07:34:30 -0500 Subject: tasks, sched/core: With a grace period after finish_task_switch(), remove unnecessary code Remove work arounds that were written before there was a grace period after tasks left the runqueue in finish_task_switch(). In particular now that there tasks exiting the runqueue exprience a RCU grace period none of the work performed by task_rcu_dereference() excpet the rcu_dereference() is necessary so replace task_rcu_dereference() with rcu_dereference(). Remove the code in rcuwait_wait_event() that checks to ensure the current task has not exited. It is no longer necessary as it is guaranteed that any running task will experience a RCU grace period after it leaves the run queueue. Remove the comment in rcuwait_wake_up() as it is no longer relevant. Ref: 8f95c90ceb54 ("sched/wait, RCU: Introduce rcuwait machinery") Ref: 150593bf8693 ("sched/api: Introduce task_rcu_dereference() and try_get_task_struct()") Signed-off-by: Eric W. Biederman Signed-off-by: Peter Zijlstra (Intel) Cc: Chris Metcalf Cc: Christoph Lameter Cc: Davidlohr Bueso Cc: Kirill Tkhai Cc: Linus Torvalds Cc: Mike Galbraith Cc: Oleg Nesterov Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Russell King - ARM Linux admin Cc: Thomas Gleixner Link: https://lkml.kernel.org/r/87lfurdpk9.fsf_-_@x220.int.ebiederm.org Signed-off-by: Ingo Molnar --- kernel/exit.c | 67 ----------------------------------------------- kernel/sched/fair.c | 2 +- kernel/sched/membarrier.c | 4 +-- 3 files changed, 3 insertions(+), 70 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 3bcaec2ea3ba..a46a50d67002 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -234,69 +234,6 @@ repeat: goto repeat; } -/* - * Note that if this function returns a valid task_struct pointer (!NULL) - * task->usage must remain >0 for the duration of the RCU critical section. - */ -struct task_struct *task_rcu_dereference(struct task_struct **ptask) -{ - struct sighand_struct *sighand; - struct task_struct *task; - - /* - * We need to verify that release_task() was not called and thus - * delayed_put_task_struct() can't run and drop the last reference - * before rcu_read_unlock(). We check task->sighand != NULL, - * but we can read the already freed and reused memory. - */ -retry: - task = rcu_dereference(*ptask); - if (!task) - return NULL; - - probe_kernel_address(&task->sighand, sighand); - - /* - * Pairs with atomic_dec_and_test() in put_task_struct(). If this task - * was already freed we can not miss the preceding update of this - * pointer. - */ - smp_rmb(); - if (unlikely(task != READ_ONCE(*ptask))) - goto retry; - - /* - * We've re-checked that "task == *ptask", now we have two different - * cases: - * - * 1. This is actually the same task/task_struct. In this case - * sighand != NULL tells us it is still alive. - * - * 2. This is another task which got the same memory for task_struct. - * We can't know this of course, and we can not trust - * sighand != NULL. - * - * In this case we actually return a random value, but this is - * correct. - * - * If we return NULL - we can pretend that we actually noticed that - * *ptask was updated when the previous task has exited. Or pretend - * that probe_slab_address(&sighand) reads NULL. - * - * If we return the new task (because sighand is not NULL for any - * reason) - this is fine too. This (new) task can't go away before - * another gp pass. - * - * And note: We could even eliminate the false positive if re-read - * task->sighand once again to avoid the falsely NULL. But this case - * is very unlikely so we don't care. - */ - if (!sighand) - return NULL; - - return task; -} - void rcuwait_wake_up(struct rcuwait *w) { struct task_struct *task; @@ -316,10 +253,6 @@ void rcuwait_wake_up(struct rcuwait *w) */ smp_mb(); /* (B) */ - /* - * Avoid using task_rcu_dereference() magic as long as we are careful, - * see comment in rcuwait_wait_event() regarding ->exit_state. - */ task = rcu_dereference(w->task); if (task) wake_up_process(task); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 3101c662426d..5bc23996ffae 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1602,7 +1602,7 @@ static void task_numa_compare(struct task_numa_env *env, return; rcu_read_lock(); - cur = task_rcu_dereference(&dst_rq->curr); + cur = rcu_dereference(dst_rq->curr); if (cur && ((cur->flags & PF_EXITING) || is_idle_task(cur))) cur = NULL; diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c index aa8d75804108..b14250a11608 100644 --- a/kernel/sched/membarrier.c +++ b/kernel/sched/membarrier.c @@ -71,7 +71,7 @@ static int membarrier_global_expedited(void) continue; rcu_read_lock(); - p = task_rcu_dereference(&cpu_rq(cpu)->curr); + p = rcu_dereference(cpu_rq(cpu)->curr); if (p && p->mm && (atomic_read(&p->mm->membarrier_state) & MEMBARRIER_STATE_GLOBAL_EXPEDITED)) { if (!fallback) @@ -150,7 +150,7 @@ static int membarrier_private_expedited(int flags) if (cpu == raw_smp_processor_id()) continue; rcu_read_lock(); - p = task_rcu_dereference(&cpu_rq(cpu)->curr); + p = rcu_dereference(cpu_rq(cpu)->curr); if (p && p->mm == current->mm) { if (!fallback) __cpumask_set_cpu(cpu, tmpmask); -- cgit v1.2.3 From 5311a98fef7d0dc2e8040ae0e18f5568d6d1dd5a Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sat, 14 Sep 2019 07:35:02 -0500 Subject: tasks, sched/core: RCUify the assignment of rq->curr The current task on the runqueue is currently read with rcu_dereference(). To obtain ordinary RCU semantics for an rcu_dereference() of rq->curr it needs to be paired with rcu_assign_pointer() of rq->curr. Which provides the memory barrier necessary to order assignments to the task_struct and the assignment to rq->curr. Unfortunately the assignment of rq->curr in __schedule is a hot path, and it has already been show that additional barriers in that code will reduce the performance of the scheduler. So I will attempt to describe below why you can effectively have ordinary RCU semantics without any additional barriers. The assignment of rq->curr in init_idle is a slow path called once per cpu and that can use rcu_assign_pointer() without any concerns. As I write this there are effectively two users of rcu_dereference() on rq->curr. There is the membarrier code in kernel/sched/membarrier.c that only looks at "->mm" after the rcu_dereference(). Then there is task_numa_compare() in kernel/sched/fair.c. My best reading of the code shows that task_numa_compare only access: "->flags", "->cpus_ptr", "->numa_group", "->numa_faults[]", "->total_numa_faults", and "->se.cfs_rq". The code in __schedule() essentially does: rq_lock(...); smp_mb__after_spinlock(); next = pick_next_task(...); rq->curr = next; context_switch(prev, next); At the start of the function the rq_lock/smp_mb__after_spinlock pair provides a full memory barrier. Further there is a full memory barrier in context_switch(). This means that any task that has already run and modified itself (the common case) has already seen two memory barriers before __schedule() runs and begins executing. A task that modifies itself then sees a third full memory barrier pair with the rq_lock(); For a brand new task that is enqueued with wake_up_new_task() there are the memory barriers present from the taking and release the pi_lock and the rq_lock as the processes is enqueued as well as the full memory barrier at the start of __schedule() assuming __schedule() happens on the same cpu. This means that by the time we reach the assignment of rq->curr except for values on the task struct modified in pick_next_task the code has the same guarantees as if it used rcu_assign_pointer(). Reading through all of the implementations of pick_next_task it appears pick_next_task is limited to modifying the task_struct fields "->se", "->rt", "->dl". These fields are the sched_entity structures of the varies schedulers. Further "->se.cfs_rq" is only changed in cgroup attach/move operations initialized by userspace. Unless I have missed something this means that in practice that the users of "rcu_dereference(rq->curr)" get normal RCU semantics of rcu_dereference() for the fields the care about, despite the assignment of rq->curr in __schedule() ot using rcu_assign_pointer. Signed-off-by: Eric W. Biederman Signed-off-by: Peter Zijlstra (Intel) Cc: Chris Metcalf Cc: Christoph Lameter Cc: Davidlohr Bueso Cc: Kirill Tkhai Cc: Linus Torvalds Cc: Mike Galbraith Cc: Oleg Nesterov Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Russell King - ARM Linux admin Cc: Thomas Gleixner Link: https://lore.kernel.org/r/20190903200603.GW2349@hirez.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 5e5fefbd4cc7..84c71160beb1 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4033,7 +4033,11 @@ static void __sched notrace __schedule(bool preempt) if (likely(prev != next)) { rq->nr_switches++; - rq->curr = next; + /* + * RCU users of rcu_dereference(rq->curr) may not see + * changes to task_struct made by pick_next_task(). + */ + RCU_INIT_POINTER(rq->curr, next); /* * The membarrier system call requires each architecture * to have a full memory barrier after updating @@ -6060,7 +6064,8 @@ void init_idle(struct task_struct *idle, int cpu) __set_task_cpu(idle, cpu); rcu_read_unlock(); - rq->curr = rq->idle = idle; + rq->idle = idle; + rcu_assign_pointer(rq->curr, idle); idle->on_rq = TASK_ON_RQ_QUEUED; #ifdef CONFIG_SMP idle->on_cpu = 1; -- cgit v1.2.3 From fc0d77387cb5ae883fd774fc559e056a8dde024c Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Thu, 19 Sep 2019 13:36:59 -0400 Subject: sched/membarrier: Fix private expedited registration check Fix a logic flaw in the way membarrier_register_private_expedited() handles ready state checks for private expedited sync core and private expedited registrations. If a private expedited membarrier registration is first performed, and then a private expedited sync_core registration is performed, the ready state check will skip the second registration when it really should not. Signed-off-by: Mathieu Desnoyers Signed-off-by: Peter Zijlstra (Intel) Cc: Chris Metcalf Cc: Christoph Lameter Cc: Eric W. Biederman Cc: Kirill Tkhai Cc: Linus Torvalds Cc: Mike Galbraith Cc: Oleg Nesterov Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Russell King - ARM Linux admin Cc: Thomas Gleixner Link: https://lkml.kernel.org/r/20190919173705.2181-2-mathieu.desnoyers@efficios.com Signed-off-by: Ingo Molnar --- kernel/sched/membarrier.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c index b14250a11608..d48b95fc4026 100644 --- a/kernel/sched/membarrier.c +++ b/kernel/sched/membarrier.c @@ -226,7 +226,7 @@ static int membarrier_register_private_expedited(int flags) * groups, which use the same mm. (CLONE_VM but not * CLONE_THREAD). */ - if (atomic_read(&mm->membarrier_state) & state) + if ((atomic_read(&mm->membarrier_state) & state) == state) return 0; atomic_or(MEMBARRIER_STATE_PRIVATE_EXPEDITED, &mm->membarrier_state); if (flags & MEMBARRIER_FLAG_SYNC_CORE) -- cgit v1.2.3 From 09554009c0cad4cb2223dd943c813c9257c6883a Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Thu, 19 Sep 2019 13:37:00 -0400 Subject: sched/membarrier: Remove redundant check Checking that the number of threads is 1 is redundant with checking mm_users == 1. No change in functionality intended. Suggested-by: Oleg Nesterov Signed-off-by: Mathieu Desnoyers Signed-off-by: Peter Zijlstra (Intel) Cc: Chris Metcalf Cc: Christoph Lameter Cc: Eric W. Biederman Cc: Kirill Tkhai Cc: Linus Torvalds Cc: Mike Galbraith Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Russell King - ARM Linux admin Cc: Thomas Gleixner Link: https://lkml.kernel.org/r/20190919173705.2181-3-mathieu.desnoyers@efficios.com Signed-off-by: Ingo Molnar --- kernel/sched/membarrier.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c index d48b95fc4026..7ccbd0e19626 100644 --- a/kernel/sched/membarrier.c +++ b/kernel/sched/membarrier.c @@ -186,7 +186,7 @@ static int membarrier_register_global_expedited(void) MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY) return 0; atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED, &mm->membarrier_state); - if (atomic_read(&mm->mm_users) == 1 && get_nr_threads(p) == 1) { + if (atomic_read(&mm->mm_users) == 1) { /* * For single mm user, single threaded process, we can * simply issue a memory barrier after setting @@ -232,7 +232,7 @@ static int membarrier_register_private_expedited(int flags) if (flags & MEMBARRIER_FLAG_SYNC_CORE) atomic_or(MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE, &mm->membarrier_state); - if (!(atomic_read(&mm->mm_users) == 1 && get_nr_threads(p) == 1)) { + if (atomic_read(&mm->mm_users) != 1) { /* * Ensure all future scheduler executions will observe the * new thread flag state for this process. -- cgit v1.2.3 From 227a4aadc75ba22fcb6c4e1c078817b8cbaae4ce Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Thu, 19 Sep 2019 13:37:02 -0400 Subject: sched/membarrier: Fix p->mm->membarrier_state racy load The membarrier_state field is located within the mm_struct, which is not guaranteed to exist when used from runqueue-lock-free iteration on runqueues by the membarrier system call. Copy the membarrier_state from the mm_struct into the scheduler runqueue when the scheduler switches between mm. When registering membarrier for mm, after setting the registration bit in the mm membarrier state, issue a synchronize_rcu() to ensure the scheduler observes the change. In order to take care of the case where a runqueue keeps executing the target mm without swapping to other mm, iterate over each runqueue and issue an IPI to copy the membarrier_state from the mm_struct into each runqueue which have the same mm which state has just been modified. Move the mm membarrier_state field closer to pgd in mm_struct to use a cache line already touched by the scheduler switch_mm. The membarrier_execve() (now membarrier_exec_mmap) hook now needs to clear the runqueue's membarrier state in addition to clear the mm membarrier state, so move its implementation into the scheduler membarrier code so it can access the runqueue structure. Add memory barrier in membarrier_exec_mmap() prior to clearing the membarrier state, ensuring memory accesses executed prior to exec are not reordered with the stores clearing the membarrier state. As suggested by Linus, move all membarrier.c RCU read-side locks outside of the for each cpu loops. Suggested-by: Linus Torvalds Signed-off-by: Mathieu Desnoyers Signed-off-by: Peter Zijlstra (Intel) Cc: Chris Metcalf Cc: Christoph Lameter Cc: Eric W. Biederman Cc: Kirill Tkhai Cc: Mike Galbraith Cc: Oleg Nesterov Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Russell King - ARM Linux admin Cc: Thomas Gleixner Link: https://lkml.kernel.org/r/20190919173705.2181-5-mathieu.desnoyers@efficios.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 4 +- kernel/sched/membarrier.c | 175 ++++++++++++++++++++++++++++++++++------------ kernel/sched/sched.h | 34 +++++++++ 3 files changed, 168 insertions(+), 45 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 84c71160beb1..2d9a3947bef4 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3358,15 +3358,15 @@ context_switch(struct rq *rq, struct task_struct *prev, else prev->active_mm = NULL; } else { // to user + membarrier_switch_mm(rq, prev->active_mm, next->mm); /* * sys_membarrier() requires an smp_mb() between setting - * rq->curr and returning to userspace. + * rq->curr / membarrier_switch_mm() and returning to userspace. * * The below provides this either through switch_mm(), or in * case 'prev->active_mm == next->mm' through * finish_task_switch()'s mmdrop(). */ - switch_mm_irqs_off(prev->active_mm, next->mm, next); if (!prev->mm) { // from kernel diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c index 7ccbd0e19626..070cf433bb9a 100644 --- a/kernel/sched/membarrier.c +++ b/kernel/sched/membarrier.c @@ -30,6 +30,39 @@ static void ipi_mb(void *info) smp_mb(); /* IPIs should be serializing but paranoid. */ } +static void ipi_sync_rq_state(void *info) +{ + struct mm_struct *mm = (struct mm_struct *) info; + + if (current->mm != mm) + return; + this_cpu_write(runqueues.membarrier_state, + atomic_read(&mm->membarrier_state)); + /* + * Issue a memory barrier after setting + * MEMBARRIER_STATE_GLOBAL_EXPEDITED in the current runqueue to + * guarantee that no memory access following registration is reordered + * before registration. + */ + smp_mb(); +} + +void membarrier_exec_mmap(struct mm_struct *mm) +{ + /* + * Issue a memory barrier before clearing membarrier_state to + * guarantee that no memory access prior to exec is reordered after + * clearing this state. + */ + smp_mb(); + atomic_set(&mm->membarrier_state, 0); + /* + * Keep the runqueue membarrier_state in sync with this mm + * membarrier_state. + */ + this_cpu_write(runqueues.membarrier_state, 0); +} + static int membarrier_global_expedited(void) { int cpu; @@ -56,6 +89,7 @@ static int membarrier_global_expedited(void) } cpus_read_lock(); + rcu_read_lock(); for_each_online_cpu(cpu) { struct task_struct *p; @@ -70,17 +104,25 @@ static int membarrier_global_expedited(void) if (cpu == raw_smp_processor_id()) continue; - rcu_read_lock(); + if (!(READ_ONCE(cpu_rq(cpu)->membarrier_state) & + MEMBARRIER_STATE_GLOBAL_EXPEDITED)) + continue; + + /* + * Skip the CPU if it runs a kernel thread. The scheduler + * leaves the prior task mm in place as an optimization when + * scheduling a kthread. + */ p = rcu_dereference(cpu_rq(cpu)->curr); - if (p && p->mm && (atomic_read(&p->mm->membarrier_state) & - MEMBARRIER_STATE_GLOBAL_EXPEDITED)) { - if (!fallback) - __cpumask_set_cpu(cpu, tmpmask); - else - smp_call_function_single(cpu, ipi_mb, NULL, 1); - } - rcu_read_unlock(); + if (p->flags & PF_KTHREAD) + continue; + + if (!fallback) + __cpumask_set_cpu(cpu, tmpmask); + else + smp_call_function_single(cpu, ipi_mb, NULL, 1); } + rcu_read_unlock(); if (!fallback) { preempt_disable(); smp_call_function_many(tmpmask, ipi_mb, NULL, 1); @@ -136,6 +178,7 @@ static int membarrier_private_expedited(int flags) } cpus_read_lock(); + rcu_read_lock(); for_each_online_cpu(cpu) { struct task_struct *p; @@ -157,8 +200,8 @@ static int membarrier_private_expedited(int flags) else smp_call_function_single(cpu, ipi_mb, NULL, 1); } - rcu_read_unlock(); } + rcu_read_unlock(); if (!fallback) { preempt_disable(); smp_call_function_many(tmpmask, ipi_mb, NULL, 1); @@ -177,32 +220,78 @@ static int membarrier_private_expedited(int flags) return 0; } +static int sync_runqueues_membarrier_state(struct mm_struct *mm) +{ + int membarrier_state = atomic_read(&mm->membarrier_state); + cpumask_var_t tmpmask; + int cpu; + + if (atomic_read(&mm->mm_users) == 1 || num_online_cpus() == 1) { + this_cpu_write(runqueues.membarrier_state, membarrier_state); + + /* + * For single mm user, we can simply issue a memory barrier + * after setting MEMBARRIER_STATE_GLOBAL_EXPEDITED in the + * mm and in the current runqueue to guarantee that no memory + * access following registration is reordered before + * registration. + */ + smp_mb(); + return 0; + } + + if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL)) + return -ENOMEM; + + /* + * For mm with multiple users, we need to ensure all future + * scheduler executions will observe @mm's new membarrier + * state. + */ + synchronize_rcu(); + + /* + * For each cpu runqueue, if the task's mm match @mm, ensure that all + * @mm's membarrier state set bits are also set in in the runqueue's + * membarrier state. This ensures that a runqueue scheduling + * between threads which are users of @mm has its membarrier state + * updated. + */ + cpus_read_lock(); + rcu_read_lock(); + for_each_online_cpu(cpu) { + struct rq *rq = cpu_rq(cpu); + struct task_struct *p; + + p = rcu_dereference(&rq->curr); + if (p && p->mm == mm) + __cpumask_set_cpu(cpu, tmpmask); + } + rcu_read_unlock(); + + preempt_disable(); + smp_call_function_many(tmpmask, ipi_sync_rq_state, mm, 1); + preempt_enable(); + + free_cpumask_var(tmpmask); + cpus_read_unlock(); + + return 0; +} + static int membarrier_register_global_expedited(void) { struct task_struct *p = current; struct mm_struct *mm = p->mm; + int ret; if (atomic_read(&mm->membarrier_state) & MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY) return 0; atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED, &mm->membarrier_state); - if (atomic_read(&mm->mm_users) == 1) { - /* - * For single mm user, single threaded process, we can - * simply issue a memory barrier after setting - * MEMBARRIER_STATE_GLOBAL_EXPEDITED to guarantee that - * no memory access following registration is reordered - * before registration. - */ - smp_mb(); - } else { - /* - * For multi-mm user threads, we need to ensure all - * future scheduler executions will observe the new - * thread flag state for this mm. - */ - synchronize_rcu(); - } + ret = sync_runqueues_membarrier_state(mm); + if (ret) + return ret; atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY, &mm->membarrier_state); @@ -213,12 +302,15 @@ static int membarrier_register_private_expedited(int flags) { struct task_struct *p = current; struct mm_struct *mm = p->mm; - int state = MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY; + int ready_state = MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY, + set_state = MEMBARRIER_STATE_PRIVATE_EXPEDITED, + ret; if (flags & MEMBARRIER_FLAG_SYNC_CORE) { if (!IS_ENABLED(CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE)) return -EINVAL; - state = MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY; + ready_state = + MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY; } /* @@ -226,20 +318,15 @@ static int membarrier_register_private_expedited(int flags) * groups, which use the same mm. (CLONE_VM but not * CLONE_THREAD). */ - if ((atomic_read(&mm->membarrier_state) & state) == state) + if ((atomic_read(&mm->membarrier_state) & ready_state) == ready_state) return 0; - atomic_or(MEMBARRIER_STATE_PRIVATE_EXPEDITED, &mm->membarrier_state); if (flags & MEMBARRIER_FLAG_SYNC_CORE) - atomic_or(MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE, - &mm->membarrier_state); - if (atomic_read(&mm->mm_users) != 1) { - /* - * Ensure all future scheduler executions will observe the - * new thread flag state for this process. - */ - synchronize_rcu(); - } - atomic_or(state, &mm->membarrier_state); + set_state |= MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE; + atomic_or(set_state, &mm->membarrier_state); + ret = sync_runqueues_membarrier_state(mm); + if (ret) + return ret; + atomic_or(ready_state, &mm->membarrier_state); return 0; } @@ -253,8 +340,10 @@ static int membarrier_register_private_expedited(int flags) * command specified does not exist, not available on the running * kernel, or if the command argument is invalid, this system call * returns -EINVAL. For a given command, with flags argument set to 0, - * this system call is guaranteed to always return the same value until - * reboot. + * if this system call returns -ENOSYS or -EINVAL, it is guaranteed to + * always return the same value until reboot. In addition, it can return + * -ENOMEM if there is not enough memory available to perform the system + * call. * * All memory accesses performed in program order from each targeted thread * is guaranteed to be ordered with respect to sys_membarrier(). If we use diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index b3cb895d14a2..0db2c1b3361e 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -911,6 +911,10 @@ struct rq { atomic_t nr_iowait; +#ifdef CONFIG_MEMBARRIER + int membarrier_state; +#endif + #ifdef CONFIG_SMP struct root_domain *rd; struct sched_domain __rcu *sd; @@ -2438,3 +2442,33 @@ static inline bool sched_energy_enabled(void) static inline bool sched_energy_enabled(void) { return false; } #endif /* CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL */ + +#ifdef CONFIG_MEMBARRIER +/* + * The scheduler provides memory barriers required by membarrier between: + * - prior user-space memory accesses and store to rq->membarrier_state, + * - store to rq->membarrier_state and following user-space memory accesses. + * In the same way it provides those guarantees around store to rq->curr. + */ +static inline void membarrier_switch_mm(struct rq *rq, + struct mm_struct *prev_mm, + struct mm_struct *next_mm) +{ + int membarrier_state; + + if (prev_mm == next_mm) + return; + + membarrier_state = atomic_read(&next_mm->membarrier_state); + if (READ_ONCE(rq->membarrier_state) == membarrier_state) + return; + + WRITE_ONCE(rq->membarrier_state, membarrier_state); +} +#else +static inline void membarrier_switch_mm(struct rq *rq, + struct mm_struct *prev_mm, + struct mm_struct *next_mm) +{ +} +#endif -- cgit v1.2.3 From c6d68c1c4a4d6611fc0f8145d764226571d737ca Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Thu, 19 Sep 2019 13:37:04 -0400 Subject: sched/membarrier: Skip IPIs when mm->mm_users == 1 If there is only a single mm_user for the mm, the private expedited membarrier command can skip the IPIs, because only a single thread is using the mm. Signed-off-by: Mathieu Desnoyers Signed-off-by: Peter Zijlstra (Intel) Cc: Chris Metcalf Cc: Christoph Lameter Cc: Eric W. Biederman Cc: Kirill Tkhai Cc: Linus Torvalds Cc: Mike Galbraith Cc: Oleg Nesterov Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Russell King - ARM Linux admin Cc: Thomas Gleixner Link: https://lkml.kernel.org/r/20190919173705.2181-7-mathieu.desnoyers@efficios.com Signed-off-by: Ingo Molnar --- kernel/sched/membarrier.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c index 070cf433bb9a..fced54ad0f3d 100644 --- a/kernel/sched/membarrier.c +++ b/kernel/sched/membarrier.c @@ -145,20 +145,21 @@ static int membarrier_private_expedited(int flags) int cpu; bool fallback = false; cpumask_var_t tmpmask; + struct mm_struct *mm = current->mm; if (flags & MEMBARRIER_FLAG_SYNC_CORE) { if (!IS_ENABLED(CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE)) return -EINVAL; - if (!(atomic_read(¤t->mm->membarrier_state) & + if (!(atomic_read(&mm->membarrier_state) & MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY)) return -EPERM; } else { - if (!(atomic_read(¤t->mm->membarrier_state) & + if (!(atomic_read(&mm->membarrier_state) & MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY)) return -EPERM; } - if (num_online_cpus() == 1) + if (atomic_read(&mm->mm_users) == 1 || num_online_cpus() == 1) return 0; /* @@ -194,7 +195,7 @@ static int membarrier_private_expedited(int flags) continue; rcu_read_lock(); p = rcu_dereference(cpu_rq(cpu)->curr); - if (p && p->mm == current->mm) { + if (p && p->mm == mm) { if (!fallback) __cpumask_set_cpu(cpu, tmpmask); else -- cgit v1.2.3 From c172e0a3e8e65a4c6fffec5bc4d6de08d6f894f7 Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Thu, 19 Sep 2019 13:37:05 -0400 Subject: sched/membarrier: Return -ENOMEM to userspace on memory allocation failure Remove the IPI fallback code from membarrier to deal with very infrequent cpumask memory allocation failure. Use GFP_KERNEL rather than GFP_NOWAIT, and relax the blocking guarantees for the expedited membarrier system call commands, allowing it to block if waiting for memory to be made available. In addition, now -ENOMEM can be returned to user-space if the cpumask memory allocation fails. Signed-off-by: Mathieu Desnoyers Signed-off-by: Peter Zijlstra (Intel) Cc: Chris Metcalf Cc: Christoph Lameter Cc: Eric W. Biederman Cc: Kirill Tkhai Cc: Linus Torvalds Cc: Mike Galbraith Cc: Oleg Nesterov Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Russell King - ARM Linux admin Cc: Thomas Gleixner Link: https://lkml.kernel.org/r/20190919173705.2181-8-mathieu.desnoyers@efficios.com Signed-off-by: Ingo Molnar --- kernel/sched/membarrier.c | 63 +++++++++++++++-------------------------------- 1 file changed, 20 insertions(+), 43 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c index fced54ad0f3d..a39bed2c784f 100644 --- a/kernel/sched/membarrier.c +++ b/kernel/sched/membarrier.c @@ -66,7 +66,6 @@ void membarrier_exec_mmap(struct mm_struct *mm) static int membarrier_global_expedited(void) { int cpu; - bool fallback = false; cpumask_var_t tmpmask; if (num_online_cpus() == 1) @@ -78,15 +77,8 @@ static int membarrier_global_expedited(void) */ smp_mb(); /* system call entry is not a mb. */ - /* - * Expedited membarrier commands guarantee that they won't - * block, hence the GFP_NOWAIT allocation flag and fallback - * implementation. - */ - if (!zalloc_cpumask_var(&tmpmask, GFP_NOWAIT)) { - /* Fallback for OOM. */ - fallback = true; - } + if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL)) + return -ENOMEM; cpus_read_lock(); rcu_read_lock(); @@ -117,18 +109,15 @@ static int membarrier_global_expedited(void) if (p->flags & PF_KTHREAD) continue; - if (!fallback) - __cpumask_set_cpu(cpu, tmpmask); - else - smp_call_function_single(cpu, ipi_mb, NULL, 1); + __cpumask_set_cpu(cpu, tmpmask); } rcu_read_unlock(); - if (!fallback) { - preempt_disable(); - smp_call_function_many(tmpmask, ipi_mb, NULL, 1); - preempt_enable(); - free_cpumask_var(tmpmask); - } + + preempt_disable(); + smp_call_function_many(tmpmask, ipi_mb, NULL, 1); + preempt_enable(); + + free_cpumask_var(tmpmask); cpus_read_unlock(); /* @@ -143,7 +132,6 @@ static int membarrier_global_expedited(void) static int membarrier_private_expedited(int flags) { int cpu; - bool fallback = false; cpumask_var_t tmpmask; struct mm_struct *mm = current->mm; @@ -168,15 +156,8 @@ static int membarrier_private_expedited(int flags) */ smp_mb(); /* system call entry is not a mb. */ - /* - * Expedited membarrier commands guarantee that they won't - * block, hence the GFP_NOWAIT allocation flag and fallback - * implementation. - */ - if (!zalloc_cpumask_var(&tmpmask, GFP_NOWAIT)) { - /* Fallback for OOM. */ - fallback = true; - } + if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL)) + return -ENOMEM; cpus_read_lock(); rcu_read_lock(); @@ -195,20 +176,16 @@ static int membarrier_private_expedited(int flags) continue; rcu_read_lock(); p = rcu_dereference(cpu_rq(cpu)->curr); - if (p && p->mm == mm) { - if (!fallback) - __cpumask_set_cpu(cpu, tmpmask); - else - smp_call_function_single(cpu, ipi_mb, NULL, 1); - } + if (p && p->mm == mm) + __cpumask_set_cpu(cpu, tmpmask); } rcu_read_unlock(); - if (!fallback) { - preempt_disable(); - smp_call_function_many(tmpmask, ipi_mb, NULL, 1); - preempt_enable(); - free_cpumask_var(tmpmask); - } + + preempt_disable(); + smp_call_function_many(tmpmask, ipi_mb, NULL, 1); + preempt_enable(); + + free_cpumask_var(tmpmask); cpus_read_unlock(); /* @@ -264,7 +241,7 @@ static int sync_runqueues_membarrier_state(struct mm_struct *mm) struct rq *rq = cpu_rq(cpu); struct task_struct *p; - p = rcu_dereference(&rq->curr); + p = rcu_dereference(rq->curr); if (p && p->mm == mm) __cpumask_set_cpu(cpu, tmpmask); } -- cgit v1.2.3 From 714e501e16cd473538b609b3e351b2cc9f7f09ed Mon Sep 17 00:00:00 2001 From: KeMeng Shi Date: Mon, 16 Sep 2019 06:53:28 +0000 Subject: sched/core: Fix migration to invalid CPU in __set_cpus_allowed_ptr() An oops can be triggered in the scheduler when running qemu on arm64: Unable to handle kernel paging request at virtual address ffff000008effe40 Internal error: Oops: 96000007 [#1] SMP Process migration/0 (pid: 12, stack limit = 0x00000000084e3736) pstate: 20000085 (nzCv daIf -PAN -UAO) pc : __ll_sc___cmpxchg_case_acq_4+0x4/0x20 lr : move_queued_task.isra.21+0x124/0x298 ... Call trace: __ll_sc___cmpxchg_case_acq_4+0x4/0x20 __migrate_task+0xc8/0xe0 migration_cpu_stop+0x170/0x180 cpu_stopper_thread+0xec/0x178 smpboot_thread_fn+0x1ac/0x1e8 kthread+0x134/0x138 ret_from_fork+0x10/0x18 __set_cpus_allowed_ptr() will choose an active dest_cpu in affinity mask to migrage the process if process is not currently running on any one of the CPUs specified in affinity mask. __set_cpus_allowed_ptr() will choose an invalid dest_cpu (dest_cpu >= nr_cpu_ids, 1024 in my virtual machine) if CPUS in an affinity mask are deactived by cpu_down after cpumask_intersects check. cpumask_test_cpu() of dest_cpu afterwards is overflown and may pass if corresponding bit is coincidentally set. As a consequence, kernel will access an invalid rq address associate with the invalid CPU in migration_cpu_stop->__migrate_task->move_queued_task and the Oops occurs. The reproduce the crash: 1) A process repeatedly binds itself to cpu0 and cpu1 in turn by calling sched_setaffinity. 2) A shell script repeatedly does "echo 0 > /sys/devices/system/cpu/cpu1/online" and "echo 1 > /sys/devices/system/cpu/cpu1/online" in turn. 3) Oops appears if the invalid CPU is set in memory after tested cpumask. Signed-off-by: KeMeng Shi Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Valentin Schneider Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: https://lkml.kernel.org/r/1568616808-16808-1-git-send-email-shikemeng@huawei.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 2d9a3947bef4..83ea23e9e91f 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1656,7 +1656,8 @@ static int __set_cpus_allowed_ptr(struct task_struct *p, if (cpumask_equal(p->cpus_ptr, new_mask)) goto out; - if (!cpumask_intersects(new_mask, cpu_valid_mask)) { + dest_cpu = cpumask_any_and(cpu_valid_mask, new_mask); + if (dest_cpu >= nr_cpu_ids) { ret = -EINVAL; goto out; } @@ -1677,7 +1678,6 @@ static int __set_cpus_allowed_ptr(struct task_struct *p, if (cpumask_test_cpu(task_cpu(p), new_mask)) goto out; - dest_cpu = cpumask_any_and(cpu_valid_mask, new_mask); if (task_running(rq, p) || p->state == TASK_WAKING) { struct migration_arg arg = { p, dest_cpu }; /* Need help from migration thread: drop lock and wait. */ -- cgit v1.2.3 From 763a9ec06c409dcde2a761aac4bb83ff3938e0b3 Mon Sep 17 00:00:00 2001 From: Qian Cai Date: Tue, 20 Aug 2019 14:40:55 -0400 Subject: sched/fair: Fix -Wunused-but-set-variable warnings Commit: de53fd7aedb1 ("sched/fair: Fix low cpu usage with high throttling by removing expiration of cpu-local slices") introduced a few compilation warnings: kernel/sched/fair.c: In function '__refill_cfs_bandwidth_runtime': kernel/sched/fair.c:4365:6: warning: variable 'now' set but not used [-Wunused-but-set-variable] kernel/sched/fair.c: In function 'start_cfs_bandwidth': kernel/sched/fair.c:4992:6: warning: variable 'overrun' set but not used [-Wunused-but-set-variable] Also, __refill_cfs_bandwidth_runtime() does no longer update the expiration time, so fix the comments accordingly. Signed-off-by: Qian Cai Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Ben Segall Reviewed-by: Dave Chiluk Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: pauld@redhat.com Fixes: de53fd7aedb1 ("sched/fair: Fix low cpu usage with high throttling by removing expiration of cpu-local slices") Link: https://lkml.kernel.org/r/1566326455-8038-1-git-send-email-cai@lca.pw Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 19 ++++++------------- 1 file changed, 6 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 5bc23996ffae..dfdac90fd211 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4353,21 +4353,16 @@ static inline u64 sched_cfs_bandwidth_slice(void) } /* - * Replenish runtime according to assigned quota and update expiration time. - * We use sched_clock_cpu directly instead of rq->clock to avoid adding - * additional synchronization around rq->lock. + * Replenish runtime according to assigned quota. We use sched_clock_cpu + * directly instead of rq->clock to avoid adding additional synchronization + * around rq->lock. * * requires cfs_b->lock */ void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b) { - u64 now; - - if (cfs_b->quota == RUNTIME_INF) - return; - - now = sched_clock_cpu(smp_processor_id()); - cfs_b->runtime = cfs_b->quota; + if (cfs_b->quota != RUNTIME_INF) + cfs_b->runtime = cfs_b->quota; } static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg) @@ -4983,15 +4978,13 @@ static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b) { - u64 overrun; - lockdep_assert_held(&cfs_b->lock); if (cfs_b->period_active) return; cfs_b->period_active = 1; - overrun = hrtimer_forward_now(&cfs_b->period_timer, cfs_b->period); + hrtimer_forward_now(&cfs_b->period_timer, cfs_b->period); hrtimer_start_expires(&cfs_b->period_timer, HRTIMER_MODE_ABS_PINNED); } -- cgit v1.2.3 From a49b4f4012ef233143c5f7ce44f97851e54d5ef9 Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Mon, 23 Sep 2019 15:36:12 +0100 Subject: sched/core: Fix preempt_schedule() interrupt return comment preempt_schedule_irq() is the one that should be called on return from interrupt, clean up the comment to avoid any ambiguity. Signed-off-by: Valentin Schneider Signed-off-by: Peter Zijlstra (Intel) Acked-by: Thomas Gleixner Cc: Linus Torvalds Cc: Peter Zijlstra Cc: linux-m68k@lists.linux-m68k.org Cc: linux-riscv@lists.infradead.org Cc: uclinux-h8-devel@lists.sourceforge.jp Link: https://lkml.kernel.org/r/20190923143620.29334-2-valentin.schneider@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 83ea23e9e91f..00ef44c8f7b5 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4218,9 +4218,8 @@ static void __sched notrace preempt_schedule_common(void) #ifdef CONFIG_PREEMPTION /* - * this is the entry point to schedule() from in-kernel preemption - * off of preempt_enable. Kernel preemptions off return from interrupt - * occur there and call schedule directly. + * This is the entry point to schedule() from in-kernel preemption + * off of preempt_enable. */ asmlinkage __visible void __sched notrace preempt_schedule(void) { @@ -4291,7 +4290,7 @@ EXPORT_SYMBOL_GPL(preempt_schedule_notrace); #endif /* CONFIG_PREEMPTION */ /* - * this is the entry point to schedule() from kernel preemption + * This is the entry point to schedule() from kernel preemption * off of irq context. * Note, that this is called and return with irqs disabled. This will * protect us against recursive calling from irq. -- cgit v1.2.3 From 9fc41acc89e58262c917b4be89ef00a87485804f Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Mon, 23 Sep 2019 10:30:17 +0100 Subject: sched/core: Remove double update_max_interval() call on CPU startup update_max_interval() is called in both CPUHP_AP_SCHED_STARTING's startup and teardown callbacks, but it turns out it's also called at the end of the startup callback of CPUHP_AP_ACTIVE (which is further down the startup sequence). There's no point in repeating this interval update in the startup sequence since the CPU will remain online until it goes down the teardown path. Remove the redundant call in sched_cpu_activate() (CPUHP_AP_ACTIVE). Signed-off-by: Valentin Schneider Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dietmar.eggemann@arm.com Cc: juri.lelli@redhat.com Cc: vincent.guittot@linaro.org Link: https://lkml.kernel.org/r/20190923093017.11755-1-valentin.schneider@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 00ef44c8f7b5..1b61cf48eee8 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6425,8 +6425,6 @@ int sched_cpu_activate(unsigned int cpu) } rq_unlock_irqrestore(rq, &rf); - update_max_interval(); - return 0; } -- cgit v1.2.3 From 4892f51ad54ddff2883a60b6ad4323c1f632a9d6 Mon Sep 17 00:00:00 2001 From: Quentin Perret Date: Fri, 20 Sep 2019 11:41:15 +0200 Subject: sched/fair: Avoid redundant EAS calculation The EAS wake-up path computes the system energy for several CPU candidates: the CPU with maximum spare capacity in each performance domain, and the prev_cpu. However, if prev_cpu also happens to be the CPU with maximum spare capacity in its performance domain, the energy calculation is still done twice, unnecessarily. Add a condition to filter out this corner case before doing the energy calculation. Reported-by: Pavan Kondeti Signed-off-by: Quentin Perret Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dietmar.eggemann@arm.com Cc: juri.lelli@redhat.com Cc: morten.rasmussen@arm.com Cc: qais.yousef@arm.com Cc: rjw@rjwysocki.net Cc: tkjos@google.com Cc: valentin.schneider@arm.com Cc: vincent.guittot@linaro.org Fixes: eb92692b2544 ("sched/fair: Speed-up energy-aware wake-ups") Link: https://lkml.kernel.org/r/20190920094115.GA11503@qperret.net Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index dfdac90fd211..83ab35e2374f 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6389,7 +6389,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) } /* Evaluate the energy impact of using this CPU. */ - if (max_spare_cap_cpu >= 0) { + if (max_spare_cap_cpu >= 0 && max_spare_cap_cpu != prev_cpu) { cur_delta = compute_energy(p, max_spare_cap_cpu, pd); cur_delta -= base_energy_pd; if (cur_delta < best_delta) { -- cgit v1.2.3 From fcd30ae0665c778e283f73c1c885c7fd26d12ef2 Mon Sep 17 00:00:00 2001 From: Jonathan Lemon Date: Tue, 24 Sep 2019 09:25:21 -0700 Subject: bpf/xskmap: Return ERR_PTR for failure case instead of NULL. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When kzalloc() failed, NULL was returned to the caller, which tested the pointer with IS_ERR(), which didn't match, so the pointer was used later, resulting in a NULL dereference. Return ERR_PTR(-ENOMEM) instead of NULL. Reported-by: syzbot+491c1b7565ba9069ecae@syzkaller.appspotmail.com Fixes: 0402acd683c6 ("xsk: remove AF_XDP socket from map when the socket is released") Signed-off-by: Jonathan Lemon Acked-by: Björn Töpel Signed-off-by: Daniel Borkmann --- kernel/bpf/xskmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/xskmap.c b/kernel/bpf/xskmap.c index 942c662e2eed..82a1ffe15dfa 100644 --- a/kernel/bpf/xskmap.c +++ b/kernel/bpf/xskmap.c @@ -37,7 +37,7 @@ static struct xsk_map_node *xsk_map_node_alloc(struct xsk_map *map, node = kzalloc(sizeof(*node), GFP_ATOMIC | __GFP_NOWARN); if (!node) - return NULL; + return ERR_PTR(-ENOMEM); err = xsk_map_inc(map); if (err) { -- cgit v1.2.3 From 0f74914071ab7e7b78731ed62bf350e3a344e0a5 Mon Sep 17 00:00:00 2001 From: Valdis Kletnieks Date: Wed, 25 Sep 2019 16:45:59 -0700 Subject: kernel/elfcore.c: include proper prototypes When building with W=1, gcc properly complains that there's no prototypes: CC kernel/elfcore.o kernel/elfcore.c:7:17: warning: no previous prototype for 'elf_core_extra_phdrs' [-Wmissing-prototypes] 7 | Elf_Half __weak elf_core_extra_phdrs(void) | ^~~~~~~~~~~~~~~~~~~~ kernel/elfcore.c:12:12: warning: no previous prototype for 'elf_core_write_extra_phdrs' [-Wmissing-prototypes] 12 | int __weak elf_core_write_extra_phdrs(struct coredump_params *cprm, loff_t offset) | ^~~~~~~~~~~~~~~~~~~~~~~~~~ kernel/elfcore.c:17:12: warning: no previous prototype for 'elf_core_write_extra_data' [-Wmissing-prototypes] 17 | int __weak elf_core_write_extra_data(struct coredump_params *cprm) | ^~~~~~~~~~~~~~~~~~~~~~~~~ kernel/elfcore.c:22:15: warning: no previous prototype for 'elf_core_extra_data_size' [-Wmissing-prototypes] 22 | size_t __weak elf_core_extra_data_size(void) | ^~~~~~~~~~~~~~~~~~~~~~~~ Provide the include file so gcc is happy, and we don't have potential code drift Link: http://lkml.kernel.org/r/29875.1565224705@turing-police Signed-off-by: Valdis Kletnieks Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/elfcore.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/elfcore.c b/kernel/elfcore.c index fc482c8e0bd8..57fb4dcff434 100644 --- a/kernel/elfcore.c +++ b/kernel/elfcore.c @@ -3,6 +3,7 @@ #include #include #include +#include Elf_Half __weak elf_core_extra_phdrs(void) { -- cgit v1.2.3 From 8495f7e6732ed248b648d36439795b42ec650b9e Mon Sep 17 00:00:00 2001 From: Sai Praneeth Prakhya Date: Wed, 25 Sep 2019 16:47:27 -0700 Subject: fork: improve error message for corrupted page tables When a user process exits, the kernel cleans up the mm_struct of the user process and during cleanup, check_mm() checks the page tables of the user process for corruption (E.g: unexpected page flags set/cleared). For corrupted page tables, the error message printed by check_mm() isn't very clear as it prints the loop index instead of page table type (E.g: Resident file mapping pages vs Resident shared memory pages). The loop index in check_mm() is used to index rss_stat[] which represents individual memory type stats. Hence, instead of printing index, print memory type, thereby improving error message. Without patch: -------------- [ 204.836425] mm/pgtable-generic.c:29: bad p4d 0000000089eb4e92(800000025f941467) [ 204.836544] BUG: Bad rss-counter state mm:00000000f75895ea idx:0 val:2 [ 204.836615] BUG: Bad rss-counter state mm:00000000f75895ea idx:1 val:5 [ 204.836685] BUG: non-zero pgtables_bytes on freeing mm: 20480 With patch: ----------- [ 69.815453] mm/pgtable-generic.c:29: bad p4d 0000000084653642(800000025ca37467) [ 69.815872] BUG: Bad rss-counter state mm:00000000014a6c03 type:MM_FILEPAGES val:2 [ 69.815962] BUG: Bad rss-counter state mm:00000000014a6c03 type:MM_ANONPAGES val:5 [ 69.816050] BUG: non-zero pgtables_bytes on freeing mm: 20480 Also, change print function (from printk(KERN_ALERT, ..) to pr_alert()) so that it matches the other print statement. Link: http://lkml.kernel.org/r/da75b5153f617f4c5739c08ee6ebeb3d19db0fbc.1565123758.git.sai.praneeth.prakhya@intel.com Signed-off-by: Sai Praneeth Prakhya Reviewed-by: Anshuman Khandual Suggested-by: Dave Hansen Acked-by: Michal Hocko Acked-by: Vlastimil Babka Acked-by: Dave Hansen Cc: Ingo Molnar Cc: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 5a0fd518e04e..60763c043aa3 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -125,6 +125,15 @@ int nr_threads; /* The idle threads do not count.. */ static int max_threads; /* tunable limit on nr_threads */ +#define NAMED_ARRAY_INDEX(x) [x] = __stringify(x) + +static const char * const resident_page_types[] = { + NAMED_ARRAY_INDEX(MM_FILEPAGES), + NAMED_ARRAY_INDEX(MM_ANONPAGES), + NAMED_ARRAY_INDEX(MM_SWAPENTS), + NAMED_ARRAY_INDEX(MM_SHMEMPAGES), +}; + DEFINE_PER_CPU(unsigned long, process_counts) = 0; __cacheline_aligned DEFINE_RWLOCK(tasklist_lock); /* outer */ @@ -645,12 +654,15 @@ static void check_mm(struct mm_struct *mm) { int i; + BUILD_BUG_ON_MSG(ARRAY_SIZE(resident_page_types) != NR_MM_COUNTERS, + "Please make sure 'struct resident_page_types[]' is updated as well"); + for (i = 0; i < NR_MM_COUNTERS; i++) { long x = atomic_long_read(&mm->rss_stat.count[i]); if (unlikely(x)) - printk(KERN_ALERT "BUG: Bad rss-counter state " - "mm:%p idx:%d val:%ld\n", mm, i, x); + pr_alert("BUG: Bad rss-counter state mm:%p type:%s val:%ld\n", + mm, resident_page_types[i], x); } if (mm_pgtables_bytes(mm)) -- cgit v1.2.3 From 7c3a6aedcd6aae0a32a527e68669f7dd667492d1 Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Wed, 25 Sep 2019 16:47:33 -0700 Subject: kexec: bail out upon SIGKILL when allocating memory. syzbot found that a thread can stall for minutes inside kexec_load() after that thread was killed by SIGKILL [1]. It turned out that the reproducer was trying to allocate 2408MB of memory using kimage_alloc_page() from kimage_load_normal_segment(). Let's check for SIGKILL before doing memory allocation. [1] https://syzkaller.appspot.com/bug?id=a0e3436829698d5824231251fad9d8e998f94f5e Link: http://lkml.kernel.org/r/993c9185-d324-2640-d061-bed2dd18b1f7@I-love.SAKURA.ne.jp Signed-off-by: Tetsuo Handa Reported-by: syzbot Cc: Eric Biederman Reviewed-by: Andrew Morton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kexec_core.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index d5870723b8ad..15d70a90b50d 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -300,6 +300,8 @@ static struct page *kimage_alloc_pages(gfp_t gfp_mask, unsigned int order) { struct page *pages; + if (fatal_signal_pending(current)) + return NULL; pages = alloc_pages(gfp_mask & ~__GFP_ZERO, order); if (pages) { unsigned int count, i; -- cgit v1.2.3 From 7d92bda271ddcbb2d1be2f82733dcb9bf8378010 Mon Sep 17 00:00:00 2001 From: Douglas Anderson Date: Wed, 25 Sep 2019 16:47:45 -0700 Subject: kgdb: don't use a notifier to enter kgdb at panic; call directly Right now kgdb/kdb hooks up to debug panics by registering for the panic notifier. This works OK except that it means that kgdb/kdb gets called _after_ the CPUs in the system are taken offline. That means that if anything important was happening on those CPUs (like something that might have contributed to the panic) you can't debug them. Specifically I ran into a case where I got a panic because a task was "blocked for more than 120 seconds" which was detected on CPU 2. I nicely got shown stack traces in the kernel log for all CPUs including CPU 0, which was running 'PID: 111 Comm: kworker/0:1H' and was in the middle of __mmc_switch(). I then ended up at the kdb prompt where switched over to kgdb to try to look at local variables of the process on CPU 0. I found that I couldn't. Digging more, I found that I had no info on any tasks running on CPUs other than CPU 2 and that asking kdb for help showed me "Error: no saved data for this cpu". This was because all the CPUs were offline. Let's move the entry of kdb/kgdb to a direct call from panic() and stop using the generic notifier. Putting a direct call in allows us to order things more properly and it also doesn't seem like we're breaking any abstractions by calling into the debugger from the panic function. Daniel said: : This patch changes the way kdump and kgdb interact with each other. : However it would seem rather odd to have both tools simultaneously armed : and, even if they were, the user still has the option to use panic_timeout : to force a kdump to happen. Thus I think the change of order is : acceptable. Link: http://lkml.kernel.org/r/20190703170354.217312-1-dianders@chromium.org Signed-off-by: Douglas Anderson Reviewed-by: Daniel Thompson Cc: Jason Wessel Cc: Kees Cook Cc: Borislav Petkov Cc: Thomas Gleixner Cc: Feng Tang Cc: YueHaibing Cc: Sergey Senozhatsky Cc: "Steven Rostedt (VMware)" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/debug/debug_core.c | 31 +++++++++++-------------------- kernel/panic.c | 8 ++++++++ 2 files changed, 19 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c index 10f1187b3907..f76d6f77dd5e 100644 --- a/kernel/debug/debug_core.c +++ b/kernel/debug/debug_core.c @@ -893,30 +893,25 @@ static struct sysrq_key_op sysrq_dbg_op = { }; #endif -static int kgdb_panic_event(struct notifier_block *self, - unsigned long val, - void *data) +void kgdb_panic(const char *msg) { + if (!kgdb_io_module_registered) + return; + /* - * Avoid entering the debugger if we were triggered due to a panic - * We don't want to get stuck waiting for input from user in such case. - * panic_timeout indicates the system should automatically + * We don't want to get stuck waiting for input from user if + * "panic_timeout" indicates the system should automatically * reboot on panic. */ if (panic_timeout) - return NOTIFY_DONE; + return; if (dbg_kdb_mode) - kdb_printf("PANIC: %s\n", (char *)data); + kdb_printf("PANIC: %s\n", msg); + kgdb_breakpoint(); - return NOTIFY_DONE; } -static struct notifier_block kgdb_panic_event_nb = { - .notifier_call = kgdb_panic_event, - .priority = INT_MAX, -}; - void __weak kgdb_arch_late(void) { } @@ -965,8 +960,6 @@ static void kgdb_register_callbacks(void) kgdb_arch_late(); register_module_notifier(&dbg_module_load_nb); register_reboot_notifier(&dbg_reboot_notifier); - atomic_notifier_chain_register(&panic_notifier_list, - &kgdb_panic_event_nb); #ifdef CONFIG_MAGIC_SYSRQ register_sysrq_key('g', &sysrq_dbg_op); #endif @@ -980,16 +973,14 @@ static void kgdb_register_callbacks(void) static void kgdb_unregister_callbacks(void) { /* - * When this routine is called KGDB should unregister from the - * panic handler and clean up, making sure it is not handling any + * When this routine is called KGDB should unregister from + * handlers and clean up, making sure it is not handling any * break exceptions at the time. */ if (kgdb_io_module_registered) { kgdb_io_module_registered = 0; unregister_reboot_notifier(&dbg_reboot_notifier); unregister_module_notifier(&dbg_module_load_nb); - atomic_notifier_chain_unregister(&panic_notifier_list, - &kgdb_panic_event_nb); kgdb_arch_exit(); #ifdef CONFIG_MAGIC_SYSRQ unregister_sysrq_key('g', &sysrq_dbg_op); diff --git a/kernel/panic.c b/kernel/panic.c index 057540b6eee9..d1ece4c363b9 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -219,6 +220,13 @@ void panic(const char *fmt, ...) dump_stack(); #endif + /* + * If kgdb is enabled, give it a chance to run before we stop all + * the other CPUs or else we won't be able to debug processes left + * running on them. + */ + kgdb_panic(buf); + /* * If we have crashed and we have a crash kernel loaded let it handle * everything else. -- cgit v1.2.3 From ee8711336c51708382627ebcaee5f2122b77dfef Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 25 Sep 2019 16:47:52 -0700 Subject: bug: refactor away warn_slowpath_fmt_taint() Patch series "Clean up WARN() "cut here" handling", v2. Christophe Leroy noticed that the fix for missing "cut here" in the WARN() case was adding explicit printk() calls instead of teaching the exception handler to add it. This refactors the bug/warn infrastructure to pass this information as a new BUGFLAG. Longer details repeated from the last patch in the series: bug: move WARN_ON() "cut here" into exception handler The original cleanup of "cut here" missed the WARN_ON() case (that does not have a printk message), which was fixed recently by adding an explicit printk of "cut here". This had the downside of adding a printk() to every WARN_ON() caller, which reduces the utility of using an instruction exception to streamline the resulting code. By making this a new BUGFLAG, all of these can be removed and "cut here" can be handled by the exception handler. This was very pronounced on PowerPC, but the effect can be seen on x86 as well. The resulting text size of a defconfig build shows some small savings from this patch: text data bss dec hex filename 19691167 5134320 1646664 26472151 193eed7 vmlinux.before 19676362 5134260 1663048 26473670 193f4c6 vmlinux.after This change also opens the door for creating something like BUG_MSG(), where a custom printk() before issuing BUG(), without confusing the "cut here" line. This patch (of 7): There's no reason to have specialized helpers for passing the warn taint down to __warn(). Consolidate and refactor helper macros, removing __WARN_printf() and warn_slowpath_fmt_taint(). Link: http://lkml.kernel.org/r/20190819234111.9019-2-keescook@chromium.org Signed-off-by: Kees Cook Cc: Christophe Leroy Cc: Peter Zijlstra Cc: Christophe Leroy Cc: Drew Davenport Cc: Arnd Bergmann Cc: "Steven Rostedt (VMware)" Cc: Feng Tang Cc: Petr Mladek Cc: Mauro Carvalho Chehab Cc: Borislav Petkov Cc: YueHaibing Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/panic.c | 18 +++--------------- 1 file changed, 3 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/panic.c b/kernel/panic.c index d1ece4c363b9..1d89f5423426 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -600,20 +600,8 @@ void __warn(const char *file, int line, void *caller, unsigned taint, } #ifdef WANT_WARN_ON_SLOWPATH -void warn_slowpath_fmt(const char *file, int line, const char *fmt, ...) -{ - struct warn_args args; - - args.fmt = fmt; - va_start(args.args, fmt); - __warn(file, line, __builtin_return_address(0), TAINT_WARN, NULL, - &args); - va_end(args.args); -} -EXPORT_SYMBOL(warn_slowpath_fmt); - -void warn_slowpath_fmt_taint(const char *file, int line, - unsigned taint, const char *fmt, ...) +void warn_slowpath_fmt(const char *file, int line, unsigned taint, + const char *fmt, ...) { struct warn_args args; @@ -622,7 +610,7 @@ void warn_slowpath_fmt_taint(const char *file, int line, __warn(file, line, __builtin_return_address(0), taint, NULL, &args); va_end(args.args); } -EXPORT_SYMBOL(warn_slowpath_fmt_taint); +EXPORT_SYMBOL(warn_slowpath_fmt); void warn_slowpath_null(const char *file, int line) { -- cgit v1.2.3 From f2f84b05e02b7710a201f0017b3272ad7ef703d1 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 25 Sep 2019 16:47:58 -0700 Subject: bug: consolidate warn_slowpath_fmt() usage Instead of having a separate helper for no printk output, just consolidate the logic into warn_slowpath_fmt(). Link: http://lkml.kernel.org/r/20190819234111.9019-4-keescook@chromium.org Signed-off-by: Kees Cook Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Christophe Leroy Cc: Drew Davenport Cc: Feng Tang Cc: Mauro Carvalho Chehab Cc: Peter Zijlstra Cc: Petr Mladek Cc: "Steven Rostedt (VMware)" Cc: YueHaibing Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/panic.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/panic.c b/kernel/panic.c index 1d89f5423426..79c153951b59 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -605,19 +605,19 @@ void warn_slowpath_fmt(const char *file, int line, unsigned taint, { struct warn_args args; + if (!fmt) { + pr_warn(CUT_HERE); + __warn(file, line, __builtin_return_address(0), taint, + NULL, NULL); + return; + } + args.fmt = fmt; va_start(args.args, fmt); __warn(file, line, __builtin_return_address(0), taint, NULL, &args); va_end(args.args); } EXPORT_SYMBOL(warn_slowpath_fmt); - -void warn_slowpath_null(const char *file, int line) -{ - pr_warn(CUT_HERE); - __warn(file, line, __builtin_return_address(0), TAINT_WARN, NULL, NULL); -} -EXPORT_SYMBOL(warn_slowpath_null); #else void __warn_printk(const char *fmt, ...) { -- cgit v1.2.3 From d38aba49a9f72b862f1220739ca837c886fdc319 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 25 Sep 2019 16:48:01 -0700 Subject: bug: lift "cut here" out of __warn() In preparation for cleaning up "cut here", move the "cut here" logic up out of __warn() and into callers that pass non-NULL args. For anyone looking closely, there are two callers that pass NULL args: one already explicitly prints "cut here". The remaining case is covered by how a WARN is built, which will be cleaned up in the next patch. Link: http://lkml.kernel.org/r/20190819234111.9019-5-keescook@chromium.org Signed-off-by: Kees Cook Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Christophe Leroy Cc: Drew Davenport Cc: Feng Tang Cc: Mauro Carvalho Chehab Cc: Peter Zijlstra Cc: Petr Mladek Cc: "Steven Rostedt (VMware)" Cc: YueHaibing Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/panic.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/panic.c b/kernel/panic.c index 79c153951b59..a643e5464296 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -559,9 +559,6 @@ void __warn(const char *file, int line, void *caller, unsigned taint, { disable_trace_on_warning(); - if (args) - pr_warn(CUT_HERE); - if (file) pr_warn("WARNING: CPU: %d PID: %d at %s:%d %pS\n", raw_smp_processor_id(), current->pid, file, line, @@ -605,8 +602,9 @@ void warn_slowpath_fmt(const char *file, int line, unsigned taint, { struct warn_args args; + pr_warn(CUT_HERE); + if (!fmt) { - pr_warn(CUT_HERE); __warn(file, line, __builtin_return_address(0), taint, NULL, NULL); return; -- cgit v1.2.3 From 2da1ead4d5f7fa5f61e5805655de1e245d03a763 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 25 Sep 2019 16:48:08 -0700 Subject: bug: consolidate __WARN_FLAGS usage Instead of having separate tests for __WARN_FLAGS, merge the two #ifdef blocks and replace the synonym WANT_WARN_ON_SLOWPATH macro. Link: http://lkml.kernel.org/r/20190819234111.9019-7-keescook@chromium.org Signed-off-by: Kees Cook Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Christophe Leroy Cc: Drew Davenport Cc: Feng Tang Cc: Mauro Carvalho Chehab Cc: Peter Zijlstra Cc: Petr Mladek Cc: "Steven Rostedt (VMware)" Cc: YueHaibing Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/panic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/panic.c b/kernel/panic.c index a643e5464296..47e8ebccc22b 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -596,7 +596,7 @@ void __warn(const char *file, int line, void *caller, unsigned taint, add_taint(taint, LOCKDEP_STILL_OK); } -#ifdef WANT_WARN_ON_SLOWPATH +#ifndef __WARN_FLAGS void warn_slowpath_fmt(const char *file, int line, unsigned taint, const char *fmt, ...) { -- cgit v1.2.3 From e3439af4a339acd7fddbd6d59b8ecefaac07a611 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Wed, 25 Sep 2019 10:38:35 +0100 Subject: bpf: Clean up indentation issue in BTF kflag processing There is a statement that is indented one level too deeply, remove the extraneous tab. Signed-off-by: Colin Ian King Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20190925093835.19515-1-colin.king@canonical.com --- kernel/bpf/btf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 722d38e543e9..29c7c06c6bd6 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -2332,7 +2332,7 @@ static int btf_enum_check_kflag_member(struct btf_verifier_env *env, if (BITS_PER_BYTE_MASKED(struct_bits_off)) { btf_verifier_log_member(env, struct_type, member, "Member is not byte aligned"); - return -EINVAL; + return -EINVAL; } nr_bits = int_bitsize; -- cgit v1.2.3 From 768fb61fcc13b2acaca758275d54c09a65e2968b Mon Sep 17 00:00:00 2001 From: Allan Zhang Date: Wed, 25 Sep 2019 16:43:12 -0700 Subject: bpf: Fix bpf_event_output re-entry issue BPF_PROG_TYPE_SOCK_OPS program can reenter bpf_event_output because it can be called from atomic and non-atomic contexts since we don't have bpf_prog_active to prevent it happen. This patch enables 3 levels of nesting to support normal, irq and nmi context. We can easily reproduce the issue by running netperf crr mode with 100 flows and 10 threads from netperf client side. Here is the whole stack dump: [ 515.228898] WARNING: CPU: 20 PID: 14686 at kernel/trace/bpf_trace.c:549 bpf_event_output+0x1f9/0x220 [ 515.228903] CPU: 20 PID: 14686 Comm: tcp_crr Tainted: G W 4.15.0-smp-fixpanic #44 [ 515.228904] Hardware name: Intel TBG,ICH10/Ikaria_QC_1b, BIOS 1.22.0 06/04/2018 [ 515.228905] RIP: 0010:bpf_event_output+0x1f9/0x220 [ 515.228906] RSP: 0018:ffff9a57ffc03938 EFLAGS: 00010246 [ 515.228907] RAX: 0000000000000012 RBX: 0000000000000001 RCX: 0000000000000000 [ 515.228907] RDX: 0000000000000000 RSI: 0000000000000096 RDI: ffffffff836b0f80 [ 515.228908] RBP: ffff9a57ffc039c8 R08: 0000000000000004 R09: 0000000000000012 [ 515.228908] R10: ffff9a57ffc1de40 R11: 0000000000000000 R12: 0000000000000002 [ 515.228909] R13: ffff9a57e13bae00 R14: 00000000ffffffff R15: ffff9a57ffc1e2c0 [ 515.228910] FS: 00007f5a3e6ec700(0000) GS:ffff9a57ffc00000(0000) knlGS:0000000000000000 [ 515.228910] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 515.228911] CR2: 0000537082664fff CR3: 000000061fed6002 CR4: 00000000000226f0 [ 515.228911] Call Trace: [ 515.228913] [ 515.228919] [] bpf_sockopt_event_output+0x3b/0x50 [ 515.228923] [] ? bpf_ktime_get_ns+0xe/0x10 [ 515.228927] [] ? __cgroup_bpf_run_filter_sock_ops+0x85/0x100 [ 515.228930] [] ? tcp_init_transfer+0x125/0x150 [ 515.228933] [] ? tcp_finish_connect+0x89/0x110 [ 515.228936] [] ? tcp_rcv_state_process+0x704/0x1010 [ 515.228939] [] ? sk_filter_trim_cap+0x53/0x2a0 [ 515.228942] [] ? tcp_v6_inbound_md5_hash+0x6f/0x1d0 [ 515.228945] [] ? tcp_v6_do_rcv+0x1c0/0x460 [ 515.228947] [] ? tcp_v6_rcv+0x9f8/0xb30 [ 515.228951] [] ? ip6_route_input+0x190/0x220 [ 515.228955] [] ? ip6_protocol_deliver_rcu+0x6d/0x450 [ 515.228958] [] ? ip6_rcv_finish+0xb6/0x170 [ 515.228961] [] ? ip6_protocol_deliver_rcu+0x450/0x450 [ 515.228963] [] ? ipv6_rcv+0x61/0xe0 [ 515.228966] [] ? ipv6_list_rcv+0x330/0x330 [ 515.228969] [] ? __netif_receive_skb_one_core+0x5b/0xa0 [ 515.228972] [] ? __netif_receive_skb+0x21/0x70 [ 515.228975] [] ? process_backlog+0xb2/0x150 [ 515.228978] [] ? net_rx_action+0x16f/0x410 [ 515.228982] [] ? __do_softirq+0xdd/0x305 [ 515.228986] [] ? irq_exit+0x9c/0xb0 [ 515.228989] [] ? smp_call_function_single_interrupt+0x65/0x120 [ 515.228991] [] ? call_function_single_interrupt+0x81/0x90 [ 515.228992] [ 515.228996] [] ? io_serial_in+0x20/0x20 [ 515.229000] [] ? console_unlock+0x230/0x490 [ 515.229003] [] ? vprintk_emit+0x26a/0x2a0 [ 515.229006] [] ? vprintk_default+0x1f/0x30 [ 515.229008] [] ? vprintk_func+0x35/0x70 [ 515.229011] [] ? printk+0x50/0x66 [ 515.229013] [] ? bpf_event_output+0xb7/0x220 [ 515.229016] [] ? bpf_sockopt_event_output+0x3b/0x50 [ 515.229019] [] ? bpf_ktime_get_ns+0xe/0x10 [ 515.229023] [] ? release_sock+0x97/0xb0 [ 515.229026] [] ? tcp_recvmsg+0x31a/0xda0 [ 515.229029] [] ? __cgroup_bpf_run_filter_sock_ops+0x85/0x100 [ 515.229032] [] ? tcp_set_state+0x191/0x1b0 [ 515.229035] [] ? tcp_disconnect+0x2e/0x600 [ 515.229038] [] ? tcp_close+0x3eb/0x460 [ 515.229040] [] ? inet_release+0x42/0x70 [ 515.229043] [] ? inet6_release+0x39/0x50 [ 515.229046] [] ? __sock_release+0x4d/0xd0 [ 515.229049] [] ? sock_close+0x15/0x20 [ 515.229052] [] ? __fput+0xe7/0x1f0 [ 515.229055] [] ? ____fput+0xe/0x10 [ 515.229058] [] ? task_work_run+0x82/0xb0 [ 515.229061] [] ? exit_to_usermode_loop+0x7e/0x11f [ 515.229064] [] ? do_syscall_64+0x111/0x130 [ 515.229067] [] ? entry_SYSCALL_64_after_hwframe+0x3d/0xa2 Fixes: a5a3a828cd00 ("bpf: add perf event notificaton support for sock_ops") Signed-off-by: Allan Zhang Signed-off-by: Daniel Borkmann Reviewed-by: Stanislav Fomichev Reviewed-by: Eric Dumazet Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20190925234312.94063-2-allanzhang@google.com --- kernel/trace/bpf_trace.c | 26 +++++++++++++++++++++----- 1 file changed, 21 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index ca1255d14576..3e38a010003c 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -500,14 +500,17 @@ static const struct bpf_func_proto bpf_perf_event_output_proto = { .arg5_type = ARG_CONST_SIZE_OR_ZERO, }; -static DEFINE_PER_CPU(struct pt_regs, bpf_pt_regs); -static DEFINE_PER_CPU(struct perf_sample_data, bpf_misc_sd); +static DEFINE_PER_CPU(int, bpf_event_output_nest_level); +struct bpf_nested_pt_regs { + struct pt_regs regs[3]; +}; +static DEFINE_PER_CPU(struct bpf_nested_pt_regs, bpf_pt_regs); +static DEFINE_PER_CPU(struct bpf_trace_sample_data, bpf_misc_sds); u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size, void *ctx, u64 ctx_size, bpf_ctx_copy_t ctx_copy) { - struct perf_sample_data *sd = this_cpu_ptr(&bpf_misc_sd); - struct pt_regs *regs = this_cpu_ptr(&bpf_pt_regs); + int nest_level = this_cpu_inc_return(bpf_event_output_nest_level); struct perf_raw_frag frag = { .copy = ctx_copy, .size = ctx_size, @@ -522,12 +525,25 @@ u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size, .data = meta, }, }; + struct perf_sample_data *sd; + struct pt_regs *regs; + u64 ret; + + if (WARN_ON_ONCE(nest_level > ARRAY_SIZE(bpf_misc_sds.sds))) { + ret = -EBUSY; + goto out; + } + sd = this_cpu_ptr(&bpf_misc_sds.sds[nest_level - 1]); + regs = this_cpu_ptr(&bpf_pt_regs.regs[nest_level - 1]); perf_fetch_caller_regs(regs); perf_sample_data_init(sd, 0, 0); sd->raw = &raw; - return __bpf_perf_event_output(regs, map, flags, sd); + ret = __bpf_perf_event_output(regs, map, flags, sd); +out: + this_cpu_dec(bpf_event_output_nest_level); + return ret; } BPF_CALL_0(bpf_get_current_task) -- cgit v1.2.3 From b9023b91dd020ad7e093baa5122b6968c48cc9e0 Mon Sep 17 00:00:00 2001 From: Balasubramani Vivekanandan Date: Thu, 26 Sep 2019 15:51:01 +0200 Subject: tick: broadcast-hrtimer: Fix a race in bc_set_next When a cpu requests broadcasting, before starting the tick broadcast hrtimer, bc_set_next() checks if the timer callback (bc_handler) is active using hrtimer_try_to_cancel(). But hrtimer_try_to_cancel() does not provide the required synchronization when the callback is active on other core. The callback could have already executed tick_handle_oneshot_broadcast() and could have also returned. But still there is a small time window where the hrtimer_try_to_cancel() returns -1. In that case bc_set_next() returns without doing anything, but the next_event of the tick broadcast clock device is already set to a timeout value. In the race condition diagram below, CPU #1 is running the timer callback and CPU #2 is entering idle state and so calls bc_set_next(). In the worst case, the next_event will contain an expiry time, but the hrtimer will not be started which happens when the racing callback returns HRTIMER_NORESTART. The hrtimer might never recover if all further requests from the CPUs to subscribe to tick broadcast have timeout greater than the next_event of tick broadcast clock device. This leads to cascading of failures and finally noticed as rcu stall warnings Here is a depiction of the race condition CPU #1 (Running timer callback) CPU #2 (Enter idle and subscribe to tick broadcast) --------------------- --------------------- __run_hrtimer() tick_broadcast_enter() bc_handler() __tick_broadcast_oneshot_control() tick_handle_oneshot_broadcast() raw_spin_lock(&tick_broadcast_lock); dev->next_event = KTIME_MAX; //wait for tick_broadcast_lock //next_event for tick broadcast clock set to KTIME_MAX since no other cores subscribed to tick broadcasting raw_spin_unlock(&tick_broadcast_lock); if (dev->next_event == KTIME_MAX) return HRTIMER_NORESTART // callback function exits without restarting the hrtimer //tick_broadcast_lock acquired raw_spin_lock(&tick_broadcast_lock); tick_broadcast_set_event() clockevents_program_event() dev->next_event = expires; bc_set_next() hrtimer_try_to_cancel() //returns -1 since the timer callback is active. Exits without restarting the timer cpu_base->running = NULL; The comment that hrtimer cannot be armed from within the callback is wrong. It is fine to start the hrtimer from within the callback. Also it is safe to start the hrtimer from the enter/exit idle code while the broadcast handler is active. The enter/exit idle code and the broadcast handler are synchronized using tick_broadcast_lock. So there is no need for the existing try to cancel logic. All this can be removed which will eliminate the race condition as well. Fixes: 5d1638acb9f6 ("tick: Introduce hrtimer based broadcast") Originally-by: Thomas Gleixner Signed-off-by: Balasubramani Vivekanandan Signed-off-by: Thomas Gleixner Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20190926135101.12102-2-balasubramani_vivekanandan@mentor.com --- kernel/time/tick-broadcast-hrtimer.c | 62 +++++++++++++++++------------------- 1 file changed, 29 insertions(+), 33 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-broadcast-hrtimer.c b/kernel/time/tick-broadcast-hrtimer.c index c1f5bb590b5e..b5a65e212df2 100644 --- a/kernel/time/tick-broadcast-hrtimer.c +++ b/kernel/time/tick-broadcast-hrtimer.c @@ -42,39 +42,39 @@ static int bc_shutdown(struct clock_event_device *evt) */ static int bc_set_next(ktime_t expires, struct clock_event_device *bc) { - int bc_moved; /* - * We try to cancel the timer first. If the callback is on - * flight on some other cpu then we let it handle it. If we - * were able to cancel the timer nothing can rearm it as we - * own broadcast_lock. + * This is called either from enter/exit idle code or from the + * broadcast handler. In all cases tick_broadcast_lock is held. * - * However we can also be called from the event handler of - * ce_broadcast_hrtimer itself when it expires. We cannot - * restart the timer because we are in the callback, but we - * can set the expiry time and let the callback return - * HRTIMER_RESTART. + * hrtimer_cancel() cannot be called here neither from the + * broadcast handler nor from the enter/exit idle code. The idle + * code can run into the problem described in bc_shutdown() and the + * broadcast handler cannot wait for itself to complete for obvious + * reasons. * - * Since we are in the idle loop at this point and because - * hrtimer_{start/cancel} functions call into tracing, - * calls to these functions must be bound within RCU_NONIDLE. + * Each caller tries to arm the hrtimer on its own CPU, but if the + * hrtimer callbback function is currently running, then + * hrtimer_start() cannot move it and the timer stays on the CPU on + * which it is assigned at the moment. + * + * As this can be called from idle code, the hrtimer_start() + * invocation has to be wrapped with RCU_NONIDLE() as + * hrtimer_start() can call into tracing. */ - RCU_NONIDLE( - { - bc_moved = hrtimer_try_to_cancel(&bctimer) >= 0; - if (bc_moved) { - hrtimer_start(&bctimer, expires, - HRTIMER_MODE_ABS_PINNED_HARD); - } - } - ); - - if (bc_moved) { - /* Bind the "device" to the cpu */ - bc->bound_on = smp_processor_id(); - } else if (bc->bound_on == smp_processor_id()) { - hrtimer_set_expires(&bctimer, expires); - } + RCU_NONIDLE( { + hrtimer_start(&bctimer, expires, HRTIMER_MODE_ABS_PINNED_HARD); + /* + * The core tick broadcast mode expects bc->bound_on to be set + * correctly to prevent a CPU which has the broadcast hrtimer + * armed from going deep idle. + * + * As tick_broadcast_lock is held, nothing can change the cpu + * base which was just established in hrtimer_start() above. So + * the below access is safe even without holding the hrtimer + * base lock. + */ + bc->bound_on = bctimer.base->cpu_base->cpu; + } ); return 0; } @@ -100,10 +100,6 @@ static enum hrtimer_restart bc_handler(struct hrtimer *t) { ce_broadcast_hrtimer.event_handler(&ce_broadcast_hrtimer); - if (clockevent_state_oneshot(&ce_broadcast_hrtimer)) - if (ce_broadcast_hrtimer.next_event != KTIME_MAX) - return HRTIMER_RESTART; - return HRTIMER_NORESTART; } -- cgit v1.2.3 From d2aea95a1a4d195d939d16303700921be318a2b9 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Sat, 28 Sep 2019 05:53:29 -0400 Subject: tracing/probe: Fix to check the difference of nr_args before adding probe Steven reported that a test triggered: ================================================================== BUG: KASAN: slab-out-of-bounds in trace_kprobe_create+0xa9e/0xe40 Read of size 8 at addr ffff8880c4f25a48 by task ftracetest/4798 CPU: 2 PID: 4798 Comm: ftracetest Not tainted 5.3.0-rc6-test+ #30 Hardware name: Hewlett-Packard HP Compaq Pro 6300 SFF/339A, BIOS K01 v03.03 07/14/2016 Call Trace: dump_stack+0x7c/0xc0 ? trace_kprobe_create+0xa9e/0xe40 print_address_description+0x6c/0x332 ? trace_kprobe_create+0xa9e/0xe40 ? trace_kprobe_create+0xa9e/0xe40 __kasan_report.cold.6+0x1a/0x3b ? trace_kprobe_create+0xa9e/0xe40 kasan_report+0xe/0x12 trace_kprobe_create+0xa9e/0xe40 ? print_kprobe_event+0x280/0x280 ? match_held_lock+0x1b/0x240 ? find_held_lock+0xac/0xd0 ? fs_reclaim_release.part.112+0x5/0x20 ? lock_downgrade+0x350/0x350 ? kasan_unpoison_shadow+0x30/0x40 ? __kasan_kmalloc.constprop.6+0xc1/0xd0 ? trace_kprobe_create+0xe40/0xe40 ? trace_kprobe_create+0xe40/0xe40 create_or_delete_trace_kprobe+0x2e/0x60 trace_run_command+0xc3/0xe0 ? trace_panic_handler+0x20/0x20 ? kasan_unpoison_shadow+0x30/0x40 trace_parse_run_command+0xdc/0x163 vfs_write+0xe1/0x240 ksys_write+0xba/0x150 ? __ia32_sys_read+0x50/0x50 ? tracer_hardirqs_on+0x61/0x180 ? trace_hardirqs_off_caller+0x43/0x110 ? mark_held_locks+0x29/0xa0 ? do_syscall_64+0x14/0x260 do_syscall_64+0x68/0x260 Fix to check the difference of nr_args before adding probe on existing probes. This also may set the error log index bigger than the number of command parameters. In that case it sets the error position is next to the last parameter. Link: http://lkml.kernel.org/r/156966474783.3478.13217501608215769150.stgit@devnote2 Fixes: ca89bc071d5e ("tracing/kprobe: Add multi-probe per event support") Reported-by: Steven Rostedt (VMware) Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_probe.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index baf58a3612c0..905b10af5d5c 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -178,6 +178,16 @@ void __trace_probe_log_err(int offset, int err_type) if (!command) return; + if (trace_probe_log.index >= trace_probe_log.argc) { + /** + * Set the error position is next to the last arg + space. + * Note that len includes the terminal null and the cursor + * appaers at pos + 1. + */ + pos = len; + offset = 0; + } + /* And make a command string from argv array */ p = command; for (i = 0; i < trace_probe_log.argc; i++) { @@ -1084,6 +1094,12 @@ int trace_probe_compare_arg_type(struct trace_probe *a, struct trace_probe *b) { int i; + /* In case of more arguments */ + if (a->nr_args < b->nr_args) + return a->nr_args + 1; + if (a->nr_args > b->nr_args) + return b->nr_args + 1; + for (i = 0; i < a->nr_args; i++) { if ((b->nr_args <= i) || ((a->args[i].type != b->args[i].type) || -- cgit v1.2.3 From 968e5170939662341242812b9c82ef51cf140a33 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Thu, 26 Sep 2019 09:22:59 -0700 Subject: tracing: Fix clang -Wint-in-bool-context warnings in IF_ASSIGN macro After r372664 in clang, the IF_ASSIGN macro causes a couple hundred warnings along the lines of: kernel/trace/trace_output.c:1331:2: warning: converting the enum constant to a boolean [-Wint-in-bool-context] kernel/trace/trace.h:409:3: note: expanded from macro 'trace_assign_type' IF_ASSIGN(var, ent, struct ftrace_graph_ret_entry, ^ kernel/trace/trace.h:371:14: note: expanded from macro 'IF_ASSIGN' WARN_ON(id && (entry)->type != id); \ ^ 264 warnings generated. This warning can catch issues with constructs like: if (state == A || B) where the developer really meant: if (state == A || state == B) This is currently the only occurrence of the warning in the kernel tree across defconfig, allyesconfig, allmodconfig for arm32, arm64, and x86_64. Add the implicit '!= 0' to the WARN_ON statement to fix the warnings and find potential issues in the future. Link: https://github.com/llvm/llvm-project/commit/28b38c277a2941e9e891b2db30652cfd962f070b Link: https://github.com/ClangBuiltLinux/linux/issues/686 Link: http://lkml.kernel.org/r/20190926162258.466321-1-natechancellor@gmail.com Reviewed-by: Nick Desaulniers Signed-off-by: Nathan Chancellor Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 26b0a08f3c7d..f801d154ff6a 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -365,11 +365,11 @@ static inline struct trace_array *top_trace_array(void) __builtin_types_compatible_p(typeof(var), type *) #undef IF_ASSIGN -#define IF_ASSIGN(var, entry, etype, id) \ - if (FTRACE_CMP_TYPE(var, etype)) { \ - var = (typeof(var))(entry); \ - WARN_ON(id && (entry)->type != id); \ - break; \ +#define IF_ASSIGN(var, entry, etype, id) \ + if (FTRACE_CMP_TYPE(var, etype)) { \ + var = (typeof(var))(entry); \ + WARN_ON(id != 0 && (entry)->type != id); \ + break; \ } /* Will cause compile errors if type is not found. */ -- cgit v1.2.3 From 96c5c6e6a5b6db592acae039fed54b5c8844cd35 Mon Sep 17 00:00:00 2001 From: Navid Emamdoost Date: Fri, 20 Sep 2019 17:57:59 -0500 Subject: tracing: Have error path in predicate_parse() free its allocated memory In predicate_parse, there is an error path that is not going to out_free instead it returns directly which leads to a memory leak. Link: http://lkml.kernel.org/r/20190920225800.3870-1-navid.emamdoost@gmail.com Signed-off-by: Navid Emamdoost Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_filter.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index c773b8fb270c..c9a74f82b14a 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -452,8 +452,10 @@ predicate_parse(const char *str, int nr_parens, int nr_preds, switch (*next) { case '(': /* #2 */ - if (top - op_stack > nr_parens) - return ERR_PTR(-EINVAL); + if (top - op_stack > nr_parens) { + ret = -EINVAL; + goto out_free; + } *(++top) = invert; continue; case '!': /* #3 */ -- cgit v1.2.3 From f14c234b4bc5184fd40d9a47830e5b32c3b36d49 Mon Sep 17 00:00:00 2001 From: Aleksa Sarai Date: Tue, 1 Oct 2019 11:10:53 +1000 Subject: clone3: switch to copy_struct_from_user() Switch clone3() syscall from it's own copying struct clone_args from userspace to the new dedicated copy_struct_from_user() helper. The change is very straightforward, and helps unify the syscall interface for struct-from-userspace syscalls. Additionally, explicitly define CLONE_ARGS_SIZE_VER0 to match the other users of the struct-extension pattern. Signed-off-by: Aleksa Sarai Reviewed-by: Kees Cook Reviewed-by: Christian Brauner [christian.brauner@ubuntu.com: improve commit message] Link: https://lore.kernel.org/r/20191001011055.19283-3-cyphar@cyphar.com Signed-off-by: Christian Brauner --- kernel/fork.c | 34 +++++++--------------------------- 1 file changed, 7 insertions(+), 27 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index f9572f416126..2ef529869c64 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2525,39 +2525,19 @@ SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp, #ifdef __ARCH_WANT_SYS_CLONE3 noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs, struct clone_args __user *uargs, - size_t size) + size_t usize) { + int err; struct clone_args args; - if (unlikely(size > PAGE_SIZE)) + if (unlikely(usize > PAGE_SIZE)) return -E2BIG; - - if (unlikely(size < sizeof(struct clone_args))) + if (unlikely(usize < CLONE_ARGS_SIZE_VER0)) return -EINVAL; - if (unlikely(!access_ok(uargs, size))) - return -EFAULT; - - if (size > sizeof(struct clone_args)) { - unsigned char __user *addr; - unsigned char __user *end; - unsigned char val; - - addr = (void __user *)uargs + sizeof(struct clone_args); - end = (void __user *)uargs + size; - - for (; addr < end; addr++) { - if (get_user(val, addr)) - return -EFAULT; - if (val) - return -E2BIG; - } - - size = sizeof(struct clone_args); - } - - if (copy_from_user(&args, uargs, size)) - return -EFAULT; + err = copy_struct_from_user(&args, sizeof(args), uargs, usize); + if (err) + return err; /* * Verify that higher 32bits of exit_signal are unset and that -- cgit v1.2.3 From dff3a85feceade213daf153556fd32a3b0a73c63 Mon Sep 17 00:00:00 2001 From: Aleksa Sarai Date: Tue, 1 Oct 2019 11:10:54 +1000 Subject: sched_setattr: switch to copy_struct_from_user() Switch sched_setattr() syscall from it's own copying struct sched_attr from userspace to the new dedicated copy_struct_from_user() helper. The change is very straightforward, and helps unify the syscall interface for struct-from-userspace syscalls. Ideally we could also unify sched_getattr(2)-style syscalls as well, but unfortunately the correct semantics for such syscalls are much less clear (see [1] for more detail). In future we could come up with a more sane idea for how the syscall interface should look. [1]: commit 1251201c0d34 ("sched/core: Fix uclamp ABI bug, clean up and robustify sched_read_attr() ABI logic and code") Signed-off-by: Aleksa Sarai Reviewed-by: Kees Cook Reviewed-by: Christian Brauner [christian.brauner@ubuntu.com: improve commit message] Link: https://lore.kernel.org/r/20191001011055.19283-4-cyphar@cyphar.com Signed-off-by: Christian Brauner --- kernel/sched/core.c | 43 +++++++------------------------------------ 1 file changed, 7 insertions(+), 36 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 7880f4f64d0e..dd05a378631a 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5106,9 +5106,6 @@ static int sched_copy_attr(struct sched_attr __user *uattr, struct sched_attr *a u32 size; int ret; - if (!access_ok(uattr, SCHED_ATTR_SIZE_VER0)) - return -EFAULT; - /* Zero the full structure, so that a short copy will be nice: */ memset(attr, 0, sizeof(*attr)); @@ -5116,45 +5113,19 @@ static int sched_copy_attr(struct sched_attr __user *uattr, struct sched_attr *a if (ret) return ret; - /* Bail out on silly large: */ - if (size > PAGE_SIZE) - goto err_size; - /* ABI compatibility quirk: */ if (!size) size = SCHED_ATTR_SIZE_VER0; - - if (size < SCHED_ATTR_SIZE_VER0) + if (size < SCHED_ATTR_SIZE_VER0 || size > PAGE_SIZE) goto err_size; - /* - * If we're handed a bigger struct than we know of, - * ensure all the unknown bits are 0 - i.e. new - * user-space does not rely on any kernel feature - * extensions we dont know about yet. - */ - if (size > sizeof(*attr)) { - unsigned char __user *addr; - unsigned char __user *end; - unsigned char val; - - addr = (void __user *)uattr + sizeof(*attr); - end = (void __user *)uattr + size; - - for (; addr < end; addr++) { - ret = get_user(val, addr); - if (ret) - return ret; - if (val) - goto err_size; - } - size = sizeof(*attr); + ret = copy_struct_from_user(attr, sizeof(*attr), uattr, size); + if (ret) { + if (ret == -E2BIG) + goto err_size; + return ret; } - ret = copy_from_user(attr, uattr, size); - if (ret) - return -EFAULT; - if ((attr->sched_flags & SCHED_FLAG_UTIL_CLAMP) && size < SCHED_ATTR_SIZE_VER1) return -EINVAL; @@ -5354,7 +5325,7 @@ sched_attr_copy_to_user(struct sched_attr __user *uattr, * sys_sched_getattr - similar to sched_getparam, but with sched_attr * @pid: the pid in question. * @uattr: structure containing the extended parameters. - * @usize: sizeof(attr) that user-space knows about, for forwards and backwards compatibility. + * @usize: sizeof(attr) for fwd/bwd comp. * @flags: for future extension. */ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, -- cgit v1.2.3 From c2ba8f41ad366dc12840dd87d8f1565185845126 Mon Sep 17 00:00:00 2001 From: Aleksa Sarai Date: Tue, 1 Oct 2019 11:10:55 +1000 Subject: perf_event_open: switch to copy_struct_from_user() Switch perf_event_open() syscall from it's own copying struct perf_event_attr from userspace to the new dedicated copy_struct_from_user() helper. The change is very straightforward, and helps unify the syscall interface for struct-from-userspace syscalls. Signed-off-by: Aleksa Sarai Reviewed-by: Kees Cook Reviewed-by: Christian Brauner [christian.brauner@ubuntu.com: improve commit message] Link: https://lore.kernel.org/r/20191001011055.19283-5-cyphar@cyphar.com Signed-off-by: Christian Brauner --- kernel/events/core.c | 47 +++++++++-------------------------------------- 1 file changed, 9 insertions(+), 38 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 4655adbbae10..3f0cb82e4fbc 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -10586,55 +10586,26 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr, u32 size; int ret; - if (!access_ok(uattr, PERF_ATTR_SIZE_VER0)) - return -EFAULT; - - /* - * zero the full structure, so that a short copy will be nice. - */ + /* Zero the full structure, so that a short copy will be nice. */ memset(attr, 0, sizeof(*attr)); ret = get_user(size, &uattr->size); if (ret) return ret; - if (size > PAGE_SIZE) /* silly large */ - goto err_size; - - if (!size) /* abi compat */ + /* ABI compatibility quirk: */ + if (!size) size = PERF_ATTR_SIZE_VER0; - - if (size < PERF_ATTR_SIZE_VER0) + if (size < PERF_ATTR_SIZE_VER0 || size > PAGE_SIZE) goto err_size; - /* - * If we're handed a bigger struct than we know of, - * ensure all the unknown bits are 0 - i.e. new - * user-space does not rely on any kernel feature - * extensions we dont know about yet. - */ - if (size > sizeof(*attr)) { - unsigned char __user *addr; - unsigned char __user *end; - unsigned char val; - - addr = (void __user *)uattr + sizeof(*attr); - end = (void __user *)uattr + size; - - for (; addr < end; addr++) { - ret = get_user(val, addr); - if (ret) - return ret; - if (val) - goto err_size; - } - size = sizeof(*attr); + ret = copy_struct_from_user(attr, sizeof(*attr), uattr, size); + if (ret) { + if (ret == -E2BIG) + goto err_size; + return ret; } - ret = copy_from_user(attr, uattr, size); - if (ret) - return -EFAULT; - attr->size = size; if (attr->__reserved_1) -- cgit v1.2.3 From 73956fc07dd7b25d4a33ab3fdd6247c60d0b237c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 1 Oct 2019 10:50:33 +0200 Subject: membarrier: Fix RCU locking bug caused by faulty merge The following commit: 227a4aadc75b ("sched/membarrier: Fix p->mm->membarrier_state racy load") got fat fingered by me when merging it with other patches. It meant to move the RCU section out of the for loop but ended up doing it partially, leaving a superfluous rcu_read_lock() inside, causing havok. Reported-by: Ingo Molnar Signed-off-by: Peter Zijlstra (Intel) Cc: Borislav Petkov Cc: Chris Metcalf Cc: Christoph Lameter Cc: Eric W. Biederman Cc: Kirill Tkhai Cc: Linus Torvalds Cc: Mathieu Desnoyers Cc: Mike Galbraith Cc: Oleg Nesterov Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Russell King - ARM Linux admin Cc: Thomas Gleixner Cc: linux-tip-commits@vger.kernel.org Fixes: 227a4aadc75b ("sched/membarrier: Fix p->mm->membarrier_state racy load") Link: https://lkml.kernel.org/r/20191001085033.GP4519@hirez.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- kernel/sched/membarrier.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c index a39bed2c784f..168479a7d61b 100644 --- a/kernel/sched/membarrier.c +++ b/kernel/sched/membarrier.c @@ -174,7 +174,6 @@ static int membarrier_private_expedited(int flags) */ if (cpu == raw_smp_processor_id()) continue; - rcu_read_lock(); p = rcu_dereference(cpu_rq(cpu)->curr); if (p && p->mm == mm) __cpumask_set_cpu(cpu, tmpmask); -- cgit v1.2.3 From 501bd0166eb949820c8e4204e62aaee5c89c84d9 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 27 Sep 2019 17:28:42 +0200 Subject: fork: add kernel-doc for clone3 Add kernel-doc for the clone3() syscall. Link: https://lore.kernel.org/r/20191001114701.24661-2-christian.brauner@ubuntu.com Signed-off-by: Christian Brauner --- kernel/fork.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index f9572f416126..bf11cf39579a 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2604,6 +2604,17 @@ static bool clone3_args_valid(const struct kernel_clone_args *kargs) return true; } +/** + * clone3 - create a new process with specific properties + * @uargs: argument structure + * @size: size of @uargs + * + * clone3() is the extensible successor to clone()/clone2(). + * It takes a struct as argument that is versioned by its size. + * + * Return: On success, a positive PID for the child process. + * On error, a negative errno number. + */ SYSCALL_DEFINE2(clone3, struct clone_args __user *, uargs, size_t, size) { int err; -- cgit v1.2.3 From 86cdd2fdc4e39c388d39c7ba2396d1a9dfd66226 Mon Sep 17 00:00:00 2001 From: Dmitry Goldin Date: Fri, 4 Oct 2019 10:40:07 +0000 Subject: kheaders: make headers archive reproducible In commit 43d8ce9d65a5 ("Provide in-kernel headers to make extending kernel easier") a new mechanism was introduced, for kernels >=5.2, which embeds the kernel headers in the kernel image or a module and exposes them in procfs for use by userland tools. The archive containing the header files has nondeterminism caused by header files metadata. This patch normalizes the metadata and utilizes KBUILD_BUILD_TIMESTAMP if provided and otherwise falls back to the default behaviour. In commit f7b101d33046 ("kheaders: Move from proc to sysfs") it was modified to use sysfs and the script for generation of the archive was renamed to what is being patched. Signed-off-by: Dmitry Goldin Reviewed-by: Greg Kroah-Hartman Reviewed-by: Joel Fernandes (Google) Signed-off-by: Masahiro Yamada --- kernel/gen_kheaders.sh | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/gen_kheaders.sh b/kernel/gen_kheaders.sh index 9ff449888d9c..aff79e461fc9 100755 --- a/kernel/gen_kheaders.sh +++ b/kernel/gen_kheaders.sh @@ -71,7 +71,10 @@ done | cpio --quiet -pd $cpio_dir >/dev/null 2>&1 find $cpio_dir -type f -print0 | xargs -0 -P8 -n1 perl -pi -e 'BEGIN {undef $/;}; s/\/\*((?!SPDX).)*?\*\///smg;' -tar -Jcf $tarfile -C $cpio_dir/ . > /dev/null +# Create archive and try to normalize metadata for reproducibility +tar "${KBUILD_BUILD_TIMESTAMP:+--mtime=$KBUILD_BUILD_TIMESTAMP}" \ + --owner=0 --group=0 --sort=name --numeric-owner \ + -Jcf $tarfile -C $cpio_dir/ . > /dev/null echo "$src_files_md5" > kernel/kheaders.md5 echo "$obj_files_md5" >> kernel/kheaders.md5 -- cgit v1.2.3 From 2cf2aa6a69db0b17b3979144287af8775c1c1534 Mon Sep 17 00:00:00 2001 From: Andrey Smirnov Date: Sat, 5 Oct 2019 10:23:30 +0200 Subject: dma-mapping: fix false positivse warnings in dma_common_free_remap() Commit 5cf4537975bb ("dma-mapping: introduce a dma_common_find_pages helper") changed invalid input check in dma_common_free_remap() from: if (!area || !area->flags != VM_DMA_COHERENT) to if (!area || !area->flags != VM_DMA_COHERENT || !area->pages) which seem to produce false positives for memory obtained via dma_common_contiguous_remap() This triggers the following warning message when doing "reboot" on ZII VF610 Dev Board Rev B: WARNING: CPU: 0 PID: 1 at kernel/dma/remap.c:112 dma_common_free_remap+0x88/0x8c trying to free invalid coherent area: 9ef82980 Modules linked in: CPU: 0 PID: 1 Comm: systemd-shutdow Not tainted 5.3.0-rc6-next-20190820 #119 Hardware name: Freescale Vybrid VF5xx/VF6xx (Device Tree) Backtrace: [<8010d1ec>] (dump_backtrace) from [<8010d588>] (show_stack+0x20/0x24) r7:8015ed78 r6:00000009 r5:00000000 r4:9f4d9b14 [<8010d568>] (show_stack) from [<8077e3f0>] (dump_stack+0x24/0x28) [<8077e3cc>] (dump_stack) from [<801197a0>] (__warn.part.3+0xcc/0xe4) [<801196d4>] (__warn.part.3) from [<80119830>] (warn_slowpath_fmt+0x78/0x94) r6:00000070 r5:808e540c r4:81c03048 [<801197bc>] (warn_slowpath_fmt) from [<8015ed78>] (dma_common_free_remap+0x88/0x8c) r3:9ef82980 r2:808e53e0 r7:00001000 r6:a0b1e000 r5:a0b1e000 r4:00001000 [<8015ecf0>] (dma_common_free_remap) from [<8010fa9c>] (remap_allocator_free+0x60/0x68) r5:81c03048 r4:9f4d9b78 [<8010fa3c>] (remap_allocator_free) from [<801100d0>] (__arm_dma_free.constprop.3+0xf8/0x148) r5:81c03048 r4:9ef82900 [<8010ffd8>] (__arm_dma_free.constprop.3) from [<80110144>] (arm_dma_free+0x24/0x2c) r5:9f563410 r4:80110120 [<80110120>] (arm_dma_free) from [<8015d80c>] (dma_free_attrs+0xa0/0xdc) [<8015d76c>] (dma_free_attrs) from [<8020f3e4>] (dma_pool_destroy+0xc0/0x154) r8:9efa8860 r7:808f02f0 r6:808f02d0 r5:9ef82880 r4:9ef82780 [<8020f324>] (dma_pool_destroy) from [<805525d0>] (ehci_mem_cleanup+0x6c/0x150) r7:9f563410 r6:9efa8810 r5:00000000 r4:9efd0148 [<80552564>] (ehci_mem_cleanup) from [<80558e0c>] (ehci_stop+0xac/0xc0) r5:9efd0148 r4:9efd0000 [<80558d60>] (ehci_stop) from [<8053c4bc>] (usb_remove_hcd+0xf4/0x1b0) r7:9f563410 r6:9efd0074 r5:81c03048 r4:9efd0000 [<8053c3c8>] (usb_remove_hcd) from [<8056361c>] (host_stop+0x48/0xb8) r7:9f563410 r6:9efd0000 r5:9f5f4040 r4:9f5f5040 [<805635d4>] (host_stop) from [<80563d0c>] (ci_hdrc_host_destroy+0x34/0x38) r7:9f563410 r6:9f5f5040 r5:9efa8800 r4:9f5f4040 [<80563cd8>] (ci_hdrc_host_destroy) from [<8055ef18>] (ci_hdrc_remove+0x50/0x10c) [<8055eec8>] (ci_hdrc_remove) from [<804a2ed8>] (platform_drv_remove+0x34/0x4c) r7:9f563410 r6:81c4f99c r5:9efa8810 r4:9efa8810 [<804a2ea4>] (platform_drv_remove) from [<804a18a8>] (device_release_driver_internal+0xec/0x19c) r5:00000000 r4:9efa8810 [<804a17bc>] (device_release_driver_internal) from [<804a1978>] (device_release_driver+0x20/0x24) r7:9f563410 r6:81c41ed0 r5:9efa8810 r4:9f4a1dac [<804a1958>] (device_release_driver) from [<804a01b8>] (bus_remove_device+0xdc/0x108) [<804a00dc>] (bus_remove_device) from [<8049c204>] (device_del+0x150/0x36c) r7:9f563410 r6:81c03048 r5:9efa8854 r4:9efa8810 [<8049c0b4>] (device_del) from [<804a3368>] (platform_device_del.part.2+0x20/0x84) r10:9f563414 r9:809177e0 r8:81cb07dc r7:81c78320 r6:9f563454 r5:9efa8800 r4:9efa8800 [<804a3348>] (platform_device_del.part.2) from [<804a3420>] (platform_device_unregister+0x28/0x34) r5:9f563400 r4:9efa8800 [<804a33f8>] (platform_device_unregister) from [<8055dce0>] (ci_hdrc_remove_device+0x1c/0x30) r5:9f563400 r4:00000001 [<8055dcc4>] (ci_hdrc_remove_device) from [<805652ac>] (ci_hdrc_imx_remove+0x38/0x118) r7:81c78320 r6:9f563454 r5:9f563410 r4:9f541010 [<8056538c>] (ci_hdrc_imx_shutdown) from [<804a2970>] (platform_drv_shutdown+0x2c/0x30) [<804a2944>] (platform_drv_shutdown) from [<8049e4fc>] (device_shutdown+0x158/0x1f0) [<8049e3a4>] (device_shutdown) from [<8013ac80>] (kernel_restart_prepare+0x44/0x48) r10:00000058 r9:9f4d8000 r8:fee1dead r7:379ce700 r6:81c0b280 r5:81c03048 r4:00000000 [<8013ac3c>] (kernel_restart_prepare) from [<8013ad14>] (kernel_restart+0x1c/0x60) [<8013acf8>] (kernel_restart) from [<8013af84>] (__do_sys_reboot+0xe0/0x1d8) r5:81c03048 r4:00000000 [<8013aea4>] (__do_sys_reboot) from [<8013b0ec>] (sys_reboot+0x18/0x1c) r8:80101204 r7:00000058 r6:00000000 r5:00000000 r4:00000000 [<8013b0d4>] (sys_reboot) from [<80101000>] (ret_fast_syscall+0x0/0x54) Exception stack(0x9f4d9fa8 to 0x9f4d9ff0) 9fa0: 00000000 00000000 fee1dead 28121969 01234567 379ce700 9fc0: 00000000 00000000 00000000 00000058 00000000 00000000 00000000 00016d04 9fe0: 00028e0c 7ec87c64 000135ec 76c1f410 Restore original invalid input check in dma_common_free_remap() to avoid this problem. Fixes: 5cf4537975bb ("dma-mapping: introduce a dma_common_find_pages helper") Signed-off-by: Andrey Smirnov [hch: just revert the offending hunk instead of creating a new helper] Signed-off-by: Christoph Hellwig --- kernel/dma/remap.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/remap.c b/kernel/dma/remap.c index ca4e5d44b571..c00b9258fa6a 100644 --- a/kernel/dma/remap.c +++ b/kernel/dma/remap.c @@ -87,9 +87,9 @@ void *dma_common_contiguous_remap(struct page *page, size_t size, */ void dma_common_free_remap(void *cpu_addr, size_t size) { - struct page **pages = dma_common_find_pages(cpu_addr); + struct vm_struct *area = find_vm_area(cpu_addr); - if (!pages) { + if (!area || area->flags != VM_DMA_COHERENT) { WARN(1, "trying to free invalid coherent area: %p\n", cpu_addr); return; } -- cgit v1.2.3 From 0e48f51cbbfbdb79149806de14dcb8bf0f01ca94 Mon Sep 17 00:00:00 2001 From: Mika Westerberg Date: Fri, 4 Oct 2019 13:00:25 +0300 Subject: Revert "libata, freezer: avoid block device removal while system is frozen" This reverts commit 85fbd722ad0f5d64d1ad15888cd1eb2188bfb557. The commit was added as a quick band-aid for a hang that happened when a block device was removed during system suspend. Now that bdi_wq is not freezable anymore the hang should not be possible and we can get rid of this hack by reverting it. Acked-by: Rafael J. Wysocki Signed-off-by: Mika Westerberg Signed-off-by: Jens Axboe --- kernel/freezer.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'kernel') diff --git a/kernel/freezer.c b/kernel/freezer.c index c0738424bb43..dc520f01f99d 100644 --- a/kernel/freezer.c +++ b/kernel/freezer.c @@ -22,12 +22,6 @@ EXPORT_SYMBOL(system_freezing_cnt); bool pm_freezing; bool pm_nosig_freezing; -/* - * Temporary export for the deadlock workaround in ata_scsi_hotplug(). - * Remove once the hack becomes unnecessary. - */ -EXPORT_SYMBOL_GPL(pm_freezing); - /* protects freezing and frozen transitions */ static DEFINE_SPINLOCK(freezer_lock); -- cgit v1.2.3 From f733c6b508bcaa3441ba1eacf16efb9abd47489f Mon Sep 17 00:00:00 2001 From: Alexander Shishkin Date: Fri, 4 Oct 2019 15:57:29 +0300 Subject: perf/core: Fix inheritance of aux_output groups Commit: ab43762ef010 ("perf: Allow normal events to output AUX data") forgets to configure aux_output relation in the inherited groups, which results in child PEBS events forever failing to schedule. Fix this by setting up the AUX output link in the inheritance path. Signed-off-by: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Peter Zijlstra Cc: Thomas Gleixner Link: https://lkml.kernel.org/r/20191004125729.32397-1-alexander.shishkin@linux.intel.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 3f0cb82e4fbc..f953dd16a5e2 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -11862,6 +11862,10 @@ static int inherit_group(struct perf_event *parent_event, child, leader, child_ctx); if (IS_ERR(child_ctr)) return PTR_ERR(child_ctr); + + if (sub->aux_event == parent_event && + !perf_get_aux_event(child_ctr, leader)) + return -EINVAL; } return 0; } -- cgit v1.2.3 From 20bb759a66be52cf4a9ddd17fddaf509e11490cd Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Sun, 6 Oct 2019 17:58:00 -0700 Subject: panic: ensure preemption is disabled during panic() Calling 'panic()' on a kernel with CONFIG_PREEMPT=y can leave the calling CPU in an infinite loop, but with interrupts and preemption enabled. From this state, userspace can continue to be scheduled, despite the system being "dead" as far as the kernel is concerned. This is easily reproducible on arm64 when booting with "nosmp" on the command line; a couple of shell scripts print out a periodic "Ping" message whilst another triggers a crash by writing to /proc/sysrq-trigger: | sysrq: Trigger a crash | Kernel panic - not syncing: sysrq triggered crash | CPU: 0 PID: 1 Comm: init Not tainted 5.2.15 #1 | Hardware name: linux,dummy-virt (DT) | Call trace: | dump_backtrace+0x0/0x148 | show_stack+0x14/0x20 | dump_stack+0xa0/0xc4 | panic+0x140/0x32c | sysrq_handle_reboot+0x0/0x20 | __handle_sysrq+0x124/0x190 | write_sysrq_trigger+0x64/0x88 | proc_reg_write+0x60/0xa8 | __vfs_write+0x18/0x40 | vfs_write+0xa4/0x1b8 | ksys_write+0x64/0xf0 | __arm64_sys_write+0x14/0x20 | el0_svc_common.constprop.0+0xb0/0x168 | el0_svc_handler+0x28/0x78 | el0_svc+0x8/0xc | Kernel Offset: disabled | CPU features: 0x0002,24002004 | Memory Limit: none | ---[ end Kernel panic - not syncing: sysrq triggered crash ]--- | Ping 2! | Ping 1! | Ping 1! | Ping 2! The issue can also be triggered on x86 kernels if CONFIG_SMP=n, otherwise local interrupts are disabled in 'smp_send_stop()'. Disable preemption in 'panic()' before re-enabling interrupts. Link: http://lkml.kernel.org/r/20191002123538.22609-1-will@kernel.org Link: https://lore.kernel.org/r/BX1W47JXPMR8.58IYW53H6M5N@dragonstone Signed-off-by: Will Deacon Reported-by: Xogium Reviewed-by: Kees Cook Cc: Russell King Cc: Greg Kroah-Hartman Cc: Ingo Molnar Cc: Petr Mladek Cc: Feng Tang Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/panic.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/panic.c b/kernel/panic.c index 47e8ebccc22b..f470a038b05b 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -180,6 +180,7 @@ void panic(const char *fmt, ...) * after setting panic_cpu) from invoking panic() again. */ local_irq_disable(); + preempt_disable_notrace(); /* * It's possible to come here directly from a panic-assertion and -- cgit v1.2.3 From b0f53dbc4bc4c371f38b14c391095a3bb8a0bb40 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Sun, 6 Oct 2019 17:58:19 -0700 Subject: kernel/sysctl.c: do not override max_threads provided by userspace Partially revert 16db3d3f1170 ("kernel/sysctl.c: threads-max observe limits") because the patch is causing a regression to any workload which needs to override the auto-tuning of the limit provided by kernel. set_max_threads is implementing a boot time guesstimate to provide a sensible limit of the concurrently running threads so that runaways will not deplete all the memory. This is a good thing in general but there are workloads which might need to increase this limit for an application to run (reportedly WebSpher MQ is affected) and that is simply not possible after the mentioned change. It is also very dubious to override an admin decision by an estimation that doesn't have any direct relation to correctness of the kernel operation. Fix this by dropping set_max_threads from sysctl_max_threads so any value is accepted as long as it fits into MAX_THREADS which is important to check because allowing more threads could break internal robust futex restriction. While at it, do not use MIN_THREADS as the lower boundary because it is also only a heuristic for automatic estimation and admin might have a good reason to stop new threads to be created even when below this limit. This became more severe when we switched x86 from 4k to 8k kernel stacks. Starting since 6538b8ea886e ("x86_64: expand kernel stack to 16K") (3.16) we use THREAD_SIZE_ORDER = 2 and that halved the auto-tuned value. In the particular case 3.12 kernel.threads-max = 515561 4.4 kernel.threads-max = 200000 Neither of the two values is really insane on 32GB machine. I am not sure we want/need to tune the max_thread value further. If anything the tuning should be removed altogether if proven not useful in general. But we definitely need a way to override this auto-tuning. Link: http://lkml.kernel.org/r/20190922065801.GB18814@dhcp22.suse.cz Fixes: 16db3d3f1170 ("kernel/sysctl.c: threads-max observe limits") Signed-off-by: Michal Hocko Reviewed-by: "Eric W. Biederman" Cc: Heinrich Schuchardt Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 1f6c45f6a734..bcdf53125210 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2925,7 +2925,7 @@ int sysctl_max_threads(struct ctl_table *table, int write, struct ctl_table t; int ret; int threads = max_threads; - int min = MIN_THREADS; + int min = 1; int max = MAX_THREADS; t = *table; @@ -2937,7 +2937,7 @@ int sysctl_max_threads(struct ctl_table *table, int write, if (ret || !write) return ret; - set_max_threads(threads); + max_threads = threads; return 0; } -- cgit v1.2.3 From 4929a4e6faa0f13289a67cae98139e727f0d4a97 Mon Sep 17 00:00:00 2001 From: Xuewei Zhang Date: Thu, 3 Oct 2019 17:12:43 -0700 Subject: sched/fair: Scale bandwidth quota and period without losing quota/period ratio precision The quota/period ratio is used to ensure a child task group won't get more bandwidth than the parent task group, and is calculated as: normalized_cfs_quota() = [(quota_us << 20) / period_us] If the quota/period ratio was changed during this scaling due to precision loss, it will cause inconsistency between parent and child task groups. See below example: A userspace container manager (kubelet) does three operations: 1) Create a parent cgroup, set quota to 1,000us and period to 10,000us. 2) Create a few children cgroups. 3) Set quota to 1,000us and period to 10,000us on a child cgroup. These operations are expected to succeed. However, if the scaling of 147/128 happens before step 3, quota and period of the parent cgroup will be changed: new_quota: 1148437ns, 1148us new_period: 11484375ns, 11484us And when step 3 comes in, the ratio of the child cgroup will be 104857, which will be larger than the parent cgroup ratio (104821), and will fail. Scaling them by a factor of 2 will fix the problem. Tested-by: Phil Auld Signed-off-by: Xuewei Zhang Signed-off-by: Peter Zijlstra (Intel) Acked-by: Phil Auld Cc: Anton Blanchard Cc: Ben Segall Cc: Dietmar Eggemann Cc: Juri Lelli Cc: Linus Torvalds Cc: Mel Gorman Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Vincent Guittot Fixes: 2e8e19226398 ("sched/fair: Limit sched_cfs_period_timer() loop to avoid hard lockup") Link: https://lkml.kernel.org/r/20191004001243.140897-1-xueweiz@google.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 36 ++++++++++++++++++++++-------------- 1 file changed, 22 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 83ab35e2374f..682a754ea3e1 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4926,20 +4926,28 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer) if (++count > 3) { u64 new, old = ktime_to_ns(cfs_b->period); - new = (old * 147) / 128; /* ~115% */ - new = min(new, max_cfs_quota_period); - - cfs_b->period = ns_to_ktime(new); - - /* since max is 1s, this is limited to 1e9^2, which fits in u64 */ - cfs_b->quota *= new; - cfs_b->quota = div64_u64(cfs_b->quota, old); - - pr_warn_ratelimited( - "cfs_period_timer[cpu%d]: period too short, scaling up (new cfs_period_us %lld, cfs_quota_us = %lld)\n", - smp_processor_id(), - div_u64(new, NSEC_PER_USEC), - div_u64(cfs_b->quota, NSEC_PER_USEC)); + /* + * Grow period by a factor of 2 to avoid losing precision. + * Precision loss in the quota/period ratio can cause __cfs_schedulable + * to fail. + */ + new = old * 2; + if (new < max_cfs_quota_period) { + cfs_b->period = ns_to_ktime(new); + cfs_b->quota *= 2; + + pr_warn_ratelimited( + "cfs_period_timer[cpu%d]: period too short, scaling up (new cfs_period_us = %lld, cfs_quota_us = %lld)\n", + smp_processor_id(), + div_u64(new, NSEC_PER_USEC), + div_u64(cfs_b->quota, NSEC_PER_USEC)); + } else { + pr_warn_ratelimited( + "cfs_period_timer[cpu%d]: period too short, but cannot scale up without losing precision (cfs_period_us = %lld, cfs_quota_us = %lld)\n", + smp_processor_id(), + div_u64(old, NSEC_PER_USEC), + div_u64(cfs_b->quota, NSEC_PER_USEC)); + } /* reset count so we don't come right back in here */ count = 0; -- cgit v1.2.3 From 68e7a4d66b0ce04bf18ff2ffded5596ab3618585 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 25 Sep 2019 23:42:42 +0200 Subject: sched/vtime: Fix guest/system mis-accounting on task switch vtime_account_system() assumes that the target task to account cputime to is always the current task. This is most often true indeed except on task switch where we call: vtime_common_task_switch(prev) vtime_account_system(prev) Here prev is the scheduling-out task where we account the cputime to. It doesn't match current that is already the scheduling-in task at this stage of the context switch. So we end up checking the wrong task flags to determine if we are accounting guest or system time to the previous task. As a result the wrong task is used to check if the target is running in guest mode. We may then spuriously account or leak either system or guest time on task switch. Fix this assumption and also turn vtime_guest_enter/exit() to use the task passed in parameter as well to avoid future similar issues. Signed-off-by: Frederic Weisbecker Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Wanpeng Li Fixes: 2a42eb9594a1 ("sched/cputime: Accumulate vtime on top of nsec clocksource") Link: https://lkml.kernel.org/r/20190925214242.21873-1-frederic@kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/cputime.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 2305ce89a26c..46ed4e1383e2 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -740,7 +740,7 @@ void vtime_account_system(struct task_struct *tsk) write_seqcount_begin(&vtime->seqcount); /* We might have scheduled out from guest path */ - if (current->flags & PF_VCPU) + if (tsk->flags & PF_VCPU) vtime_account_guest(tsk, vtime); else __vtime_account_system(tsk, vtime); @@ -783,7 +783,7 @@ void vtime_guest_enter(struct task_struct *tsk) */ write_seqcount_begin(&vtime->seqcount); __vtime_account_system(tsk, vtime); - current->flags |= PF_VCPU; + tsk->flags |= PF_VCPU; write_seqcount_end(&vtime->seqcount); } EXPORT_SYMBOL_GPL(vtime_guest_enter); @@ -794,7 +794,7 @@ void vtime_guest_exit(struct task_struct *tsk) write_seqcount_begin(&vtime->seqcount); vtime_account_guest(tsk, vtime); - current->flags &= ~PF_VCPU; + tsk->flags &= ~PF_VCPU; write_seqcount_end(&vtime->seqcount); } EXPORT_SYMBOL_GPL(vtime_guest_exit); -- cgit v1.2.3 From d44248a41337731a111374822d7d4451b64e73e4 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Wed, 4 Sep 2019 14:46:18 -0700 Subject: perf/core: Rework memory accounting in perf_mmap() perf_mmap() always increases user->locked_vm. As a result, "extra" could grow bigger than "user_extra", which doesn't make sense. Here is an example case: (Note: Assume "user_lock_limit" is very small.) | # of perf_mmap calls |vma->vm_mm->pinned_vm|user->locked_vm| | 0 | 0 | 0 | | 1 | user_extra | user_extra | | 2 | 3 * user_extra | 2 * user_extra| | 3 | 6 * user_extra | 3 * user_extra| | 4 | 10 * user_extra | 4 * user_extra| Fix this by maintaining proper user_extra and extra. Reviewed-By: Hechao Li Reported-by: Hechao Li Signed-off-by: Song Liu Signed-off-by: Peter Zijlstra (Intel) Cc: Cc: Jie Meng Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: https://lkml.kernel.org/r/20190904214618.3795672-1-songliubraving@fb.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index f953dd16a5e2..2b8265ad7bf5 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -5668,7 +5668,8 @@ again: * undo the VM accounting. */ - atomic_long_sub((size >> PAGE_SHIFT) + 1, &mmap_user->locked_vm); + atomic_long_sub((size >> PAGE_SHIFT) + 1 - mmap_locked, + &mmap_user->locked_vm); atomic64_sub(mmap_locked, &vma->vm_mm->pinned_vm); free_uid(mmap_user); @@ -5812,8 +5813,20 @@ accounting: user_locked = atomic_long_read(&user->locked_vm) + user_extra; - if (user_locked > user_lock_limit) + if (user_locked <= user_lock_limit) { + /* charge all to locked_vm */ + } else if (atomic_long_read(&user->locked_vm) >= user_lock_limit) { + /* charge all to pinned_vm */ + extra = user_extra; + user_extra = 0; + } else { + /* + * charge locked_vm until it hits user_lock_limit; + * charge the rest from pinned_vm + */ extra = user_locked - user_lock_limit; + user_extra -= extra; + } lock_limit = rlimit(RLIMIT_MEMLOCK); lock_limit >>= PAGE_SHIFT; -- cgit v1.2.3 From 7fa343b7fdc4f351de4e3f28d5c285937dd1f42f Mon Sep 17 00:00:00 2001 From: Song Liu Date: Tue, 8 Oct 2019 09:59:49 -0700 Subject: perf/core: Fix corner case in perf_rotate_context() In perf_rotate_context(), when the first cpu flexible event fail to schedule, cpu_rotate is 1, while cpu_event is NULL. Since cpu_event is NULL, perf_rotate_context will _NOT_ call cpu_ctx_sched_out(), thus cpuctx->ctx.is_active will have EVENT_FLEXIBLE set. Then, the next perf_event_sched_in() will skip all cpu flexible events because of the EVENT_FLEXIBLE bit. In the next call of perf_rotate_context(), cpu_rotate stays 1, and cpu_event stays NULL, so this process repeats. The end result is, flexible events on this cpu will not be scheduled (until another event being added to the cpuctx). Here is an easy repro of this issue. On Intel CPUs, where ref-cycles could only use one counter, run one pinned event for ref-cycles, one flexible event for ref-cycles, and one flexible event for cycles. The flexible ref-cycles is never scheduled, which is expected. However, because of this issue, the cycles event is never scheduled either. $ perf stat -e ref-cycles:D,ref-cycles,cycles -C 5 -I 1000 time counts unit events 1.000152973 15,412,480 ref-cycles:D 1.000152973 ref-cycles (0.00%) 1.000152973 cycles (0.00%) 2.000486957 18,263,120 ref-cycles:D 2.000486957 ref-cycles (0.00%) 2.000486957 cycles (0.00%) To fix this, when the flexible_active list is empty, try rotate the first event in the flexible_groups. Also, rename ctx_first_active() to ctx_event_to_rotate(), which is more accurate. Signed-off-by: Song Liu Signed-off-by: Peter Zijlstra (Intel) Cc: Cc: Arnaldo Carvalho de Melo Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Sasha Levin Cc: Thomas Gleixner Fixes: 8d5bce0c37fa ("perf/core: Optimize perf_rotate_context() event scheduling") Link: https://lkml.kernel.org/r/20191008165949.920548-1-songliubraving@fb.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 2b8265ad7bf5..9ec0b0bfddbd 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3779,11 +3779,23 @@ static void rotate_ctx(struct perf_event_context *ctx, struct perf_event *event) perf_event_groups_insert(&ctx->flexible_groups, event); } +/* pick an event from the flexible_groups to rotate */ static inline struct perf_event * -ctx_first_active(struct perf_event_context *ctx) +ctx_event_to_rotate(struct perf_event_context *ctx) { - return list_first_entry_or_null(&ctx->flexible_active, - struct perf_event, active_list); + struct perf_event *event; + + /* pick the first active flexible event */ + event = list_first_entry_or_null(&ctx->flexible_active, + struct perf_event, active_list); + + /* if no active flexible event, pick the first event */ + if (!event) { + event = rb_entry_safe(rb_first(&ctx->flexible_groups.tree), + typeof(*event), group_node); + } + + return event; } static bool perf_rotate_context(struct perf_cpu_context *cpuctx) @@ -3808,9 +3820,9 @@ static bool perf_rotate_context(struct perf_cpu_context *cpuctx) perf_pmu_disable(cpuctx->ctx.pmu); if (task_rotate) - task_event = ctx_first_active(task_ctx); + task_event = ctx_event_to_rotate(task_ctx); if (cpu_rotate) - cpu_event = ctx_first_active(&cpuctx->ctx); + cpu_event = ctx_event_to_rotate(&cpuctx->ctx); /* * As per the order given at ctx_resched() first 'pop' task flexible -- cgit v1.2.3 From f49249d58abdc12067d69f15d509487171148b30 Mon Sep 17 00:00:00 2001 From: Ben Dooks Date: Tue, 8 Oct 2019 11:46:46 +0100 Subject: PM: sleep: include for pm_wq Include the for the definition of pm_wq to avoid the following warning: kernel/power/main.c:890:25: warning: symbol 'pm_wq' was not declared. Should it be static? Signed-off-by: Ben Dooks Signed-off-by: Rafael J. Wysocki --- kernel/power/main.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/power/main.c b/kernel/power/main.c index e8710d179b35..e26de7af520b 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -15,6 +15,7 @@ #include #include #include +#include #include "power.h" -- cgit v1.2.3 From 9ef16693aff8137faa21d16ffe65bb9832d24d71 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Fri, 11 Oct 2019 17:56:57 -0400 Subject: ftrace: Get a reference counter for the trace_array on filter files The ftrace set_ftrace_filter and set_ftrace_notrace files are specific for an instance now. They need to take a reference to the instance otherwise there could be a race between accessing the files and deleting the instance. It wasn't until the :mod: caching where these file operations started referencing the trace_array directly. Cc: stable@vger.kernel.org Fixes: 673feb9d76ab3 ("ftrace: Add :mod: caching infrastructure to trace_array") Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 62a50bf399d6..32c2eb167de0 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -3540,21 +3540,22 @@ ftrace_regex_open(struct ftrace_ops *ops, int flag, struct ftrace_hash *hash; struct list_head *mod_head; struct trace_array *tr = ops->private; - int ret = 0; + int ret = -ENOMEM; ftrace_ops_init(ops); if (unlikely(ftrace_disabled)) return -ENODEV; + if (tr && trace_array_get(tr) < 0) + return -ENODEV; + iter = kzalloc(sizeof(*iter), GFP_KERNEL); if (!iter) - return -ENOMEM; + goto out; - if (trace_parser_get_init(&iter->parser, FTRACE_BUFF_MAX)) { - kfree(iter); - return -ENOMEM; - } + if (trace_parser_get_init(&iter->parser, FTRACE_BUFF_MAX)) + goto out; iter->ops = ops; iter->flags = flag; @@ -3584,13 +3585,13 @@ ftrace_regex_open(struct ftrace_ops *ops, int flag, if (!iter->hash) { trace_parser_put(&iter->parser); - kfree(iter); - ret = -ENOMEM; goto out_unlock; } } else iter->hash = hash; + ret = 0; + if (file->f_mode & FMODE_READ) { iter->pg = ftrace_pages_start; @@ -3602,7 +3603,6 @@ ftrace_regex_open(struct ftrace_ops *ops, int flag, /* Failed */ free_ftrace_hash(iter->hash); trace_parser_put(&iter->parser); - kfree(iter); } } else file->private_data = iter; @@ -3610,6 +3610,13 @@ ftrace_regex_open(struct ftrace_ops *ops, int flag, out_unlock: mutex_unlock(&ops->func_hash->regex_lock); + out: + if (ret) { + kfree(iter); + if (tr) + trace_array_put(tr); + } + return ret; } @@ -5037,6 +5044,8 @@ int ftrace_regex_release(struct inode *inode, struct file *file) mutex_unlock(&iter->ops->func_hash->regex_lock); free_ftrace_hash(iter->hash); + if (iter->tr) + trace_array_put(iter->tr); kfree(iter); return 0; -- cgit v1.2.3 From 194c2c74f5532e62c218adeb8e2b683119503907 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Fri, 11 Oct 2019 18:19:17 -0400 Subject: tracing: Get trace_array reference for available_tracers files As instances may have different tracers available, we need to look at the trace_array descriptor that shows the list of the available tracers for the instance. But there's a race between opening the file and an admin deleting the instance. The trace_array_get() needs to be called before accessing the trace_array. Cc: stable@vger.kernel.org Fixes: 607e2ea167e56 ("tracing: Set up infrastructure to allow tracers for instances") Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 252f79c435f8..fa7d813b04c6 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4355,9 +4355,14 @@ static int show_traces_open(struct inode *inode, struct file *file) if (tracing_disabled) return -ENODEV; + if (trace_array_get(tr) < 0) + return -ENODEV; + ret = seq_open(file, &show_traces_seq_ops); - if (ret) + if (ret) { + trace_array_put(tr); return ret; + } m = file->private_data; m->private = tr; @@ -4365,6 +4370,14 @@ static int show_traces_open(struct inode *inode, struct file *file) return 0; } +static int show_traces_release(struct inode *inode, struct file *file) +{ + struct trace_array *tr = inode->i_private; + + trace_array_put(tr); + return seq_release(inode, file); +} + static ssize_t tracing_write_stub(struct file *filp, const char __user *ubuf, size_t count, loff_t *ppos) @@ -4395,8 +4408,8 @@ static const struct file_operations tracing_fops = { static const struct file_operations show_traces_fops = { .open = show_traces_open, .read = seq_read, - .release = seq_release, .llseek = seq_lseek, + .release = show_traces_release, }; static ssize_t -- cgit v1.2.3 From aa07d71f1bc7ea20e442e812b5de9d632b7f84c6 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Fri, 11 Oct 2019 19:12:21 -0400 Subject: tracing: Have trace events system open call tracing_open_generic_tr() Instead of having the trace events system open call open code the taking of the trace_array descriptor (with trace_array_get()) and then calling trace_open_generic(), have it use the tracing_open_generic_tr() that does the combination of the two. This requires making tracing_open_generic_tr() global. Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 2 +- kernel/trace/trace.h | 1 + kernel/trace/trace_events.c | 17 +++-------------- 3 files changed, 5 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index fa7d813b04c6..94f1b9124939 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4156,7 +4156,7 @@ bool tracing_is_disabled(void) * Open and update trace_array ref count. * Must have the current trace_array passed to it. */ -static int tracing_open_generic_tr(struct inode *inode, struct file *filp) +int tracing_open_generic_tr(struct inode *inode, struct file *filp) { struct trace_array *tr = inode->i_private; diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index f801d154ff6a..854dbf4050f8 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -681,6 +681,7 @@ void tracing_reset_online_cpus(struct trace_buffer *buf); void tracing_reset_current(int cpu); void tracing_reset_all_online_cpus(void); int tracing_open_generic(struct inode *inode, struct file *filp); +int tracing_open_generic_tr(struct inode *inode, struct file *filp); bool tracing_is_disabled(void); bool tracer_tracing_is_on(struct trace_array *tr); void tracer_tracing_on(struct trace_array *tr); diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index b89cdfe20bc1..9613a757c902 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -1440,28 +1440,17 @@ static int system_tr_open(struct inode *inode, struct file *filp) struct trace_array *tr = inode->i_private; int ret; - if (tracing_is_disabled()) - return -ENODEV; - - if (trace_array_get(tr) < 0) - return -ENODEV; - /* Make a temporary dir that has no system but points to tr */ dir = kzalloc(sizeof(*dir), GFP_KERNEL); - if (!dir) { - trace_array_put(tr); + if (!dir) return -ENOMEM; - } - - dir->tr = tr; - ret = tracing_open_generic(inode, filp); + ret = tracing_open_generic_tr(inode, filp); if (ret < 0) { - trace_array_put(tr); kfree(dir); return ret; } - + dir->tr = tr; filp->private_data = dir; return 0; -- cgit v1.2.3 From 8530dec63e7b486e3761cc3d74a22de301845ff5 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Fri, 11 Oct 2019 17:39:57 -0400 Subject: tracing: Add tracing_check_open_get_tr() Currently, most files in the tracefs directory test if tracing_disabled is set. If so, it should return -ENODEV. The tracing_disabled is called when tracing is found to be broken. Originally it was done in case the ring buffer was found to be corrupted, and we wanted to prevent reading it from crashing the kernel. But it's also called if a tracing selftest fails on boot. It's a one way switch. That is, once it is triggered, tracing is disabled until reboot. As most tracefs files can also be used by instances in the tracefs directory, they need to be carefully done. Each instance has a trace_array associated to it, and when the instance is removed, the trace_array is freed. But if an instance is opened with a reference to the trace_array, then it requires looking up the trace_array to get its ref counter (as there could be a race with it being deleted and the open itself). Once it is found, a reference is added to prevent the instance from being removed (and the trace_array associated with it freed). Combine the two checks (tracing_disabled and trace_array_get()) into a single helper function. This will also make it easier to add lockdown to tracefs later. Link: http://lkml.kernel.org/r/20191011135458.7399da44@gandalf.local.home Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 7 ++- kernel/trace/trace.c | 117 ++++++++++++++++++++++----------------- kernel/trace/trace.h | 1 + kernel/trace/trace_dynevent.c | 4 ++ kernel/trace/trace_events.c | 10 ++-- kernel/trace/trace_events_hist.c | 2 +- 6 files changed, 81 insertions(+), 60 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 32c2eb167de0..8b765a55e01c 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -3547,7 +3547,7 @@ ftrace_regex_open(struct ftrace_ops *ops, int flag, if (unlikely(ftrace_disabled)) return -ENODEV; - if (tr && trace_array_get(tr) < 0) + if (tracing_check_open_get_tr(tr)) return -ENODEV; iter = kzalloc(sizeof(*iter), GFP_KERNEL); @@ -6546,8 +6546,9 @@ ftrace_pid_open(struct inode *inode, struct file *file) struct seq_file *m; int ret = 0; - if (trace_array_get(tr) < 0) - return -ENODEV; + ret = tracing_check_open_get_tr(tr); + if (ret) + return ret; if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 94f1b9124939..26ee280f852b 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -304,6 +304,17 @@ void trace_array_put(struct trace_array *this_tr) mutex_unlock(&trace_types_lock); } +int tracing_check_open_get_tr(struct trace_array *tr) +{ + if (tracing_disabled) + return -ENODEV; + + if (tr && trace_array_get(tr) < 0) + return -ENODEV; + + return 0; +} + int call_filter_check_discard(struct trace_event_call *call, void *rec, struct ring_buffer *buffer, struct ring_buffer_event *event) @@ -4140,8 +4151,11 @@ release: int tracing_open_generic(struct inode *inode, struct file *filp) { - if (tracing_disabled) - return -ENODEV; + int ret; + + ret = tracing_check_open_get_tr(NULL); + if (ret) + return ret; filp->private_data = inode->i_private; return 0; @@ -4159,12 +4173,11 @@ bool tracing_is_disabled(void) int tracing_open_generic_tr(struct inode *inode, struct file *filp) { struct trace_array *tr = inode->i_private; + int ret; - if (tracing_disabled) - return -ENODEV; - - if (trace_array_get(tr) < 0) - return -ENODEV; + ret = tracing_check_open_get_tr(tr); + if (ret) + return ret; filp->private_data = inode->i_private; @@ -4233,10 +4246,11 @@ static int tracing_open(struct inode *inode, struct file *file) { struct trace_array *tr = inode->i_private; struct trace_iterator *iter; - int ret = 0; + int ret; - if (trace_array_get(tr) < 0) - return -ENODEV; + ret = tracing_check_open_get_tr(tr); + if (ret) + return ret; /* If this file was open for write, then erase contents */ if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) { @@ -4352,11 +4366,9 @@ static int show_traces_open(struct inode *inode, struct file *file) struct seq_file *m; int ret; - if (tracing_disabled) - return -ENODEV; - - if (trace_array_get(tr) < 0) - return -ENODEV; + ret = tracing_check_open_get_tr(tr); + if (ret) + return ret; ret = seq_open(file, &show_traces_seq_ops); if (ret) { @@ -4710,11 +4722,9 @@ static int tracing_trace_options_open(struct inode *inode, struct file *file) struct trace_array *tr = inode->i_private; int ret; - if (tracing_disabled) - return -ENODEV; - - if (trace_array_get(tr) < 0) - return -ENODEV; + ret = tracing_check_open_get_tr(tr); + if (ret) + return ret; ret = single_open(file, tracing_trace_options_show, inode->i_private); if (ret < 0) @@ -5051,8 +5061,11 @@ static const struct seq_operations tracing_saved_tgids_seq_ops = { static int tracing_saved_tgids_open(struct inode *inode, struct file *filp) { - if (tracing_disabled) - return -ENODEV; + int ret; + + ret = tracing_check_open_get_tr(NULL); + if (ret) + return ret; return seq_open(filp, &tracing_saved_tgids_seq_ops); } @@ -5128,8 +5141,11 @@ static const struct seq_operations tracing_saved_cmdlines_seq_ops = { static int tracing_saved_cmdlines_open(struct inode *inode, struct file *filp) { - if (tracing_disabled) - return -ENODEV; + int ret; + + ret = tracing_check_open_get_tr(NULL); + if (ret) + return ret; return seq_open(filp, &tracing_saved_cmdlines_seq_ops); } @@ -5293,8 +5309,11 @@ static const struct seq_operations tracing_eval_map_seq_ops = { static int tracing_eval_map_open(struct inode *inode, struct file *filp) { - if (tracing_disabled) - return -ENODEV; + int ret; + + ret = tracing_check_open_get_tr(NULL); + if (ret) + return ret; return seq_open(filp, &tracing_eval_map_seq_ops); } @@ -5817,13 +5836,11 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp) { struct trace_array *tr = inode->i_private; struct trace_iterator *iter; - int ret = 0; - - if (tracing_disabled) - return -ENODEV; + int ret; - if (trace_array_get(tr) < 0) - return -ENODEV; + ret = tracing_check_open_get_tr(tr); + if (ret) + return ret; mutex_lock(&trace_types_lock); @@ -6560,11 +6577,9 @@ static int tracing_clock_open(struct inode *inode, struct file *file) struct trace_array *tr = inode->i_private; int ret; - if (tracing_disabled) - return -ENODEV; - - if (trace_array_get(tr)) - return -ENODEV; + ret = tracing_check_open_get_tr(tr); + if (ret) + return ret; ret = single_open(file, tracing_clock_show, inode->i_private); if (ret < 0) @@ -6594,11 +6609,9 @@ static int tracing_time_stamp_mode_open(struct inode *inode, struct file *file) struct trace_array *tr = inode->i_private; int ret; - if (tracing_disabled) - return -ENODEV; - - if (trace_array_get(tr)) - return -ENODEV; + ret = tracing_check_open_get_tr(tr); + if (ret) + return ret; ret = single_open(file, tracing_time_stamp_mode_show, inode->i_private); if (ret < 0) @@ -6651,10 +6664,11 @@ static int tracing_snapshot_open(struct inode *inode, struct file *file) struct trace_array *tr = inode->i_private; struct trace_iterator *iter; struct seq_file *m; - int ret = 0; + int ret; - if (trace_array_get(tr) < 0) - return -ENODEV; + ret = tracing_check_open_get_tr(tr); + if (ret) + return ret; if (file->f_mode & FMODE_READ) { iter = __tracing_open(inode, file, true); @@ -7118,8 +7132,9 @@ static int tracing_err_log_open(struct inode *inode, struct file *file) struct trace_array *tr = inode->i_private; int ret = 0; - if (trace_array_get(tr) < 0) - return -ENODEV; + ret = tracing_check_open_get_tr(tr); + if (ret) + return ret; /* If this file was opened for write, then erase contents */ if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) @@ -7170,11 +7185,9 @@ static int tracing_buffers_open(struct inode *inode, struct file *filp) struct ftrace_buffer_info *info; int ret; - if (tracing_disabled) - return -ENODEV; - - if (trace_array_get(tr) < 0) - return -ENODEV; + ret = tracing_check_open_get_tr(tr); + if (ret) + return ret; info = kzalloc(sizeof(*info), GFP_KERNEL); if (!info) { diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 854dbf4050f8..d685c61085c0 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -338,6 +338,7 @@ extern struct mutex trace_types_lock; extern int trace_array_get(struct trace_array *tr); extern void trace_array_put(struct trace_array *tr); +extern int tracing_check_open_get_tr(struct trace_array *tr); extern int tracing_set_time_stamp_abs(struct trace_array *tr, bool abs); extern int tracing_set_clock(struct trace_array *tr, const char *clockstr); diff --git a/kernel/trace/trace_dynevent.c b/kernel/trace/trace_dynevent.c index a41fed46c285..89779eb84a07 100644 --- a/kernel/trace/trace_dynevent.c +++ b/kernel/trace/trace_dynevent.c @@ -174,6 +174,10 @@ static int dyn_event_open(struct inode *inode, struct file *file) { int ret; + ret = tracing_check_open_get_tr(NULL); + if (ret) + return ret; + if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) { ret = dyn_events_release_all(NULL); if (ret < 0) diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 9613a757c902..71ab0a3660f4 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -1794,8 +1794,9 @@ ftrace_event_set_open(struct inode *inode, struct file *file) struct trace_array *tr = inode->i_private; int ret; - if (trace_array_get(tr) < 0) - return -ENODEV; + ret = tracing_check_open_get_tr(tr); + if (ret) + return ret; if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) @@ -1814,8 +1815,9 @@ ftrace_event_set_pid_open(struct inode *inode, struct file *file) struct trace_array *tr = inode->i_private; int ret; - if (trace_array_get(tr) < 0) - return -ENODEV; + ret = tracing_check_open_get_tr(tr); + if (ret) + return ret; if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 9468bd8d44a2..dd18d76bf1bd 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -1680,7 +1680,7 @@ static int save_hist_vars(struct hist_trigger_data *hist_data) if (var_data) return 0; - if (trace_array_get(tr) < 0) + if (tracing_check_open_get_tr(tr)) return -ENODEV; var_data = kzalloc(sizeof(*var_data), GFP_KERNEL); -- cgit v1.2.3 From 17911ff38aa58d3c95c07589dbf5d3564c4cf3c5 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Fri, 11 Oct 2019 17:22:50 -0400 Subject: tracing: Add locked_down checks to the open calls of files created for tracefs Added various checks on open tracefs calls to see if tracefs is in lockdown mode, and if so, to return -EPERM. Note, the event format files (which are basically standard on all machines) as well as the enabled_functions file (which shows what is currently being traced) are not lockde down. Perhaps they should be, but it seems counter intuitive to lockdown information to help you know if the system has been modified. Link: http://lkml.kernel.org/r/CAHk-=wj7fGPKUspr579Cii-w_y60PtRaiDgKuxVtBAMK0VNNkA@mail.gmail.com Suggested-by: Linus Torvalds Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 23 ++++++++++++++++++++++- kernel/trace/trace.c | 8 ++++++++ kernel/trace/trace_events.c | 8 ++++++++ kernel/trace/trace_events_hist.c | 11 +++++++++++ kernel/trace/trace_events_trigger.c | 8 +++++++- kernel/trace/trace_kprobe.c | 12 +++++++++++- kernel/trace/trace_printk.c | 7 +++++++ kernel/trace/trace_stack.c | 8 ++++++++ kernel/trace/trace_stat.c | 6 +++++- kernel/trace/trace_uprobe.c | 11 +++++++++++ 10 files changed, 98 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 8b765a55e01c..f296d89be757 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -3486,6 +3487,11 @@ static int ftrace_avail_open(struct inode *inode, struct file *file) { struct ftrace_iterator *iter; + int ret; + + ret = security_locked_down(LOCKDOWN_TRACEFS); + if (ret) + return ret; if (unlikely(ftrace_disabled)) return -ENODEV; @@ -3505,6 +3511,15 @@ ftrace_enabled_open(struct inode *inode, struct file *file) { struct ftrace_iterator *iter; + /* + * This shows us what functions are currently being + * traced and by what. Not sure if we want lockdown + * to hide such critical information for an admin. + * Although, perhaps it can show information we don't + * want people to see, but if something is tracing + * something, we probably want to know about it. + */ + iter = __seq_open_private(file, &show_ftrace_seq_ops, sizeof(*iter)); if (!iter) return -ENOMEM; @@ -3625,6 +3640,7 @@ ftrace_filter_open(struct inode *inode, struct file *file) { struct ftrace_ops *ops = inode->i_private; + /* Checks for tracefs lockdown */ return ftrace_regex_open(ops, FTRACE_ITER_FILTER | FTRACE_ITER_DO_PROBES, inode, file); @@ -3635,6 +3651,7 @@ ftrace_notrace_open(struct inode *inode, struct file *file) { struct ftrace_ops *ops = inode->i_private; + /* Checks for tracefs lockdown */ return ftrace_regex_open(ops, FTRACE_ITER_NOTRACE, inode, file); } @@ -5203,9 +5220,13 @@ static int __ftrace_graph_open(struct inode *inode, struct file *file, struct ftrace_graph_data *fgd) { - int ret = 0; + int ret; struct ftrace_hash *new_hash = NULL; + ret = security_locked_down(LOCKDOWN_TRACEFS); + if (ret) + return ret; + if (file->f_mode & FMODE_WRITE) { const int size_bits = FTRACE_HASH_DEFAULT_BITS; diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 26ee280f852b..2b4eff383505 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include @@ -306,6 +307,12 @@ void trace_array_put(struct trace_array *this_tr) int tracing_check_open_get_tr(struct trace_array *tr) { + int ret; + + ret = security_locked_down(LOCKDOWN_TRACEFS); + if (ret) + return ret; + if (tracing_disabled) return -ENODEV; @@ -6813,6 +6820,7 @@ static int snapshot_raw_open(struct inode *inode, struct file *filp) struct ftrace_buffer_info *info; int ret; + /* The following checks for tracefs lockdown */ ret = tracing_buffers_open(inode, filp); if (ret < 0) return ret; diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 71ab0a3660f4..fba87d10f0c1 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -12,6 +12,7 @@ #define pr_fmt(fmt) fmt #include +#include #include #include #include @@ -1294,6 +1295,8 @@ static int trace_format_open(struct inode *inode, struct file *file) struct seq_file *m; int ret; + /* Do we want to hide event format files on tracefs lockdown? */ + ret = seq_open(file, &trace_format_seq_ops); if (ret < 0) return ret; @@ -1760,6 +1763,10 @@ ftrace_event_open(struct inode *inode, struct file *file, struct seq_file *m; int ret; + ret = security_locked_down(LOCKDOWN_TRACEFS); + if (ret) + return ret; + ret = seq_open(file, seq_ops); if (ret < 0) return ret; @@ -1784,6 +1791,7 @@ ftrace_event_avail_open(struct inode *inode, struct file *file) { const struct seq_operations *seq_ops = &show_event_seq_ops; + /* Checks for tracefs lockdown */ return ftrace_event_open(inode, file, seq_ops); } diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index dd18d76bf1bd..57648c5aa679 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -7,6 +7,7 @@ #include #include +#include #include #include #include @@ -1448,6 +1449,10 @@ static int synth_events_open(struct inode *inode, struct file *file) { int ret; + ret = security_locked_down(LOCKDOWN_TRACEFS); + if (ret) + return ret; + if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) { ret = dyn_events_release_all(&synth_event_ops); if (ret < 0) @@ -5515,6 +5520,12 @@ static int hist_show(struct seq_file *m, void *v) static int event_hist_open(struct inode *inode, struct file *file) { + int ret; + + ret = security_locked_down(LOCKDOWN_TRACEFS); + if (ret) + return ret; + return single_open(file, hist_show, file); } diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index 2a2912cb4533..2cd53ca21b51 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -5,6 +5,7 @@ * Copyright (C) 2013 Tom Zanussi */ +#include #include #include #include @@ -173,7 +174,11 @@ static const struct seq_operations event_triggers_seq_ops = { static int event_trigger_regex_open(struct inode *inode, struct file *file) { - int ret = 0; + int ret; + + ret = security_locked_down(LOCKDOWN_TRACEFS); + if (ret) + return ret; mutex_lock(&event_mutex); @@ -292,6 +297,7 @@ event_trigger_write(struct file *filp, const char __user *ubuf, static int event_trigger_open(struct inode *inode, struct file *filp) { + /* Checks for tracefs lockdown */ return event_trigger_regex_open(inode, filp); } diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 324ffbea3556..1552a95c743b 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -7,11 +7,11 @@ */ #define pr_fmt(fmt) "trace_kprobe: " fmt +#include #include #include #include #include -#include #include /* for COMMAND_LINE_SIZE */ @@ -936,6 +936,10 @@ static int probes_open(struct inode *inode, struct file *file) { int ret; + ret = security_locked_down(LOCKDOWN_TRACEFS); + if (ret) + return ret; + if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) { ret = dyn_events_release_all(&trace_kprobe_ops); if (ret < 0) @@ -988,6 +992,12 @@ static const struct seq_operations profile_seq_op = { static int profile_open(struct inode *inode, struct file *file) { + int ret; + + ret = security_locked_down(LOCKDOWN_TRACEFS); + if (ret) + return ret; + return seq_open(file, &profile_seq_op); } diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c index c3fd849d4a8f..d4e31e969206 100644 --- a/kernel/trace/trace_printk.c +++ b/kernel/trace/trace_printk.c @@ -6,6 +6,7 @@ * */ #include +#include #include #include #include @@ -348,6 +349,12 @@ static const struct seq_operations show_format_seq_ops = { static int ftrace_formats_open(struct inode *inode, struct file *file) { + int ret; + + ret = security_locked_down(LOCKDOWN_TRACEFS); + if (ret) + return ret; + return seq_open(file, &show_format_seq_ops); } diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index ec9a34a97129..4df9a209f7ca 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -5,6 +5,7 @@ */ #include #include +#include #include #include #include @@ -470,6 +471,12 @@ static const struct seq_operations stack_trace_seq_ops = { static int stack_trace_open(struct inode *inode, struct file *file) { + int ret; + + ret = security_locked_down(LOCKDOWN_TRACEFS); + if (ret) + return ret; + return seq_open(file, &stack_trace_seq_ops); } @@ -487,6 +494,7 @@ stack_trace_filter_open(struct inode *inode, struct file *file) { struct ftrace_ops *ops = inode->i_private; + /* Checks for tracefs lockdown */ return ftrace_regex_open(ops, FTRACE_ITER_FILTER, inode, file); } diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c index 75bf1bcb4a8a..9ab0a1a7ad5e 100644 --- a/kernel/trace/trace_stat.c +++ b/kernel/trace/trace_stat.c @@ -9,7 +9,7 @@ * */ - +#include #include #include #include @@ -238,6 +238,10 @@ static int tracing_stat_open(struct inode *inode, struct file *file) struct seq_file *m; struct stat_session *session = inode->i_private; + ret = security_locked_down(LOCKDOWN_TRACEFS); + if (ret) + return ret; + ret = stat_seq_init(session); if (ret) return ret; diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index dd884341f5c5..352073d36585 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -7,6 +7,7 @@ */ #define pr_fmt(fmt) "trace_uprobe: " fmt +#include #include #include #include @@ -769,6 +770,10 @@ static int probes_open(struct inode *inode, struct file *file) { int ret; + ret = security_locked_down(LOCKDOWN_TRACEFS); + if (ret) + return ret; + if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) { ret = dyn_events_release_all(&trace_uprobe_ops); if (ret) @@ -818,6 +823,12 @@ static const struct seq_operations profile_seq_op = { static int profile_open(struct inode *inode, struct file *file) { + int ret; + + ret = security_locked_down(LOCKDOWN_TRACEFS); + if (ret) + return ret; + return seq_open(file, &profile_seq_op); } -- cgit v1.2.3 From 98dc19c11470ee6048aba723d77079ad2cda8a52 Mon Sep 17 00:00:00 2001 From: "Srivatsa S. Bhat (VMware)" Date: Thu, 10 Oct 2019 11:50:46 -0700 Subject: tracing/hwlat: Report total time spent in all NMIs during the sample nmi_total_ts is supposed to record the total time spent in *all* NMIs that occur on the given CPU during the (active portion of the) sampling window. However, the code seems to be overwriting this variable for each NMI, thereby only recording the time spent in the most recent NMI. Fix it by accumulating the duration instead. Link: http://lkml.kernel.org/r/157073343544.17189.13911783866738671133.stgit@srivatsa-ubuntu Fixes: 7b2c86250122 ("tracing: Add NMI tracing in hwlat detector") Cc: stable@vger.kernel.org Signed-off-by: Srivatsa S. Bhat (VMware) Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_hwlat.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c index fa95139445b2..a0251a78807d 100644 --- a/kernel/trace/trace_hwlat.c +++ b/kernel/trace/trace_hwlat.c @@ -150,7 +150,7 @@ void trace_hwlat_callback(bool enter) if (enter) nmi_ts_start = time_get(); else - nmi_total_ts = time_get() - nmi_ts_start; + nmi_total_ts += time_get() - nmi_ts_start; } if (enter) -- cgit v1.2.3 From fc64e4ad80d4b72efce116f87b3174f0b7196f8e Mon Sep 17 00:00:00 2001 From: "Srivatsa S. Bhat (VMware)" Date: Thu, 10 Oct 2019 11:51:01 -0700 Subject: tracing/hwlat: Don't ignore outer-loop duration when calculating max_latency max_latency is intended to record the maximum ever observed hardware latency, which may occur in either part of the loop (inner/outer). So we need to also consider the outer-loop sample when updating max_latency. Link: http://lkml.kernel.org/r/157073345463.17189.18124025522664682811.stgit@srivatsa-ubuntu Fixes: e7c15cd8a113 ("tracing: Added hardware latency tracer") Cc: stable@vger.kernel.org Signed-off-by: Srivatsa S. Bhat (VMware) Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_hwlat.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c index a0251a78807d..862f4b0139fc 100644 --- a/kernel/trace/trace_hwlat.c +++ b/kernel/trace/trace_hwlat.c @@ -256,6 +256,8 @@ static int get_sample(void) /* Keep a running maximum ever recorded hardware latency */ if (sample > tr->max_latency) tr->max_latency = sample; + if (outer_sample > tr->max_latency) + tr->max_latency = outer_sample; } out: -- cgit v1.2.3 From d303de1fcf344ff7c15ed64c3f48a991c9958775 Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Fri, 11 Oct 2019 16:21:34 +0200 Subject: tracing: Initialize iter->seq after zeroing in tracing_read_pipe() A customer reported the following softlockup: [899688.160002] NMI watchdog: BUG: soft lockup - CPU#0 stuck for 22s! [test.sh:16464] [899688.160002] CPU: 0 PID: 16464 Comm: test.sh Not tainted 4.12.14-6.23-azure #1 SLE12-SP4 [899688.160002] RIP: 0010:up_write+0x1a/0x30 [899688.160002] Kernel panic - not syncing: softlockup: hung tasks [899688.160002] RIP: 0010:up_write+0x1a/0x30 [899688.160002] RSP: 0018:ffffa86784d4fde8 EFLAGS: 00000257 ORIG_RAX: ffffffffffffff12 [899688.160002] RAX: ffffffff970fea00 RBX: 0000000000000001 RCX: 0000000000000000 [899688.160002] RDX: ffffffff00000001 RSI: 0000000000000080 RDI: ffffffff970fea00 [899688.160002] RBP: ffffffffffffffff R08: ffffffffffffffff R09: 0000000000000000 [899688.160002] R10: 0000000000000000 R11: 0000000000000000 R12: ffff8b59014720d8 [899688.160002] R13: ffff8b59014720c0 R14: ffff8b5901471090 R15: ffff8b5901470000 [899688.160002] tracing_read_pipe+0x336/0x3c0 [899688.160002] __vfs_read+0x26/0x140 [899688.160002] vfs_read+0x87/0x130 [899688.160002] SyS_read+0x42/0x90 [899688.160002] do_syscall_64+0x74/0x160 It caught the process in the middle of trace_access_unlock(). There is no loop. So, it must be looping in the caller tracing_read_pipe() via the "waitagain" label. Crashdump analyze uncovered that iter->seq was completely zeroed at this point, including iter->seq.seq.size. It means that print_trace_line() was never able to print anything and there was no forward progress. The culprit seems to be in the code: /* reset all but tr, trace, and overruns */ memset(&iter->seq, 0, sizeof(struct trace_iterator) - offsetof(struct trace_iterator, seq)); It was added by the commit 53d0aa773053ab182877 ("ftrace: add logic to record overruns"). It was v2.6.27-rc1. It was the time when iter->seq looked like: struct trace_seq { unsigned char buffer[PAGE_SIZE]; unsigned int len; }; There was no "size" variable and zeroing was perfectly fine. The solution is to reinitialize the structure after or without zeroing. Link: http://lkml.kernel.org/r/20191011142134.11997-1-pmladek@suse.com Signed-off-by: Petr Mladek Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 2b4eff383505..6a0ee9178365 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -6036,6 +6036,7 @@ waitagain: sizeof(struct trace_iterator) - offsetof(struct trace_iterator, seq)); cpumask_clear(iter->started); + trace_seq_init(&iter->seq); iter->pos = -1; trace_event_read_lock(); -- cgit v1.2.3 From ff229eee3d897f52bd001c841f2d3cce8853ecdc Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 8 Oct 2019 10:32:04 -0700 Subject: hrtimer: Annotate lockless access to timer->base Followup to commit dd2261ed45aa ("hrtimer: Protect lockless access to timer->base") lock_hrtimer_base() fetches timer->base without lock exclusion. Compiler is allowed to read timer->base twice (even if considered dumb) which could end up trying to lock migration_base and return &migration_base. base = timer->base; if (likely(base != &migration_base)) { /* compiler reads timer->base again, and now (base == &migration_base) raw_spin_lock_irqsave(&base->cpu_base->lock, *flags); if (likely(base == timer->base)) return base; /* == &migration_base ! */ Similarly the write sides must use WRITE_ONCE() to avoid store tearing. Signed-off-by: Eric Dumazet Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20191008173204.180879-1-edumazet@google.com --- kernel/time/hrtimer.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 0d4dc241c0fb..65605530ee34 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -164,7 +164,7 @@ struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer, struct hrtimer_clock_base *base; for (;;) { - base = timer->base; + base = READ_ONCE(timer->base); if (likely(base != &migration_base)) { raw_spin_lock_irqsave(&base->cpu_base->lock, *flags); if (likely(base == timer->base)) @@ -244,7 +244,7 @@ again: return base; /* See the comment in lock_hrtimer_base() */ - timer->base = &migration_base; + WRITE_ONCE(timer->base, &migration_base); raw_spin_unlock(&base->cpu_base->lock); raw_spin_lock(&new_base->cpu_base->lock); @@ -253,10 +253,10 @@ again: raw_spin_unlock(&new_base->cpu_base->lock); raw_spin_lock(&base->cpu_base->lock); new_cpu_base = this_cpu_base; - timer->base = base; + WRITE_ONCE(timer->base, base); goto again; } - timer->base = new_base; + WRITE_ONCE(timer->base, new_base); } else { if (new_cpu_base != this_cpu_base && hrtimer_check_target(timer, new_base)) { -- cgit v1.2.3 From b67114db64adcb0aaff0fe0bb2a84ede0a99aaf2 Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Fri, 4 Oct 2019 13:10:09 +0200 Subject: parisc: sysctl.c: Use CONFIG_PARISC instead of __hppa_ define Signed-off-by: Helge Deller --- kernel/sysctl.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 00fcea236eba..b6f2f35d0bcf 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -163,7 +163,7 @@ static unsigned long hung_task_timeout_max = (LONG_MAX/HZ); #ifdef CONFIG_SPARC #endif -#ifdef __hppa__ +#ifdef CONFIG_PARISC extern int pwrsw_enabled; #endif @@ -620,7 +620,7 @@ static struct ctl_table kern_table[] = { .proc_handler = proc_dointvec, }, #endif -#ifdef __hppa__ +#ifdef CONFIG_PARISC { .procname = "soft-power", .data = &pwrsw_enabled, -- cgit v1.2.3 From bc88f85c6c09306bd21917e1ae28205e9cd775a7 Mon Sep 17 00:00:00 2001 From: Ben Dooks Date: Wed, 16 Oct 2019 12:24:58 +0100 Subject: kthread: make __kthread_queue_delayed_work static The __kthread_queue_delayed_work is not exported so make it static, to avoid the following sparse warning: kernel/kthread.c:869:6: warning: symbol '__kthread_queue_delayed_work' was not declared. Should it be static? Signed-off-by: Ben Dooks Signed-off-by: Linus Torvalds --- kernel/kthread.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/kthread.c b/kernel/kthread.c index 621467c33fef..b262f47046ca 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -866,9 +866,9 @@ void kthread_delayed_work_timer_fn(struct timer_list *t) } EXPORT_SYMBOL(kthread_delayed_work_timer_fn); -void __kthread_queue_delayed_work(struct kthread_worker *worker, - struct kthread_delayed_work *dwork, - unsigned long delay) +static void __kthread_queue_delayed_work(struct kthread_worker *worker, + struct kthread_delayed_work *dwork, + unsigned long delay) { struct timer_list *timer = &dwork->timer; struct kthread_work *work = &dwork->work; -- cgit v1.2.3 From 700dea5a0bea9f64eba89fae7cb2540326fdfdc1 Mon Sep 17 00:00:00 2001 From: Dmitry Goldin Date: Wed, 9 Oct 2019 13:42:14 +0000 Subject: kheaders: substituting --sort in archive creation The option --sort=ORDER was only introduced in tar 1.28 (2014), which is rather new and might not be available in some setups. This patch tries to replicate the previous behaviour as closely as possible to fix the kheaders build for older environments. It does not produce identical archives compared to the previous version due to minor sorting differences but produces reproducible results itself in my tests. Reported-by: Andreas Schwab Signed-off-by: Dmitry Goldin Tested-by: Andreas Schwab Tested-by: Quentin Perret Signed-off-by: Masahiro Yamada --- kernel/gen_kheaders.sh | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/gen_kheaders.sh b/kernel/gen_kheaders.sh index aff79e461fc9..5a0fc0b0403a 100755 --- a/kernel/gen_kheaders.sh +++ b/kernel/gen_kheaders.sh @@ -71,10 +71,13 @@ done | cpio --quiet -pd $cpio_dir >/dev/null 2>&1 find $cpio_dir -type f -print0 | xargs -0 -P8 -n1 perl -pi -e 'BEGIN {undef $/;}; s/\/\*((?!SPDX).)*?\*\///smg;' -# Create archive and try to normalize metadata for reproducibility -tar "${KBUILD_BUILD_TIMESTAMP:+--mtime=$KBUILD_BUILD_TIMESTAMP}" \ - --owner=0 --group=0 --sort=name --numeric-owner \ - -Jcf $tarfile -C $cpio_dir/ . > /dev/null +# Create archive and try to normalize metadata for reproducibility. +# For compatibility with older versions of tar, files are fed to tar +# pre-sorted, as --sort=name might not be available. +find $cpio_dir -printf "./%P\n" | LC_ALL=C sort | \ + tar "${KBUILD_BUILD_TIMESTAMP:+--mtime=$KBUILD_BUILD_TIMESTAMP}" \ + --owner=0 --group=0 --numeric-owner --no-recursion \ + -Jcf $tarfile -C $cpio_dir/ -T - > /dev/null echo "$src_files_md5" > kernel/kheaders.md5 echo "$obj_files_md5" >> kernel/kheaders.md5 -- cgit v1.2.3 From b1fc5833357524d5d342737913dbe32ff3557bc5 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Mon, 7 Oct 2019 11:45:36 +0100 Subject: stop_machine: Avoid potential race behaviour Both multi_cpu_stop() and set_state() access multi_stop_data::state racily using plain accesses. These are subject to compiler transformations which could break the intended behaviour of the code, and this situation is detected by KCSAN on both arm64 and x86 (splats below). Improve matters by using READ_ONCE() and WRITE_ONCE() to ensure that the compiler cannot elide, replay, or tear loads and stores. In multi_cpu_stop() the two loads of multi_stop_data::state are expected to be a consistent value, so snapshot the value into a temporary variable to ensure this. The state transitions are serialized by atomic manipulation of multi_stop_data::num_threads, and other fields in multi_stop_data are not modified while subject to concurrent reads. KCSAN splat on arm64: | BUG: KCSAN: data-race in multi_cpu_stop+0xa8/0x198 and set_state+0x80/0xb0 | | write to 0xffff00001003bd00 of 4 bytes by task 24 on cpu 3: | set_state+0x80/0xb0 | multi_cpu_stop+0x16c/0x198 | cpu_stopper_thread+0x170/0x298 | smpboot_thread_fn+0x40c/0x560 | kthread+0x1a8/0x1b0 | ret_from_fork+0x10/0x18 | | read to 0xffff00001003bd00 of 4 bytes by task 14 on cpu 1: | multi_cpu_stop+0xa8/0x198 | cpu_stopper_thread+0x170/0x298 | smpboot_thread_fn+0x40c/0x560 | kthread+0x1a8/0x1b0 | ret_from_fork+0x10/0x18 | | Reported by Kernel Concurrency Sanitizer on: | CPU: 1 PID: 14 Comm: migration/1 Not tainted 5.3.0-00007-g67ab35a199f4-dirty #3 | Hardware name: linux,dummy-virt (DT) KCSAN splat on x86: | write to 0xffffb0bac0013e18 of 4 bytes by task 19 on cpu 2: | set_state kernel/stop_machine.c:170 [inline] | ack_state kernel/stop_machine.c:177 [inline] | multi_cpu_stop+0x1a4/0x220 kernel/stop_machine.c:227 | cpu_stopper_thread+0x19e/0x280 kernel/stop_machine.c:516 | smpboot_thread_fn+0x1a8/0x300 kernel/smpboot.c:165 | kthread+0x1b5/0x200 kernel/kthread.c:255 | ret_from_fork+0x35/0x40 arch/x86/entry/entry_64.S:352 | | read to 0xffffb0bac0013e18 of 4 bytes by task 44 on cpu 7: | multi_cpu_stop+0xb4/0x220 kernel/stop_machine.c:213 | cpu_stopper_thread+0x19e/0x280 kernel/stop_machine.c:516 | smpboot_thread_fn+0x1a8/0x300 kernel/smpboot.c:165 | kthread+0x1b5/0x200 kernel/kthread.c:255 | ret_from_fork+0x35/0x40 arch/x86/entry/entry_64.S:352 | | Reported by Kernel Concurrency Sanitizer on: | CPU: 7 PID: 44 Comm: migration/7 Not tainted 5.3.0+ #1 | Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.12.0-1 04/01/2014 Signed-off-by: Mark Rutland Signed-off-by: Thomas Gleixner Acked-by: Marco Elver Link: https://lkml.kernel.org/r/20191007104536.27276-1-mark.rutland@arm.com --- kernel/stop_machine.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index c7031a22aa7b..998d50ee2d9b 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -7,6 +7,7 @@ * Copyright (C) 2010 SUSE Linux Products GmbH * Copyright (C) 2010 Tejun Heo */ +#include #include #include #include @@ -167,7 +168,7 @@ static void set_state(struct multi_stop_data *msdata, /* Reset ack counter. */ atomic_set(&msdata->thread_ack, msdata->num_threads); smp_wmb(); - msdata->state = newstate; + WRITE_ONCE(msdata->state, newstate); } /* Last one to ack a state moves to the next state. */ @@ -186,7 +187,7 @@ void __weak stop_machine_yield(const struct cpumask *cpumask) static int multi_cpu_stop(void *data) { struct multi_stop_data *msdata = data; - enum multi_stop_state curstate = MULTI_STOP_NONE; + enum multi_stop_state newstate, curstate = MULTI_STOP_NONE; int cpu = smp_processor_id(), err = 0; const struct cpumask *cpumask; unsigned long flags; @@ -210,8 +211,9 @@ static int multi_cpu_stop(void *data) do { /* Chill out and ensure we re-read multi_stop_state. */ stop_machine_yield(cpumask); - if (msdata->state != curstate) { - curstate = msdata->state; + newstate = READ_ONCE(msdata->state); + if (newstate != curstate) { + curstate = newstate; switch (curstate) { case MULTI_STOP_DISABLE_IRQ: local_irq_disable(); -- cgit v1.2.3 From 9fa8c9c647be624e91b09ecffa7cd97ee0600b40 Mon Sep 17 00:00:00 2001 From: Zhengjun Xing Date: Fri, 18 Oct 2019 09:20:34 +0800 Subject: tracing: Fix "gfp_t" format for synthetic events In the format of synthetic events, the "gfp_t" is shown as "signed:1", but in fact the "gfp_t" is "unsigned", should be shown as "signed:0". The issue can be reproduced by the following commands: echo 'memlatency u64 lat; unsigned int order; gfp_t gfp_flags; int migratetype' > /sys/kernel/debug/tracing/synthetic_events cat /sys/kernel/debug/tracing/events/synthetic/memlatency/format name: memlatency ID: 2233 format: field:unsigned short common_type; offset:0; size:2; signed:0; field:unsigned char common_flags; offset:2; size:1; signed:0; field:unsigned char common_preempt_count; offset:3; size:1; signed:0; field:int common_pid; offset:4; size:4; signed:1; field:u64 lat; offset:8; size:8; signed:0; field:unsigned int order; offset:16; size:4; signed:0; field:gfp_t gfp_flags; offset:24; size:4; signed:1; field:int migratetype; offset:32; size:4; signed:1; print fmt: "lat=%llu, order=%u, gfp_flags=%x, migratetype=%d", REC->lat, REC->order, REC->gfp_flags, REC->migratetype Link: http://lkml.kernel.org/r/20191018012034.6404-1-zhengjun.xing@linux.intel.com Reviewed-by: Tom Zanussi Signed-off-by: Zhengjun Xing Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 57648c5aa679..7482a1466ebf 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -679,6 +679,8 @@ static bool synth_field_signed(char *type) { if (str_has_prefix(type, "u")) return false; + if (strcmp(type, "gfp_t") == 0) + return false; return true; } -- cgit v1.2.3 From aa5de305c90c995321df452f274fe75b52bd8fcc Mon Sep 17 00:00:00 2001 From: Song Liu Date: Fri, 18 Oct 2019 20:20:40 -0700 Subject: kernel/events/uprobes.c: only do FOLL_SPLIT_PMD for uprobe register Attaching uprobe to text section in THP splits the PMD mapped page table into PTE mapped entries. On uprobe detach, we would like to regroup PMD mapped page table entry to regain performance benefit of THP. However, the regroup is broken For perf_event based trace_uprobe. This is because perf_event based trace_uprobe calls uprobe_unregister twice on close: first in TRACE_REG_PERF_CLOSE, then in TRACE_REG_PERF_UNREGISTER. The second call will split the PMD mapped page table entry, which is not the desired behavior. Fix this by only use FOLL_SPLIT_PMD for uprobe register case. Add a WARN() to confirm uprobe unregister never work on huge pages, and abort the operation when this WARN() triggers. Link: http://lkml.kernel.org/r/20191017164223.2762148-6-songliubraving@fb.com Fixes: 5a52c9df62b4 ("uprobe: use FOLL_SPLIT_PMD instead of FOLL_SPLIT") Signed-off-by: Song Liu Reviewed-by: Srikar Dronamraju Cc: Kirill A. Shutemov Cc: Oleg Nesterov Cc: Matthew Wilcox (Oracle) Cc: William Kucharski Cc: Yang Shi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/events/uprobes.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 94d38a39d72e..c74761004ee5 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -474,14 +474,17 @@ int uprobe_write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm, struct vm_area_struct *vma; int ret, is_register, ref_ctr_updated = 0; bool orig_page_huge = false; + unsigned int gup_flags = FOLL_FORCE; is_register = is_swbp_insn(&opcode); uprobe = container_of(auprobe, struct uprobe, arch); retry: + if (is_register) + gup_flags |= FOLL_SPLIT_PMD; /* Read the page with vaddr into memory */ - ret = get_user_pages_remote(NULL, mm, vaddr, 1, - FOLL_FORCE | FOLL_SPLIT_PMD, &old_page, &vma, NULL); + ret = get_user_pages_remote(NULL, mm, vaddr, 1, gup_flags, + &old_page, &vma, NULL); if (ret <= 0) return ret; @@ -489,6 +492,12 @@ retry: if (ret <= 0) goto put_old; + if (WARN(!is_register && PageCompound(old_page), + "uprobe unregister should never work on compound page\n")) { + ret = -EINVAL; + goto put_old; + } + /* We are going to replace instruction, update ref_ctr. */ if (!ref_ctr_updated && uprobe->ref_ctr_offset) { ret = update_ref_ctr(uprobe, mm, is_register ? 1 : -1); -- cgit v1.2.3 From 77751a466ebd1a785456556061a2db6d60ea3898 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 16 Oct 2019 12:41:24 +0200 Subject: PM: QoS: Introduce frequency QoS Introduce frequency QoS, based on the "raw" low-level PM QoS, to represent min and max frequency requests and aggregate constraints. The min and max frequency requests are to be represented by struct freq_qos_request objects and the aggregate constraints are to be represented by struct freq_constraints objects. The latter are expected to be initialized with the help of freq_constraints_init(). The freq_qos_read_value() helper is defined to retrieve the aggregate constraints values from a given struct freq_constraints object and there are the freq_qos_add_request(), freq_qos_update_request() and freq_qos_remove_request() helpers to manipulate the min and max frequency requests. It is assumed that the the helpers will not run concurrently with each other for the same struct freq_qos_request object, so if that may be the case, their uses must ensure proper synchronization between them (e.g. through locking). In addition, freq_qos_add_notifier() and freq_qos_remove_notifier() are provided to add and remove notifiers that will trigger on aggregate constraint changes to and from a given struct freq_constraints object, respectively. Signed-off-by: Rafael J. Wysocki Acked-by: Viresh Kumar --- kernel/power/qos.c | 240 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 240 insertions(+) (limited to 'kernel') diff --git a/kernel/power/qos.c b/kernel/power/qos.c index 9568a2fe7c11..04e83fdfbe80 100644 --- a/kernel/power/qos.c +++ b/kernel/power/qos.c @@ -650,3 +650,243 @@ static int __init pm_qos_power_init(void) } late_initcall(pm_qos_power_init); + +/* Definitions related to the frequency QoS below. */ + +/** + * freq_constraints_init - Initialize frequency QoS constraints. + * @qos: Frequency QoS constraints to initialize. + */ +void freq_constraints_init(struct freq_constraints *qos) +{ + struct pm_qos_constraints *c; + + c = &qos->min_freq; + plist_head_init(&c->list); + c->target_value = FREQ_QOS_MIN_DEFAULT_VALUE; + c->default_value = FREQ_QOS_MIN_DEFAULT_VALUE; + c->no_constraint_value = FREQ_QOS_MIN_DEFAULT_VALUE; + c->type = PM_QOS_MAX; + c->notifiers = &qos->min_freq_notifiers; + BLOCKING_INIT_NOTIFIER_HEAD(c->notifiers); + + c = &qos->max_freq; + plist_head_init(&c->list); + c->target_value = FREQ_QOS_MAX_DEFAULT_VALUE; + c->default_value = FREQ_QOS_MAX_DEFAULT_VALUE; + c->no_constraint_value = FREQ_QOS_MAX_DEFAULT_VALUE; + c->type = PM_QOS_MIN; + c->notifiers = &qos->max_freq_notifiers; + BLOCKING_INIT_NOTIFIER_HEAD(c->notifiers); +} + +/** + * freq_qos_read_value - Get frequency QoS constraint for a given list. + * @qos: Constraints to evaluate. + * @type: QoS request type. + */ +s32 freq_qos_read_value(struct freq_constraints *qos, + enum freq_qos_req_type type) +{ + s32 ret; + + switch (type) { + case FREQ_QOS_MIN: + ret = IS_ERR_OR_NULL(qos) ? + FREQ_QOS_MIN_DEFAULT_VALUE : + pm_qos_read_value(&qos->min_freq); + break; + case FREQ_QOS_MAX: + ret = IS_ERR_OR_NULL(qos) ? + FREQ_QOS_MAX_DEFAULT_VALUE : + pm_qos_read_value(&qos->max_freq); + break; + default: + WARN_ON(1); + ret = 0; + } + + return ret; +} + +/** + * freq_qos_apply - Add/modify/remove frequency QoS request. + * @req: Constraint request to apply. + * @action: Action to perform (add/update/remove). + * @value: Value to assign to the QoS request. + */ +static int freq_qos_apply(struct freq_qos_request *req, + enum pm_qos_req_action action, s32 value) +{ + int ret; + + switch(req->type) { + case FREQ_QOS_MIN: + ret = pm_qos_update_target(&req->qos->min_freq, &req->pnode, + action, value); + break; + case FREQ_QOS_MAX: + ret = pm_qos_update_target(&req->qos->max_freq, &req->pnode, + action, value); + break; + default: + ret = -EINVAL; + } + + return ret; +} + +/** + * freq_qos_add_request - Insert new frequency QoS request into a given list. + * @qos: Constraints to update. + * @req: Preallocated request object. + * @type: Request type. + * @value: Request value. + * + * Insert a new entry into the @qos list of requests, recompute the effective + * QoS constraint value for that list and initialize the @req object. The + * caller needs to save that object for later use in updates and removal. + * + * Return 1 if the effective constraint value has changed, 0 if the effective + * constraint value has not changed, or a negative error code on failures. + */ +int freq_qos_add_request(struct freq_constraints *qos, + struct freq_qos_request *req, + enum freq_qos_req_type type, s32 value) +{ + int ret; + + if (IS_ERR_OR_NULL(qos) || !req) + return -EINVAL; + + if (WARN(freq_qos_request_active(req), + "%s() called for active request\n", __func__)) + return -EINVAL; + + req->qos = qos; + req->type = type; + ret = freq_qos_apply(req, PM_QOS_ADD_REQ, value); + if (ret < 0) { + req->qos = NULL; + req->type = 0; + } + + return ret; +} +EXPORT_SYMBOL_GPL(freq_qos_add_request); + +/** + * freq_qos_update_request - Modify existing frequency QoS request. + * @req: Request to modify. + * @new_value: New request value. + * + * Update an existing frequency QoS request along with the effective constraint + * value for the list of requests it belongs to. + * + * Return 1 if the effective constraint value has changed, 0 if the effective + * constraint value has not changed, or a negative error code on failures. + */ +int freq_qos_update_request(struct freq_qos_request *req, s32 new_value) +{ + if (!req) + return -EINVAL; + + if (WARN(!freq_qos_request_active(req), + "%s() called for unknown object\n", __func__)) + return -EINVAL; + + if (req->pnode.prio == new_value) + return 0; + + return freq_qos_apply(req, PM_QOS_UPDATE_REQ, new_value); +} +EXPORT_SYMBOL_GPL(freq_qos_update_request); + +/** + * freq_qos_remove_request - Remove frequency QoS request from its list. + * @req: Request to remove. + * + * Remove the given frequency QoS request from the list of constraints it + * belongs to and recompute the effective constraint value for that list. + * + * Return 1 if the effective constraint value has changed, 0 if the effective + * constraint value has not changed, or a negative error code on failures. + */ +int freq_qos_remove_request(struct freq_qos_request *req) +{ + if (!req) + return -EINVAL; + + if (WARN(!freq_qos_request_active(req), + "%s() called for unknown object\n", __func__)) + return -EINVAL; + + return freq_qos_apply(req, PM_QOS_REMOVE_REQ, PM_QOS_DEFAULT_VALUE); +} +EXPORT_SYMBOL_GPL(freq_qos_remove_request); + +/** + * freq_qos_add_notifier - Add frequency QoS change notifier. + * @qos: List of requests to add the notifier to. + * @type: Request type. + * @notifier: Notifier block to add. + */ +int freq_qos_add_notifier(struct freq_constraints *qos, + enum freq_qos_req_type type, + struct notifier_block *notifier) +{ + int ret; + + if (IS_ERR_OR_NULL(qos) || !notifier) + return -EINVAL; + + switch (type) { + case FREQ_QOS_MIN: + ret = blocking_notifier_chain_register(qos->min_freq.notifiers, + notifier); + break; + case FREQ_QOS_MAX: + ret = blocking_notifier_chain_register(qos->max_freq.notifiers, + notifier); + break; + default: + WARN_ON(1); + ret = -EINVAL; + } + + return ret; +} +EXPORT_SYMBOL_GPL(freq_qos_add_notifier); + +/** + * freq_qos_remove_notifier - Remove frequency QoS change notifier. + * @qos: List of requests to remove the notifier from. + * @type: Request type. + * @notifier: Notifier block to remove. + */ +int freq_qos_remove_notifier(struct freq_constraints *qos, + enum freq_qos_req_type type, + struct notifier_block *notifier) +{ + int ret; + + if (IS_ERR_OR_NULL(qos) || !notifier) + return -EINVAL; + + switch (type) { + case FREQ_QOS_MIN: + ret = blocking_notifier_chain_unregister(qos->min_freq.notifiers, + notifier); + break; + case FREQ_QOS_MAX: + ret = blocking_notifier_chain_unregister(qos->max_freq.notifiers, + notifier); + break; + default: + WARN_ON(1); + ret = -EINVAL; + } + + return ret; +} +EXPORT_SYMBOL_GPL(freq_qos_remove_notifier); -- cgit v1.2.3 From 5e6c3c7b1ec217c1c4c95d9148182302b9969b97 Mon Sep 17 00:00:00 2001 From: Thomas Richter Date: Mon, 21 Oct 2019 10:33:54 +0200 Subject: perf/aux: Fix tracking of auxiliary trace buffer allocation The following commit from the v5.4 merge window: d44248a41337 ("perf/core: Rework memory accounting in perf_mmap()") ... breaks auxiliary trace buffer tracking. If I run command 'perf record -e rbd000' to record samples and saving them in the **auxiliary** trace buffer then the value of 'locked_vm' becomes negative after all trace buffers have been allocated and released: During allocation the values increase: [52.250027] perf_mmap user->locked_vm:0x87 pinned_vm:0x0 ret:0 [52.250115] perf_mmap user->locked_vm:0x107 pinned_vm:0x0 ret:0 [52.250251] perf_mmap user->locked_vm:0x188 pinned_vm:0x0 ret:0 [52.250326] perf_mmap user->locked_vm:0x208 pinned_vm:0x0 ret:0 [52.250441] perf_mmap user->locked_vm:0x289 pinned_vm:0x0 ret:0 [52.250498] perf_mmap user->locked_vm:0x309 pinned_vm:0x0 ret:0 [52.250613] perf_mmap user->locked_vm:0x38a pinned_vm:0x0 ret:0 [52.250715] perf_mmap user->locked_vm:0x408 pinned_vm:0x2 ret:0 [52.250834] perf_mmap user->locked_vm:0x408 pinned_vm:0x83 ret:0 [52.250915] perf_mmap user->locked_vm:0x408 pinned_vm:0x103 ret:0 [52.251061] perf_mmap user->locked_vm:0x408 pinned_vm:0x184 ret:0 [52.251146] perf_mmap user->locked_vm:0x408 pinned_vm:0x204 ret:0 [52.251299] perf_mmap user->locked_vm:0x408 pinned_vm:0x285 ret:0 [52.251383] perf_mmap user->locked_vm:0x408 pinned_vm:0x305 ret:0 [52.251544] perf_mmap user->locked_vm:0x408 pinned_vm:0x386 ret:0 [52.251634] perf_mmap user->locked_vm:0x408 pinned_vm:0x406 ret:0 [52.253018] perf_mmap user->locked_vm:0x408 pinned_vm:0x487 ret:0 [52.253197] perf_mmap user->locked_vm:0x408 pinned_vm:0x508 ret:0 [52.253374] perf_mmap user->locked_vm:0x408 pinned_vm:0x589 ret:0 [52.253550] perf_mmap user->locked_vm:0x408 pinned_vm:0x60a ret:0 [52.253726] perf_mmap user->locked_vm:0x408 pinned_vm:0x68b ret:0 [52.253903] perf_mmap user->locked_vm:0x408 pinned_vm:0x70c ret:0 [52.254084] perf_mmap user->locked_vm:0x408 pinned_vm:0x78d ret:0 [52.254263] perf_mmap user->locked_vm:0x408 pinned_vm:0x80e ret:0 The value of user->locked_vm increases to a limit then the memory is tracked by pinned_vm. During deallocation the size is subtracted from pinned_vm until it hits a limit. Then a larger value is subtracted from locked_vm leading to a large number (because of type unsigned): [64.267797] perf_mmap_close mmap_user->locked_vm:0x408 pinned_vm:0x78d [64.267826] perf_mmap_close mmap_user->locked_vm:0x408 pinned_vm:0x70c [64.267848] perf_mmap_close mmap_user->locked_vm:0x408 pinned_vm:0x68b [64.267869] perf_mmap_close mmap_user->locked_vm:0x408 pinned_vm:0x60a [64.267891] perf_mmap_close mmap_user->locked_vm:0x408 pinned_vm:0x589 [64.267911] perf_mmap_close mmap_user->locked_vm:0x408 pinned_vm:0x508 [64.267933] perf_mmap_close mmap_user->locked_vm:0x408 pinned_vm:0x487 [64.267952] perf_mmap_close mmap_user->locked_vm:0x408 pinned_vm:0x406 [64.268883] perf_mmap_close mmap_user->locked_vm:0x307 pinned_vm:0x406 [64.269117] perf_mmap_close mmap_user->locked_vm:0x206 pinned_vm:0x406 [64.269433] perf_mmap_close mmap_user->locked_vm:0x105 pinned_vm:0x406 [64.269536] perf_mmap_close mmap_user->locked_vm:0x4 pinned_vm:0x404 [64.269797] perf_mmap_close mmap_user->locked_vm:0xffffffffffffff84 pinned_vm:0x303 [64.270105] perf_mmap_close mmap_user->locked_vm:0xffffffffffffff04 pinned_vm:0x202 [64.270374] perf_mmap_close mmap_user->locked_vm:0xfffffffffffffe84 pinned_vm:0x101 [64.270628] perf_mmap_close mmap_user->locked_vm:0xfffffffffffffe04 pinned_vm:0x0 This value sticks for the user until system is rebooted, causing follow-on system calls using locked_vm resource limit to fail. Note: There is no issue using the normal trace buffer. In fact the issue is in perf_mmap_close(). During allocation auxiliary trace buffer memory is either traced as 'extra' and added to 'pinned_vm' or trace as 'user_extra' and added to 'locked_vm'. This applies for normal trace buffers and auxiliary trace buffer. However in function perf_mmap_close() all auxiliary trace buffer is subtraced from 'locked_vm' and never from 'pinned_vm'. This breaks the ballance. Signed-off-by: Thomas Richter Acked-by: Peter Zijlstra Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: acme@kernel.org Cc: gor@linux.ibm.com Cc: hechaol@fb.com Cc: heiko.carstens@de.ibm.com Cc: linux-perf-users@vger.kernel.org Cc: songliubraving@fb.com Fixes: d44248a41337 ("perf/core: Rework memory accounting in perf_mmap()") Link: https://lkml.kernel.org/r/20191021083354.67868-1-tmricht@linux.ibm.com [ Minor readability edits. ] Signed-off-by: Ingo Molnar --- kernel/events/core.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 9ec0b0bfddbd..f5d7950d1931 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -5607,8 +5607,10 @@ static void perf_mmap_close(struct vm_area_struct *vma) perf_pmu_output_stop(event); /* now it's safe to free the pages */ - atomic_long_sub(rb->aux_nr_pages, &mmap_user->locked_vm); - atomic64_sub(rb->aux_mmap_locked, &vma->vm_mm->pinned_vm); + if (!rb->aux_mmap_locked) + atomic_long_sub(rb->aux_nr_pages, &mmap_user->locked_vm); + else + atomic64_sub(rb->aux_mmap_locked, &vma->vm_mm->pinned_vm); /* this has to be the last one */ rb_free_aux(rb); -- cgit v1.2.3 From 6b1340cc00edeadd52ebd8a45171f38c8de2a387 Mon Sep 17 00:00:00 2001 From: Prateek Sood Date: Tue, 15 Oct 2019 11:47:25 +0530 Subject: tracing: Fix race in perf_trace_buf initialization A race condition exists while initialiazing perf_trace_buf from perf_trace_init() and perf_kprobe_init(). CPU0 CPU1 perf_trace_init() mutex_lock(&event_mutex) perf_trace_event_init() perf_trace_event_reg() total_ref_count == 0 buf = alloc_percpu() perf_trace_buf[i] = buf tp_event->class->reg() //fails perf_kprobe_init() goto fail perf_trace_event_init() perf_trace_event_reg() fail: total_ref_count == 0 total_ref_count == 0 buf = alloc_percpu() perf_trace_buf[i] = buf tp_event->class->reg() total_ref_count++ free_percpu(perf_trace_buf[i]) perf_trace_buf[i] = NULL Any subsequent call to perf_trace_event_reg() will observe total_ref_count > 0, causing the perf_trace_buf to be always NULL. This can result in perf_trace_buf getting accessed from perf_trace_buf_alloc() without being initialized. Acquiring event_mutex in perf_kprobe_init() before calling perf_trace_event_init() should fix this race. The race caused the following bug: Unable to handle kernel paging request at virtual address 0000003106f2003c Mem abort info: ESR = 0x96000045 Exception class = DABT (current EL), IL = 32 bits SET = 0, FnV = 0 EA = 0, S1PTW = 0 Data abort info: ISV = 0, ISS = 0x00000045 CM = 0, WnR = 1 user pgtable: 4k pages, 39-bit VAs, pgdp = ffffffc034b9b000 [0000003106f2003c] pgd=0000000000000000, pud=0000000000000000 Internal error: Oops: 96000045 [#1] PREEMPT SMP Process syz-executor (pid: 18393, stack limit = 0xffffffc093190000) pstate: 80400005 (Nzcv daif +PAN -UAO) pc : __memset+0x20/0x1ac lr : memset+0x3c/0x50 sp : ffffffc09319fc50 __memset+0x20/0x1ac perf_trace_buf_alloc+0x140/0x1a0 perf_trace_sys_enter+0x158/0x310 syscall_trace_enter+0x348/0x7c0 el0_svc_common+0x11c/0x368 el0_svc_handler+0x12c/0x198 el0_svc+0x8/0xc Ramdumps showed the following: total_ref_count = 3 perf_trace_buf = ( 0x0 -> NULL, 0x0 -> NULL, 0x0 -> NULL, 0x0 -> NULL) Link: http://lkml.kernel.org/r/1571120245-4186-1-git-send-email-prsood@codeaurora.org Cc: stable@vger.kernel.org Fixes: e12f03d7031a9 ("perf/core: Implement the 'perf_kprobe' PMU") Acked-by: Song Liu Signed-off-by: Prateek Sood Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_event_perf.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index 0892e38ed6fb..a9dfa04ffa44 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -272,9 +272,11 @@ int perf_kprobe_init(struct perf_event *p_event, bool is_retprobe) goto out; } + mutex_lock(&event_mutex); ret = perf_trace_event_init(tp_event, p_event); if (ret) destroy_local_trace_kprobe(tp_event); + mutex_unlock(&event_mutex); out: kfree(func); return ret; @@ -282,8 +284,10 @@ out: void perf_kprobe_destroy(struct perf_event *p_event) { + mutex_lock(&event_mutex); perf_trace_event_close(p_event); perf_trace_event_unreg(p_event); + mutex_unlock(&event_mutex); destroy_local_trace_kprobe(p_event->tp_event); } -- cgit v1.2.3 From f3a519e4add93b7b31a6616f0b09635ff2e6a159 Mon Sep 17 00:00:00 2001 From: Alexander Shishkin Date: Tue, 22 Oct 2019 10:39:40 +0300 Subject: perf/aux: Fix AUX output stopping Commit: 8a58ddae2379 ("perf/core: Fix exclusive events' grouping") allows CAP_EXCLUSIVE events to be grouped with other events. Since all of those also happen to be AUX events (which is not the case the other way around, because arch/s390), this changes the rules for stopping the output: the AUX event may not be on its PMU's context any more, if it's grouped with a HW event, in which case it will be on that HW event's context instead. If that's the case, munmap() of the AUX buffer can't find and stop the AUX event, potentially leaving the last reference with the atomic context, which will then end up freeing the AUX buffer. This will then trip warnings: Fix this by using the context's PMU context when looking for events to stop, instead of the event's PMU context. Signed-off-by: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20191022073940.61814-1-alexander.shishkin@linux.intel.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index f5d7950d1931..bb3748d29b04 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -6949,7 +6949,7 @@ static void __perf_event_output_stop(struct perf_event *event, void *data) static int __perf_pmu_output_stop(void *info) { struct perf_event *event = info; - struct pmu *pmu = event->pmu; + struct pmu *pmu = event->ctx->pmu; struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); struct remote_output ro = { .rb = event->rb, -- cgit v1.2.3 From 086ee46b08634a999bcd1707eabe3b0dc1806674 Mon Sep 17 00:00:00 2001 From: "Ben Dooks (Codethink)" Date: Tue, 22 Oct 2019 14:12:26 +0100 Subject: timers/sched_clock: Include local timekeeping.h for missing declarations Include the timekeeping.h header to get the declaration of the sched_clock_{suspend,resume} functions. Fixes the following sparse warnings: kernel/time/sched_clock.c:275:5: warning: symbol 'sched_clock_suspend' was not declared. Should it be static? kernel/time/sched_clock.c:286:6: warning: symbol 'sched_clock_resume' was not declared. Should it be static? Signed-off-by: Ben Dooks (Codethink) Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20191022131226.11465-1-ben.dooks@codethink.co.uk --- kernel/time/sched_clock.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c index 142b07619918..dbd69052eaa6 100644 --- a/kernel/time/sched_clock.c +++ b/kernel/time/sched_clock.c @@ -17,6 +17,8 @@ #include #include +#include "timekeeping.h" + /** * struct clock_read_data - data required to read from sched_clock() * -- cgit v1.2.3 From 7f2cbcbcafbca5360efbd175b3320852b8f7c637 Mon Sep 17 00:00:00 2001 From: Yi Wang Date: Mon, 21 Oct 2019 15:44:12 +0800 Subject: posix-cpu-timers: Fix two trivial comments Recent changes modified the function arguments of thread_group_sample_cputime() and task_cputimers_expired(), but forgot to update the comments. Fix it up. [ tglx: Changed the argument name of task_cputimers_expired() as the pointer points to an array of samples. ] Fixes: b7be4ef1365d ("posix-cpu-timers: Switch thread group sampling to array") Fixes: 001f7971433a ("posix-cpu-timers: Make expiry checks array based") Signed-off-by: Yi Wang Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/1571643852-21848-1-git-send-email-wang.yi59@zte.com.cn --- kernel/time/posix-cpu-timers.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 92a431981b1c..42d512fcfda2 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -266,7 +266,7 @@ static void update_gt_cputime(struct task_cputime_atomic *cputime_atomic, /** * thread_group_sample_cputime - Sample cputime for a given task * @tsk: Task for which cputime needs to be started - * @iimes: Storage for time samples + * @samples: Storage for time samples * * Called from sys_getitimer() to calculate the expiry time of an active * timer. That means group cputime accounting is already active. Called @@ -1038,12 +1038,12 @@ unlock: * member of @pct->bases[CLK].nextevt. False otherwise */ static inline bool -task_cputimers_expired(const u64 *sample, struct posix_cputimers *pct) +task_cputimers_expired(const u64 *samples, struct posix_cputimers *pct) { int i; for (i = 0; i < CPUCLOCK_MAX; i++) { - if (sample[i] >= pct->bases[i].nextevt) + if (samples[i] >= pct->bases[i].nextevt) return true; } return false; -- cgit v1.2.3